kernel: 5.4: import wireguard backport

author Jason A. Donenfeld <Jason@zx2c4.com>

Fri, 19 Feb 2021 13:29:04 +0000 (14:29 +0100)

committer David Bauer <mail@david-bauer.net>

Fri, 26 Feb 2021 19:41:01 +0000 (20:41 +0100)
author Jason A. Donenfeld <Jason@zx2c4.com>
Fri, 19 Feb 2021 13:29:04 +0000 (14:29 +0100)
committer David Bauer <mail@david-bauer.net>
Fri, 26 Feb 2021 19:41:01 +0000 (20:41 +0100)
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0001-crypto-lib-tidy-up-lib-crypto-Kconfig-and-Makefile.patch b/target/linux/generic/backport-5.4/080-wireguard-0001-crypto-lib-tidy-up-lib-crypto-Kconfig-and-Makefile.patch

new file mode 100644 (file)

index 0000000..9de7c9c
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0001-crypto-lib-tidy-up-lib-crypto-Kconfig-and-Makefile.patch
@@ -0,0 +1,112 @@
+From 7b5de278d022b3f31bc5b42cd160bea2e8bc4c74 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:07 +0100
+Subject: [PATCH 001/124] crypto: lib - tidy up lib/crypto Kconfig and Makefile
+
+commit 746b2e024c67aa605ac12d135cd7085a49cf9dc4 upstream.
+
+In preparation of introducing a set of crypto library interfaces, tidy
+up the Makefile and split off the Kconfig symbols into a separate file.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/Kconfig      | 13 +------------
+ lib/crypto/Kconfig  | 15 +++++++++++++++
+ lib/crypto/Makefile | 16 ++++++++--------
+ 3 files changed, 24 insertions(+), 20 deletions(-)
+ create mode 100644 lib/crypto/Kconfig
+
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -878,9 +878,6 @@ config CRYPTO_SHA1_PPC_SPE
+         SHA-1 secure hash standard (DFIPS 180-4) implemented
+         using powerpc SPE SIMD instruction set.
+ 
+-config CRYPTO_LIB_SHA256
+-      tristate
+-
+ config CRYPTO_SHA256
+       tristate "SHA224 and SHA256 digest algorithm"
+       select CRYPTO_HASH
+@@ -1019,9 +1016,6 @@ config CRYPTO_GHASH_CLMUL_NI_INTEL
+ 
+ comment "Ciphers"
+ 
+-config CRYPTO_LIB_AES
+-      tristate
+-
+ config CRYPTO_AES
+       tristate "AES cipher algorithms"
+       select CRYPTO_ALGAPI
+@@ -1150,9 +1144,6 @@ config CRYPTO_ANUBIS
+         <https://www.cosic.esat.kuleuven.be/nessie/reports/>
+         <http://www.larc.usp.br/~pbarreto/AnubisPage.html>
+ 
+-config CRYPTO_LIB_ARC4
+-      tristate
+-
+ config CRYPTO_ARC4
+       tristate "ARC4 cipher algorithm"
+       select CRYPTO_BLKCIPHER
+@@ -1339,9 +1330,6 @@ config CRYPTO_CAST6_AVX_X86_64
+         This module provides the Cast6 cipher algorithm that processes
+         eight blocks parallel using the AVX instruction set.
+ 
+-config CRYPTO_LIB_DES
+-      tristate
+-
+ config CRYPTO_DES
+       tristate "DES and Triple DES EDE cipher algorithms"
+       select CRYPTO_ALGAPI
+@@ -1845,6 +1833,7 @@ config CRYPTO_STATS
+ config CRYPTO_HASH_INFO
+       bool
+ 
++source "lib/crypto/Kconfig"
+ source "drivers/crypto/Kconfig"
+ source "crypto/asymmetric_keys/Kconfig"
+ source "certs/Kconfig"
+--- /dev/null
++++ b/lib/crypto/Kconfig
+@@ -0,0 +1,15 @@
++# SPDX-License-Identifier: GPL-2.0
++
++comment "Crypto library routines"
++
++config CRYPTO_LIB_AES
++      tristate
++
++config CRYPTO_LIB_ARC4
++      tristate
++
++config CRYPTO_LIB_DES
++      tristate
++
++config CRYPTO_LIB_SHA256
++      tristate
+--- a/lib/crypto/Makefile
++++ b/lib/crypto/Makefile
+@@ -1,13 +1,13 @@
+ # SPDX-License-Identifier: GPL-2.0
+ 
+-obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o
+-libaes-y := aes.o
++obj-$(CONFIG_CRYPTO_LIB_AES)                  += libaes.o
++libaes-y                                      := aes.o
+ 
+-obj-$(CONFIG_CRYPTO_LIB_ARC4) += libarc4.o
+-libarc4-y := arc4.o
++obj-$(CONFIG_CRYPTO_LIB_ARC4)                 += libarc4.o
++libarc4-y                                     := arc4.o
+ 
+-obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o
+-libdes-y := des.o
++obj-$(CONFIG_CRYPTO_LIB_DES)                  += libdes.o
++libdes-y                                      := des.o
+ 
+-obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
+-libsha256-y := sha256.o
++obj-$(CONFIG_CRYPTO_LIB_SHA256)                       += libsha256.o
++libsha256-y                                   := sha256.o
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0002-crypto-chacha-move-existing-library-code-into-lib-cr.patch b/target/linux/generic/backport-5.4/080-wireguard-0002-crypto-chacha-move-existing-library-code-into-lib-cr.patch

new file mode 100644 (file)

index 0000000..a16ca08
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0002-crypto-chacha-move-existing-library-code-into-lib-cr.patch
@@ -0,0 +1,669 @@
+From 6f71439c260ddd0f9a21fee3e34449fe9c017ab6 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:08 +0100
+Subject: [PATCH 002/124] crypto: chacha - move existing library code into
+ lib/crypto
+
+commit 5fb8ef25803ef33e2eb60b626435828b937bed75 upstream.
+
+Currently, our generic ChaCha implementation consists of a permute
+function in lib/chacha.c that operates on the 64-byte ChaCha state
+directly [and which is always included into the core kernel since it
+is used by the /dev/random driver], and the crypto API plumbing to
+expose it as a skcipher.
+
+In order to support in-kernel users that need the ChaCha streamcipher
+but have no need [or tolerance] for going through the abstractions of
+the crypto API, let's expose the streamcipher bits via a library API
+as well, in a way that permits the implementation to be superseded by
+an architecture specific one if provided.
+
+So move the streamcipher code into a separate module in lib/crypto,
+and expose the init() and crypt() routines to users of the library.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-neon-glue.c   |  2 +-
+ arch/arm64/crypto/chacha-neon-glue.c |  2 +-
+ arch/x86/crypto/chacha_glue.c        |  2 +-
+ crypto/Kconfig                       |  1 +
+ crypto/chacha_generic.c              | 60 ++--------------------
+ include/crypto/chacha.h              | 77 ++++++++++++++++++++++------
+ include/crypto/internal/chacha.h     | 53 +++++++++++++++++++
+ lib/Makefile                         |  3 +-
+ lib/crypto/Kconfig                   | 26 ++++++++++
+ lib/crypto/Makefile                  |  4 ++
+ lib/{ => crypto}/chacha.c            | 20 ++++----
+ lib/crypto/libchacha.c               | 35 +++++++++++++
+ 12 files changed, 199 insertions(+), 86 deletions(-)
+ create mode 100644 include/crypto/internal/chacha.h
+ rename lib/{ => crypto}/chacha.c (88%)
+ create mode 100644 lib/crypto/libchacha.c
+
+--- a/arch/arm/crypto/chacha-neon-glue.c
++++ b/arch/arm/crypto/chacha-neon-glue.c
+@@ -20,7 +20,7 @@
+  */
+ 
+ #include <crypto/algapi.h>
+-#include <crypto/chacha.h>
++#include <crypto/internal/chacha.h>
+ #include <crypto/internal/simd.h>
+ #include <crypto/internal/skcipher.h>
+ #include <linux/kernel.h>
+--- a/arch/arm64/crypto/chacha-neon-glue.c
++++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -20,7 +20,7 @@
+  */
+ 
+ #include <crypto/algapi.h>
+-#include <crypto/chacha.h>
++#include <crypto/internal/chacha.h>
+ #include <crypto/internal/simd.h>
+ #include <crypto/internal/skcipher.h>
+ #include <linux/kernel.h>
+--- a/arch/x86/crypto/chacha_glue.c
++++ b/arch/x86/crypto/chacha_glue.c
+@@ -7,7 +7,7 @@
+  */
+ 
+ #include <crypto/algapi.h>
+-#include <crypto/chacha.h>
++#include <crypto/internal/chacha.h>
+ #include <crypto/internal/simd.h>
+ #include <crypto/internal/skcipher.h>
+ #include <linux/kernel.h>
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -1393,6 +1393,7 @@ config CRYPTO_SALSA20
+ 
+ config CRYPTO_CHACHA20
+       tristate "ChaCha stream cipher algorithms"
++      select CRYPTO_LIB_CHACHA_GENERIC
+       select CRYPTO_BLKCIPHER
+       help
+         The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms.
+--- a/crypto/chacha_generic.c
++++ b/crypto/chacha_generic.c
+@@ -8,29 +8,10 @@
+ 
+ #include <asm/unaligned.h>
+ #include <crypto/algapi.h>
+-#include <crypto/chacha.h>
++#include <crypto/internal/chacha.h>
+ #include <crypto/internal/skcipher.h>
+ #include <linux/module.h>
+ 
+-static void chacha_docrypt(u32 *state, u8 *dst, const u8 *src,
+-                         unsigned int bytes, int nrounds)
+-{
+-      /* aligned to potentially speed up crypto_xor() */
+-      u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+-
+-      while (bytes >= CHACHA_BLOCK_SIZE) {
+-              chacha_block(state, stream, nrounds);
+-              crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
+-              bytes -= CHACHA_BLOCK_SIZE;
+-              dst += CHACHA_BLOCK_SIZE;
+-              src += CHACHA_BLOCK_SIZE;
+-      }
+-      if (bytes) {
+-              chacha_block(state, stream, nrounds);
+-              crypto_xor_cpy(dst, src, stream, bytes);
+-      }
+-}
+-
+ static int chacha_stream_xor(struct skcipher_request *req,
+                            const struct chacha_ctx *ctx, const u8 *iv)
+ {
+@@ -48,8 +29,8 @@ static int chacha_stream_xor(struct skci
+               if (nbytes < walk.total)
+                       nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE);
+ 
+-              chacha_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
+-                             nbytes, ctx->nrounds);
++              chacha_crypt_generic(state, walk.dst.virt.addr,
++                                   walk.src.virt.addr, nbytes, ctx->nrounds);
+               err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+       }
+ 
+@@ -58,41 +39,10 @@ static int chacha_stream_xor(struct skci
+ 
+ void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv)
+ {
+-      state[0]  = 0x61707865; /* "expa" */
+-      state[1]  = 0x3320646e; /* "nd 3" */
+-      state[2]  = 0x79622d32; /* "2-by" */
+-      state[3]  = 0x6b206574; /* "te k" */
+-      state[4]  = ctx->key[0];
+-      state[5]  = ctx->key[1];
+-      state[6]  = ctx->key[2];
+-      state[7]  = ctx->key[3];
+-      state[8]  = ctx->key[4];
+-      state[9]  = ctx->key[5];
+-      state[10] = ctx->key[6];
+-      state[11] = ctx->key[7];
+-      state[12] = get_unaligned_le32(iv +  0);
+-      state[13] = get_unaligned_le32(iv +  4);
+-      state[14] = get_unaligned_le32(iv +  8);
+-      state[15] = get_unaligned_le32(iv + 12);
++      chacha_init_generic(state, ctx->key, iv);
+ }
+ EXPORT_SYMBOL_GPL(crypto_chacha_init);
+ 
+-static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-                       unsigned int keysize, int nrounds)
+-{
+-      struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-      int i;
+-
+-      if (keysize != CHACHA_KEY_SIZE)
+-              return -EINVAL;
+-
+-      for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
+-              ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
+-
+-      ctx->nrounds = nrounds;
+-      return 0;
+-}
+-
+ int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                          unsigned int keysize)
+ {
+@@ -126,7 +76,7 @@ int crypto_xchacha_crypt(struct skcipher
+ 
+       /* Compute the subkey given the original key and first 128 nonce bits */
+       crypto_chacha_init(state, ctx, req->iv);
+-      hchacha_block(state, subctx.key, ctx->nrounds);
++      hchacha_block_generic(state, subctx.key, ctx->nrounds);
+       subctx.nrounds = ctx->nrounds;
+ 
+       /* Build the real IV */
+--- a/include/crypto/chacha.h
++++ b/include/crypto/chacha.h
+@@ -15,9 +15,8 @@
+ #ifndef _CRYPTO_CHACHA_H
+ #define _CRYPTO_CHACHA_H
+ 
+-#include <crypto/skcipher.h>
++#include <asm/unaligned.h>
+ #include <linux/types.h>
+-#include <linux/crypto.h>
+ 
+ /* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
+ #define CHACHA_IV_SIZE                16
+@@ -29,26 +28,70 @@
+ /* 192-bit nonce, then 64-bit stream position */
+ #define XCHACHA_IV_SIZE               32
+ 
+-struct chacha_ctx {
+-      u32 key[8];
+-      int nrounds;
+-};
+-
+-void chacha_block(u32 *state, u8 *stream, int nrounds);
++void chacha_block_generic(u32 *state, u8 *stream, int nrounds);
+ static inline void chacha20_block(u32 *state, u8 *stream)
+ {
+-      chacha_block(state, stream, 20);
++      chacha_block_generic(state, stream, 20);
+ }
+-void hchacha_block(const u32 *in, u32 *out, int nrounds);
+ 
+-void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv);
++void hchacha_block_arch(const u32 *state, u32 *out, int nrounds);
++void hchacha_block_generic(const u32 *state, u32 *out, int nrounds);
++
++static inline void hchacha_block(const u32 *state, u32 *out, int nrounds)
++{
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
++              hchacha_block_arch(state, out, nrounds);
++      else
++              hchacha_block_generic(state, out, nrounds);
++}
+ 
+-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-                         unsigned int keysize);
+-int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-                         unsigned int keysize);
++void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv);
++static inline void chacha_init_generic(u32 *state, const u32 *key, const u8 *iv)
++{
++      state[0]  = 0x61707865; /* "expa" */
++      state[1]  = 0x3320646e; /* "nd 3" */
++      state[2]  = 0x79622d32; /* "2-by" */
++      state[3]  = 0x6b206574; /* "te k" */
++      state[4]  = key[0];
++      state[5]  = key[1];
++      state[6]  = key[2];
++      state[7]  = key[3];
++      state[8]  = key[4];
++      state[9]  = key[5];
++      state[10] = key[6];
++      state[11] = key[7];
++      state[12] = get_unaligned_le32(iv +  0);
++      state[13] = get_unaligned_le32(iv +  4);
++      state[14] = get_unaligned_le32(iv +  8);
++      state[15] = get_unaligned_le32(iv + 12);
++}
+ 
+-int crypto_chacha_crypt(struct skcipher_request *req);
+-int crypto_xchacha_crypt(struct skcipher_request *req);
++static inline void chacha_init(u32 *state, const u32 *key, const u8 *iv)
++{
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
++              chacha_init_arch(state, key, iv);
++      else
++              chacha_init_generic(state, key, iv);
++}
++
++void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
++                     unsigned int bytes, int nrounds);
++void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
++                        unsigned int bytes, int nrounds);
++
++static inline void chacha_crypt(u32 *state, u8 *dst, const u8 *src,
++                              unsigned int bytes, int nrounds)
++{
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
++              chacha_crypt_arch(state, dst, src, bytes, nrounds);
++      else
++              chacha_crypt_generic(state, dst, src, bytes, nrounds);
++}
++
++static inline void chacha20_crypt(u32 *state, u8 *dst, const u8 *src,
++                                unsigned int bytes)
++{
++      chacha_crypt(state, dst, src, bytes, 20);
++}
+ 
+ #endif /* _CRYPTO_CHACHA_H */
+--- /dev/null
++++ b/include/crypto/internal/chacha.h
+@@ -0,0 +1,53 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef _CRYPTO_INTERNAL_CHACHA_H
++#define _CRYPTO_INTERNAL_CHACHA_H
++
++#include <crypto/chacha.h>
++#include <crypto/internal/skcipher.h>
++#include <linux/crypto.h>
++
++struct chacha_ctx {
++      u32 key[8];
++      int nrounds;
++};
++
++void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv);
++
++static inline int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
++                              unsigned int keysize, int nrounds)
++{
++      struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
++      int i;
++
++      if (keysize != CHACHA_KEY_SIZE)
++              return -EINVAL;
++
++      for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
++              ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
++
++      ctx->nrounds = nrounds;
++      return 0;
++}
++
++static inline int chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
++                                unsigned int keysize)
++{
++      return chacha_setkey(tfm, key, keysize, 20);
++}
++
++static int inline chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
++                                unsigned int keysize)
++{
++      return chacha_setkey(tfm, key, keysize, 12);
++}
++
++int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
++                         unsigned int keysize);
++int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
++                         unsigned int keysize);
++
++int crypto_chacha_crypt(struct skcipher_request *req);
++int crypto_xchacha_crypt(struct skcipher_request *req);
++
++#endif /* _CRYPTO_CHACHA_H */
+--- a/lib/Makefile
++++ b/lib/Makefile
+@@ -26,8 +26,7 @@ endif
+ 
+ lib-y := ctype.o string.o vsprintf.o cmdline.o \
+        rbtree.o radix-tree.o timerqueue.o xarray.o \
+-       idr.o extable.o \
+-       sha1.o chacha.o irq_regs.o argv_split.o \
++       idr.o extable.o sha1.o irq_regs.o argv_split.o \
+        flex_proportions.o ratelimit.o show_mem.o \
+        is_single_threaded.o plist.o decompress.o kobject_uevent.o \
+        earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -8,6 +8,32 @@ config CRYPTO_LIB_AES
+ config CRYPTO_LIB_ARC4
+       tristate
+ 
++config CRYPTO_ARCH_HAVE_LIB_CHACHA
++      tristate
++      help
++        Declares whether the architecture provides an arch-specific
++        accelerated implementation of the ChaCha library interface,
++        either builtin or as a module.
++
++config CRYPTO_LIB_CHACHA_GENERIC
++      tristate
++      select CRYPTO_ALGAPI
++      help
++        This symbol can be depended upon by arch implementations of the
++        ChaCha library interface that require the generic code as a
++        fallback, e.g., for SIMD implementations. If no arch specific
++        implementation is enabled, this implementation serves the users
++        of CRYPTO_LIB_CHACHA.
++
++config CRYPTO_LIB_CHACHA
++      tristate "ChaCha library interface"
++      depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
++      select CRYPTO_LIB_CHACHA_GENERIC if CRYPTO_ARCH_HAVE_LIB_CHACHA=n
++      help
++        Enable the ChaCha library interface. This interface may be fulfilled
++        by either the generic implementation or an arch-specific one, if one
++        is available and enabled.
++
+ config CRYPTO_LIB_DES
+       tristate
+ 
+--- a/lib/crypto/Makefile
++++ b/lib/crypto/Makefile
+@@ -1,5 +1,9 @@
+ # SPDX-License-Identifier: GPL-2.0
+ 
++# chacha is used by the /dev/random driver which is always builtin
++obj-y                                         += chacha.o
++obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC)               += libchacha.o
++
+ obj-$(CONFIG_CRYPTO_LIB_AES)                  += libaes.o
+ libaes-y                                      := aes.o
+ 
+--- a/lib/chacha.c
++++ /dev/null
+@@ -1,113 +0,0 @@
+-// SPDX-License-Identifier: GPL-2.0-or-later
+-/*
+- * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+- *
+- * Copyright (C) 2015 Martin Willi
+- */
+-
+-#include <linux/kernel.h>
+-#include <linux/export.h>
+-#include <linux/bitops.h>
+-#include <linux/cryptohash.h>
+-#include <asm/unaligned.h>
+-#include <crypto/chacha.h>
+-
+-static void chacha_permute(u32 *x, int nrounds)
+-{
+-      int i;
+-
+-      /* whitelist the allowed round counts */
+-      WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
+-
+-      for (i = 0; i < nrounds; i += 2) {
+-              x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
+-              x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
+-              x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
+-              x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
+-
+-              x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
+-              x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
+-              x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
+-              x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
+-
+-              x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
+-              x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
+-              x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
+-              x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
+-
+-              x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
+-              x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
+-              x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
+-              x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
+-
+-              x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
+-              x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
+-              x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
+-              x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
+-
+-              x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
+-              x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
+-              x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
+-              x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
+-
+-              x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
+-              x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
+-              x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
+-              x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
+-
+-              x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
+-              x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
+-              x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
+-              x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
+-      }
+-}
+-
+-/**
+- * chacha_block - generate one keystream block and increment block counter
+- * @state: input state matrix (16 32-bit words)
+- * @stream: output keystream block (64 bytes)
+- * @nrounds: number of rounds (20 or 12; 20 is recommended)
+- *
+- * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+- * The caller has already converted the endianness of the input.  This function
+- * also handles incrementing the block counter in the input matrix.
+- */
+-void chacha_block(u32 *state, u8 *stream, int nrounds)
+-{
+-      u32 x[16];
+-      int i;
+-
+-      memcpy(x, state, 64);
+-
+-      chacha_permute(x, nrounds);
+-
+-      for (i = 0; i < ARRAY_SIZE(x); i++)
+-              put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
+-
+-      state[12]++;
+-}
+-EXPORT_SYMBOL(chacha_block);
+-
+-/**
+- * hchacha_block - abbreviated ChaCha core, for XChaCha
+- * @in: input state matrix (16 32-bit words)
+- * @out: output (8 32-bit words)
+- * @nrounds: number of rounds (20 or 12; 20 is recommended)
+- *
+- * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+- * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
+- * skips the final addition of the initial state, and outputs only certain words
+- * of the state.  It should not be used for streaming directly.
+- */
+-void hchacha_block(const u32 *in, u32 *out, int nrounds)
+-{
+-      u32 x[16];
+-
+-      memcpy(x, in, 64);
+-
+-      chacha_permute(x, nrounds);
+-
+-      memcpy(&out[0], &x[0], 16);
+-      memcpy(&out[4], &x[12], 16);
+-}
+-EXPORT_SYMBOL(hchacha_block);
+--- /dev/null
++++ b/lib/crypto/chacha.c
+@@ -0,0 +1,115 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
++ *
++ * Copyright (C) 2015 Martin Willi
++ */
++
++#include <linux/bug.h>
++#include <linux/kernel.h>
++#include <linux/export.h>
++#include <linux/bitops.h>
++#include <linux/string.h>
++#include <linux/cryptohash.h>
++#include <asm/unaligned.h>
++#include <crypto/chacha.h>
++
++static void chacha_permute(u32 *x, int nrounds)
++{
++      int i;
++
++      /* whitelist the allowed round counts */
++      WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
++
++      for (i = 0; i < nrounds; i += 2) {
++              x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
++              x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
++              x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
++              x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
++
++              x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
++              x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
++              x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
++              x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
++
++              x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
++              x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
++              x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
++              x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
++
++              x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
++              x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
++              x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
++              x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
++
++              x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
++              x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
++              x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
++              x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
++
++              x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
++              x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
++              x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
++              x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
++
++              x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
++              x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
++              x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
++              x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
++
++              x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
++              x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
++              x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
++              x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
++      }
++}
++
++/**
++ * chacha_block - generate one keystream block and increment block counter
++ * @state: input state matrix (16 32-bit words)
++ * @stream: output keystream block (64 bytes)
++ * @nrounds: number of rounds (20 or 12; 20 is recommended)
++ *
++ * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
++ * The caller has already converted the endianness of the input.  This function
++ * also handles incrementing the block counter in the input matrix.
++ */
++void chacha_block_generic(u32 *state, u8 *stream, int nrounds)
++{
++      u32 x[16];
++      int i;
++
++      memcpy(x, state, 64);
++
++      chacha_permute(x, nrounds);
++
++      for (i = 0; i < ARRAY_SIZE(x); i++)
++              put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
++
++      state[12]++;
++}
++EXPORT_SYMBOL(chacha_block_generic);
++
++/**
++ * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
++ * @state: input state matrix (16 32-bit words)
++ * @out: output (8 32-bit words)
++ * @nrounds: number of rounds (20 or 12; 20 is recommended)
++ *
++ * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
++ * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
++ * skips the final addition of the initial state, and outputs only certain words
++ * of the state.  It should not be used for streaming directly.
++ */
++void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds)
++{
++      u32 x[16];
++
++      memcpy(x, state, 64);
++
++      chacha_permute(x, nrounds);
++
++      memcpy(&stream[0], &x[0], 16);
++      memcpy(&stream[4], &x[12], 16);
++}
++EXPORT_SYMBOL(hchacha_block_generic);
+--- /dev/null
++++ b/lib/crypto/libchacha.c
+@@ -0,0 +1,35 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * The ChaCha stream cipher (RFC7539)
++ *
++ * Copyright (C) 2015 Martin Willi
++ */
++
++#include <linux/kernel.h>
++#include <linux/export.h>
++#include <linux/module.h>
++
++#include <crypto/algapi.h> // for crypto_xor_cpy
++#include <crypto/chacha.h>
++
++void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
++                        unsigned int bytes, int nrounds)
++{
++      /* aligned to potentially speed up crypto_xor() */
++      u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
++
++      while (bytes >= CHACHA_BLOCK_SIZE) {
++              chacha_block_generic(state, stream, nrounds);
++              crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
++              bytes -= CHACHA_BLOCK_SIZE;
++              dst += CHACHA_BLOCK_SIZE;
++              src += CHACHA_BLOCK_SIZE;
++      }
++      if (bytes) {
++              chacha_block_generic(state, stream, nrounds);
++              crypto_xor_cpy(dst, src, stream, bytes);
++      }
++}
++EXPORT_SYMBOL(chacha_crypt_generic);
++
++MODULE_LICENSE("GPL");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0003-crypto-x86-chacha-depend-on-generic-chacha-library-i.patch b/target/linux/generic/backport-5.4/080-wireguard-0003-crypto-x86-chacha-depend-on-generic-chacha-library-i.patch

new file mode 100644 (file)

index 0000000..6033938
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0003-crypto-x86-chacha-depend-on-generic-chacha-library-i.patch
@@ -0,0 +1,192 @@
+From 29c84baf5e125aa43265192a08cc4bd904db1d45 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:09 +0100
+Subject: [PATCH 003/124] crypto: x86/chacha - depend on generic chacha library
+ instead of crypto driver
+
+commit 28e8d89b1ce8d2e7badfb5f69971dd635acb8863 upstream.
+
+In preparation of extending the x86 ChaCha driver to also expose the ChaCha
+library interface, drop the dependency on the chacha_generic crypto driver
+as a non-SIMD fallback, and depend on the generic ChaCha library directly.
+This way, we only pull in the code we actually need, without registering
+a set of ChaCha skciphers that we will never use.
+
+Since turning the FPU on and off is cheap these days, simplify the SIMD
+routine by dropping the per-page yield, which makes for a cleaner switch
+to the library API as well. This also allows use to invoke the skcipher
+walk routines in non-atomic mode.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/chacha_glue.c | 90 ++++++++++++++---------------------
+ crypto/Kconfig                |  2 +-
+ 2 files changed, 36 insertions(+), 56 deletions(-)
+
+--- a/arch/x86/crypto/chacha_glue.c
++++ b/arch/x86/crypto/chacha_glue.c
+@@ -123,37 +123,38 @@ static void chacha_dosimd(u32 *state, u8
+       }
+ }
+ 
+-static int chacha_simd_stream_xor(struct skcipher_walk *walk,
++static int chacha_simd_stream_xor(struct skcipher_request *req,
+                                 const struct chacha_ctx *ctx, const u8 *iv)
+ {
+       u32 *state, state_buf[16 + 2] __aligned(8);
+-      int next_yield = 4096; /* bytes until next FPU yield */
+-      int err = 0;
++      struct skcipher_walk walk;
++      int err;
++
++      err = skcipher_walk_virt(&walk, req, false);
+ 
+       BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+       state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+ 
+-      crypto_chacha_init(state, ctx, iv);
++      chacha_init_generic(state, ctx->key, iv);
+ 
+-      while (walk->nbytes > 0) {
+-              unsigned int nbytes = walk->nbytes;
++      while (walk.nbytes > 0) {
++              unsigned int nbytes = walk.nbytes;
+ 
+-              if (nbytes < walk->total) {
+-                      nbytes = round_down(nbytes, walk->stride);
+-                      next_yield -= nbytes;
+-              }
+-
+-              chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
+-                            nbytes, ctx->nrounds);
++              if (nbytes < walk.total)
++                      nbytes = round_down(nbytes, walk.stride);
+ 
+-              if (next_yield <= 0) {
+-                      /* temporarily allow preemption */
+-                      kernel_fpu_end();
++              if (!crypto_simd_usable()) {
++                      chacha_crypt_generic(state, walk.dst.virt.addr,
++                                           walk.src.virt.addr, nbytes,
++                                           ctx->nrounds);
++              } else {
+                       kernel_fpu_begin();
+-                      next_yield = 4096;
++                      chacha_dosimd(state, walk.dst.virt.addr,
++                                    walk.src.virt.addr, nbytes,
++                                    ctx->nrounds);
++                      kernel_fpu_end();
+               }
+-
+-              err = skcipher_walk_done(walk, walk->nbytes - nbytes);
++              err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+       }
+ 
+       return err;
+@@ -163,55 +164,34 @@ static int chacha_simd(struct skcipher_r
+ {
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-      struct skcipher_walk walk;
+-      int err;
+-
+-      if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-              return crypto_chacha_crypt(req);
+ 
+-      err = skcipher_walk_virt(&walk, req, true);
+-      if (err)
+-              return err;
+-
+-      kernel_fpu_begin();
+-      err = chacha_simd_stream_xor(&walk, ctx, req->iv);
+-      kernel_fpu_end();
+-      return err;
++      return chacha_simd_stream_xor(req, ctx, req->iv);
+ }
+ 
+ static int xchacha_simd(struct skcipher_request *req)
+ {
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-      struct skcipher_walk walk;
+-      struct chacha_ctx subctx;
+       u32 *state, state_buf[16 + 2] __aligned(8);
++      struct chacha_ctx subctx;
+       u8 real_iv[16];
+-      int err;
+-
+-      if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-              return crypto_xchacha_crypt(req);
+-
+-      err = skcipher_walk_virt(&walk, req, true);
+-      if (err)
+-              return err;
+ 
+       BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+       state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+-      crypto_chacha_init(state, ctx, req->iv);
++      chacha_init_generic(state, ctx->key, req->iv);
+ 
+-      kernel_fpu_begin();
+-
+-      hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
++      if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
++              kernel_fpu_begin();
++              hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
++              kernel_fpu_end();
++      } else {
++              hchacha_block_generic(state, subctx.key, ctx->nrounds);
++      }
+       subctx.nrounds = ctx->nrounds;
+ 
+       memcpy(&real_iv[0], req->iv + 24, 8);
+       memcpy(&real_iv[8], req->iv + 16, 8);
+-      err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
+-
+-      kernel_fpu_end();
+-
+-      return err;
++      return chacha_simd_stream_xor(req, &subctx, real_iv);
+ }
+ 
+ static struct skcipher_alg algs[] = {
+@@ -227,7 +207,7 @@ static struct skcipher_alg algs[] = {
+               .max_keysize            = CHACHA_KEY_SIZE,
+               .ivsize                 = CHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha20_setkey,
++              .setkey                 = chacha20_setkey,
+               .encrypt                = chacha_simd,
+               .decrypt                = chacha_simd,
+       }, {
+@@ -242,7 +222,7 @@ static struct skcipher_alg algs[] = {
+               .max_keysize            = CHACHA_KEY_SIZE,
+               .ivsize                 = XCHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha20_setkey,
++              .setkey                 = chacha20_setkey,
+               .encrypt                = xchacha_simd,
+               .decrypt                = xchacha_simd,
+       }, {
+@@ -257,7 +237,7 @@ static struct skcipher_alg algs[] = {
+               .max_keysize            = CHACHA_KEY_SIZE,
+               .ivsize                 = XCHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha12_setkey,
++              .setkey                 = chacha12_setkey,
+               .encrypt                = xchacha_simd,
+               .decrypt                = xchacha_simd,
+       },
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -1417,7 +1417,7 @@ config CRYPTO_CHACHA20_X86_64
+       tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)"
+       depends on X86 && 64BIT
+       select CRYPTO_BLKCIPHER
+-      select CRYPTO_CHACHA20
++      select CRYPTO_LIB_CHACHA_GENERIC
+       help
+         SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
+         XChaCha20, and XChaCha12 stream ciphers.
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0004-crypto-x86-chacha-expose-SIMD-ChaCha-routine-as-libr.patch b/target/linux/generic/backport-5.4/080-wireguard-0004-crypto-x86-chacha-expose-SIMD-ChaCha-routine-as-libr.patch

new file mode 100644 (file)

index 0000000..0e916c8
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0004-crypto-x86-chacha-expose-SIMD-ChaCha-routine-as-libr.patch
@@ -0,0 +1,205 @@
+From e7f5b03590beee54da6d02aabe0e1392bc3251e4 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:10 +0100
+Subject: [PATCH 004/124] crypto: x86/chacha - expose SIMD ChaCha routine as
+ library function
+
+commit 84e03fa39fbe95a5567d43bff458c6d3b3a23ad1 upstream.
+
+Wire the existing x86 SIMD ChaCha code into the new ChaCha library
+interface, so that users of the library interface will get the
+accelerated version when available.
+
+Given that calls into the library API will always go through the
+routines in this module if it is enabled, switch to static keys
+to select the optimal implementation available (which may be none
+at all, in which case we defer to the generic implementation for
+all invocations).
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/chacha_glue.c | 91 +++++++++++++++++++++++++----------
+ crypto/Kconfig                |  1 +
+ include/crypto/chacha.h       |  6 +++
+ 3 files changed, 73 insertions(+), 25 deletions(-)
+
+--- a/arch/x86/crypto/chacha_glue.c
++++ b/arch/x86/crypto/chacha_glue.c
+@@ -21,24 +21,24 @@ asmlinkage void chacha_block_xor_ssse3(u
+ asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+                                       unsigned int len, int nrounds);
+ asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
+-#ifdef CONFIG_AS_AVX2
++
+ asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+                                      unsigned int len, int nrounds);
+ asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+                                      unsigned int len, int nrounds);
+ asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+                                      unsigned int len, int nrounds);
+-static bool chacha_use_avx2;
+-#ifdef CONFIG_AS_AVX512
++
+ asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+                                          unsigned int len, int nrounds);
+ asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+                                          unsigned int len, int nrounds);
+ asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+                                          unsigned int len, int nrounds);
+-static bool chacha_use_avx512vl;
+-#endif
+-#endif
++
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
+ 
+ static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+ {
+@@ -49,9 +49,8 @@ static unsigned int chacha_advance(unsig
+ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
+                         unsigned int bytes, int nrounds)
+ {
+-#ifdef CONFIG_AS_AVX2
+-#ifdef CONFIG_AS_AVX512
+-      if (chacha_use_avx512vl) {
++      if (IS_ENABLED(CONFIG_AS_AVX512) &&
++          static_branch_likely(&chacha_use_avx512vl)) {
+               while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+                       chacha_8block_xor_avx512vl(state, dst, src, bytes,
+                                                  nrounds);
+@@ -79,8 +78,9 @@ static void chacha_dosimd(u32 *state, u8
+                       return;
+               }
+       }
+-#endif
+-      if (chacha_use_avx2) {
++
++      if (IS_ENABLED(CONFIG_AS_AVX2) &&
++          static_branch_likely(&chacha_use_avx2)) {
+               while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+                       chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+                       bytes -= CHACHA_BLOCK_SIZE * 8;
+@@ -104,7 +104,7 @@ static void chacha_dosimd(u32 *state, u8
+                       return;
+               }
+       }
+-#endif
++
+       while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+               chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+               bytes -= CHACHA_BLOCK_SIZE * 4;
+@@ -123,6 +123,43 @@ static void chacha_dosimd(u32 *state, u8
+       }
+ }
+ 
++void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
++{
++      state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
++
++      if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
++              hchacha_block_generic(state, stream, nrounds);
++      } else {
++              kernel_fpu_begin();
++              hchacha_block_ssse3(state, stream, nrounds);
++              kernel_fpu_end();
++      }
++}
++EXPORT_SYMBOL(hchacha_block_arch);
++
++void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
++{
++      state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
++
++      chacha_init_generic(state, key, iv);
++}
++EXPORT_SYMBOL(chacha_init_arch);
++
++void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
++                     int nrounds)
++{
++      state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
++
++      if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
++          bytes <= CHACHA_BLOCK_SIZE)
++              return chacha_crypt_generic(state, dst, src, bytes, nrounds);
++
++      kernel_fpu_begin();
++      chacha_dosimd(state, dst, src, bytes, nrounds);
++      kernel_fpu_end();
++}
++EXPORT_SYMBOL(chacha_crypt_arch);
++
+ static int chacha_simd_stream_xor(struct skcipher_request *req,
+                                 const struct chacha_ctx *ctx, const u8 *iv)
+ {
+@@ -143,7 +180,8 @@ static int chacha_simd_stream_xor(struct
+               if (nbytes < walk.total)
+                       nbytes = round_down(nbytes, walk.stride);
+ 
+-              if (!crypto_simd_usable()) {
++              if (!static_branch_likely(&chacha_use_simd) ||
++                  !crypto_simd_usable()) {
+                       chacha_crypt_generic(state, walk.dst.virt.addr,
+                                            walk.src.virt.addr, nbytes,
+                                            ctx->nrounds);
+@@ -246,18 +284,21 @@ static struct skcipher_alg algs[] = {
+ static int __init chacha_simd_mod_init(void)
+ {
+       if (!boot_cpu_has(X86_FEATURE_SSSE3))
+-              return -ENODEV;
++              return 0;
+ 
+-#ifdef CONFIG_AS_AVX2
+-      chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+-                        boot_cpu_has(X86_FEATURE_AVX2) &&
+-                        cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+-#ifdef CONFIG_AS_AVX512
+-      chacha_use_avx512vl = chacha_use_avx2 &&
+-                            boot_cpu_has(X86_FEATURE_AVX512VL) &&
+-                            boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+-#endif
+-#endif
++      static_branch_enable(&chacha_use_simd);
++
++      if (IS_ENABLED(CONFIG_AS_AVX2) &&
++          boot_cpu_has(X86_FEATURE_AVX) &&
++          boot_cpu_has(X86_FEATURE_AVX2) &&
++          cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
++              static_branch_enable(&chacha_use_avx2);
++
++              if (IS_ENABLED(CONFIG_AS_AVX512) &&
++                  boot_cpu_has(X86_FEATURE_AVX512VL) &&
++                  boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
++                      static_branch_enable(&chacha_use_avx512vl);
++      }
+       return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -1418,6 +1418,7 @@ config CRYPTO_CHACHA20_X86_64
+       depends on X86 && 64BIT
+       select CRYPTO_BLKCIPHER
+       select CRYPTO_LIB_CHACHA_GENERIC
++      select CRYPTO_ARCH_HAVE_LIB_CHACHA
+       help
+         SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
+         XChaCha20, and XChaCha12 stream ciphers.
+--- a/include/crypto/chacha.h
++++ b/include/crypto/chacha.h
+@@ -25,6 +25,12 @@
+ #define CHACHA_BLOCK_SIZE     64
+ #define CHACHAPOLY_IV_SIZE    12
+ 
++#ifdef CONFIG_X86_64
++#define CHACHA_STATE_WORDS    ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
++#else
++#define CHACHA_STATE_WORDS    (CHACHA_BLOCK_SIZE / sizeof(u32))
++#endif
++
+ /* 192-bit nonce, then 64-bit stream position */
+ #define XCHACHA_IV_SIZE               32
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0005-crypto-arm64-chacha-depend-on-generic-chacha-library.patch b/target/linux/generic/backport-5.4/080-wireguard-0005-crypto-arm64-chacha-depend-on-generic-chacha-library.patch

new file mode 100644 (file)

index 0000000..eca55ed
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0005-crypto-arm64-chacha-depend-on-generic-chacha-library.patch
@@ -0,0 +1,129 @@
+From 527b7f4f3e244c58e07fdb7d850acb45821e1c52 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:11 +0100
+Subject: [PATCH 005/124] crypto: arm64/chacha - depend on generic chacha
+ library instead of crypto driver
+
+commit c77da4867cbb7841177275dbb250f5c09679fae4 upstream.
+
+Depend on the generic ChaCha library routines instead of pulling in the
+generic ChaCha skcipher driver, which is more than we need, and makes
+managing the dependencies between the generic library, generic driver,
+accelerated library and driver more complicated.
+
+While at it, drop the logic to prefer the scalar code on short inputs.
+Turning the NEON on and off is cheap these days, and one major use case
+for ChaCha20 is ChaCha20-Poly1305, which is guaranteed to hit the scalar
+path upon every invocation  (when doing the Poly1305 nonce generation)
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm64/crypto/Kconfig            |  2 +-
+ arch/arm64/crypto/chacha-neon-glue.c | 40 +++++++++++++++-------------
+ 2 files changed, 23 insertions(+), 19 deletions(-)
+
+--- a/arch/arm64/crypto/Kconfig
++++ b/arch/arm64/crypto/Kconfig
+@@ -103,7 +103,7 @@ config CRYPTO_CHACHA20_NEON
+       tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
+       depends on KERNEL_MODE_NEON
+       select CRYPTO_BLKCIPHER
+-      select CRYPTO_CHACHA20
++      select CRYPTO_LIB_CHACHA_GENERIC
+ 
+ config CRYPTO_NHPOLY1305_NEON
+       tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
+--- a/arch/arm64/crypto/chacha-neon-glue.c
++++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -68,7 +68,7 @@ static int chacha_neon_stream_xor(struct
+ 
+       err = skcipher_walk_virt(&walk, req, false);
+ 
+-      crypto_chacha_init(state, ctx, iv);
++      chacha_init_generic(state, ctx->key, iv);
+ 
+       while (walk.nbytes > 0) {
+               unsigned int nbytes = walk.nbytes;
+@@ -76,10 +76,16 @@ static int chacha_neon_stream_xor(struct
+               if (nbytes < walk.total)
+                       nbytes = rounddown(nbytes, walk.stride);
+ 
+-              kernel_neon_begin();
+-              chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+-                            nbytes, ctx->nrounds);
+-              kernel_neon_end();
++              if (!crypto_simd_usable()) {
++                      chacha_crypt_generic(state, walk.dst.virt.addr,
++                                           walk.src.virt.addr, nbytes,
++                                           ctx->nrounds);
++              } else {
++                      kernel_neon_begin();
++                      chacha_doneon(state, walk.dst.virt.addr,
++                                    walk.src.virt.addr, nbytes, ctx->nrounds);
++                      kernel_neon_end();
++              }
+               err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+       }
+ 
+@@ -91,9 +97,6 @@ static int chacha_neon(struct skcipher_r
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ 
+-      if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-              return crypto_chacha_crypt(req);
+-
+       return chacha_neon_stream_xor(req, ctx, req->iv);
+ }
+ 
+@@ -105,14 +108,15 @@ static int xchacha_neon(struct skcipher_
+       u32 state[16];
+       u8 real_iv[16];
+ 
+-      if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-              return crypto_xchacha_crypt(req);
+-
+-      crypto_chacha_init(state, ctx, req->iv);
++      chacha_init_generic(state, ctx->key, req->iv);
+ 
+-      kernel_neon_begin();
+-      hchacha_block_neon(state, subctx.key, ctx->nrounds);
+-      kernel_neon_end();
++      if (crypto_simd_usable()) {
++              kernel_neon_begin();
++              hchacha_block_neon(state, subctx.key, ctx->nrounds);
++              kernel_neon_end();
++      } else {
++              hchacha_block_generic(state, subctx.key, ctx->nrounds);
++      }
+       subctx.nrounds = ctx->nrounds;
+ 
+       memcpy(&real_iv[0], req->iv + 24, 8);
+@@ -134,7 +138,7 @@ static struct skcipher_alg algs[] = {
+               .ivsize                 = CHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+               .walksize               = 5 * CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha20_setkey,
++              .setkey                 = chacha20_setkey,
+               .encrypt                = chacha_neon,
+               .decrypt                = chacha_neon,
+       }, {
+@@ -150,7 +154,7 @@ static struct skcipher_alg algs[] = {
+               .ivsize                 = XCHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+               .walksize               = 5 * CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha20_setkey,
++              .setkey                 = chacha20_setkey,
+               .encrypt                = xchacha_neon,
+               .decrypt                = xchacha_neon,
+       }, {
+@@ -166,7 +170,7 @@ static struct skcipher_alg algs[] = {
+               .ivsize                 = XCHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+               .walksize               = 5 * CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha12_setkey,
++              .setkey                 = chacha12_setkey,
+               .encrypt                = xchacha_neon,
+               .decrypt                = xchacha_neon,
+       }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0006-crypto-arm64-chacha-expose-arm64-ChaCha-routine-as-l.patch b/target/linux/generic/backport-5.4/080-wireguard-0006-crypto-arm64-chacha-expose-arm64-ChaCha-routine-as-l.patch

new file mode 100644 (file)

index 0000000..69583ec
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0006-crypto-arm64-chacha-expose-arm64-ChaCha-routine-as-l.patch
@@ -0,0 +1,138 @@
+From 8b3fda990212ced164ec776a3ba0acedae022614 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:12 +0100
+Subject: [PATCH 006/124] crypto: arm64/chacha - expose arm64 ChaCha routine as
+ library function
+
+commit b3aad5bad26a01a4bd8c49a5c5f52aec665f3b7c upstream.
+
+Expose the accelerated NEON ChaCha routine directly as a symbol
+export so that users of the ChaCha library API can use it directly.
+
+Given that calls into the library API will always go through the
+routines in this module if it is enabled, switch to static keys
+to select the optimal implementation available (which may be none
+at all, in which case we defer to the generic implementation for
+all invocations).
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm64/crypto/Kconfig            |  1 +
+ arch/arm64/crypto/chacha-neon-glue.c | 53 ++++++++++++++++++++++------
+ 2 files changed, 43 insertions(+), 11 deletions(-)
+
+--- a/arch/arm64/crypto/Kconfig
++++ b/arch/arm64/crypto/Kconfig
+@@ -104,6 +104,7 @@ config CRYPTO_CHACHA20_NEON
+       depends on KERNEL_MODE_NEON
+       select CRYPTO_BLKCIPHER
+       select CRYPTO_LIB_CHACHA_GENERIC
++      select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ 
+ config CRYPTO_NHPOLY1305_NEON
+       tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
+--- a/arch/arm64/crypto/chacha-neon-glue.c
++++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -23,6 +23,7 @@
+ #include <crypto/internal/chacha.h>
+ #include <crypto/internal/simd.h>
+ #include <crypto/internal/skcipher.h>
++#include <linux/jump_label.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ 
+@@ -36,6 +37,8 @@ asmlinkage void chacha_4block_xor_neon(u
+                                      int nrounds, int bytes);
+ asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+ 
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
++
+ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+                         int bytes, int nrounds)
+ {
+@@ -59,6 +62,37 @@ static void chacha_doneon(u32 *state, u8
+       }
+ }
+ 
++void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
++{
++      if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
++              hchacha_block_generic(state, stream, nrounds);
++      } else {
++              kernel_neon_begin();
++              hchacha_block_neon(state, stream, nrounds);
++              kernel_neon_end();
++      }
++}
++EXPORT_SYMBOL(hchacha_block_arch);
++
++void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
++{
++      chacha_init_generic(state, key, iv);
++}
++EXPORT_SYMBOL(chacha_init_arch);
++
++void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
++                     int nrounds)
++{
++      if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
++          !crypto_simd_usable())
++              return chacha_crypt_generic(state, dst, src, bytes, nrounds);
++
++      kernel_neon_begin();
++      chacha_doneon(state, dst, src, bytes, nrounds);
++      kernel_neon_end();
++}
++EXPORT_SYMBOL(chacha_crypt_arch);
++
+ static int chacha_neon_stream_xor(struct skcipher_request *req,
+                                 const struct chacha_ctx *ctx, const u8 *iv)
+ {
+@@ -76,7 +110,8 @@ static int chacha_neon_stream_xor(struct
+               if (nbytes < walk.total)
+                       nbytes = rounddown(nbytes, walk.stride);
+ 
+-              if (!crypto_simd_usable()) {
++              if (!static_branch_likely(&have_neon) ||
++                  !crypto_simd_usable()) {
+                       chacha_crypt_generic(state, walk.dst.virt.addr,
+                                            walk.src.virt.addr, nbytes,
+                                            ctx->nrounds);
+@@ -109,14 +144,7 @@ static int xchacha_neon(struct skcipher_
+       u8 real_iv[16];
+ 
+       chacha_init_generic(state, ctx->key, req->iv);
+-
+-      if (crypto_simd_usable()) {
+-              kernel_neon_begin();
+-              hchacha_block_neon(state, subctx.key, ctx->nrounds);
+-              kernel_neon_end();
+-      } else {
+-              hchacha_block_generic(state, subctx.key, ctx->nrounds);
+-      }
++      hchacha_block_arch(state, subctx.key, ctx->nrounds);
+       subctx.nrounds = ctx->nrounds;
+ 
+       memcpy(&real_iv[0], req->iv + 24, 8);
+@@ -179,14 +207,17 @@ static struct skcipher_alg algs[] = {
+ static int __init chacha_simd_mod_init(void)
+ {
+       if (!cpu_have_named_feature(ASIMD))
+-              return -ENODEV;
++              return 0;
++
++      static_branch_enable(&have_neon);
+ 
+       return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-      crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
++      if (cpu_have_named_feature(ASIMD))
++              crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+ module_init(chacha_simd_mod_init);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0007-crypto-arm-chacha-import-Eric-Biggers-s-scalar-accel.patch b/target/linux/generic/backport-5.4/080-wireguard-0007-crypto-arm-chacha-import-Eric-Biggers-s-scalar-accel.patch

new file mode 100644 (file)

index 0000000..bf3ce3e
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0007-crypto-arm-chacha-import-Eric-Biggers-s-scalar-accel.patch
@@ -0,0 +1,480 @@
+From 140ec1877054d2fe67538541b94b4967c0219ff4 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:13 +0100
+Subject: [PATCH 007/124] crypto: arm/chacha - import Eric Biggers's scalar
+ accelerated ChaCha code
+
+commit 29621d099f9c642b22a69dc8e7e20c108473a392 upstream.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-scalar-core.S | 461 +++++++++++++++++++++++++++
+ 1 file changed, 461 insertions(+)
+ create mode 100644 arch/arm/crypto/chacha-scalar-core.S
+
+--- /dev/null
++++ b/arch/arm/crypto/chacha-scalar-core.S
+@@ -0,0 +1,461 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2018 Google, Inc.
++ */
++
++#include <linux/linkage.h>
++#include <asm/assembler.h>
++
++/*
++ * Design notes:
++ *
++ * 16 registers would be needed to hold the state matrix, but only 14 are
++ * available because 'sp' and 'pc' cannot be used.  So we spill the elements
++ * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
++ * 'ldrd' and one 'strd' instruction per round.
++ *
++ * All rotates are performed using the implicit rotate operand accepted by the
++ * 'add' and 'eor' instructions.  This is faster than using explicit rotate
++ * instructions.  To make this work, we allow the values in the second and last
++ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
++ * wrong rotation amount.  The rotation amount is then fixed up just in time
++ * when the values are used.  'brot' is the number of bits the values in row 'b'
++ * need to be rotated right to arrive at the correct values, and 'drot'
++ * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
++ * that they end up as (25, 24) after every round.
++ */
++
++      // ChaCha state registers
++      X0      .req    r0
++      X1      .req    r1
++      X2      .req    r2
++      X3      .req    r3
++      X4      .req    r4
++      X5      .req    r5
++      X6      .req    r6
++      X7      .req    r7
++      X8_X10  .req    r8      // shared by x8 and x10
++      X9_X11  .req    r9      // shared by x9 and x11
++      X12     .req    r10
++      X13     .req    r11
++      X14     .req    r12
++      X15     .req    r14
++
++.Lexpand_32byte_k:
++      // "expand 32-byte k"
++      .word   0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
++
++#ifdef __thumb2__
++#  define adrl adr
++#endif
++
++.macro __rev          out, in,  t0, t1, t2
++.if __LINUX_ARM_ARCH__ >= 6
++      rev             \out, \in
++.else
++      lsl             \t0, \in, #24
++      and             \t1, \in, #0xff00
++      and             \t2, \in, #0xff0000
++      orr             \out, \t0, \in, lsr #24
++      orr             \out, \out, \t1, lsl #8
++      orr             \out, \out, \t2, lsr #8
++.endif
++.endm
++
++.macro _le32_bswap    x,  t0, t1, t2
++#ifdef __ARMEB__
++      __rev           \x, \x,  \t0, \t1, \t2
++#endif
++.endm
++
++.macro _le32_bswap_4x a, b, c, d,  t0, t1, t2
++      _le32_bswap     \a,  \t0, \t1, \t2
++      _le32_bswap     \b,  \t0, \t1, \t2
++      _le32_bswap     \c,  \t0, \t1, \t2
++      _le32_bswap     \d,  \t0, \t1, \t2
++.endm
++
++.macro __ldrd         a, b, src, offset
++#if __LINUX_ARM_ARCH__ >= 6
++      ldrd            \a, \b, [\src, #\offset]
++#else
++      ldr             \a, [\src, #\offset]
++      ldr             \b, [\src, #\offset + 4]
++#endif
++.endm
++
++.macro __strd         a, b, dst, offset
++#if __LINUX_ARM_ARCH__ >= 6
++      strd            \a, \b, [\dst, #\offset]
++#else
++      str             \a, [\dst, #\offset]
++      str             \b, [\dst, #\offset + 4]
++#endif
++.endm
++
++.macro _halfround     a1, b1, c1, d1,  a2, b2, c2, d2
++
++      // a += b; d ^= a; d = rol(d, 16);
++      add             \a1, \a1, \b1, ror #brot
++      add             \a2, \a2, \b2, ror #brot
++      eor             \d1, \a1, \d1, ror #drot
++      eor             \d2, \a2, \d2, ror #drot
++      // drot == 32 - 16 == 16
++
++      // c += d; b ^= c; b = rol(b, 12);
++      add             \c1, \c1, \d1, ror #16
++      add             \c2, \c2, \d2, ror #16
++      eor             \b1, \c1, \b1, ror #brot
++      eor             \b2, \c2, \b2, ror #brot
++      // brot == 32 - 12 == 20
++
++      // a += b; d ^= a; d = rol(d, 8);
++      add             \a1, \a1, \b1, ror #20
++      add             \a2, \a2, \b2, ror #20
++      eor             \d1, \a1, \d1, ror #16
++      eor             \d2, \a2, \d2, ror #16
++      // drot == 32 - 8 == 24
++
++      // c += d; b ^= c; b = rol(b, 7);
++      add             \c1, \c1, \d1, ror #24
++      add             \c2, \c2, \d2, ror #24
++      eor             \b1, \c1, \b1, ror #20
++      eor             \b2, \c2, \b2, ror #20
++      // brot == 32 - 7 == 25
++.endm
++
++.macro _doubleround
++
++      // column round
++
++      // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
++      _halfround      X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
++
++      // save (x8, x9); restore (x10, x11)
++      __strd          X8_X10, X9_X11, sp, 0
++      __ldrd          X8_X10, X9_X11, sp, 8
++
++      // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
++      _halfround      X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
++
++      .set brot, 25
++      .set drot, 24
++
++      // diagonal round
++
++      // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
++      _halfround      X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
++
++      // save (x10, x11); restore (x8, x9)
++      __strd          X8_X10, X9_X11, sp, 8
++      __ldrd          X8_X10, X9_X11, sp, 0
++
++      // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
++      _halfround      X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
++.endm
++
++.macro _chacha_permute        nrounds
++      .set brot, 0
++      .set drot, 0
++      .rept \nrounds / 2
++       _doubleround
++      .endr
++.endm
++
++.macro _chacha                nrounds
++
++.Lnext_block\@:
++      // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
++      // Registers contain x0-x9,x12-x15.
++
++      // Do the core ChaCha permutation to update x0-x15.
++      _chacha_permute \nrounds
++
++      add             sp, #8
++      // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
++      // Registers contain x0-x9,x12-x15.
++      // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
++
++      // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
++      push            {X8_X10, X9_X11, X12, X13, X14, X15}
++
++      // Load (OUT, IN, LEN).
++      ldr             r14, [sp, #96]
++      ldr             r12, [sp, #100]
++      ldr             r11, [sp, #104]
++
++      orr             r10, r14, r12
++
++      // Use slow path if fewer than 64 bytes remain.
++      cmp             r11, #64
++      blt             .Lxor_slowpath\@
++
++      // Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
++      // ARMv6+, since ldmia and stmia (used below) still require alignment.
++      tst             r10, #3
++      bne             .Lxor_slowpath\@
++
++      // Fast path: XOR 64 bytes of aligned data.
++
++      // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
++      // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
++      // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
++
++      // x0-x3
++      __ldrd          r8, r9, sp, 32
++      __ldrd          r10, r11, sp, 40
++      add             X0, X0, r8
++      add             X1, X1, r9
++      add             X2, X2, r10
++      add             X3, X3, r11
++      _le32_bswap_4x  X0, X1, X2, X3,  r8, r9, r10
++      ldmia           r12!, {r8-r11}
++      eor             X0, X0, r8
++      eor             X1, X1, r9
++      eor             X2, X2, r10
++      eor             X3, X3, r11
++      stmia           r14!, {X0-X3}
++
++      // x4-x7
++      __ldrd          r8, r9, sp, 48
++      __ldrd          r10, r11, sp, 56
++      add             X4, r8, X4, ror #brot
++      add             X5, r9, X5, ror #brot
++      ldmia           r12!, {X0-X3}
++      add             X6, r10, X6, ror #brot
++      add             X7, r11, X7, ror #brot
++      _le32_bswap_4x  X4, X5, X6, X7,  r8, r9, r10
++      eor             X4, X4, X0
++      eor             X5, X5, X1
++      eor             X6, X6, X2
++      eor             X7, X7, X3
++      stmia           r14!, {X4-X7}
++
++      // x8-x15
++      pop             {r0-r7}                 // (x8-x9,x12-x15,x10-x11)
++      __ldrd          r8, r9, sp, 32
++      __ldrd          r10, r11, sp, 40
++      add             r0, r0, r8              // x8
++      add             r1, r1, r9              // x9
++      add             r6, r6, r10             // x10
++      add             r7, r7, r11             // x11
++      _le32_bswap_4x  r0, r1, r6, r7,  r8, r9, r10
++      ldmia           r12!, {r8-r11}
++      eor             r0, r0, r8              // x8
++      eor             r1, r1, r9              // x9
++      eor             r6, r6, r10             // x10
++      eor             r7, r7, r11             // x11
++      stmia           r14!, {r0,r1,r6,r7}
++      ldmia           r12!, {r0,r1,r6,r7}
++      __ldrd          r8, r9, sp, 48
++      __ldrd          r10, r11, sp, 56
++      add             r2, r8, r2, ror #drot   // x12
++      add             r3, r9, r3, ror #drot   // x13
++      add             r4, r10, r4, ror #drot  // x14
++      add             r5, r11, r5, ror #drot  // x15
++      _le32_bswap_4x  r2, r3, r4, r5,  r9, r10, r11
++        ldr           r9, [sp, #72]           // load LEN
++      eor             r2, r2, r0              // x12
++      eor             r3, r3, r1              // x13
++      eor             r4, r4, r6              // x14
++      eor             r5, r5, r7              // x15
++        subs          r9, #64                 // decrement and check LEN
++      stmia           r14!, {r2-r5}
++
++      beq             .Ldone\@
++
++.Lprepare_for_next_block\@:
++
++      // Stack: x0-x15 OUT IN LEN
++
++      // Increment block counter (x12)
++      add             r8, #1
++
++      // Store updated (OUT, IN, LEN)
++      str             r14, [sp, #64]
++      str             r12, [sp, #68]
++      str             r9, [sp, #72]
++
++        mov           r14, sp
++
++      // Store updated block counter (x12)
++      str             r8, [sp, #48]
++
++        sub           sp, #16
++
++      // Reload state and do next block
++      ldmia           r14!, {r0-r11}          // load x0-x11
++      __strd          r10, r11, sp, 8         // store x10-x11 before state
++      ldmia           r14, {r10-r12,r14}      // load x12-x15
++      b               .Lnext_block\@
++
++.Lxor_slowpath\@:
++      // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
++      // We handle it by storing the 64 bytes of keystream to the stack, then
++      // XOR-ing the needed portion with the data.
++
++      // Allocate keystream buffer
++      sub             sp, #64
++      mov             r14, sp
++
++      // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
++      // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
++      // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
++
++      // Save keystream for x0-x3
++      __ldrd          r8, r9, sp, 96
++      __ldrd          r10, r11, sp, 104
++      add             X0, X0, r8
++      add             X1, X1, r9
++      add             X2, X2, r10
++      add             X3, X3, r11
++      _le32_bswap_4x  X0, X1, X2, X3,  r8, r9, r10
++      stmia           r14!, {X0-X3}
++
++      // Save keystream for x4-x7
++      __ldrd          r8, r9, sp, 112
++      __ldrd          r10, r11, sp, 120
++      add             X4, r8, X4, ror #brot
++      add             X5, r9, X5, ror #brot
++      add             X6, r10, X6, ror #brot
++      add             X7, r11, X7, ror #brot
++      _le32_bswap_4x  X4, X5, X6, X7,  r8, r9, r10
++        add           r8, sp, #64
++      stmia           r14!, {X4-X7}
++
++      // Save keystream for x8-x15
++      ldm             r8, {r0-r7}             // (x8-x9,x12-x15,x10-x11)
++      __ldrd          r8, r9, sp, 128
++      __ldrd          r10, r11, sp, 136
++      add             r0, r0, r8              // x8
++      add             r1, r1, r9              // x9
++      add             r6, r6, r10             // x10
++      add             r7, r7, r11             // x11
++      _le32_bswap_4x  r0, r1, r6, r7,  r8, r9, r10
++      stmia           r14!, {r0,r1,r6,r7}
++      __ldrd          r8, r9, sp, 144
++      __ldrd          r10, r11, sp, 152
++      add             r2, r8, r2, ror #drot   // x12
++      add             r3, r9, r3, ror #drot   // x13
++      add             r4, r10, r4, ror #drot  // x14
++      add             r5, r11, r5, ror #drot  // x15
++      _le32_bswap_4x  r2, r3, r4, r5,  r9, r10, r11
++      stmia           r14, {r2-r5}
++
++      // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
++      // Registers: r8 is block counter, r12 is IN.
++
++      ldr             r9, [sp, #168]          // LEN
++      ldr             r14, [sp, #160]         // OUT
++      cmp             r9, #64
++        mov           r0, sp
++      movle           r1, r9
++      movgt           r1, #64
++      // r1 is number of bytes to XOR, in range [1, 64]
++
++.if __LINUX_ARM_ARCH__ < 6
++      orr             r2, r12, r14
++      tst             r2, #3                  // IN or OUT misaligned?
++      bne             .Lxor_next_byte\@
++.endif
++
++      // XOR a word at a time
++.rept 16
++      subs            r1, #4
++      blt             .Lxor_words_done\@
++      ldr             r2, [r12], #4
++      ldr             r3, [r0], #4
++      eor             r2, r2, r3
++      str             r2, [r14], #4
++.endr
++      b               .Lxor_slowpath_done\@
++.Lxor_words_done\@:
++      ands            r1, r1, #3
++      beq             .Lxor_slowpath_done\@
++
++      // XOR a byte at a time
++.Lxor_next_byte\@:
++      ldrb            r2, [r12], #1
++      ldrb            r3, [r0], #1
++      eor             r2, r2, r3
++      strb            r2, [r14], #1
++      subs            r1, #1
++      bne             .Lxor_next_byte\@
++
++.Lxor_slowpath_done\@:
++      subs            r9, #64
++      add             sp, #96
++      bgt             .Lprepare_for_next_block\@
++
++.Ldone\@:
++.endm // _chacha
++
++/*
++ * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
++ *                 const u32 iv[4]);
++ */
++ENTRY(chacha20_arm)
++      cmp             r2, #0                  // len == 0?
++      reteq           lr
++
++      push            {r0-r2,r4-r11,lr}
++
++      // Push state x0-x15 onto stack.
++      // Also store an extra copy of x10-x11 just before the state.
++
++      ldr             r4, [sp, #48]           // iv
++      mov             r0, sp
++      sub             sp, #80
++
++      // iv: x12-x15
++      ldm             r4, {X12,X13,X14,X15}
++      stmdb           r0!, {X12,X13,X14,X15}
++
++      // key: x4-x11
++      __ldrd          X8_X10, X9_X11, r3, 24
++      __strd          X8_X10, X9_X11, sp, 8
++      stmdb           r0!, {X8_X10, X9_X11}
++      ldm             r3, {X4-X9_X11}
++      stmdb           r0!, {X4-X9_X11}
++
++      // constants: x0-x3
++      adrl            X3, .Lexpand_32byte_k
++      ldm             X3, {X0-X3}
++      __strd          X0, X1, sp, 16
++      __strd          X2, X3, sp, 24
++
++      _chacha         20
++
++      add             sp, #76
++      pop             {r4-r11, pc}
++ENDPROC(chacha20_arm)
++
++/*
++ * void hchacha20_arm(const u32 state[16], u32 out[8]);
++ */
++ENTRY(hchacha20_arm)
++      push            {r1,r4-r11,lr}
++
++      mov             r14, r0
++      ldmia           r14!, {r0-r11}          // load x0-x11
++      push            {r10-r11}               // store x10-x11 to stack
++      ldm             r14, {r10-r12,r14}      // load x12-x15
++      sub             sp, #8
++
++      _chacha_permute 20
++
++      // Skip over (unused0-unused1, x10-x11)
++      add             sp, #16
++
++      // Fix up rotations of x12-x15
++      ror             X12, X12, #drot
++      ror             X13, X13, #drot
++        pop           {r4}                    // load 'out'
++      ror             X14, X14, #drot
++      ror             X15, X15, #drot
++
++      // Store (x0-x3,x12-x15) to 'out'
++      stm             r4, {X0,X1,X2,X3,X12,X13,X14,X15}
++
++      pop             {r4-r11,pc}
++ENDPROC(hchacha20_arm)
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch b/target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch

new file mode 100644 (file)

index 0000000..7f907f2
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch
@@ -0,0 +1,691 @@
+From a92bd97c758d32511f0deeef84f25c3a1d5e7879 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:14 +0100
+Subject: [PATCH 008/124] crypto: arm/chacha - remove dependency on generic
+ ChaCha driver
+
+commit b36d8c09e710c71f6a9690b6586fea2d1c9e1e27 upstream.
+
+Instead of falling back to the generic ChaCha skcipher driver for
+non-SIMD cases, use a fast scalar implementation for ARM authored
+by Eric Biggers. This removes the module dependency on chacha-generic
+altogether, which also simplifies things when we expose the ChaCha
+library interface from this module.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/Kconfig              |   4 +-
+ arch/arm/crypto/Makefile             |   3 +-
+ arch/arm/crypto/chacha-glue.c        | 304 +++++++++++++++++++++++++++
+ arch/arm/crypto/chacha-neon-glue.c   | 202 ------------------
+ arch/arm/crypto/chacha-scalar-core.S |  65 +++---
+ arch/arm64/crypto/chacha-neon-glue.c |   2 +-
+ 6 files changed, 340 insertions(+), 240 deletions(-)
+ create mode 100644 arch/arm/crypto/chacha-glue.c
+ delete mode 100644 arch/arm/crypto/chacha-neon-glue.c
+
+--- a/arch/arm/crypto/Kconfig
++++ b/arch/arm/crypto/Kconfig
+@@ -127,10 +127,8 @@ config CRYPTO_CRC32_ARM_CE
+       select CRYPTO_HASH
+ 
+ config CRYPTO_CHACHA20_NEON
+-      tristate "NEON accelerated ChaCha stream cipher algorithms"
+-      depends on KERNEL_MODE_NEON
++      tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
+       select CRYPTO_BLKCIPHER
+-      select CRYPTO_CHACHA20
+ 
+ config CRYPTO_NHPOLY1305_NEON
+       tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+--- a/arch/arm/crypto/Makefile
++++ b/arch/arm/crypto/Makefile
+@@ -53,7 +53,8 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glu
+ ghash-arm-ce-y        := ghash-ce-core.o ghash-ce-glue.o
+ crct10dif-arm-ce-y    := crct10dif-ce-core.o crct10dif-ce-glue.o
+ crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
++chacha-neon-y := chacha-scalar-core.o chacha-glue.o
++chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
+ nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
+ 
+ ifdef REGENERATE_ARM_CRYPTO
+--- /dev/null
++++ b/arch/arm/crypto/chacha-glue.c
+@@ -0,0 +1,304 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
++ * including ChaCha20 (RFC7539)
++ *
++ * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
++ * Copyright (C) 2015 Martin Willi
++ */
++
++#include <crypto/algapi.h>
++#include <crypto/internal/chacha.h>
++#include <crypto/internal/simd.h>
++#include <crypto/internal/skcipher.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <asm/cputype.h>
++#include <asm/hwcap.h>
++#include <asm/neon.h>
++#include <asm/simd.h>
++
++asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
++                                    int nrounds);
++asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
++                                     int nrounds);
++asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
++asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
++
++asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
++                           const u32 *state, int nrounds);
++
++static inline bool neon_usable(void)
++{
++      return crypto_simd_usable();
++}
++
++static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
++                        unsigned int bytes, int nrounds)
++{
++      u8 buf[CHACHA_BLOCK_SIZE];
++
++      while (bytes >= CHACHA_BLOCK_SIZE * 4) {
++              chacha_4block_xor_neon(state, dst, src, nrounds);
++              bytes -= CHACHA_BLOCK_SIZE * 4;
++              src += CHACHA_BLOCK_SIZE * 4;
++              dst += CHACHA_BLOCK_SIZE * 4;
++              state[12] += 4;
++      }
++      while (bytes >= CHACHA_BLOCK_SIZE) {
++              chacha_block_xor_neon(state, dst, src, nrounds);
++              bytes -= CHACHA_BLOCK_SIZE;
++              src += CHACHA_BLOCK_SIZE;
++              dst += CHACHA_BLOCK_SIZE;
++              state[12]++;
++      }
++      if (bytes) {
++              memcpy(buf, src, bytes);
++              chacha_block_xor_neon(state, buf, buf, nrounds);
++              memcpy(dst, buf, bytes);
++      }
++}
++
++static int chacha_stream_xor(struct skcipher_request *req,
++                           const struct chacha_ctx *ctx, const u8 *iv,
++                           bool neon)
++{
++      struct skcipher_walk walk;
++      u32 state[16];
++      int err;
++
++      err = skcipher_walk_virt(&walk, req, false);
++
++      chacha_init_generic(state, ctx->key, iv);
++
++      while (walk.nbytes > 0) {
++              unsigned int nbytes = walk.nbytes;
++
++              if (nbytes < walk.total)
++                      nbytes = round_down(nbytes, walk.stride);
++
++              if (!neon) {
++                      chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
++                                   nbytes, state, ctx->nrounds);
++                      state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
++              } else {
++                      kernel_neon_begin();
++                      chacha_doneon(state, walk.dst.virt.addr,
++                                    walk.src.virt.addr, nbytes, ctx->nrounds);
++                      kernel_neon_end();
++              }
++              err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
++      }
++
++      return err;
++}
++
++static int do_chacha(struct skcipher_request *req, bool neon)
++{
++      struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
++      struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
++
++      return chacha_stream_xor(req, ctx, req->iv, neon);
++}
++
++static int chacha_arm(struct skcipher_request *req)
++{
++      return do_chacha(req, false);
++}
++
++static int chacha_neon(struct skcipher_request *req)
++{
++      return do_chacha(req, neon_usable());
++}
++
++static int do_xchacha(struct skcipher_request *req, bool neon)
++{
++      struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
++      struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
++      struct chacha_ctx subctx;
++      u32 state[16];
++      u8 real_iv[16];
++
++      chacha_init_generic(state, ctx->key, req->iv);
++
++      if (!neon) {
++              hchacha_block_arm(state, subctx.key, ctx->nrounds);
++      } else {
++              kernel_neon_begin();
++              hchacha_block_neon(state, subctx.key, ctx->nrounds);
++              kernel_neon_end();
++      }
++      subctx.nrounds = ctx->nrounds;
++
++      memcpy(&real_iv[0], req->iv + 24, 8);
++      memcpy(&real_iv[8], req->iv + 16, 8);
++      return chacha_stream_xor(req, &subctx, real_iv, neon);
++}
++
++static int xchacha_arm(struct skcipher_request *req)
++{
++      return do_xchacha(req, false);
++}
++
++static int xchacha_neon(struct skcipher_request *req)
++{
++      return do_xchacha(req, neon_usable());
++}
++
++static struct skcipher_alg arm_algs[] = {
++      {
++              .base.cra_name          = "chacha20",
++              .base.cra_driver_name   = "chacha20-arm",
++              .base.cra_priority      = 200,
++              .base.cra_blocksize     = 1,
++              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
++              .base.cra_module        = THIS_MODULE,
++
++              .min_keysize            = CHACHA_KEY_SIZE,
++              .max_keysize            = CHACHA_KEY_SIZE,
++              .ivsize                 = CHACHA_IV_SIZE,
++              .chunksize              = CHACHA_BLOCK_SIZE,
++              .setkey                 = chacha20_setkey,
++              .encrypt                = chacha_arm,
++              .decrypt                = chacha_arm,
++      }, {
++              .base.cra_name          = "xchacha20",
++              .base.cra_driver_name   = "xchacha20-arm",
++              .base.cra_priority      = 200,
++              .base.cra_blocksize     = 1,
++              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
++              .base.cra_module        = THIS_MODULE,
++
++              .min_keysize            = CHACHA_KEY_SIZE,
++              .max_keysize            = CHACHA_KEY_SIZE,
++              .ivsize                 = XCHACHA_IV_SIZE,
++              .chunksize              = CHACHA_BLOCK_SIZE,
++              .setkey                 = chacha20_setkey,
++              .encrypt                = xchacha_arm,
++              .decrypt                = xchacha_arm,
++      }, {
++              .base.cra_name          = "xchacha12",
++              .base.cra_driver_name   = "xchacha12-arm",
++              .base.cra_priority      = 200,
++              .base.cra_blocksize     = 1,
++              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
++              .base.cra_module        = THIS_MODULE,
++
++              .min_keysize            = CHACHA_KEY_SIZE,
++              .max_keysize            = CHACHA_KEY_SIZE,
++              .ivsize                 = XCHACHA_IV_SIZE,
++              .chunksize              = CHACHA_BLOCK_SIZE,
++              .setkey                 = chacha12_setkey,
++              .encrypt                = xchacha_arm,
++              .decrypt                = xchacha_arm,
++      },
++};
++
++static struct skcipher_alg neon_algs[] = {
++      {
++              .base.cra_name          = "chacha20",
++              .base.cra_driver_name   = "chacha20-neon",
++              .base.cra_priority      = 300,
++              .base.cra_blocksize     = 1,
++              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
++              .base.cra_module        = THIS_MODULE,
++
++              .min_keysize            = CHACHA_KEY_SIZE,
++              .max_keysize            = CHACHA_KEY_SIZE,
++              .ivsize                 = CHACHA_IV_SIZE,
++              .chunksize              = CHACHA_BLOCK_SIZE,
++              .walksize               = 4 * CHACHA_BLOCK_SIZE,
++              .setkey                 = chacha20_setkey,
++              .encrypt                = chacha_neon,
++              .decrypt                = chacha_neon,
++      }, {
++              .base.cra_name          = "xchacha20",
++              .base.cra_driver_name   = "xchacha20-neon",
++              .base.cra_priority      = 300,
++              .base.cra_blocksize     = 1,
++              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
++              .base.cra_module        = THIS_MODULE,
++
++              .min_keysize            = CHACHA_KEY_SIZE,
++              .max_keysize            = CHACHA_KEY_SIZE,
++              .ivsize                 = XCHACHA_IV_SIZE,
++              .chunksize              = CHACHA_BLOCK_SIZE,
++              .walksize               = 4 * CHACHA_BLOCK_SIZE,
++              .setkey                 = chacha20_setkey,
++              .encrypt                = xchacha_neon,
++              .decrypt                = xchacha_neon,
++      }, {
++              .base.cra_name          = "xchacha12",
++              .base.cra_driver_name   = "xchacha12-neon",
++              .base.cra_priority      = 300,
++              .base.cra_blocksize     = 1,
++              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
++              .base.cra_module        = THIS_MODULE,
++
++              .min_keysize            = CHACHA_KEY_SIZE,
++              .max_keysize            = CHACHA_KEY_SIZE,
++              .ivsize                 = XCHACHA_IV_SIZE,
++              .chunksize              = CHACHA_BLOCK_SIZE,
++              .walksize               = 4 * CHACHA_BLOCK_SIZE,
++              .setkey                 = chacha12_setkey,
++              .encrypt                = xchacha_neon,
++              .decrypt                = xchacha_neon,
++      }
++};
++
++static int __init chacha_simd_mod_init(void)
++{
++      int err;
++
++      err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
++      if (err)
++              return err;
++
++      if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
++              int i;
++
++              switch (read_cpuid_part()) {
++              case ARM_CPU_PART_CORTEX_A7:
++              case ARM_CPU_PART_CORTEX_A5:
++                      /*
++                       * The Cortex-A7 and Cortex-A5 do not perform well with
++                       * the NEON implementation but do incredibly with the
++                       * scalar one and use less power.
++                       */
++                      for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
++                              neon_algs[i].base.cra_priority = 0;
++                      break;
++              }
++
++              err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
++              if (err)
++                      crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
++      }
++      return err;
++}
++
++static void __exit chacha_simd_mod_fini(void)
++{
++      crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
++      if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
++              crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
++}
++
++module_init(chacha_simd_mod_init);
++module_exit(chacha_simd_mod_fini);
++
++MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
++MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
++MODULE_LICENSE("GPL v2");
++MODULE_ALIAS_CRYPTO("chacha20");
++MODULE_ALIAS_CRYPTO("chacha20-arm");
++MODULE_ALIAS_CRYPTO("xchacha20");
++MODULE_ALIAS_CRYPTO("xchacha20-arm");
++MODULE_ALIAS_CRYPTO("xchacha12");
++MODULE_ALIAS_CRYPTO("xchacha12-arm");
++#ifdef CONFIG_KERNEL_MODE_NEON
++MODULE_ALIAS_CRYPTO("chacha20-neon");
++MODULE_ALIAS_CRYPTO("xchacha20-neon");
++MODULE_ALIAS_CRYPTO("xchacha12-neon");
++#endif
+--- a/arch/arm/crypto/chacha-neon-glue.c
++++ /dev/null
+@@ -1,202 +0,0 @@
+-/*
+- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+- * including ChaCha20 (RFC7539)
+- *
+- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License version 2 as
+- * published by the Free Software Foundation.
+- *
+- * Based on:
+- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+- *
+- * Copyright (C) 2015 Martin Willi
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License as published by
+- * the Free Software Foundation; either version 2 of the License, or
+- * (at your option) any later version.
+- */
+-
+-#include <crypto/algapi.h>
+-#include <crypto/internal/chacha.h>
+-#include <crypto/internal/simd.h>
+-#include <crypto/internal/skcipher.h>
+-#include <linux/kernel.h>
+-#include <linux/module.h>
+-
+-#include <asm/hwcap.h>
+-#include <asm/neon.h>
+-#include <asm/simd.h>
+-
+-asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+-                                    int nrounds);
+-asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+-                                     int nrounds);
+-asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+-
+-static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+-                        unsigned int bytes, int nrounds)
+-{
+-      u8 buf[CHACHA_BLOCK_SIZE];
+-
+-      while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+-              chacha_4block_xor_neon(state, dst, src, nrounds);
+-              bytes -= CHACHA_BLOCK_SIZE * 4;
+-              src += CHACHA_BLOCK_SIZE * 4;
+-              dst += CHACHA_BLOCK_SIZE * 4;
+-              state[12] += 4;
+-      }
+-      while (bytes >= CHACHA_BLOCK_SIZE) {
+-              chacha_block_xor_neon(state, dst, src, nrounds);
+-              bytes -= CHACHA_BLOCK_SIZE;
+-              src += CHACHA_BLOCK_SIZE;
+-              dst += CHACHA_BLOCK_SIZE;
+-              state[12]++;
+-      }
+-      if (bytes) {
+-              memcpy(buf, src, bytes);
+-              chacha_block_xor_neon(state, buf, buf, nrounds);
+-              memcpy(dst, buf, bytes);
+-      }
+-}
+-
+-static int chacha_neon_stream_xor(struct skcipher_request *req,
+-                                const struct chacha_ctx *ctx, const u8 *iv)
+-{
+-      struct skcipher_walk walk;
+-      u32 state[16];
+-      int err;
+-
+-      err = skcipher_walk_virt(&walk, req, false);
+-
+-      crypto_chacha_init(state, ctx, iv);
+-
+-      while (walk.nbytes > 0) {
+-              unsigned int nbytes = walk.nbytes;
+-
+-              if (nbytes < walk.total)
+-                      nbytes = round_down(nbytes, walk.stride);
+-
+-              kernel_neon_begin();
+-              chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+-                            nbytes, ctx->nrounds);
+-              kernel_neon_end();
+-              err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+-      }
+-
+-      return err;
+-}
+-
+-static int chacha_neon(struct skcipher_request *req)
+-{
+-      struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+-      struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-
+-      if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-              return crypto_chacha_crypt(req);
+-
+-      return chacha_neon_stream_xor(req, ctx, req->iv);
+-}
+-
+-static int xchacha_neon(struct skcipher_request *req)
+-{
+-      struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+-      struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-      struct chacha_ctx subctx;
+-      u32 state[16];
+-      u8 real_iv[16];
+-
+-      if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-              return crypto_xchacha_crypt(req);
+-
+-      crypto_chacha_init(state, ctx, req->iv);
+-
+-      kernel_neon_begin();
+-      hchacha_block_neon(state, subctx.key, ctx->nrounds);
+-      kernel_neon_end();
+-      subctx.nrounds = ctx->nrounds;
+-
+-      memcpy(&real_iv[0], req->iv + 24, 8);
+-      memcpy(&real_iv[8], req->iv + 16, 8);
+-      return chacha_neon_stream_xor(req, &subctx, real_iv);
+-}
+-
+-static struct skcipher_alg algs[] = {
+-      {
+-              .base.cra_name          = "chacha20",
+-              .base.cra_driver_name   = "chacha20-neon",
+-              .base.cra_priority      = 300,
+-              .base.cra_blocksize     = 1,
+-              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
+-              .base.cra_module        = THIS_MODULE,
+-
+-              .min_keysize            = CHACHA_KEY_SIZE,
+-              .max_keysize            = CHACHA_KEY_SIZE,
+-              .ivsize                 = CHACHA_IV_SIZE,
+-              .chunksize              = CHACHA_BLOCK_SIZE,
+-              .walksize               = 4 * CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha20_setkey,
+-              .encrypt                = chacha_neon,
+-              .decrypt                = chacha_neon,
+-      }, {
+-              .base.cra_name          = "xchacha20",
+-              .base.cra_driver_name   = "xchacha20-neon",
+-              .base.cra_priority      = 300,
+-              .base.cra_blocksize     = 1,
+-              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
+-              .base.cra_module        = THIS_MODULE,
+-
+-              .min_keysize            = CHACHA_KEY_SIZE,
+-              .max_keysize            = CHACHA_KEY_SIZE,
+-              .ivsize                 = XCHACHA_IV_SIZE,
+-              .chunksize              = CHACHA_BLOCK_SIZE,
+-              .walksize               = 4 * CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha20_setkey,
+-              .encrypt                = xchacha_neon,
+-              .decrypt                = xchacha_neon,
+-      }, {
+-              .base.cra_name          = "xchacha12",
+-              .base.cra_driver_name   = "xchacha12-neon",
+-              .base.cra_priority      = 300,
+-              .base.cra_blocksize     = 1,
+-              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
+-              .base.cra_module        = THIS_MODULE,
+-
+-              .min_keysize            = CHACHA_KEY_SIZE,
+-              .max_keysize            = CHACHA_KEY_SIZE,
+-              .ivsize                 = XCHACHA_IV_SIZE,
+-              .chunksize              = CHACHA_BLOCK_SIZE,
+-              .walksize               = 4 * CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha12_setkey,
+-              .encrypt                = xchacha_neon,
+-              .decrypt                = xchacha_neon,
+-      }
+-};
+-
+-static int __init chacha_simd_mod_init(void)
+-{
+-      if (!(elf_hwcap & HWCAP_NEON))
+-              return -ENODEV;
+-
+-      return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+-}
+-
+-static void __exit chacha_simd_mod_fini(void)
+-{
+-      crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+-}
+-
+-module_init(chacha_simd_mod_init);
+-module_exit(chacha_simd_mod_fini);
+-
+-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
+-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+-MODULE_LICENSE("GPL v2");
+-MODULE_ALIAS_CRYPTO("chacha20");
+-MODULE_ALIAS_CRYPTO("chacha20-neon");
+-MODULE_ALIAS_CRYPTO("xchacha20");
+-MODULE_ALIAS_CRYPTO("xchacha20-neon");
+-MODULE_ALIAS_CRYPTO("xchacha12");
+-MODULE_ALIAS_CRYPTO("xchacha12-neon");
+--- a/arch/arm/crypto/chacha-scalar-core.S
++++ b/arch/arm/crypto/chacha-scalar-core.S
+@@ -41,14 +41,6 @@
+       X14     .req    r12
+       X15     .req    r14
+ 
+-.Lexpand_32byte_k:
+-      // "expand 32-byte k"
+-      .word   0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+-
+-#ifdef __thumb2__
+-#  define adrl adr
+-#endif
+-
+ .macro __rev          out, in,  t0, t1, t2
+ .if __LINUX_ARM_ARCH__ >= 6
+       rev             \out, \in
+@@ -391,61 +383,65 @@
+ .endm // _chacha
+ 
+ /*
+- * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
+- *                 const u32 iv[4]);
++ * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
++ *                 const u32 *state, int nrounds);
+  */
+-ENTRY(chacha20_arm)
++ENTRY(chacha_doarm)
+       cmp             r2, #0                  // len == 0?
+       reteq           lr
+ 
++      ldr             ip, [sp]
++      cmp             ip, #12
++
+       push            {r0-r2,r4-r11,lr}
+ 
+       // Push state x0-x15 onto stack.
+       // Also store an extra copy of x10-x11 just before the state.
+ 
+-      ldr             r4, [sp, #48]           // iv
+-      mov             r0, sp
+-      sub             sp, #80
+-
+-      // iv: x12-x15
+-      ldm             r4, {X12,X13,X14,X15}
+-      stmdb           r0!, {X12,X13,X14,X15}
++      add             X12, r3, #48
++      ldm             X12, {X12,X13,X14,X15}
++      push            {X12,X13,X14,X15}
++      sub             sp, sp, #64
+ 
+-      // key: x4-x11
+-      __ldrd          X8_X10, X9_X11, r3, 24
++      __ldrd          X8_X10, X9_X11, r3, 40
+       __strd          X8_X10, X9_X11, sp, 8
+-      stmdb           r0!, {X8_X10, X9_X11}
+-      ldm             r3, {X4-X9_X11}
+-      stmdb           r0!, {X4-X9_X11}
+-
+-      // constants: x0-x3
+-      adrl            X3, .Lexpand_32byte_k
+-      ldm             X3, {X0-X3}
++      __strd          X8_X10, X9_X11, sp, 56
++      ldm             r3, {X0-X9_X11}
+       __strd          X0, X1, sp, 16
+       __strd          X2, X3, sp, 24
++      __strd          X4, X5, sp, 32
++      __strd          X6, X7, sp, 40
++      __strd          X8_X10, X9_X11, sp, 48
+ 
++      beq             1f
+       _chacha         20
+ 
+-      add             sp, #76
++0:    add             sp, #76
+       pop             {r4-r11, pc}
+-ENDPROC(chacha20_arm)
++
++1:    _chacha         12
++      b               0b
++ENDPROC(chacha_doarm)
+ 
+ /*
+- * void hchacha20_arm(const u32 state[16], u32 out[8]);
++ * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
+  */
+-ENTRY(hchacha20_arm)
++ENTRY(hchacha_block_arm)
+       push            {r1,r4-r11,lr}
+ 
++      cmp             r2, #12                 // ChaCha12 ?
++
+       mov             r14, r0
+       ldmia           r14!, {r0-r11}          // load x0-x11
+       push            {r10-r11}               // store x10-x11 to stack
+       ldm             r14, {r10-r12,r14}      // load x12-x15
+       sub             sp, #8
+ 
++      beq             1f
+       _chacha_permute 20
+ 
+       // Skip over (unused0-unused1, x10-x11)
+-      add             sp, #16
++0:    add             sp, #16
+ 
+       // Fix up rotations of x12-x15
+       ror             X12, X12, #drot
+@@ -458,4 +454,7 @@ ENTRY(hchacha20_arm)
+       stm             r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+ 
+       pop             {r4-r11,pc}
+-ENDPROC(hchacha20_arm)
++
++1:    _chacha_permute 12
++      b               0b
++ENDPROC(hchacha_block_arm)
+--- a/arch/arm64/crypto/chacha-neon-glue.c
++++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -1,5 +1,5 @@
+ /*
+- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
++ * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
+  * including ChaCha20 (RFC7539)
+  *
+  * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0009-crypto-arm-chacha-expose-ARM-ChaCha-routine-as-libra.patch b/target/linux/generic/backport-5.4/080-wireguard-0009-crypto-arm-chacha-expose-ARM-ChaCha-routine-as-libra.patch

new file mode 100644 (file)

index 0000000..072b50b
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0009-crypto-arm-chacha-expose-ARM-ChaCha-routine-as-libra.patch
@@ -0,0 +1,108 @@
+From 360be1a8f326ec5c0d20a134e228fb96a2eb351d Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:15 +0100
+Subject: [PATCH 009/124] crypto: arm/chacha - expose ARM ChaCha routine as
+ library function
+
+commit a44a3430d71bad4ee56788a59fff099b291ea54c upstream.
+
+Expose the accelerated NEON ChaCha routine directly as a symbol
+export so that users of the ChaCha library API can use it directly.
+
+Given that calls into the library API will always go through the
+routines in this module if it is enabled, switch to static keys
+to select the optimal implementation available (which may be none
+at all, in which case we defer to the generic implementation for
+all invocations).
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/Kconfig       |  1 +
+ arch/arm/crypto/chacha-glue.c | 41 ++++++++++++++++++++++++++++++++++-
+ 2 files changed, 41 insertions(+), 1 deletion(-)
+
+--- a/arch/arm/crypto/Kconfig
++++ b/arch/arm/crypto/Kconfig
+@@ -129,6 +129,7 @@ config CRYPTO_CRC32_ARM_CE
+ config CRYPTO_CHACHA20_NEON
+       tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
+       select CRYPTO_BLKCIPHER
++      select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ 
+ config CRYPTO_NHPOLY1305_NEON
+       tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+--- a/arch/arm/crypto/chacha-glue.c
++++ b/arch/arm/crypto/chacha-glue.c
+@@ -11,6 +11,7 @@
+ #include <crypto/internal/chacha.h>
+ #include <crypto/internal/simd.h>
+ #include <crypto/internal/skcipher.h>
++#include <linux/jump_label.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ 
+@@ -29,9 +30,11 @@ asmlinkage void hchacha_block_neon(const
+ asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+                            const u32 *state, int nrounds);
+ 
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
++
+ static inline bool neon_usable(void)
+ {
+-      return crypto_simd_usable();
++      return static_branch_likely(&use_neon) && crypto_simd_usable();
+ }
+ 
+ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+@@ -60,6 +63,40 @@ static void chacha_doneon(u32 *state, u8
+       }
+ }
+ 
++void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
++{
++      if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
++              hchacha_block_arm(state, stream, nrounds);
++      } else {
++              kernel_neon_begin();
++              hchacha_block_neon(state, stream, nrounds);
++              kernel_neon_end();
++      }
++}
++EXPORT_SYMBOL(hchacha_block_arch);
++
++void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
++{
++      chacha_init_generic(state, key, iv);
++}
++EXPORT_SYMBOL(chacha_init_arch);
++
++void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
++                     int nrounds)
++{
++      if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
++          bytes <= CHACHA_BLOCK_SIZE) {
++              chacha_doarm(dst, src, bytes, state, nrounds);
++              state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
++              return;
++      }
++
++      kernel_neon_begin();
++      chacha_doneon(state, dst, src, bytes, nrounds);
++      kernel_neon_end();
++}
++EXPORT_SYMBOL(chacha_crypt_arch);
++
+ static int chacha_stream_xor(struct skcipher_request *req,
+                            const struct chacha_ctx *ctx, const u8 *iv,
+                            bool neon)
+@@ -269,6 +306,8 @@ static int __init chacha_simd_mod_init(v
+                       for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
+                               neon_algs[i].base.cra_priority = 0;
+                       break;
++              default:
++                      static_branch_enable(&use_neon);
+               }
+ 
+               err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch b/target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch

new file mode 100644 (file)

index 0000000..e6fb4d9
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch
@@ -0,0 +1,452 @@
+From f9b4c68865fdb7f3327f7d82fbc82c76c8773d53 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:16 +0100
+Subject: [PATCH 010/124] crypto: mips/chacha - import 32r2 ChaCha code from
+ Zinc
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream.
+
+This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
+Zinc patch set.
+
+Co-developed-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/mips/crypto/chacha-core.S | 424 +++++++++++++++++++++++++++++++++
+ 1 file changed, 424 insertions(+)
+ create mode 100644 arch/mips/crypto/chacha-core.S
+
+--- /dev/null
++++ b/arch/mips/crypto/chacha-core.S
+@@ -0,0 +1,424 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
++/*
++ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#define MASK_U32              0x3c
++#define CHACHA20_BLOCK_SIZE   64
++#define STACK_SIZE            32
++
++#define X0    $t0
++#define X1    $t1
++#define X2    $t2
++#define X3    $t3
++#define X4    $t4
++#define X5    $t5
++#define X6    $t6
++#define X7    $t7
++#define X8    $t8
++#define X9    $t9
++#define X10   $v1
++#define X11   $s6
++#define X12   $s5
++#define X13   $s4
++#define X14   $s3
++#define X15   $s2
++/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
++#define T0    $s1
++#define T1    $s0
++#define T(n)  T ## n
++#define X(n)  X ## n
++
++/* Input arguments */
++#define STATE         $a0
++#define OUT           $a1
++#define IN            $a2
++#define BYTES         $a3
++
++/* Output argument */
++/* NONCE[0] is kept in a register and not in memory.
++ * We don't want to touch original value in memory.
++ * Must be incremented every loop iteration.
++ */
++#define NONCE_0               $v0
++
++/* SAVED_X and SAVED_CA are set in the jump table.
++ * Use regs which are overwritten on exit else we don't leak clear data.
++ * They are used to handling the last bytes which are not multiple of 4.
++ */
++#define SAVED_X               X15
++#define SAVED_CA      $s7
++
++#define IS_UNALIGNED  $s7
++
++#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++#define MSB 0
++#define LSB 3
++#define ROTx rotl
++#define ROTR(n) rotr n, 24
++#define       CPU_TO_LE32(n) \
++      wsbh    n; \
++      rotr    n, 16;
++#else
++#define MSB 3
++#define LSB 0
++#define ROTx rotr
++#define CPU_TO_LE32(n)
++#define ROTR(n)
++#endif
++
++#define FOR_EACH_WORD(x) \
++      x( 0); \
++      x( 1); \
++      x( 2); \
++      x( 3); \
++      x( 4); \
++      x( 5); \
++      x( 6); \
++      x( 7); \
++      x( 8); \
++      x( 9); \
++      x(10); \
++      x(11); \
++      x(12); \
++      x(13); \
++      x(14); \
++      x(15);
++
++#define FOR_EACH_WORD_REV(x) \
++      x(15); \
++      x(14); \
++      x(13); \
++      x(12); \
++      x(11); \
++      x(10); \
++      x( 9); \
++      x( 8); \
++      x( 7); \
++      x( 6); \
++      x( 5); \
++      x( 4); \
++      x( 3); \
++      x( 2); \
++      x( 1); \
++      x( 0);
++
++#define PLUS_ONE_0     1
++#define PLUS_ONE_1     2
++#define PLUS_ONE_2     3
++#define PLUS_ONE_3     4
++#define PLUS_ONE_4     5
++#define PLUS_ONE_5     6
++#define PLUS_ONE_6     7
++#define PLUS_ONE_7     8
++#define PLUS_ONE_8     9
++#define PLUS_ONE_9    10
++#define PLUS_ONE_10   11
++#define PLUS_ONE_11   12
++#define PLUS_ONE_12   13
++#define PLUS_ONE_13   14
++#define PLUS_ONE_14   15
++#define PLUS_ONE_15   16
++#define PLUS_ONE(x)   PLUS_ONE_ ## x
++#define _CONCAT3(a,b,c)       a ## b ## c
++#define CONCAT3(a,b,c)        _CONCAT3(a,b,c)
++
++#define STORE_UNALIGNED(x) \
++CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
++      .if (x != 12); \
++              lw      T0, (x*4)(STATE); \
++      .endif; \
++      lwl     T1, (x*4)+MSB ## (IN); \
++      lwr     T1, (x*4)+LSB ## (IN); \
++      .if (x == 12); \
++              addu    X ## x, NONCE_0; \
++      .else; \
++              addu    X ## x, T0; \
++      .endif; \
++      CPU_TO_LE32(X ## x); \
++      xor     X ## x, T1; \
++      swl     X ## x, (x*4)+MSB ## (OUT); \
++      swr     X ## x, (x*4)+LSB ## (OUT);
++
++#define STORE_ALIGNED(x) \
++CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
++      .if (x != 12); \
++              lw      T0, (x*4)(STATE); \
++      .endif; \
++      lw      T1, (x*4) ## (IN); \
++      .if (x == 12); \
++              addu    X ## x, NONCE_0; \
++      .else; \
++              addu    X ## x, T0; \
++      .endif; \
++      CPU_TO_LE32(X ## x); \
++      xor     X ## x, T1; \
++      sw      X ## x, (x*4) ## (OUT);
++
++/* Jump table macro.
++ * Used for setup and handling the last bytes, which are not multiple of 4.
++ * X15 is free to store Xn
++ * Every jumptable entry must be equal in size.
++ */
++#define JMPTBL_ALIGNED(x) \
++.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
++      .set    noreorder; \
++      b       .Lchacha20_mips_xor_aligned_ ## x ## _b; \
++      .if (x == 12); \
++              addu    SAVED_X, X ## x, NONCE_0; \
++      .else; \
++              addu    SAVED_X, X ## x, SAVED_CA; \
++      .endif; \
++      .set    reorder
++
++#define JMPTBL_UNALIGNED(x) \
++.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
++      .set    noreorder; \
++      b       .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
++      .if (x == 12); \
++              addu    SAVED_X, X ## x, NONCE_0; \
++      .else; \
++              addu    SAVED_X, X ## x, SAVED_CA; \
++      .endif; \
++      .set    reorder
++
++#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
++      addu    X(A), X(K); \
++      addu    X(B), X(L); \
++      addu    X(C), X(M); \
++      addu    X(D), X(N); \
++      xor     X(V), X(A); \
++      xor     X(W), X(B); \
++      xor     X(Y), X(C); \
++      xor     X(Z), X(D); \
++      rotl    X(V), S;    \
++      rotl    X(W), S;    \
++      rotl    X(Y), S;    \
++      rotl    X(Z), S;
++
++.text
++.set  reorder
++.set  noat
++.globl        chacha20_mips
++.ent  chacha20_mips
++chacha20_mips:
++      .frame  $sp, STACK_SIZE, $ra
++
++      addiu   $sp, -STACK_SIZE
++
++      /* Return bytes = 0. */
++      beqz    BYTES, .Lchacha20_mips_end
++
++      lw      NONCE_0, 48(STATE)
++
++      /* Save s0-s7 */
++      sw      $s0,  0($sp)
++      sw      $s1,  4($sp)
++      sw      $s2,  8($sp)
++      sw      $s3, 12($sp)
++      sw      $s4, 16($sp)
++      sw      $s5, 20($sp)
++      sw      $s6, 24($sp)
++      sw      $s7, 28($sp)
++
++      /* Test IN or OUT is unaligned.
++       * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
++       */
++      or      IS_UNALIGNED, IN, OUT
++      andi    IS_UNALIGNED, 0x3
++
++      /* Set number of rounds */
++      li      $at, 20
++
++      b       .Lchacha20_rounds_start
++
++.align 4
++.Loop_chacha20_rounds:
++      addiu   IN,  CHACHA20_BLOCK_SIZE
++      addiu   OUT, CHACHA20_BLOCK_SIZE
++      addiu   NONCE_0, 1
++
++.Lchacha20_rounds_start:
++      lw      X0,  0(STATE)
++      lw      X1,  4(STATE)
++      lw      X2,  8(STATE)
++      lw      X3,  12(STATE)
++
++      lw      X4,  16(STATE)
++      lw      X5,  20(STATE)
++      lw      X6,  24(STATE)
++      lw      X7,  28(STATE)
++      lw      X8,  32(STATE)
++      lw      X9,  36(STATE)
++      lw      X10, 40(STATE)
++      lw      X11, 44(STATE)
++
++      move    X12, NONCE_0
++      lw      X13, 52(STATE)
++      lw      X14, 56(STATE)
++      lw      X15, 60(STATE)
++
++.Loop_chacha20_xor_rounds:
++      addiu   $at, -2
++      AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
++      AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
++      AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
++      AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
++      AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
++      AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
++      AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
++      AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
++      bnez    $at, .Loop_chacha20_xor_rounds
++
++      addiu   BYTES, -(CHACHA20_BLOCK_SIZE)
++
++      /* Is data src/dst unaligned? Jump */
++      bnez    IS_UNALIGNED, .Loop_chacha20_unaligned
++
++      /* Set number rounds here to fill delayslot. */
++      li      $at, 20
++
++      /* BYTES < 0, it has no full block. */
++      bltz    BYTES, .Lchacha20_mips_no_full_block_aligned
++
++      FOR_EACH_WORD_REV(STORE_ALIGNED)
++
++      /* BYTES > 0? Loop again. */
++      bgtz    BYTES, .Loop_chacha20_rounds
++
++      /* Place this here to fill delay slot */
++      addiu   NONCE_0, 1
++
++      /* BYTES < 0? Handle last bytes */
++      bltz    BYTES, .Lchacha20_mips_xor_bytes
++
++.Lchacha20_mips_xor_done:
++      /* Restore used registers */
++      lw      $s0,  0($sp)
++      lw      $s1,  4($sp)
++      lw      $s2,  8($sp)
++      lw      $s3, 12($sp)
++      lw      $s4, 16($sp)
++      lw      $s5, 20($sp)
++      lw      $s6, 24($sp)
++      lw      $s7, 28($sp)
++
++      /* Write NONCE_0 back to right location in state */
++      sw      NONCE_0, 48(STATE)
++
++.Lchacha20_mips_end:
++      addiu   $sp, STACK_SIZE
++      jr      $ra
++
++.Lchacha20_mips_no_full_block_aligned:
++      /* Restore the offset on BYTES */
++      addiu   BYTES, CHACHA20_BLOCK_SIZE
++
++      /* Get number of full WORDS */
++      andi    $at, BYTES, MASK_U32
++
++      /* Load upper half of jump table addr */
++      lui     T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
++
++      /* Calculate lower half jump table offset */
++      ins     T0, $at, 1, 6
++
++      /* Add offset to STATE */
++      addu    T1, STATE, $at
++
++      /* Add lower half jump table addr */
++      addiu   T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
++
++      /* Read value from STATE */
++      lw      SAVED_CA, 0(T1)
++
++      /* Store remaining bytecounter as negative value */
++      subu    BYTES, $at, BYTES
++
++      jr      T0
++
++      /* Jump table */
++      FOR_EACH_WORD(JMPTBL_ALIGNED)
++
++
++.Loop_chacha20_unaligned:
++      /* Set number rounds here to fill delayslot. */
++      li      $at, 20
++
++      /* BYTES > 0, it has no full block. */
++      bltz    BYTES, .Lchacha20_mips_no_full_block_unaligned
++
++      FOR_EACH_WORD_REV(STORE_UNALIGNED)
++
++      /* BYTES > 0? Loop again. */
++      bgtz    BYTES, .Loop_chacha20_rounds
++
++      /* Write NONCE_0 back to right location in state */
++      sw      NONCE_0, 48(STATE)
++
++      .set noreorder
++      /* Fall through to byte handling */
++      bgez    BYTES, .Lchacha20_mips_xor_done
++.Lchacha20_mips_xor_unaligned_0_b:
++.Lchacha20_mips_xor_aligned_0_b:
++      /* Place this here to fill delay slot */
++      addiu   NONCE_0, 1
++      .set reorder
++
++.Lchacha20_mips_xor_bytes:
++      addu    IN, $at
++      addu    OUT, $at
++      /* First byte */
++      lbu     T1, 0(IN)
++      addiu   $at, BYTES, 1
++      CPU_TO_LE32(SAVED_X)
++      ROTR(SAVED_X)
++      xor     T1, SAVED_X
++      sb      T1, 0(OUT)
++      beqz    $at, .Lchacha20_mips_xor_done
++      /* Second byte */
++      lbu     T1, 1(IN)
++      addiu   $at, BYTES, 2
++      ROTx    SAVED_X, 8
++      xor     T1, SAVED_X
++      sb      T1, 1(OUT)
++      beqz    $at, .Lchacha20_mips_xor_done
++      /* Third byte */
++      lbu     T1, 2(IN)
++      ROTx    SAVED_X, 8
++      xor     T1, SAVED_X
++      sb      T1, 2(OUT)
++      b       .Lchacha20_mips_xor_done
++
++.Lchacha20_mips_no_full_block_unaligned:
++      /* Restore the offset on BYTES */
++      addiu   BYTES, CHACHA20_BLOCK_SIZE
++
++      /* Get number of full WORDS */
++      andi    $at, BYTES, MASK_U32
++
++      /* Load upper half of jump table addr */
++      lui     T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
++
++      /* Calculate lower half jump table offset */
++      ins     T0, $at, 1, 6
++
++      /* Add offset to STATE */
++      addu    T1, STATE, $at
++
++      /* Add lower half jump table addr */
++      addiu   T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
++
++      /* Read value from STATE */
++      lw      SAVED_CA, 0(T1)
++
++      /* Store remaining bytecounter as negative value */
++      subu    BYTES, $at, BYTES
++
++      jr      T0
++
++      /* Jump table */
++      FOR_EACH_WORD(JMPTBL_UNALIGNED)
++.end chacha20_mips
++.set at
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0011-crypto-mips-chacha-wire-up-accelerated-32r2-code-fro.patch b/target/linux/generic/backport-5.4/080-wireguard-0011-crypto-mips-chacha-wire-up-accelerated-32r2-code-fro.patch

new file mode 100644 (file)

index 0000000..1abfc29
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0011-crypto-mips-chacha-wire-up-accelerated-32r2-code-fro.patch
@@ -0,0 +1,560 @@
+From 01c1104f551dae77125bb3d0f461f4084f2a98df Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:17 +0100
+Subject: [PATCH 011/124] crypto: mips/chacha - wire up accelerated 32r2 code
+ from Zinc
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 3a2f58f3ba4f6f44e33d1a48240d5eadb882cb59 upstream.
+
+This integrates the accelerated MIPS 32r2 implementation of ChaCha
+into both the API and library interfaces of the kernel crypto stack.
+
+The significance of this is that, in addition to becoming available
+as an accelerated library implementation, it can also be used by
+existing crypto API code such as Adiantum (for block encryption on
+ultra low performance cores) or IPsec using chacha20poly1305. These
+are use cases that have already opted into using the abstract crypto
+API. In order to support Adiantum, the core assembler routine has
+been adapted to take the round count as a function argument rather
+than hardcoding it to 20.
+
+Co-developed-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/mips/Makefile             |   2 +-
+ arch/mips/crypto/Makefile      |   4 +
+ arch/mips/crypto/chacha-core.S | 159 ++++++++++++++++++++++++---------
+ arch/mips/crypto/chacha-glue.c | 150 +++++++++++++++++++++++++++++++
+ crypto/Kconfig                 |   6 ++
+ 5 files changed, 277 insertions(+), 44 deletions(-)
+ create mode 100644 arch/mips/crypto/chacha-glue.c
+
+--- a/arch/mips/Makefile
++++ b/arch/mips/Makefile
+@@ -334,7 +334,7 @@ libs-$(CONFIG_MIPS_FP_SUPPORT) += arch/m
+ # See arch/mips/Kbuild for content of core part of the kernel
+ core-y += arch/mips/
+ 
+-drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/
++drivers-y                     += arch/mips/crypto/
+ drivers-$(CONFIG_OPROFILE)    += arch/mips/oprofile/
+ 
+ # suspend and hibernation support
+--- a/arch/mips/crypto/Makefile
++++ b/arch/mips/crypto/Makefile
+@@ -4,3 +4,7 @@
+ #
+ 
+ obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
++
++obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
++chacha-mips-y := chacha-core.o chacha-glue.o
++AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
+--- a/arch/mips/crypto/chacha-core.S
++++ b/arch/mips/crypto/chacha-core.S
+@@ -125,7 +125,7 @@
+ #define CONCAT3(a,b,c)        _CONCAT3(a,b,c)
+ 
+ #define STORE_UNALIGNED(x) \
+-CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
++CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+       .if (x != 12); \
+               lw      T0, (x*4)(STATE); \
+       .endif; \
+@@ -142,7 +142,7 @@ CONCAT3(.Lchacha20_mips_xor_unaligned_,
+       swr     X ## x, (x*4)+LSB ## (OUT);
+ 
+ #define STORE_ALIGNED(x) \
+-CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
++CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+       .if (x != 12); \
+               lw      T0, (x*4)(STATE); \
+       .endif; \
+@@ -162,9 +162,9 @@ CONCAT3(.Lchacha20_mips_xor_aligned_, PL
+  * Every jumptable entry must be equal in size.
+  */
+ #define JMPTBL_ALIGNED(x) \
+-.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
++.Lchacha_mips_jmptbl_aligned_ ## x: ; \
+       .set    noreorder; \
+-      b       .Lchacha20_mips_xor_aligned_ ## x ## _b; \
++      b       .Lchacha_mips_xor_aligned_ ## x ## _b; \
+       .if (x == 12); \
+               addu    SAVED_X, X ## x, NONCE_0; \
+       .else; \
+@@ -173,9 +173,9 @@ CONCAT3(.Lchacha20_mips_xor_aligned_, PL
+       .set    reorder
+ 
+ #define JMPTBL_UNALIGNED(x) \
+-.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
++.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
+       .set    noreorder; \
+-      b       .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
++      b       .Lchacha_mips_xor_unaligned_ ## x ## _b; \
+       .if (x == 12); \
+               addu    SAVED_X, X ## x, NONCE_0; \
+       .else; \
+@@ -200,15 +200,18 @@ CONCAT3(.Lchacha20_mips_xor_aligned_, PL
+ .text
+ .set  reorder
+ .set  noat
+-.globl        chacha20_mips
+-.ent  chacha20_mips
+-chacha20_mips:
++.globl        chacha_crypt_arch
++.ent  chacha_crypt_arch
++chacha_crypt_arch:
+       .frame  $sp, STACK_SIZE, $ra
+ 
++      /* Load number of rounds */
++      lw      $at, 16($sp)
++
+       addiu   $sp, -STACK_SIZE
+ 
+       /* Return bytes = 0. */
+-      beqz    BYTES, .Lchacha20_mips_end
++      beqz    BYTES, .Lchacha_mips_end
+ 
+       lw      NONCE_0, 48(STATE)
+ 
+@@ -228,18 +231,15 @@ chacha20_mips:
+       or      IS_UNALIGNED, IN, OUT
+       andi    IS_UNALIGNED, 0x3
+ 
+-      /* Set number of rounds */
+-      li      $at, 20
+-
+-      b       .Lchacha20_rounds_start
++      b       .Lchacha_rounds_start
+ 
+ .align 4
+-.Loop_chacha20_rounds:
++.Loop_chacha_rounds:
+       addiu   IN,  CHACHA20_BLOCK_SIZE
+       addiu   OUT, CHACHA20_BLOCK_SIZE
+       addiu   NONCE_0, 1
+ 
+-.Lchacha20_rounds_start:
++.Lchacha_rounds_start:
+       lw      X0,  0(STATE)
+       lw      X1,  4(STATE)
+       lw      X2,  8(STATE)
+@@ -259,7 +259,7 @@ chacha20_mips:
+       lw      X14, 56(STATE)
+       lw      X15, 60(STATE)
+ 
+-.Loop_chacha20_xor_rounds:
++.Loop_chacha_xor_rounds:
+       addiu   $at, -2
+       AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+       AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+@@ -269,31 +269,31 @@ chacha20_mips:
+       AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+       AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+       AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+-      bnez    $at, .Loop_chacha20_xor_rounds
++      bnez    $at, .Loop_chacha_xor_rounds
+ 
+       addiu   BYTES, -(CHACHA20_BLOCK_SIZE)
+ 
+       /* Is data src/dst unaligned? Jump */
+-      bnez    IS_UNALIGNED, .Loop_chacha20_unaligned
++      bnez    IS_UNALIGNED, .Loop_chacha_unaligned
+ 
+       /* Set number rounds here to fill delayslot. */
+-      li      $at, 20
++      lw      $at, (STACK_SIZE+16)($sp)
+ 
+       /* BYTES < 0, it has no full block. */
+-      bltz    BYTES, .Lchacha20_mips_no_full_block_aligned
++      bltz    BYTES, .Lchacha_mips_no_full_block_aligned
+ 
+       FOR_EACH_WORD_REV(STORE_ALIGNED)
+ 
+       /* BYTES > 0? Loop again. */
+-      bgtz    BYTES, .Loop_chacha20_rounds
++      bgtz    BYTES, .Loop_chacha_rounds
+ 
+       /* Place this here to fill delay slot */
+       addiu   NONCE_0, 1
+ 
+       /* BYTES < 0? Handle last bytes */
+-      bltz    BYTES, .Lchacha20_mips_xor_bytes
++      bltz    BYTES, .Lchacha_mips_xor_bytes
+ 
+-.Lchacha20_mips_xor_done:
++.Lchacha_mips_xor_done:
+       /* Restore used registers */
+       lw      $s0,  0($sp)
+       lw      $s1,  4($sp)
+@@ -307,11 +307,11 @@ chacha20_mips:
+       /* Write NONCE_0 back to right location in state */
+       sw      NONCE_0, 48(STATE)
+ 
+-.Lchacha20_mips_end:
++.Lchacha_mips_end:
+       addiu   $sp, STACK_SIZE
+       jr      $ra
+ 
+-.Lchacha20_mips_no_full_block_aligned:
++.Lchacha_mips_no_full_block_aligned:
+       /* Restore the offset on BYTES */
+       addiu   BYTES, CHACHA20_BLOCK_SIZE
+ 
+@@ -319,7 +319,7 @@ chacha20_mips:
+       andi    $at, BYTES, MASK_U32
+ 
+       /* Load upper half of jump table addr */
+-      lui     T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
++      lui     T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
+ 
+       /* Calculate lower half jump table offset */
+       ins     T0, $at, 1, 6
+@@ -328,7 +328,7 @@ chacha20_mips:
+       addu    T1, STATE, $at
+ 
+       /* Add lower half jump table addr */
+-      addiu   T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
++      addiu   T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
+ 
+       /* Read value from STATE */
+       lw      SAVED_CA, 0(T1)
+@@ -342,31 +342,31 @@ chacha20_mips:
+       FOR_EACH_WORD(JMPTBL_ALIGNED)
+ 
+ 
+-.Loop_chacha20_unaligned:
++.Loop_chacha_unaligned:
+       /* Set number rounds here to fill delayslot. */
+-      li      $at, 20
++      lw      $at, (STACK_SIZE+16)($sp)
+ 
+       /* BYTES > 0, it has no full block. */
+-      bltz    BYTES, .Lchacha20_mips_no_full_block_unaligned
++      bltz    BYTES, .Lchacha_mips_no_full_block_unaligned
+ 
+       FOR_EACH_WORD_REV(STORE_UNALIGNED)
+ 
+       /* BYTES > 0? Loop again. */
+-      bgtz    BYTES, .Loop_chacha20_rounds
++      bgtz    BYTES, .Loop_chacha_rounds
+ 
+       /* Write NONCE_0 back to right location in state */
+       sw      NONCE_0, 48(STATE)
+ 
+       .set noreorder
+       /* Fall through to byte handling */
+-      bgez    BYTES, .Lchacha20_mips_xor_done
+-.Lchacha20_mips_xor_unaligned_0_b:
+-.Lchacha20_mips_xor_aligned_0_b:
++      bgez    BYTES, .Lchacha_mips_xor_done
++.Lchacha_mips_xor_unaligned_0_b:
++.Lchacha_mips_xor_aligned_0_b:
+       /* Place this here to fill delay slot */
+       addiu   NONCE_0, 1
+       .set reorder
+ 
+-.Lchacha20_mips_xor_bytes:
++.Lchacha_mips_xor_bytes:
+       addu    IN, $at
+       addu    OUT, $at
+       /* First byte */
+@@ -376,22 +376,22 @@ chacha20_mips:
+       ROTR(SAVED_X)
+       xor     T1, SAVED_X
+       sb      T1, 0(OUT)
+-      beqz    $at, .Lchacha20_mips_xor_done
++      beqz    $at, .Lchacha_mips_xor_done
+       /* Second byte */
+       lbu     T1, 1(IN)
+       addiu   $at, BYTES, 2
+       ROTx    SAVED_X, 8
+       xor     T1, SAVED_X
+       sb      T1, 1(OUT)
+-      beqz    $at, .Lchacha20_mips_xor_done
++      beqz    $at, .Lchacha_mips_xor_done
+       /* Third byte */
+       lbu     T1, 2(IN)
+       ROTx    SAVED_X, 8
+       xor     T1, SAVED_X
+       sb      T1, 2(OUT)
+-      b       .Lchacha20_mips_xor_done
++      b       .Lchacha_mips_xor_done
+ 
+-.Lchacha20_mips_no_full_block_unaligned:
++.Lchacha_mips_no_full_block_unaligned:
+       /* Restore the offset on BYTES */
+       addiu   BYTES, CHACHA20_BLOCK_SIZE
+ 
+@@ -399,7 +399,7 @@ chacha20_mips:
+       andi    $at, BYTES, MASK_U32
+ 
+       /* Load upper half of jump table addr */
+-      lui     T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
++      lui     T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
+ 
+       /* Calculate lower half jump table offset */
+       ins     T0, $at, 1, 6
+@@ -408,7 +408,7 @@ chacha20_mips:
+       addu    T1, STATE, $at
+ 
+       /* Add lower half jump table addr */
+-      addiu   T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
++      addiu   T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
+ 
+       /* Read value from STATE */
+       lw      SAVED_CA, 0(T1)
+@@ -420,5 +420,78 @@ chacha20_mips:
+ 
+       /* Jump table */
+       FOR_EACH_WORD(JMPTBL_UNALIGNED)
+-.end chacha20_mips
++.end chacha_crypt_arch
++.set at
++
++/* Input arguments
++ * STATE      $a0
++ * OUT                $a1
++ * NROUND     $a2
++ */
++
++#undef X12
++#undef X13
++#undef X14
++#undef X15
++
++#define X12   $a3
++#define X13   $at
++#define X14   $v0
++#define X15   STATE
++
++.set noat
++.globl        hchacha_block_arch
++.ent  hchacha_block_arch
++hchacha_block_arch:
++      .frame  $sp, STACK_SIZE, $ra
++
++      addiu   $sp, -STACK_SIZE
++
++      /* Save X11(s6) */
++      sw      X11, 0($sp)
++
++      lw      X0,  0(STATE)
++      lw      X1,  4(STATE)
++      lw      X2,  8(STATE)
++      lw      X3,  12(STATE)
++      lw      X4,  16(STATE)
++      lw      X5,  20(STATE)
++      lw      X6,  24(STATE)
++      lw      X7,  28(STATE)
++      lw      X8,  32(STATE)
++      lw      X9,  36(STATE)
++      lw      X10, 40(STATE)
++      lw      X11, 44(STATE)
++      lw      X12, 48(STATE)
++      lw      X13, 52(STATE)
++      lw      X14, 56(STATE)
++      lw      X15, 60(STATE)
++
++.Loop_hchacha_xor_rounds:
++      addiu   $a2, -2
++      AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
++      AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
++      AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
++      AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
++      AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
++      AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
++      AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
++      AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
++      bnez    $a2, .Loop_hchacha_xor_rounds
++
++      /* Restore used register */
++      lw      X11, 0($sp)
++
++      sw      X0,  0(OUT)
++      sw      X1,  4(OUT)
++      sw      X2,  8(OUT)
++      sw      X3,  12(OUT)
++      sw      X12, 16(OUT)
++      sw      X13, 20(OUT)
++      sw      X14, 24(OUT)
++      sw      X15, 28(OUT)
++
++      addiu   $sp, STACK_SIZE
++      jr      $ra
++.end hchacha_block_arch
+ .set at
+--- /dev/null
++++ b/arch/mips/crypto/chacha-glue.c
+@@ -0,0 +1,150 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * MIPS accelerated ChaCha and XChaCha stream ciphers,
++ * including ChaCha20 (RFC7539)
++ *
++ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
++ */
++
++#include <asm/byteorder.h>
++#include <crypto/algapi.h>
++#include <crypto/internal/chacha.h>
++#include <crypto/internal/skcipher.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
++                                unsigned int bytes, int nrounds);
++EXPORT_SYMBOL(chacha_crypt_arch);
++
++asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
++EXPORT_SYMBOL(hchacha_block_arch);
++
++void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
++{
++      chacha_init_generic(state, key, iv);
++}
++EXPORT_SYMBOL(chacha_init_arch);
++
++static int chacha_mips_stream_xor(struct skcipher_request *req,
++                                const struct chacha_ctx *ctx, const u8 *iv)
++{
++      struct skcipher_walk walk;
++      u32 state[16];
++      int err;
++
++      err = skcipher_walk_virt(&walk, req, false);
++
++      chacha_init_generic(state, ctx->key, iv);
++
++      while (walk.nbytes > 0) {
++              unsigned int nbytes = walk.nbytes;
++
++              if (nbytes < walk.total)
++                      nbytes = round_down(nbytes, walk.stride);
++
++              chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
++                           nbytes, ctx->nrounds);
++              err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
++      }
++
++      return err;
++}
++
++static int chacha_mips(struct skcipher_request *req)
++{
++      struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
++      struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
++
++      return chacha_mips_stream_xor(req, ctx, req->iv);
++}
++
++static int xchacha_mips(struct skcipher_request *req)
++{
++      struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
++      struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
++      struct chacha_ctx subctx;
++      u32 state[16];
++      u8 real_iv[16];
++
++      chacha_init_generic(state, ctx->key, req->iv);
++
++      hchacha_block(state, subctx.key, ctx->nrounds);
++      subctx.nrounds = ctx->nrounds;
++
++      memcpy(&real_iv[0], req->iv + 24, 8);
++      memcpy(&real_iv[8], req->iv + 16, 8);
++      return chacha_mips_stream_xor(req, &subctx, real_iv);
++}
++
++static struct skcipher_alg algs[] = {
++      {
++              .base.cra_name          = "chacha20",
++              .base.cra_driver_name   = "chacha20-mips",
++              .base.cra_priority      = 200,
++              .base.cra_blocksize     = 1,
++              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
++              .base.cra_module        = THIS_MODULE,
++
++              .min_keysize            = CHACHA_KEY_SIZE,
++              .max_keysize            = CHACHA_KEY_SIZE,
++              .ivsize                 = CHACHA_IV_SIZE,
++              .chunksize              = CHACHA_BLOCK_SIZE,
++              .setkey                 = chacha20_setkey,
++              .encrypt                = chacha_mips,
++              .decrypt                = chacha_mips,
++      }, {
++              .base.cra_name          = "xchacha20",
++              .base.cra_driver_name   = "xchacha20-mips",
++              .base.cra_priority      = 200,
++              .base.cra_blocksize     = 1,
++              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
++              .base.cra_module        = THIS_MODULE,
++
++              .min_keysize            = CHACHA_KEY_SIZE,
++              .max_keysize            = CHACHA_KEY_SIZE,
++              .ivsize                 = XCHACHA_IV_SIZE,
++              .chunksize              = CHACHA_BLOCK_SIZE,
++              .setkey                 = chacha20_setkey,
++              .encrypt                = xchacha_mips,
++              .decrypt                = xchacha_mips,
++      }, {
++              .base.cra_name          = "xchacha12",
++              .base.cra_driver_name   = "xchacha12-mips",
++              .base.cra_priority      = 200,
++              .base.cra_blocksize     = 1,
++              .base.cra_ctxsize       = sizeof(struct chacha_ctx),
++              .base.cra_module        = THIS_MODULE,
++
++              .min_keysize            = CHACHA_KEY_SIZE,
++              .max_keysize            = CHACHA_KEY_SIZE,
++              .ivsize                 = XCHACHA_IV_SIZE,
++              .chunksize              = CHACHA_BLOCK_SIZE,
++              .setkey                 = chacha12_setkey,
++              .encrypt                = xchacha_mips,
++              .decrypt                = xchacha_mips,
++      }
++};
++
++static int __init chacha_simd_mod_init(void)
++{
++      return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
++}
++
++static void __exit chacha_simd_mod_fini(void)
++{
++      crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
++}
++
++module_init(chacha_simd_mod_init);
++module_exit(chacha_simd_mod_fini);
++
++MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
++MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
++MODULE_LICENSE("GPL v2");
++MODULE_ALIAS_CRYPTO("chacha20");
++MODULE_ALIAS_CRYPTO("chacha20-mips");
++MODULE_ALIAS_CRYPTO("xchacha20");
++MODULE_ALIAS_CRYPTO("xchacha20-mips");
++MODULE_ALIAS_CRYPTO("xchacha12");
++MODULE_ALIAS_CRYPTO("xchacha12-mips");
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -1423,6 +1423,12 @@ config CRYPTO_CHACHA20_X86_64
+         SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
+         XChaCha20, and XChaCha12 stream ciphers.
+ 
++config CRYPTO_CHACHA_MIPS
++      tristate "ChaCha stream cipher algorithms (MIPS 32r2 optimized)"
++      depends on CPU_MIPS32_R2
++      select CRYPTO_BLKCIPHER
++      select CRYPTO_ARCH_HAVE_LIB_CHACHA
++
+ config CRYPTO_SEED
+       tristate "SEED cipher algorithm"
+       select CRYPTO_ALGAPI
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0012-crypto-chacha-unexport-chacha_generic-routines.patch b/target/linux/generic/backport-5.4/080-wireguard-0012-crypto-chacha-unexport-chacha_generic-routines.patch

new file mode 100644 (file)

index 0000000..23a1e41
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0012-crypto-chacha-unexport-chacha_generic-routines.patch
@@ -0,0 +1,115 @@
+From 53b97caa431974880c3ea592be870a62e9ef444a Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:18 +0100
+Subject: [PATCH 012/124] crypto: chacha - unexport chacha_generic routines
+
+commit 22cf705360707ced15f9fe5423938f313c7df536 upstream.
+
+Now that all users of generic ChaCha code have moved to the core library,
+there is no longer a need for the generic ChaCha skcpiher driver to
+export parts of it implementation for reuse by other drivers. So drop
+the exports, and make the symbols static.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/chacha_generic.c          | 26 ++++++++------------------
+ include/crypto/internal/chacha.h | 10 ----------
+ 2 files changed, 8 insertions(+), 28 deletions(-)
+
+--- a/crypto/chacha_generic.c
++++ b/crypto/chacha_generic.c
+@@ -21,7 +21,7 @@ static int chacha_stream_xor(struct skci
+ 
+       err = skcipher_walk_virt(&walk, req, false);
+ 
+-      crypto_chacha_init(state, ctx, iv);
++      chacha_init_generic(state, ctx->key, iv);
+ 
+       while (walk.nbytes > 0) {
+               unsigned int nbytes = walk.nbytes;
+@@ -37,36 +37,27 @@ static int chacha_stream_xor(struct skci
+       return err;
+ }
+ 
+-void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv)
+-{
+-      chacha_init_generic(state, ctx->key, iv);
+-}
+-EXPORT_SYMBOL_GPL(crypto_chacha_init);
+-
+-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-                         unsigned int keysize)
++static int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
++                                unsigned int keysize)
+ {
+       return chacha_setkey(tfm, key, keysize, 20);
+ }
+-EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
+ 
+-int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-                         unsigned int keysize)
++static int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
++                               unsigned int keysize)
+ {
+       return chacha_setkey(tfm, key, keysize, 12);
+ }
+-EXPORT_SYMBOL_GPL(crypto_chacha12_setkey);
+ 
+-int crypto_chacha_crypt(struct skcipher_request *req)
++static int crypto_chacha_crypt(struct skcipher_request *req)
+ {
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ 
+       return chacha_stream_xor(req, ctx, req->iv);
+ }
+-EXPORT_SYMBOL_GPL(crypto_chacha_crypt);
+ 
+-int crypto_xchacha_crypt(struct skcipher_request *req)
++static int crypto_xchacha_crypt(struct skcipher_request *req)
+ {
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+@@ -75,7 +66,7 @@ int crypto_xchacha_crypt(struct skcipher
+       u8 real_iv[16];
+ 
+       /* Compute the subkey given the original key and first 128 nonce bits */
+-      crypto_chacha_init(state, ctx, req->iv);
++      chacha_init_generic(state, ctx->key, req->iv);
+       hchacha_block_generic(state, subctx.key, ctx->nrounds);
+       subctx.nrounds = ctx->nrounds;
+ 
+@@ -86,7 +77,6 @@ int crypto_xchacha_crypt(struct skcipher
+       /* Generate the stream and XOR it with the data */
+       return chacha_stream_xor(req, &subctx, real_iv);
+ }
+-EXPORT_SYMBOL_GPL(crypto_xchacha_crypt);
+ 
+ static struct skcipher_alg algs[] = {
+       {
+--- a/include/crypto/internal/chacha.h
++++ b/include/crypto/internal/chacha.h
+@@ -12,8 +12,6 @@ struct chacha_ctx {
+       int nrounds;
+ };
+ 
+-void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv);
+-
+ static inline int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                               unsigned int keysize, int nrounds)
+ {
+@@ -42,12 +40,4 @@ static int inline chacha12_setkey(struct
+       return chacha_setkey(tfm, key, keysize, 12);
+ }
+ 
+-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-                         unsigned int keysize);
+-int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-                         unsigned int keysize);
+-
+-int crypto_chacha_crypt(struct skcipher_request *req);
+-int crypto_xchacha_crypt(struct skcipher_request *req);
+-
+ #endif /* _CRYPTO_CHACHA_H */
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0013-crypto-poly1305-move-core-routines-into-a-separate-l.patch b/target/linux/generic/backport-5.4/080-wireguard-0013-crypto-poly1305-move-core-routines-into-a-separate-l.patch

new file mode 100644 (file)

index 0000000..a522704
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0013-crypto-poly1305-move-core-routines-into-a-separate-l.patch
@@ -0,0 +1,650 @@
+From 905432633564215220707ee97f64ffb249a029f2 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:19 +0100
+Subject: [PATCH 013/124] crypto: poly1305 - move core routines into a separate
+ library
+
+commit 48ea8c6ebc96bc0990e12ee1c43d0832c23576bb upstream.
+
+Move the core Poly1305 routines shared between the generic Poly1305
+shash driver and the Adiantum and NHPoly1305 drivers into a separate
+library so that using just this pieces does not pull in the crypto
+API pieces of the generic Poly1305 routine.
+
+In a subsequent patch, we will augment this generic library with
+init/update/final routines so that Poyl1305 algorithm can be used
+directly without the need for using the crypto API's shash abstraction.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c    |   2 +-
+ crypto/Kconfig                     |   5 +-
+ crypto/adiantum.c                  |   5 +-
+ crypto/nhpoly1305.c                |   3 +-
+ crypto/poly1305_generic.c          | 195 ++---------------------------
+ include/crypto/internal/poly1305.h |  67 ++++++++++
+ include/crypto/poly1305.h          |  23 ----
+ lib/crypto/Kconfig                 |   3 +
+ lib/crypto/Makefile                |   3 +
+ lib/crypto/poly1305.c              | 158 +++++++++++++++++++++++
+ 10 files changed, 248 insertions(+), 216 deletions(-)
+ create mode 100644 include/crypto/internal/poly1305.h
+ create mode 100644 lib/crypto/poly1305.c
+
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -7,8 +7,8 @@
+ 
+ #include <crypto/algapi.h>
+ #include <crypto/internal/hash.h>
++#include <crypto/internal/poly1305.h>
+ #include <crypto/internal/simd.h>
+-#include <crypto/poly1305.h>
+ #include <linux/crypto.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -446,7 +446,7 @@ config CRYPTO_KEYWRAP
+ config CRYPTO_NHPOLY1305
+       tristate
+       select CRYPTO_HASH
+-      select CRYPTO_POLY1305
++      select CRYPTO_LIB_POLY1305_GENERIC
+ 
+ config CRYPTO_NHPOLY1305_SSE2
+       tristate "NHPoly1305 hash function (x86_64 SSE2 implementation)"
+@@ -467,7 +467,7 @@ config CRYPTO_NHPOLY1305_AVX2
+ config CRYPTO_ADIANTUM
+       tristate "Adiantum support"
+       select CRYPTO_CHACHA20
+-      select CRYPTO_POLY1305
++      select CRYPTO_LIB_POLY1305_GENERIC
+       select CRYPTO_NHPOLY1305
+       select CRYPTO_MANAGER
+       help
+@@ -686,6 +686,7 @@ config CRYPTO_GHASH
+ config CRYPTO_POLY1305
+       tristate "Poly1305 authenticator algorithm"
+       select CRYPTO_HASH
++      select CRYPTO_LIB_POLY1305_GENERIC
+       help
+         Poly1305 authenticator algorithm, RFC7539.
+ 
+--- a/crypto/adiantum.c
++++ b/crypto/adiantum.c
+@@ -33,6 +33,7 @@
+ #include <crypto/b128ops.h>
+ #include <crypto/chacha.h>
+ #include <crypto/internal/hash.h>
++#include <crypto/internal/poly1305.h>
+ #include <crypto/internal/skcipher.h>
+ #include <crypto/nhpoly1305.h>
+ #include <crypto/scatterwalk.h>
+@@ -242,11 +243,11 @@ static void adiantum_hash_header(struct
+ 
+       BUILD_BUG_ON(sizeof(header) % POLY1305_BLOCK_SIZE != 0);
+       poly1305_core_blocks(&state, &tctx->header_hash_key,
+-                           &header, sizeof(header) / POLY1305_BLOCK_SIZE);
++                           &header, sizeof(header) / POLY1305_BLOCK_SIZE, 1);
+ 
+       BUILD_BUG_ON(TWEAK_SIZE % POLY1305_BLOCK_SIZE != 0);
+       poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
+-                           TWEAK_SIZE / POLY1305_BLOCK_SIZE);
++                           TWEAK_SIZE / POLY1305_BLOCK_SIZE, 1);
+ 
+       poly1305_core_emit(&state, &rctx->header_hash);
+ }
+--- a/crypto/nhpoly1305.c
++++ b/crypto/nhpoly1305.c
+@@ -33,6 +33,7 @@
+ #include <asm/unaligned.h>
+ #include <crypto/algapi.h>
+ #include <crypto/internal/hash.h>
++#include <crypto/internal/poly1305.h>
+ #include <crypto/nhpoly1305.h>
+ #include <linux/crypto.h>
+ #include <linux/kernel.h>
+@@ -78,7 +79,7 @@ static void process_nh_hash_value(struct
+       BUILD_BUG_ON(NH_HASH_BYTES % POLY1305_BLOCK_SIZE != 0);
+ 
+       poly1305_core_blocks(&state->poly_state, &key->poly_key, state->nh_hash,
+-                           NH_HASH_BYTES / POLY1305_BLOCK_SIZE);
++                           NH_HASH_BYTES / POLY1305_BLOCK_SIZE, 1);
+ }
+ 
+ /*
+--- a/crypto/poly1305_generic.c
++++ b/crypto/poly1305_generic.c
+@@ -13,27 +13,12 @@
+ 
+ #include <crypto/algapi.h>
+ #include <crypto/internal/hash.h>
+-#include <crypto/poly1305.h>
++#include <crypto/internal/poly1305.h>
+ #include <linux/crypto.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ #include <asm/unaligned.h>
+ 
+-static inline u64 mlt(u64 a, u64 b)
+-{
+-      return a * b;
+-}
+-
+-static inline u32 sr(u64 v, u_char n)
+-{
+-      return v >> n;
+-}
+-
+-static inline u32 and(u32 v, u32 mask)
+-{
+-      return v & mask;
+-}
+-
+ int crypto_poly1305_init(struct shash_desc *desc)
+ {
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+@@ -47,124 +32,8 @@ int crypto_poly1305_init(struct shash_de
+ }
+ EXPORT_SYMBOL_GPL(crypto_poly1305_init);
+ 
+-void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
+-{
+-      /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+-      key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
+-      key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
+-      key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
+-      key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
+-      key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
+-}
+-EXPORT_SYMBOL_GPL(poly1305_core_setkey);
+-
+-/*
+- * Poly1305 requires a unique key for each tag, which implies that we can't set
+- * it on the tfm that gets accessed by multiple users simultaneously. Instead we
+- * expect the key as the first 32 bytes in the update() call.
+- */
+-unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+-                                      const u8 *src, unsigned int srclen)
+-{
+-      if (!dctx->sset) {
+-              if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
+-                      poly1305_core_setkey(&dctx->r, src);
+-                      src += POLY1305_BLOCK_SIZE;
+-                      srclen -= POLY1305_BLOCK_SIZE;
+-                      dctx->rset = true;
+-              }
+-              if (srclen >= POLY1305_BLOCK_SIZE) {
+-                      dctx->s[0] = get_unaligned_le32(src +  0);
+-                      dctx->s[1] = get_unaligned_le32(src +  4);
+-                      dctx->s[2] = get_unaligned_le32(src +  8);
+-                      dctx->s[3] = get_unaligned_le32(src + 12);
+-                      src += POLY1305_BLOCK_SIZE;
+-                      srclen -= POLY1305_BLOCK_SIZE;
+-                      dctx->sset = true;
+-              }
+-      }
+-      return srclen;
+-}
+-EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
+-
+-static void poly1305_blocks_internal(struct poly1305_state *state,
+-                                   const struct poly1305_key *key,
+-                                   const void *src, unsigned int nblocks,
+-                                   u32 hibit)
+-{
+-      u32 r0, r1, r2, r3, r4;
+-      u32 s1, s2, s3, s4;
+-      u32 h0, h1, h2, h3, h4;
+-      u64 d0, d1, d2, d3, d4;
+-
+-      if (!nblocks)
+-              return;
+-
+-      r0 = key->r[0];
+-      r1 = key->r[1];
+-      r2 = key->r[2];
+-      r3 = key->r[3];
+-      r4 = key->r[4];
+-
+-      s1 = r1 * 5;
+-      s2 = r2 * 5;
+-      s3 = r3 * 5;
+-      s4 = r4 * 5;
+-
+-      h0 = state->h[0];
+-      h1 = state->h[1];
+-      h2 = state->h[2];
+-      h3 = state->h[3];
+-      h4 = state->h[4];
+-
+-      do {
+-              /* h += m[i] */
+-              h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
+-              h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
+-              h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
+-              h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
+-              h4 += (get_unaligned_le32(src + 12) >> 8) | hibit;
+-
+-              /* h *= r */
+-              d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
+-                   mlt(h3, s2) + mlt(h4, s1);
+-              d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
+-                   mlt(h3, s3) + mlt(h4, s2);
+-              d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
+-                   mlt(h3, s4) + mlt(h4, s3);
+-              d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
+-                   mlt(h3, r0) + mlt(h4, s4);
+-              d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
+-                   mlt(h3, r1) + mlt(h4, r0);
+-
+-              /* (partial) h %= p */
+-              d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
+-              d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
+-              d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
+-              d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
+-              h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
+-              h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
+-
+-              src += POLY1305_BLOCK_SIZE;
+-      } while (--nblocks);
+-
+-      state->h[0] = h0;
+-      state->h[1] = h1;
+-      state->h[2] = h2;
+-      state->h[3] = h3;
+-      state->h[4] = h4;
+-}
+-
+-void poly1305_core_blocks(struct poly1305_state *state,
+-                        const struct poly1305_key *key,
+-                        const void *src, unsigned int nblocks)
+-{
+-      poly1305_blocks_internal(state, key, src, nblocks, 1 << 24);
+-}
+-EXPORT_SYMBOL_GPL(poly1305_core_blocks);
+-
+-static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
+-                          const u8 *src, unsigned int srclen, u32 hibit)
++static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
++                          unsigned int srclen)
+ {
+       unsigned int datalen;
+ 
+@@ -174,8 +43,8 @@ static void poly1305_blocks(struct poly1
+               srclen = datalen;
+       }
+ 
+-      poly1305_blocks_internal(&dctx->h, &dctx->r,
+-                               src, srclen / POLY1305_BLOCK_SIZE, hibit);
++      poly1305_core_blocks(&dctx->h, &dctx->r, src,
++                           srclen / POLY1305_BLOCK_SIZE, 1);
+ }
+ 
+ int crypto_poly1305_update(struct shash_desc *desc,
+@@ -193,13 +62,13 @@ int crypto_poly1305_update(struct shash_
+ 
+               if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+                       poly1305_blocks(dctx, dctx->buf,
+-                                      POLY1305_BLOCK_SIZE, 1 << 24);
++                                      POLY1305_BLOCK_SIZE);
+                       dctx->buflen = 0;
+               }
+       }
+ 
+       if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+-              poly1305_blocks(dctx, src, srclen, 1 << 24);
++              poly1305_blocks(dctx, src, srclen);
+               src += srclen - (srclen % POLY1305_BLOCK_SIZE);
+               srclen %= POLY1305_BLOCK_SIZE;
+       }
+@@ -213,54 +82,6 @@ int crypto_poly1305_update(struct shash_
+ }
+ EXPORT_SYMBOL_GPL(crypto_poly1305_update);
+ 
+-void poly1305_core_emit(const struct poly1305_state *state, void *dst)
+-{
+-      u32 h0, h1, h2, h3, h4;
+-      u32 g0, g1, g2, g3, g4;
+-      u32 mask;
+-
+-      /* fully carry h */
+-      h0 = state->h[0];
+-      h1 = state->h[1];
+-      h2 = state->h[2];
+-      h3 = state->h[3];
+-      h4 = state->h[4];
+-
+-      h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
+-      h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
+-      h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
+-      h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
+-      h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
+-
+-      /* compute h + -p */
+-      g0 = h0 + 5;
+-      g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
+-      g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
+-      g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
+-      g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
+-
+-      /* select h if h < p, or h + -p if h >= p */
+-      mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
+-      g0 &= mask;
+-      g1 &= mask;
+-      g2 &= mask;
+-      g3 &= mask;
+-      g4 &= mask;
+-      mask = ~mask;
+-      h0 = (h0 & mask) | g0;
+-      h1 = (h1 & mask) | g1;
+-      h2 = (h2 & mask) | g2;
+-      h3 = (h3 & mask) | g3;
+-      h4 = (h4 & mask) | g4;
+-
+-      /* h = h % (2^128) */
+-      put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
+-      put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
+-      put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
+-      put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
+-}
+-EXPORT_SYMBOL_GPL(poly1305_core_emit);
+-
+ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+ {
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+@@ -274,7 +95,7 @@ int crypto_poly1305_final(struct shash_d
+               dctx->buf[dctx->buflen++] = 1;
+               memset(dctx->buf + dctx->buflen, 0,
+                      POLY1305_BLOCK_SIZE - dctx->buflen);
+-              poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
++              poly1305_core_blocks(&dctx->h, &dctx->r, dctx->buf, 1, 0);
+       }
+ 
+       poly1305_core_emit(&dctx->h, digest);
+--- /dev/null
++++ b/include/crypto/internal/poly1305.h
+@@ -0,0 +1,67 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Common values for the Poly1305 algorithm
++ */
++
++#ifndef _CRYPTO_INTERNAL_POLY1305_H
++#define _CRYPTO_INTERNAL_POLY1305_H
++
++#include <asm/unaligned.h>
++#include <linux/types.h>
++#include <crypto/poly1305.h>
++
++struct shash_desc;
++
++/*
++ * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
++ * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
++ * ("s key") at the end.  They also only support block-aligned inputs.
++ */
++void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
++static inline void poly1305_core_init(struct poly1305_state *state)
++{
++      *state = (struct poly1305_state){};
++}
++
++void poly1305_core_blocks(struct poly1305_state *state,
++                        const struct poly1305_key *key, const void *src,
++                        unsigned int nblocks, u32 hibit);
++void poly1305_core_emit(const struct poly1305_state *state, void *dst);
++
++/* Crypto API helper functions for the Poly1305 MAC */
++int crypto_poly1305_init(struct shash_desc *desc);
++
++int crypto_poly1305_update(struct shash_desc *desc,
++                         const u8 *src, unsigned int srclen);
++int crypto_poly1305_final(struct shash_desc *desc, u8 *dst);
++
++/*
++ * Poly1305 requires a unique key for each tag, which implies that we can't set
++ * it on the tfm that gets accessed by multiple users simultaneously. Instead we
++ * expect the key as the first 32 bytes in the update() call.
++ */
++static inline
++unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
++                                      const u8 *src, unsigned int srclen)
++{
++      if (!dctx->sset) {
++              if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
++                      poly1305_core_setkey(&dctx->r, src);
++                      src += POLY1305_BLOCK_SIZE;
++                      srclen -= POLY1305_BLOCK_SIZE;
++                      dctx->rset = true;
++              }
++              if (srclen >= POLY1305_BLOCK_SIZE) {
++                      dctx->s[0] = get_unaligned_le32(src +  0);
++                      dctx->s[1] = get_unaligned_le32(src +  4);
++                      dctx->s[2] = get_unaligned_le32(src +  8);
++                      dctx->s[3] = get_unaligned_le32(src + 12);
++                      src += POLY1305_BLOCK_SIZE;
++                      srclen -= POLY1305_BLOCK_SIZE;
++                      dctx->sset = true;
++              }
++      }
++      return srclen;
++}
++
++#endif
+--- a/include/crypto/poly1305.h
++++ b/include/crypto/poly1305.h
+@@ -38,27 +38,4 @@ struct poly1305_desc_ctx {
+       bool sset;
+ };
+ 
+-/*
+- * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
+- * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
+- * ("s key") at the end.  They also only support block-aligned inputs.
+- */
+-void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
+-static inline void poly1305_core_init(struct poly1305_state *state)
+-{
+-      memset(state->h, 0, sizeof(state->h));
+-}
+-void poly1305_core_blocks(struct poly1305_state *state,
+-                        const struct poly1305_key *key,
+-                        const void *src, unsigned int nblocks);
+-void poly1305_core_emit(const struct poly1305_state *state, void *dst);
+-
+-/* Crypto API helper functions for the Poly1305 MAC */
+-int crypto_poly1305_init(struct shash_desc *desc);
+-unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+-                                      const u8 *src, unsigned int srclen);
+-int crypto_poly1305_update(struct shash_desc *desc,
+-                         const u8 *src, unsigned int srclen);
+-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst);
+-
+ #endif
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -37,5 +37,8 @@ config CRYPTO_LIB_CHACHA
+ config CRYPTO_LIB_DES
+       tristate
+ 
++config CRYPTO_LIB_POLY1305_GENERIC
++      tristate
++
+ config CRYPTO_LIB_SHA256
+       tristate
+--- a/lib/crypto/Makefile
++++ b/lib/crypto/Makefile
+@@ -13,5 +13,8 @@ libarc4-y                                    := arc4.o
+ obj-$(CONFIG_CRYPTO_LIB_DES)                  += libdes.o
+ libdes-y                                      := des.o
+ 
++obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC)     += libpoly1305.o
++libpoly1305-y                                 := poly1305.o
++
+ obj-$(CONFIG_CRYPTO_LIB_SHA256)                       += libsha256.o
+ libsha256-y                                   := sha256.o
+--- /dev/null
++++ b/lib/crypto/poly1305.c
+@@ -0,0 +1,158 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * Poly1305 authenticator algorithm, RFC7539
++ *
++ * Copyright (C) 2015 Martin Willi
++ *
++ * Based on public domain code by Andrew Moon and Daniel J. Bernstein.
++ */
++
++#include <crypto/internal/poly1305.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <asm/unaligned.h>
++
++static inline u64 mlt(u64 a, u64 b)
++{
++      return a * b;
++}
++
++static inline u32 sr(u64 v, u_char n)
++{
++      return v >> n;
++}
++
++static inline u32 and(u32 v, u32 mask)
++{
++      return v & mask;
++}
++
++void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
++{
++      /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
++      key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
++      key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
++      key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
++      key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
++      key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
++}
++EXPORT_SYMBOL_GPL(poly1305_core_setkey);
++
++void poly1305_core_blocks(struct poly1305_state *state,
++                        const struct poly1305_key *key, const void *src,
++                        unsigned int nblocks, u32 hibit)
++{
++      u32 r0, r1, r2, r3, r4;
++      u32 s1, s2, s3, s4;
++      u32 h0, h1, h2, h3, h4;
++      u64 d0, d1, d2, d3, d4;
++
++      if (!nblocks)
++              return;
++
++      r0 = key->r[0];
++      r1 = key->r[1];
++      r2 = key->r[2];
++      r3 = key->r[3];
++      r4 = key->r[4];
++
++      s1 = r1 * 5;
++      s2 = r2 * 5;
++      s3 = r3 * 5;
++      s4 = r4 * 5;
++
++      h0 = state->h[0];
++      h1 = state->h[1];
++      h2 = state->h[2];
++      h3 = state->h[3];
++      h4 = state->h[4];
++
++      do {
++              /* h += m[i] */
++              h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
++              h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
++              h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
++              h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
++              h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
++
++              /* h *= r */
++              d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
++                   mlt(h3, s2) + mlt(h4, s1);
++              d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
++                   mlt(h3, s3) + mlt(h4, s2);
++              d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
++                   mlt(h3, s4) + mlt(h4, s3);
++              d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
++                   mlt(h3, r0) + mlt(h4, s4);
++              d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
++                   mlt(h3, r1) + mlt(h4, r0);
++
++              /* (partial) h %= p */
++              d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
++              d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
++              d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
++              d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
++              h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
++              h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
++
++              src += POLY1305_BLOCK_SIZE;
++      } while (--nblocks);
++
++      state->h[0] = h0;
++      state->h[1] = h1;
++      state->h[2] = h2;
++      state->h[3] = h3;
++      state->h[4] = h4;
++}
++EXPORT_SYMBOL_GPL(poly1305_core_blocks);
++
++void poly1305_core_emit(const struct poly1305_state *state, void *dst)
++{
++      u32 h0, h1, h2, h3, h4;
++      u32 g0, g1, g2, g3, g4;
++      u32 mask;
++
++      /* fully carry h */
++      h0 = state->h[0];
++      h1 = state->h[1];
++      h2 = state->h[2];
++      h3 = state->h[3];
++      h4 = state->h[4];
++
++      h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
++      h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
++      h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
++      h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
++      h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
++
++      /* compute h + -p */
++      g0 = h0 + 5;
++      g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
++      g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
++      g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
++      g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
++
++      /* select h if h < p, or h + -p if h >= p */
++      mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
++      g0 &= mask;
++      g1 &= mask;
++      g2 &= mask;
++      g3 &= mask;
++      g4 &= mask;
++      mask = ~mask;
++      h0 = (h0 & mask) | g0;
++      h1 = (h1 & mask) | g1;
++      h2 = (h2 & mask) | g2;
++      h3 = (h3 & mask) | g3;
++      h4 = (h4 & mask) | g4;
++
++      /* h = h % (2^128) */
++      put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
++      put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
++      put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
++      put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
++}
++EXPORT_SYMBOL_GPL(poly1305_core_emit);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0014-crypto-x86-poly1305-unify-Poly1305-state-struct-with.patch b/target/linux/generic/backport-5.4/080-wireguard-0014-crypto-x86-poly1305-unify-Poly1305-state-struct-with.patch

new file mode 100644 (file)

index 0000000..5a879f0
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0014-crypto-x86-poly1305-unify-Poly1305-state-struct-with.patch
@@ -0,0 +1,251 @@
+From 1017a880df176730e7f8e32f28300eea2a6c27a4 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:20 +0100
+Subject: [PATCH 014/124] crypto: x86/poly1305 - unify Poly1305 state struct
+ with generic code
+
+commit ad8f5b88383ea685f2b8df2a12ee3e08089a1287 upstream.
+
+In preparation of exposing a Poly1305 library interface directly from
+the accelerated x86 driver, align the state descriptor of the x86 code
+with the one used by the generic driver. This is needed to make the
+library interface unified between all implementations.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c    | 88 ++++++++++--------------------
+ crypto/poly1305_generic.c          |  6 +-
+ include/crypto/internal/poly1305.h |  4 +-
+ include/crypto/poly1305.h          | 18 +++---
+ 4 files changed, 43 insertions(+), 73 deletions(-)
+
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -14,40 +14,14 @@
+ #include <linux/module.h>
+ #include <asm/simd.h>
+ 
+-struct poly1305_simd_desc_ctx {
+-      struct poly1305_desc_ctx base;
+-      /* derived key u set? */
+-      bool uset;
+-#ifdef CONFIG_AS_AVX2
+-      /* derived keys r^3, r^4 set? */
+-      bool wset;
+-#endif
+-      /* derived Poly1305 key r^2 */
+-      u32 u[5];
+-      /* ... silently appended r^3 and r^4 when using AVX2 */
+-};
+-
+ asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
+                                   const u32 *r, unsigned int blocks);
+ asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
+                                    unsigned int blocks, const u32 *u);
+-#ifdef CONFIG_AS_AVX2
+ asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
+                                    unsigned int blocks, const u32 *u);
+-static bool poly1305_use_avx2;
+-#endif
+ 
+-static int poly1305_simd_init(struct shash_desc *desc)
+-{
+-      struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc);
+-
+-      sctx->uset = false;
+-#ifdef CONFIG_AS_AVX2
+-      sctx->wset = false;
+-#endif
+-
+-      return crypto_poly1305_init(desc);
+-}
++static bool poly1305_use_avx2 __ro_after_init;
+ 
+ static void poly1305_simd_mult(u32 *a, const u32 *b)
+ {
+@@ -63,53 +37,49 @@ static void poly1305_simd_mult(u32 *a, c
+ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
+                                        const u8 *src, unsigned int srclen)
+ {
+-      struct poly1305_simd_desc_ctx *sctx;
+       unsigned int blocks, datalen;
+ 
+-      BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base));
+-      sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base);
+-
+       if (unlikely(!dctx->sset)) {
+               datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+               src += srclen - datalen;
+               srclen = datalen;
+       }
+ 
+-#ifdef CONFIG_AS_AVX2
+-      if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
+-              if (unlikely(!sctx->wset)) {
+-                      if (!sctx->uset) {
+-                              memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+-                              poly1305_simd_mult(sctx->u, dctx->r.r);
+-                              sctx->uset = true;
++      if (IS_ENABLED(CONFIG_AS_AVX2) &&
++          poly1305_use_avx2 &&
++          srclen >= POLY1305_BLOCK_SIZE * 4) {
++              if (unlikely(dctx->rset < 4)) {
++                      if (dctx->rset < 2) {
++                              dctx->r[1] = dctx->r[0];
++                              poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
+                       }
+-                      memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
+-                      poly1305_simd_mult(sctx->u + 5, dctx->r.r);
+-                      memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
+-                      poly1305_simd_mult(sctx->u + 10, dctx->r.r);
+-                      sctx->wset = true;
++                      dctx->r[2] = dctx->r[1];
++                      poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
++                      dctx->r[3] = dctx->r[2];
++                      poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
++                      dctx->rset = 4;
+               }
+               blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
+-              poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
+-                                   sctx->u);
++              poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
++                                   dctx->r[1].r);
+               src += POLY1305_BLOCK_SIZE * 4 * blocks;
+               srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
+       }
+-#endif
++
+       if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
+-              if (unlikely(!sctx->uset)) {
+-                      memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+-                      poly1305_simd_mult(sctx->u, dctx->r.r);
+-                      sctx->uset = true;
++              if (unlikely(dctx->rset < 2)) {
++                      dctx->r[1] = dctx->r[0];
++                      poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
++                      dctx->rset = 2;
+               }
+               blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
+-              poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
+-                                   sctx->u);
++              poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
++                                   blocks, dctx->r[1].r);
+               src += POLY1305_BLOCK_SIZE * 2 * blocks;
+               srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
+       }
+       if (srclen >= POLY1305_BLOCK_SIZE) {
+-              poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
++              poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
+               srclen -= POLY1305_BLOCK_SIZE;
+       }
+       return srclen;
+@@ -159,10 +129,10 @@ static int poly1305_simd_update(struct s
+ 
+ static struct shash_alg alg = {
+       .digestsize     = POLY1305_DIGEST_SIZE,
+-      .init           = poly1305_simd_init,
++      .init           = crypto_poly1305_init,
+       .update         = poly1305_simd_update,
+       .final          = crypto_poly1305_final,
+-      .descsize       = sizeof(struct poly1305_simd_desc_ctx),
++      .descsize       = sizeof(struct poly1305_desc_ctx),
+       .base           = {
+               .cra_name               = "poly1305",
+               .cra_driver_name        = "poly1305-simd",
+@@ -177,14 +147,14 @@ static int __init poly1305_simd_mod_init
+       if (!boot_cpu_has(X86_FEATURE_XMM2))
+               return -ENODEV;
+ 
+-#ifdef CONFIG_AS_AVX2
+-      poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
++      poly1305_use_avx2 = IS_ENABLED(CONFIG_AS_AVX2) &&
++                          boot_cpu_has(X86_FEATURE_AVX) &&
+                           boot_cpu_has(X86_FEATURE_AVX2) &&
+                           cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+-      alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
++      alg.descsize = sizeof(struct poly1305_desc_ctx) + 5 * sizeof(u32);
+       if (poly1305_use_avx2)
+               alg.descsize += 10 * sizeof(u32);
+-#endif
++
+       return crypto_register_shash(&alg);
+ }
+ 
+--- a/crypto/poly1305_generic.c
++++ b/crypto/poly1305_generic.c
+@@ -25,7 +25,7 @@ int crypto_poly1305_init(struct shash_de
+ 
+       poly1305_core_init(&dctx->h);
+       dctx->buflen = 0;
+-      dctx->rset = false;
++      dctx->rset = 0;
+       dctx->sset = false;
+ 
+       return 0;
+@@ -43,7 +43,7 @@ static void poly1305_blocks(struct poly1
+               srclen = datalen;
+       }
+ 
+-      poly1305_core_blocks(&dctx->h, &dctx->r, src,
++      poly1305_core_blocks(&dctx->h, dctx->r, src,
+                            srclen / POLY1305_BLOCK_SIZE, 1);
+ }
+ 
+@@ -95,7 +95,7 @@ int crypto_poly1305_final(struct shash_d
+               dctx->buf[dctx->buflen++] = 1;
+               memset(dctx->buf + dctx->buflen, 0,
+                      POLY1305_BLOCK_SIZE - dctx->buflen);
+-              poly1305_core_blocks(&dctx->h, &dctx->r, dctx->buf, 1, 0);
++              poly1305_core_blocks(&dctx->h, dctx->r, dctx->buf, 1, 0);
+       }
+ 
+       poly1305_core_emit(&dctx->h, digest);
+--- a/include/crypto/internal/poly1305.h
++++ b/include/crypto/internal/poly1305.h
+@@ -46,10 +46,10 @@ unsigned int crypto_poly1305_setdesckey(
+ {
+       if (!dctx->sset) {
+               if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
+-                      poly1305_core_setkey(&dctx->r, src);
++                      poly1305_core_setkey(dctx->r, src);
+                       src += POLY1305_BLOCK_SIZE;
+                       srclen -= POLY1305_BLOCK_SIZE;
+-                      dctx->rset = true;
++                      dctx->rset = 1;
+               }
+               if (srclen >= POLY1305_BLOCK_SIZE) {
+                       dctx->s[0] = get_unaligned_le32(src +  0);
+--- a/include/crypto/poly1305.h
++++ b/include/crypto/poly1305.h
+@@ -22,20 +22,20 @@ struct poly1305_state {
+ };
+ 
+ struct poly1305_desc_ctx {
+-      /* key */
+-      struct poly1305_key r;
+-      /* finalize key */
+-      u32 s[4];
+-      /* accumulator */
+-      struct poly1305_state h;
+       /* partial buffer */
+       u8 buf[POLY1305_BLOCK_SIZE];
+       /* bytes used in partial buffer */
+       unsigned int buflen;
+-      /* r key has been set */
+-      bool rset;
+-      /* s key has been set */
++      /* how many keys have been set in r[] */
++      unsigned short rset;
++      /* whether s[] has been set */
+       bool sset;
++      /* finalize key */
++      u32 s[4];
++      /* accumulator */
++      struct poly1305_state h;
++      /* key */
++      struct poly1305_key r[1];
+ };
+ 
+ #endif
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0015-crypto-poly1305-expose-init-update-final-library-int.patch b/target/linux/generic/backport-5.4/080-wireguard-0015-crypto-poly1305-expose-init-update-final-library-int.patch

new file mode 100644 (file)

index 0000000..66c2762
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0015-crypto-poly1305-expose-init-update-final-library-int.patch
@@ -0,0 +1,225 @@
+From fd966ddf025b8b62aab20d2e4eb242fe51ad5137 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:21 +0100
+Subject: [PATCH 015/124] crypto: poly1305 - expose init/update/final library
+ interface
+
+commit a1d93064094cc5e24d64e35cf093e7191d0c9344 upstream.
+
+Expose the existing generic Poly1305 code via a init/update/final
+library interface so that callers are not required to go through
+the crypto API's shash abstraction to access it. At the same time,
+make some preparations so that the library implementation can be
+superseded by an accelerated arch-specific version in the future.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/poly1305_generic.c | 22 +-----------
+ include/crypto/poly1305.h | 38 +++++++++++++++++++-
+ lib/crypto/Kconfig        | 26 ++++++++++++++
+ lib/crypto/poly1305.c     | 74 +++++++++++++++++++++++++++++++++++++++
+ 4 files changed, 138 insertions(+), 22 deletions(-)
+
+--- a/crypto/poly1305_generic.c
++++ b/crypto/poly1305_generic.c
+@@ -85,31 +85,11 @@ EXPORT_SYMBOL_GPL(crypto_poly1305_update
+ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+ {
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+-      __le32 digest[4];
+-      u64 f = 0;
+ 
+       if (unlikely(!dctx->sset))
+               return -ENOKEY;
+ 
+-      if (unlikely(dctx->buflen)) {
+-              dctx->buf[dctx->buflen++] = 1;
+-              memset(dctx->buf + dctx->buflen, 0,
+-                     POLY1305_BLOCK_SIZE - dctx->buflen);
+-              poly1305_core_blocks(&dctx->h, dctx->r, dctx->buf, 1, 0);
+-      }
+-
+-      poly1305_core_emit(&dctx->h, digest);
+-
+-      /* mac = (h + s) % (2^128) */
+-      f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0];
+-      put_unaligned_le32(f, dst + 0);
+-      f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1];
+-      put_unaligned_le32(f, dst + 4);
+-      f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2];
+-      put_unaligned_le32(f, dst + 8);
+-      f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3];
+-      put_unaligned_le32(f, dst + 12);
+-
++      poly1305_final_generic(dctx, dst);
+       return 0;
+ }
+ EXPORT_SYMBOL_GPL(crypto_poly1305_final);
+--- a/include/crypto/poly1305.h
++++ b/include/crypto/poly1305.h
+@@ -35,7 +35,43 @@ struct poly1305_desc_ctx {
+       /* accumulator */
+       struct poly1305_state h;
+       /* key */
+-      struct poly1305_key r[1];
++      struct poly1305_key r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE];
+ };
+ 
++void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key);
++void poly1305_init_generic(struct poly1305_desc_ctx *desc, const u8 *key);
++
++static inline void poly1305_init(struct poly1305_desc_ctx *desc, const u8 *key)
++{
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
++              poly1305_init_arch(desc, key);
++      else
++              poly1305_init_generic(desc, key);
++}
++
++void poly1305_update_arch(struct poly1305_desc_ctx *desc, const u8 *src,
++                        unsigned int nbytes);
++void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src,
++                           unsigned int nbytes);
++
++static inline void poly1305_update(struct poly1305_desc_ctx *desc,
++                                 const u8 *src, unsigned int nbytes)
++{
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
++              poly1305_update_arch(desc, src, nbytes);
++      else
++              poly1305_update_generic(desc, src, nbytes);
++}
++
++void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest);
++void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *digest);
++
++static inline void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest)
++{
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
++              poly1305_final_arch(desc, digest);
++      else
++              poly1305_final_generic(desc, digest);
++}
++
+ #endif
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -37,8 +37,34 @@ config CRYPTO_LIB_CHACHA
+ config CRYPTO_LIB_DES
+       tristate
+ 
++config CRYPTO_LIB_POLY1305_RSIZE
++      int
++      default 1
++
++config CRYPTO_ARCH_HAVE_LIB_POLY1305
++      tristate
++      help
++        Declares whether the architecture provides an arch-specific
++        accelerated implementation of the Poly1305 library interface,
++        either builtin or as a module.
++
+ config CRYPTO_LIB_POLY1305_GENERIC
+       tristate
++      help
++        This symbol can be depended upon by arch implementations of the
++        Poly1305 library interface that require the generic code as a
++        fallback, e.g., for SIMD implementations. If no arch specific
++        implementation is enabled, this implementation serves the users
++        of CRYPTO_LIB_POLY1305.
++
++config CRYPTO_LIB_POLY1305
++      tristate "Poly1305 library interface"
++      depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
++      select CRYPTO_LIB_POLY1305_GENERIC if CRYPTO_ARCH_HAVE_LIB_POLY1305=n
++      help
++        Enable the Poly1305 library interface. This interface may be fulfilled
++        by either the generic implementation or an arch-specific one, if one
++        is available and enabled.
+ 
+ config CRYPTO_LIB_SHA256
+       tristate
+--- a/lib/crypto/poly1305.c
++++ b/lib/crypto/poly1305.c
+@@ -154,5 +154,79 @@ void poly1305_core_emit(const struct pol
+ }
+ EXPORT_SYMBOL_GPL(poly1305_core_emit);
+ 
++void poly1305_init_generic(struct poly1305_desc_ctx *desc, const u8 *key)
++{
++      poly1305_core_setkey(desc->r, key);
++      desc->s[0] = get_unaligned_le32(key + 16);
++      desc->s[1] = get_unaligned_le32(key + 20);
++      desc->s[2] = get_unaligned_le32(key + 24);
++      desc->s[3] = get_unaligned_le32(key + 28);
++      poly1305_core_init(&desc->h);
++      desc->buflen = 0;
++      desc->sset = true;
++      desc->rset = 1;
++}
++EXPORT_SYMBOL_GPL(poly1305_init_generic);
++
++void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src,
++                           unsigned int nbytes)
++{
++      unsigned int bytes;
++
++      if (unlikely(desc->buflen)) {
++              bytes = min(nbytes, POLY1305_BLOCK_SIZE - desc->buflen);
++              memcpy(desc->buf + desc->buflen, src, bytes);
++              src += bytes;
++              nbytes -= bytes;
++              desc->buflen += bytes;
++
++              if (desc->buflen == POLY1305_BLOCK_SIZE) {
++                      poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 1);
++                      desc->buflen = 0;
++              }
++      }
++
++      if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
++              poly1305_core_blocks(&desc->h, desc->r, src,
++                                   nbytes / POLY1305_BLOCK_SIZE, 1);
++              src += nbytes - (nbytes % POLY1305_BLOCK_SIZE);
++              nbytes %= POLY1305_BLOCK_SIZE;
++      }
++
++      if (unlikely(nbytes)) {
++              desc->buflen = nbytes;
++              memcpy(desc->buf, src, nbytes);
++      }
++}
++EXPORT_SYMBOL_GPL(poly1305_update_generic);
++
++void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *dst)
++{
++      __le32 digest[4];
++      u64 f = 0;
++
++      if (unlikely(desc->buflen)) {
++              desc->buf[desc->buflen++] = 1;
++              memset(desc->buf + desc->buflen, 0,
++                     POLY1305_BLOCK_SIZE - desc->buflen);
++              poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 0);
++      }
++
++      poly1305_core_emit(&desc->h, digest);
++
++      /* mac = (h + s) % (2^128) */
++      f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
++      put_unaligned_le32(f, dst + 0);
++      f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
++      put_unaligned_le32(f, dst + 4);
++      f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
++      put_unaligned_le32(f, dst + 8);
++      f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
++      put_unaligned_le32(f, dst + 12);
++
++      *desc = (struct poly1305_desc_ctx){};
++}
++EXPORT_SYMBOL_GPL(poly1305_final_generic);
++
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0016-crypto-x86-poly1305-depend-on-generic-library-not-ge.patch b/target/linux/generic/backport-5.4/080-wireguard-0016-crypto-x86-poly1305-depend-on-generic-library-not-ge.patch

new file mode 100644 (file)

index 0000000..a1fe77c
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0016-crypto-x86-poly1305-depend-on-generic-library-not-ge.patch
@@ -0,0 +1,217 @@
+From 0e610172b19b8f7c1ce829247ce5f302b25ad100 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:22 +0100
+Subject: [PATCH 016/124] crypto: x86/poly1305 - depend on generic library not
+ generic shash
+
+commit 1b2c6a5120489d41c8ea3b8dacd0b4586289b158 upstream.
+
+Remove the dependency on the generic Poly1305 driver. Instead, depend
+on the generic library so that we only reuse code without pulling in
+the generic skcipher implementation as well.
+
+While at it, remove the logic that prefers the non-SIMD path for short
+inputs - this is no longer necessary after recent FPU handling changes
+on x86.
+
+Since this removes the last remaining user of the routines exported
+by the generic shash driver, unexport them and make them static.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c    | 66 +++++++++++++++++++++++++-----
+ crypto/Kconfig                     |  2 +-
+ crypto/poly1305_generic.c          | 11 ++---
+ include/crypto/internal/poly1305.h |  9 ----
+ 4 files changed, 60 insertions(+), 28 deletions(-)
+
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -34,6 +34,24 @@ static void poly1305_simd_mult(u32 *a, c
+       poly1305_block_sse2(a, m, b, 1);
+ }
+ 
++static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
++                                         const u8 *src, unsigned int srclen)
++{
++      unsigned int datalen;
++
++      if (unlikely(!dctx->sset)) {
++              datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
++              src += srclen - datalen;
++              srclen = datalen;
++      }
++      if (srclen >= POLY1305_BLOCK_SIZE) {
++              poly1305_core_blocks(&dctx->h, dctx->r, src,
++                                   srclen / POLY1305_BLOCK_SIZE, 1);
++              srclen %= POLY1305_BLOCK_SIZE;
++      }
++      return srclen;
++}
++
+ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
+                                        const u8 *src, unsigned int srclen)
+ {
+@@ -91,12 +109,6 @@ static int poly1305_simd_update(struct s
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+       unsigned int bytes;
+ 
+-      /* kernel_fpu_begin/end is costly, use fallback for small updates */
+-      if (srclen <= 288 || !crypto_simd_usable())
+-              return crypto_poly1305_update(desc, src, srclen);
+-
+-      kernel_fpu_begin();
+-
+       if (unlikely(dctx->buflen)) {
+               bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+               memcpy(dctx->buf + dctx->buflen, src, bytes);
+@@ -105,25 +117,57 @@ static int poly1305_simd_update(struct s
+               dctx->buflen += bytes;
+ 
+               if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+-                      poly1305_simd_blocks(dctx, dctx->buf,
+-                                           POLY1305_BLOCK_SIZE);
++                      if (likely(crypto_simd_usable())) {
++                              kernel_fpu_begin();
++                              poly1305_simd_blocks(dctx, dctx->buf,
++                                                   POLY1305_BLOCK_SIZE);
++                              kernel_fpu_end();
++                      } else {
++                              poly1305_scalar_blocks(dctx, dctx->buf,
++                                                     POLY1305_BLOCK_SIZE);
++                      }
+                       dctx->buflen = 0;
+               }
+       }
+ 
+       if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+-              bytes = poly1305_simd_blocks(dctx, src, srclen);
++              if (likely(crypto_simd_usable())) {
++                      kernel_fpu_begin();
++                      bytes = poly1305_simd_blocks(dctx, src, srclen);
++                      kernel_fpu_end();
++              } else {
++                      bytes = poly1305_scalar_blocks(dctx, src, srclen);
++              }
+               src += srclen - bytes;
+               srclen = bytes;
+       }
+ 
+-      kernel_fpu_end();
+-
+       if (unlikely(srclen)) {
+               dctx->buflen = srclen;
+               memcpy(dctx->buf, src, srclen);
+       }
++}
++
++static int crypto_poly1305_init(struct shash_desc *desc)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      poly1305_core_init(&dctx->h);
++      dctx->buflen = 0;
++      dctx->rset = 0;
++      dctx->sset = false;
++
++      return 0;
++}
++
++static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      if (unlikely(!dctx->sset))
++              return -ENOKEY;
+ 
++      poly1305_final_generic(dctx, dst);
+       return 0;
+ }
+ 
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -697,7 +697,7 @@ config CRYPTO_POLY1305
+ config CRYPTO_POLY1305_X86_64
+       tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)"
+       depends on X86 && 64BIT
+-      select CRYPTO_POLY1305
++      select CRYPTO_LIB_POLY1305_GENERIC
+       help
+         Poly1305 authenticator algorithm, RFC7539.
+ 
+--- a/crypto/poly1305_generic.c
++++ b/crypto/poly1305_generic.c
+@@ -19,7 +19,7 @@
+ #include <linux/module.h>
+ #include <asm/unaligned.h>
+ 
+-int crypto_poly1305_init(struct shash_desc *desc)
++static int crypto_poly1305_init(struct shash_desc *desc)
+ {
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ 
+@@ -30,7 +30,6 @@ int crypto_poly1305_init(struct shash_de
+ 
+       return 0;
+ }
+-EXPORT_SYMBOL_GPL(crypto_poly1305_init);
+ 
+ static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+                           unsigned int srclen)
+@@ -47,8 +46,8 @@ static void poly1305_blocks(struct poly1
+                            srclen / POLY1305_BLOCK_SIZE, 1);
+ }
+ 
+-int crypto_poly1305_update(struct shash_desc *desc,
+-                         const u8 *src, unsigned int srclen)
++static int crypto_poly1305_update(struct shash_desc *desc,
++                                const u8 *src, unsigned int srclen)
+ {
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+       unsigned int bytes;
+@@ -80,9 +79,8 @@ int crypto_poly1305_update(struct shash_
+ 
+       return 0;
+ }
+-EXPORT_SYMBOL_GPL(crypto_poly1305_update);
+ 
+-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
++static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+ {
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ 
+@@ -92,7 +90,6 @@ int crypto_poly1305_final(struct shash_d
+       poly1305_final_generic(dctx, dst);
+       return 0;
+ }
+-EXPORT_SYMBOL_GPL(crypto_poly1305_final);
+ 
+ static struct shash_alg poly1305_alg = {
+       .digestsize     = POLY1305_DIGEST_SIZE,
+--- a/include/crypto/internal/poly1305.h
++++ b/include/crypto/internal/poly1305.h
+@@ -10,8 +10,6 @@
+ #include <linux/types.h>
+ #include <crypto/poly1305.h>
+ 
+-struct shash_desc;
+-
+ /*
+  * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
+  * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
+@@ -28,13 +26,6 @@ void poly1305_core_blocks(struct poly130
+                         unsigned int nblocks, u32 hibit);
+ void poly1305_core_emit(const struct poly1305_state *state, void *dst);
+ 
+-/* Crypto API helper functions for the Poly1305 MAC */
+-int crypto_poly1305_init(struct shash_desc *desc);
+-
+-int crypto_poly1305_update(struct shash_desc *desc,
+-                         const u8 *src, unsigned int srclen);
+-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst);
+-
+ /*
+  * Poly1305 requires a unique key for each tag, which implies that we can't set
+  * it on the tfm that gets accessed by multiple users simultaneously. Instead we
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0017-crypto-x86-poly1305-expose-existing-driver-as-poly13.patch b/target/linux/generic/backport-5.4/080-wireguard-0017-crypto-x86-poly1305-expose-existing-driver-as-poly13.patch

new file mode 100644 (file)

index 0000000..01037a6
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0017-crypto-x86-poly1305-expose-existing-driver-as-poly13.patch
@@ -0,0 +1,163 @@
+From 2ceb2e26de65cce974875e0487dde20bc5f1826c Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:23 +0100
+Subject: [PATCH 017/124] crypto: x86/poly1305 - expose existing driver as
+ poly1305 library
+
+commit f0e89bcfbb894e5844cd1bbf6b3cf7c63cb0f5ac upstream.
+
+Implement the arch init/update/final Poly1305 library routines in the
+accelerated SIMD driver for x86 so they are accessible to users of
+the Poly1305 library interface as well.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c | 57 ++++++++++++++++++++++++---------
+ crypto/Kconfig                  |  1 +
+ lib/crypto/Kconfig              |  1 +
+ 3 files changed, 43 insertions(+), 16 deletions(-)
+
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -10,6 +10,7 @@
+ #include <crypto/internal/poly1305.h>
+ #include <crypto/internal/simd.h>
+ #include <linux/crypto.h>
++#include <linux/jump_label.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ #include <asm/simd.h>
+@@ -21,7 +22,8 @@ asmlinkage void poly1305_2block_sse2(u32
+ asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
+                                    unsigned int blocks, const u32 *u);
+ 
+-static bool poly1305_use_avx2 __ro_after_init;
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+ 
+ static void poly1305_simd_mult(u32 *a, const u32 *b)
+ {
+@@ -64,7 +66,7 @@ static unsigned int poly1305_simd_blocks
+       }
+ 
+       if (IS_ENABLED(CONFIG_AS_AVX2) &&
+-          poly1305_use_avx2 &&
++          static_branch_likely(&poly1305_use_avx2) &&
+           srclen >= POLY1305_BLOCK_SIZE * 4) {
+               if (unlikely(dctx->rset < 4)) {
+                       if (dctx->rset < 2) {
+@@ -103,10 +105,15 @@ static unsigned int poly1305_simd_blocks
+       return srclen;
+ }
+ 
+-static int poly1305_simd_update(struct shash_desc *desc,
+-                              const u8 *src, unsigned int srclen)
++void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
++{
++      poly1305_init_generic(desc, key);
++}
++EXPORT_SYMBOL(poly1305_init_arch);
++
++void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
++                        unsigned int srclen)
+ {
+-      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+       unsigned int bytes;
+ 
+       if (unlikely(dctx->buflen)) {
+@@ -117,7 +124,8 @@ static int poly1305_simd_update(struct s
+               dctx->buflen += bytes;
+ 
+               if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+-                      if (likely(crypto_simd_usable())) {
++                      if (static_branch_likely(&poly1305_use_simd) &&
++                          likely(crypto_simd_usable())) {
+                               kernel_fpu_begin();
+                               poly1305_simd_blocks(dctx, dctx->buf,
+                                                    POLY1305_BLOCK_SIZE);
+@@ -131,7 +139,8 @@ static int poly1305_simd_update(struct s
+       }
+ 
+       if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+-              if (likely(crypto_simd_usable())) {
++              if (static_branch_likely(&poly1305_use_simd) &&
++                  likely(crypto_simd_usable())) {
+                       kernel_fpu_begin();
+                       bytes = poly1305_simd_blocks(dctx, src, srclen);
+                       kernel_fpu_end();
+@@ -147,6 +156,13 @@ static int poly1305_simd_update(struct s
+               memcpy(dctx->buf, src, srclen);
+       }
+ }
++EXPORT_SYMBOL(poly1305_update_arch);
++
++void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest)
++{
++      poly1305_final_generic(desc, digest);
++}
++EXPORT_SYMBOL(poly1305_final_arch);
+ 
+ static int crypto_poly1305_init(struct shash_desc *desc)
+ {
+@@ -171,6 +187,15 @@ static int crypto_poly1305_final(struct
+       return 0;
+ }
+ 
++static int poly1305_simd_update(struct shash_desc *desc,
++                              const u8 *src, unsigned int srclen)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      poly1305_update_arch(dctx, src, srclen);
++      return 0;
++}
++
+ static struct shash_alg alg = {
+       .digestsize     = POLY1305_DIGEST_SIZE,
+       .init           = crypto_poly1305_init,
+@@ -189,15 +214,15 @@ static struct shash_alg alg = {
+ static int __init poly1305_simd_mod_init(void)
+ {
+       if (!boot_cpu_has(X86_FEATURE_XMM2))
+-              return -ENODEV;
++              return 0;
+ 
+-      poly1305_use_avx2 = IS_ENABLED(CONFIG_AS_AVX2) &&
+-                          boot_cpu_has(X86_FEATURE_AVX) &&
+-                          boot_cpu_has(X86_FEATURE_AVX2) &&
+-                          cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+-      alg.descsize = sizeof(struct poly1305_desc_ctx) + 5 * sizeof(u32);
+-      if (poly1305_use_avx2)
+-              alg.descsize += 10 * sizeof(u32);
++      static_branch_enable(&poly1305_use_simd);
++
++      if (IS_ENABLED(CONFIG_AS_AVX2) &&
++          boot_cpu_has(X86_FEATURE_AVX) &&
++          boot_cpu_has(X86_FEATURE_AVX2) &&
++          cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
++              static_branch_enable(&poly1305_use_avx2);
+ 
+       return crypto_register_shash(&alg);
+ }
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -698,6 +698,7 @@ config CRYPTO_POLY1305_X86_64
+       tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)"
+       depends on X86 && 64BIT
+       select CRYPTO_LIB_POLY1305_GENERIC
++      select CRYPTO_ARCH_HAVE_LIB_POLY1305
+       help
+         Poly1305 authenticator algorithm, RFC7539.
+ 
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -39,6 +39,7 @@ config CRYPTO_LIB_DES
+ 
+ config CRYPTO_LIB_POLY1305_RSIZE
+       int
++      default 4 if X86_64
+       default 1
+ 
+ config CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0018-crypto-arm64-poly1305-incorporate-OpenSSL-CRYPTOGAMS.patch b/target/linux/generic/backport-5.4/080-wireguard-0018-crypto-arm64-poly1305-incorporate-OpenSSL-CRYPTOGAMS.patch

new file mode 100644 (file)

index 0000000..6596441
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0018-crypto-arm64-poly1305-incorporate-OpenSSL-CRYPTOGAMS.patch
@@ -0,0 +1,2083 @@
+From 335ed336e74d7dcb152025ab65c2ffeceb15c690 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:24 +0100
+Subject: [PATCH 018/124] crypto: arm64/poly1305 - incorporate
+ OpenSSL/CRYPTOGAMS NEON implementation
+
+commit f569ca16475155013525686d0f73bc379c67e635 upstream.
+
+This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation
+for NEON authored by Andy Polyakov, and contributed by him to the OpenSSL
+project. The file 'poly1305-armv8.pl' is taken straight from this upstream
+GitHub repository [0] at commit ec55a08dc0244ce570c4fc7cade330c60798952f,
+and already contains all the changes required to build it as part of a
+Linux kernel module.
+
+[0] https://github.com/dot-asm/cryptogams
+
+Co-developed-by: Andy Polyakov <appro@cryptogams.org>
+Signed-off-by: Andy Polyakov <appro@cryptogams.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm64/crypto/Kconfig                 |   6 +
+ arch/arm64/crypto/Makefile                |  10 +-
+ arch/arm64/crypto/poly1305-armv8.pl       | 913 ++++++++++++++++++++++
+ arch/arm64/crypto/poly1305-core.S_shipped | 835 ++++++++++++++++++++
+ arch/arm64/crypto/poly1305-glue.c         | 237 ++++++
+ lib/crypto/Kconfig                        |   1 +
+ 6 files changed, 2001 insertions(+), 1 deletion(-)
+ create mode 100644 arch/arm64/crypto/poly1305-armv8.pl
+ create mode 100644 arch/arm64/crypto/poly1305-core.S_shipped
+ create mode 100644 arch/arm64/crypto/poly1305-glue.c
+
+--- a/arch/arm64/crypto/Kconfig
++++ b/arch/arm64/crypto/Kconfig
+@@ -106,6 +106,12 @@ config CRYPTO_CHACHA20_NEON
+       select CRYPTO_LIB_CHACHA_GENERIC
+       select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ 
++config CRYPTO_POLY1305_NEON
++      tristate "Poly1305 hash function using scalar or NEON instructions"
++      depends on KERNEL_MODE_NEON
++      select CRYPTO_HASH
++      select CRYPTO_ARCH_HAVE_LIB_POLY1305
++
+ config CRYPTO_NHPOLY1305_NEON
+       tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
+       depends on KERNEL_MODE_NEON
+--- a/arch/arm64/crypto/Makefile
++++ b/arch/arm64/crypto/Makefile
+@@ -50,6 +50,10 @@ sha512-arm64-y := sha512-glue.o sha512-c
+ obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+ chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+ 
++obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
++poly1305-neon-y := poly1305-core.o poly1305-glue.o
++AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
++
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
+ nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
+ 
+@@ -68,11 +72,15 @@ ifdef REGENERATE_ARM64_CRYPTO
+ quiet_cmd_perlasm = PERLASM $@
+       cmd_perlasm = $(PERL) $(<) void $(@)
+ 
++$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
++      $(call cmd,perlasm)
++
+ $(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
+       $(call cmd,perlasm)
+ 
+ $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
+       $(call cmd,perlasm)
++
+ endif
+ 
+-clean-files += sha256-core.S sha512-core.S
++clean-files += poly1305-core.S sha256-core.S sha512-core.S
+--- /dev/null
++++ b/arch/arm64/crypto/poly1305-armv8.pl
+@@ -0,0 +1,913 @@
++#!/usr/bin/env perl
++# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
++#
++# ====================================================================
++# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
++# project.
++# ====================================================================
++#
++# This module implements Poly1305 hash for ARMv8.
++#
++# June 2015
++#
++# Numbers are cycles per processed byte with poly1305_blocks alone.
++#
++#             IALU/gcc-4.9    NEON
++#
++# Apple A7    1.86/+5%        0.72
++# Cortex-A53  2.69/+58%       1.47
++# Cortex-A57  2.70/+7%        1.14
++# Denver      1.64/+50%       1.18(*)
++# X-Gene      2.13/+68%       2.27
++# Mongoose    1.77/+75%       1.12
++# Kryo                2.70/+55%       1.13
++# ThunderX2   1.17/+95%       1.36
++#
++# (*) estimate based on resources availability is less than 1.0,
++#     i.e. measured result is worse than expected, presumably binary
++#     translator is not almighty;
++
++$flavour=shift;
++$output=shift;
++
++if ($flavour && $flavour ne "void") {
++    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++    die "can't locate arm-xlate.pl";
++
++    open STDOUT,"| \"$^X\" $xlate $flavour $output";
++} else {
++    open STDOUT,">$output";
++}
++
++my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
++my ($mac,$nonce)=($inp,$len);
++
++my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
++
++$code.=<<___;
++#ifndef __KERNEL__
++# include "arm_arch.h"
++.extern       OPENSSL_armcap_P
++#endif
++
++.text
++
++// forward "declarations" are required for Apple
++.globl        poly1305_blocks
++.globl        poly1305_emit
++
++.globl        poly1305_init
++.type poly1305_init,%function
++.align        5
++poly1305_init:
++      cmp     $inp,xzr
++      stp     xzr,xzr,[$ctx]          // zero hash value
++      stp     xzr,xzr,[$ctx,#16]      // [along with is_base2_26]
++
++      csel    x0,xzr,x0,eq
++      b.eq    .Lno_key
++
++#ifndef       __KERNEL__
++      adrp    x17,OPENSSL_armcap_P
++      ldr     w17,[x17,#:lo12:OPENSSL_armcap_P]
++#endif
++
++      ldp     $r0,$r1,[$inp]          // load key
++      mov     $s1,#0xfffffffc0fffffff
++      movk    $s1,#0x0fff,lsl#48
++#ifdef        __AARCH64EB__
++      rev     $r0,$r0                 // flip bytes
++      rev     $r1,$r1
++#endif
++      and     $r0,$r0,$s1             // &=0ffffffc0fffffff
++      and     $s1,$s1,#-4
++      and     $r1,$r1,$s1             // &=0ffffffc0ffffffc
++      mov     w#$s1,#-1
++      stp     $r0,$r1,[$ctx,#32]      // save key value
++      str     w#$s1,[$ctx,#48]        // impossible key power value
++
++#ifndef       __KERNEL__
++      tst     w17,#ARMV7_NEON
++
++      adr     $d0,.Lpoly1305_blocks
++      adr     $r0,.Lpoly1305_blocks_neon
++      adr     $d1,.Lpoly1305_emit
++
++      csel    $d0,$d0,$r0,eq
++
++# ifdef       __ILP32__
++      stp     w#$d0,w#$d1,[$len]
++# else
++      stp     $d0,$d1,[$len]
++# endif
++#endif
++      mov     x0,#1
++.Lno_key:
++      ret
++.size poly1305_init,.-poly1305_init
++
++.type poly1305_blocks,%function
++.align        5
++poly1305_blocks:
++.Lpoly1305_blocks:
++      ands    $len,$len,#-16
++      b.eq    .Lno_data
++
++      ldp     $h0,$h1,[$ctx]          // load hash value
++      ldp     $h2,x17,[$ctx,#16]      // [along with is_base2_26]
++      ldp     $r0,$r1,[$ctx,#32]      // load key value
++
++#ifdef        __AARCH64EB__
++      lsr     $d0,$h0,#32
++      mov     w#$d1,w#$h0
++      lsr     $d2,$h1,#32
++      mov     w15,w#$h1
++      lsr     x16,$h2,#32
++#else
++      mov     w#$d0,w#$h0
++      lsr     $d1,$h0,#32
++      mov     w#$d2,w#$h1
++      lsr     x15,$h1,#32
++      mov     w16,w#$h2
++#endif
++
++      add     $d0,$d0,$d1,lsl#26      // base 2^26 -> base 2^64
++      lsr     $d1,$d2,#12
++      adds    $d0,$d0,$d2,lsl#52
++      add     $d1,$d1,x15,lsl#14
++      adc     $d1,$d1,xzr
++      lsr     $d2,x16,#24
++      adds    $d1,$d1,x16,lsl#40
++      adc     $d2,$d2,xzr
++
++      cmp     x17,#0                  // is_base2_26?
++      add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
++      csel    $h0,$h0,$d0,eq          // choose between radixes
++      csel    $h1,$h1,$d1,eq
++      csel    $h2,$h2,$d2,eq
++
++.Loop:
++      ldp     $t0,$t1,[$inp],#16      // load input
++      sub     $len,$len,#16
++#ifdef        __AARCH64EB__
++      rev     $t0,$t0
++      rev     $t1,$t1
++#endif
++      adds    $h0,$h0,$t0             // accumulate input
++      adcs    $h1,$h1,$t1
++
++      mul     $d0,$h0,$r0             // h0*r0
++      adc     $h2,$h2,$padbit
++      umulh   $d1,$h0,$r0
++
++      mul     $t0,$h1,$s1             // h1*5*r1
++      umulh   $t1,$h1,$s1
++
++      adds    $d0,$d0,$t0
++      mul     $t0,$h0,$r1             // h0*r1
++      adc     $d1,$d1,$t1
++      umulh   $d2,$h0,$r1
++
++      adds    $d1,$d1,$t0
++      mul     $t0,$h1,$r0             // h1*r0
++      adc     $d2,$d2,xzr
++      umulh   $t1,$h1,$r0
++
++      adds    $d1,$d1,$t0
++      mul     $t0,$h2,$s1             // h2*5*r1
++      adc     $d2,$d2,$t1
++      mul     $t1,$h2,$r0             // h2*r0
++
++      adds    $d1,$d1,$t0
++      adc     $d2,$d2,$t1
++
++      and     $t0,$d2,#-4             // final reduction
++      and     $h2,$d2,#3
++      add     $t0,$t0,$d2,lsr#2
++      adds    $h0,$d0,$t0
++      adcs    $h1,$d1,xzr
++      adc     $h2,$h2,xzr
++
++      cbnz    $len,.Loop
++
++      stp     $h0,$h1,[$ctx]          // store hash value
++      stp     $h2,xzr,[$ctx,#16]      // [and clear is_base2_26]
++
++.Lno_data:
++      ret
++.size poly1305_blocks,.-poly1305_blocks
++
++.type poly1305_emit,%function
++.align        5
++poly1305_emit:
++.Lpoly1305_emit:
++      ldp     $h0,$h1,[$ctx]          // load hash base 2^64
++      ldp     $h2,$r0,[$ctx,#16]      // [along with is_base2_26]
++      ldp     $t0,$t1,[$nonce]        // load nonce
++
++#ifdef        __AARCH64EB__
++      lsr     $d0,$h0,#32
++      mov     w#$d1,w#$h0
++      lsr     $d2,$h1,#32
++      mov     w15,w#$h1
++      lsr     x16,$h2,#32
++#else
++      mov     w#$d0,w#$h0
++      lsr     $d1,$h0,#32
++      mov     w#$d2,w#$h1
++      lsr     x15,$h1,#32
++      mov     w16,w#$h2
++#endif
++
++      add     $d0,$d0,$d1,lsl#26      // base 2^26 -> base 2^64
++      lsr     $d1,$d2,#12
++      adds    $d0,$d0,$d2,lsl#52
++      add     $d1,$d1,x15,lsl#14
++      adc     $d1,$d1,xzr
++      lsr     $d2,x16,#24
++      adds    $d1,$d1,x16,lsl#40
++      adc     $d2,$d2,xzr
++
++      cmp     $r0,#0                  // is_base2_26?
++      csel    $h0,$h0,$d0,eq          // choose between radixes
++      csel    $h1,$h1,$d1,eq
++      csel    $h2,$h2,$d2,eq
++
++      adds    $d0,$h0,#5              // compare to modulus
++      adcs    $d1,$h1,xzr
++      adc     $d2,$h2,xzr
++
++      tst     $d2,#-4                 // see if it's carried/borrowed
++
++      csel    $h0,$h0,$d0,eq
++      csel    $h1,$h1,$d1,eq
++
++#ifdef        __AARCH64EB__
++      ror     $t0,$t0,#32             // flip nonce words
++      ror     $t1,$t1,#32
++#endif
++      adds    $h0,$h0,$t0             // accumulate nonce
++      adc     $h1,$h1,$t1
++#ifdef        __AARCH64EB__
++      rev     $h0,$h0                 // flip output bytes
++      rev     $h1,$h1
++#endif
++      stp     $h0,$h1,[$mac]          // write result
++
++      ret
++.size poly1305_emit,.-poly1305_emit
++___
++my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
++my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
++my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
++my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
++my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
++my ($T0,$T1,$MASK) = map("v$_",(29..31));
++
++my ($in2,$zeros)=("x16","x17");
++my $is_base2_26 = $zeros;             # borrow
++
++$code.=<<___;
++.type poly1305_mult,%function
++.align        5
++poly1305_mult:
++      mul     $d0,$h0,$r0             // h0*r0
++      umulh   $d1,$h0,$r0
++
++      mul     $t0,$h1,$s1             // h1*5*r1
++      umulh   $t1,$h1,$s1
++
++      adds    $d0,$d0,$t0
++      mul     $t0,$h0,$r1             // h0*r1
++      adc     $d1,$d1,$t1
++      umulh   $d2,$h0,$r1
++
++      adds    $d1,$d1,$t0
++      mul     $t0,$h1,$r0             // h1*r0
++      adc     $d2,$d2,xzr
++      umulh   $t1,$h1,$r0
++
++      adds    $d1,$d1,$t0
++      mul     $t0,$h2,$s1             // h2*5*r1
++      adc     $d2,$d2,$t1
++      mul     $t1,$h2,$r0             // h2*r0
++
++      adds    $d1,$d1,$t0
++      adc     $d2,$d2,$t1
++
++      and     $t0,$d2,#-4             // final reduction
++      and     $h2,$d2,#3
++      add     $t0,$t0,$d2,lsr#2
++      adds    $h0,$d0,$t0
++      adcs    $h1,$d1,xzr
++      adc     $h2,$h2,xzr
++
++      ret
++.size poly1305_mult,.-poly1305_mult
++
++.type poly1305_splat,%function
++.align        4
++poly1305_splat:
++      and     x12,$h0,#0x03ffffff     // base 2^64 -> base 2^26
++      ubfx    x13,$h0,#26,#26
++      extr    x14,$h1,$h0,#52
++      and     x14,x14,#0x03ffffff
++      ubfx    x15,$h1,#14,#26
++      extr    x16,$h2,$h1,#40
++
++      str     w12,[$ctx,#16*0]        // r0
++      add     w12,w13,w13,lsl#2       // r1*5
++      str     w13,[$ctx,#16*1]        // r1
++      add     w13,w14,w14,lsl#2       // r2*5
++      str     w12,[$ctx,#16*2]        // s1
++      str     w14,[$ctx,#16*3]        // r2
++      add     w14,w15,w15,lsl#2       // r3*5
++      str     w13,[$ctx,#16*4]        // s2
++      str     w15,[$ctx,#16*5]        // r3
++      add     w15,w16,w16,lsl#2       // r4*5
++      str     w14,[$ctx,#16*6]        // s3
++      str     w16,[$ctx,#16*7]        // r4
++      str     w15,[$ctx,#16*8]        // s4
++
++      ret
++.size poly1305_splat,.-poly1305_splat
++
++#ifdef        __KERNEL__
++.globl        poly1305_blocks_neon
++#endif
++.type poly1305_blocks_neon,%function
++.align        5
++poly1305_blocks_neon:
++.Lpoly1305_blocks_neon:
++      ldr     $is_base2_26,[$ctx,#24]
++      cmp     $len,#128
++      b.lo    .Lpoly1305_blocks
++
++      .inst   0xd503233f              // paciasp
++      stp     x29,x30,[sp,#-80]!
++      add     x29,sp,#0
++
++      stp     d8,d9,[sp,#16]          // meet ABI requirements
++      stp     d10,d11,[sp,#32]
++      stp     d12,d13,[sp,#48]
++      stp     d14,d15,[sp,#64]
++
++      cbz     $is_base2_26,.Lbase2_64_neon
++
++      ldp     w10,w11,[$ctx]          // load hash value base 2^26
++      ldp     w12,w13,[$ctx,#8]
++      ldr     w14,[$ctx,#16]
++
++      tst     $len,#31
++      b.eq    .Leven_neon
++
++      ldp     $r0,$r1,[$ctx,#32]      // load key value
++
++      add     $h0,x10,x11,lsl#26      // base 2^26 -> base 2^64
++      lsr     $h1,x12,#12
++      adds    $h0,$h0,x12,lsl#52
++      add     $h1,$h1,x13,lsl#14
++      adc     $h1,$h1,xzr
++      lsr     $h2,x14,#24
++      adds    $h1,$h1,x14,lsl#40
++      adc     $d2,$h2,xzr             // can be partially reduced...
++
++      ldp     $d0,$d1,[$inp],#16      // load input
++      sub     $len,$len,#16
++      add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
++
++#ifdef        __AARCH64EB__
++      rev     $d0,$d0
++      rev     $d1,$d1
++#endif
++      adds    $h0,$h0,$d0             // accumulate input
++      adcs    $h1,$h1,$d1
++      adc     $h2,$h2,$padbit
++
++      bl      poly1305_mult
++
++      and     x10,$h0,#0x03ffffff     // base 2^64 -> base 2^26
++      ubfx    x11,$h0,#26,#26
++      extr    x12,$h1,$h0,#52
++      and     x12,x12,#0x03ffffff
++      ubfx    x13,$h1,#14,#26
++      extr    x14,$h2,$h1,#40
++
++      b       .Leven_neon
++
++.align        4
++.Lbase2_64_neon:
++      ldp     $r0,$r1,[$ctx,#32]      // load key value
++
++      ldp     $h0,$h1,[$ctx]          // load hash value base 2^64
++      ldr     $h2,[$ctx,#16]
++
++      tst     $len,#31
++      b.eq    .Linit_neon
++
++      ldp     $d0,$d1,[$inp],#16      // load input
++      sub     $len,$len,#16
++      add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
++#ifdef        __AARCH64EB__
++      rev     $d0,$d0
++      rev     $d1,$d1
++#endif
++      adds    $h0,$h0,$d0             // accumulate input
++      adcs    $h1,$h1,$d1
++      adc     $h2,$h2,$padbit
++
++      bl      poly1305_mult
++
++.Linit_neon:
++      ldr     w17,[$ctx,#48]          // first table element
++      and     x10,$h0,#0x03ffffff     // base 2^64 -> base 2^26
++      ubfx    x11,$h0,#26,#26
++      extr    x12,$h1,$h0,#52
++      and     x12,x12,#0x03ffffff
++      ubfx    x13,$h1,#14,#26
++      extr    x14,$h2,$h1,#40
++
++      cmp     w17,#-1                 // is value impossible?
++      b.ne    .Leven_neon
++
++      fmov    ${H0},x10
++      fmov    ${H1},x11
++      fmov    ${H2},x12
++      fmov    ${H3},x13
++      fmov    ${H4},x14
++
++      ////////////////////////////////// initialize r^n table
++      mov     $h0,$r0                 // r^1
++      add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
++      mov     $h1,$r1
++      mov     $h2,xzr
++      add     $ctx,$ctx,#48+12
++      bl      poly1305_splat
++
++      bl      poly1305_mult           // r^2
++      sub     $ctx,$ctx,#4
++      bl      poly1305_splat
++
++      bl      poly1305_mult           // r^3
++      sub     $ctx,$ctx,#4
++      bl      poly1305_splat
++
++      bl      poly1305_mult           // r^4
++      sub     $ctx,$ctx,#4
++      bl      poly1305_splat
++      sub     $ctx,$ctx,#48           // restore original $ctx
++      b       .Ldo_neon
++
++.align        4
++.Leven_neon:
++      fmov    ${H0},x10
++      fmov    ${H1},x11
++      fmov    ${H2},x12
++      fmov    ${H3},x13
++      fmov    ${H4},x14
++
++.Ldo_neon:
++      ldp     x8,x12,[$inp,#32]       // inp[2:3]
++      subs    $len,$len,#64
++      ldp     x9,x13,[$inp,#48]
++      add     $in2,$inp,#96
++      adr     $zeros,.Lzeros
++
++      lsl     $padbit,$padbit,#24
++      add     x15,$ctx,#48
++
++#ifdef        __AARCH64EB__
++      rev     x8,x8
++      rev     x12,x12
++      rev     x9,x9
++      rev     x13,x13
++#endif
++      and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
++      and     x5,x9,#0x03ffffff
++      ubfx    x6,x8,#26,#26
++      ubfx    x7,x9,#26,#26
++      add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
++      extr    x8,x12,x8,#52
++      extr    x9,x13,x9,#52
++      add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
++      fmov    $IN23_0,x4
++      and     x8,x8,#0x03ffffff
++      and     x9,x9,#0x03ffffff
++      ubfx    x10,x12,#14,#26
++      ubfx    x11,x13,#14,#26
++      add     x12,$padbit,x12,lsr#40
++      add     x13,$padbit,x13,lsr#40
++      add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
++      fmov    $IN23_1,x6
++      add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
++      add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
++      fmov    $IN23_2,x8
++      fmov    $IN23_3,x10
++      fmov    $IN23_4,x12
++
++      ldp     x8,x12,[$inp],#16       // inp[0:1]
++      ldp     x9,x13,[$inp],#48
++
++      ld1     {$R0,$R1,$S1,$R2},[x15],#64
++      ld1     {$S2,$R3,$S3,$R4},[x15],#64
++      ld1     {$S4},[x15]
++
++#ifdef        __AARCH64EB__
++      rev     x8,x8
++      rev     x12,x12
++      rev     x9,x9
++      rev     x13,x13
++#endif
++      and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
++      and     x5,x9,#0x03ffffff
++      ubfx    x6,x8,#26,#26
++      ubfx    x7,x9,#26,#26
++      add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
++      extr    x8,x12,x8,#52
++      extr    x9,x13,x9,#52
++      add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
++      fmov    $IN01_0,x4
++      and     x8,x8,#0x03ffffff
++      and     x9,x9,#0x03ffffff
++      ubfx    x10,x12,#14,#26
++      ubfx    x11,x13,#14,#26
++      add     x12,$padbit,x12,lsr#40
++      add     x13,$padbit,x13,lsr#40
++      add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
++      fmov    $IN01_1,x6
++      add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
++      add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
++      movi    $MASK.2d,#-1
++      fmov    $IN01_2,x8
++      fmov    $IN01_3,x10
++      fmov    $IN01_4,x12
++      ushr    $MASK.2d,$MASK.2d,#38
++
++      b.ls    .Lskip_loop
++
++.align        4
++.Loop_neon:
++      ////////////////////////////////////////////////////////////////
++      // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
++      // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
++      //   \___________________/
++      // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
++      // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
++      //   \___________________/ \____________________/
++      //
++      // Note that we start with inp[2:3]*r^2. This is because it
++      // doesn't depend on reduction in previous iteration.
++      ////////////////////////////////////////////////////////////////
++      // d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
++      // d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
++      // d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
++      // d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
++      // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
++
++      subs    $len,$len,#64
++      umull   $ACC4,$IN23_0,${R4}[2]
++      csel    $in2,$zeros,$in2,lo
++      umull   $ACC3,$IN23_0,${R3}[2]
++      umull   $ACC2,$IN23_0,${R2}[2]
++       ldp    x8,x12,[$in2],#16       // inp[2:3] (or zero)
++      umull   $ACC1,$IN23_0,${R1}[2]
++       ldp    x9,x13,[$in2],#48
++      umull   $ACC0,$IN23_0,${R0}[2]
++#ifdef        __AARCH64EB__
++       rev    x8,x8
++       rev    x12,x12
++       rev    x9,x9
++       rev    x13,x13
++#endif
++
++      umlal   $ACC4,$IN23_1,${R3}[2]
++       and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
++      umlal   $ACC3,$IN23_1,${R2}[2]
++       and    x5,x9,#0x03ffffff
++      umlal   $ACC2,$IN23_1,${R1}[2]
++       ubfx   x6,x8,#26,#26
++      umlal   $ACC1,$IN23_1,${R0}[2]
++       ubfx   x7,x9,#26,#26
++      umlal   $ACC0,$IN23_1,${S4}[2]
++       add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
++
++      umlal   $ACC4,$IN23_2,${R2}[2]
++       extr   x8,x12,x8,#52
++      umlal   $ACC3,$IN23_2,${R1}[2]
++       extr   x9,x13,x9,#52
++      umlal   $ACC2,$IN23_2,${R0}[2]
++       add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
++      umlal   $ACC1,$IN23_2,${S4}[2]
++       fmov   $IN23_0,x4
++      umlal   $ACC0,$IN23_2,${S3}[2]
++       and    x8,x8,#0x03ffffff
++
++      umlal   $ACC4,$IN23_3,${R1}[2]
++       and    x9,x9,#0x03ffffff
++      umlal   $ACC3,$IN23_3,${R0}[2]
++       ubfx   x10,x12,#14,#26
++      umlal   $ACC2,$IN23_3,${S4}[2]
++       ubfx   x11,x13,#14,#26
++      umlal   $ACC1,$IN23_3,${S3}[2]
++       add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
++      umlal   $ACC0,$IN23_3,${S2}[2]
++       fmov   $IN23_1,x6
++
++      add     $IN01_2,$IN01_2,$H2
++       add    x12,$padbit,x12,lsr#40
++      umlal   $ACC4,$IN23_4,${R0}[2]
++       add    x13,$padbit,x13,lsr#40
++      umlal   $ACC3,$IN23_4,${S4}[2]
++       add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
++      umlal   $ACC2,$IN23_4,${S3}[2]
++       add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
++      umlal   $ACC1,$IN23_4,${S2}[2]
++       fmov   $IN23_2,x8
++      umlal   $ACC0,$IN23_4,${S1}[2]
++       fmov   $IN23_3,x10
++
++      ////////////////////////////////////////////////////////////////
++      // (hash+inp[0:1])*r^4 and accumulate
++
++      add     $IN01_0,$IN01_0,$H0
++       fmov   $IN23_4,x12
++      umlal   $ACC3,$IN01_2,${R1}[0]
++       ldp    x8,x12,[$inp],#16       // inp[0:1]
++      umlal   $ACC0,$IN01_2,${S3}[0]
++       ldp    x9,x13,[$inp],#48
++      umlal   $ACC4,$IN01_2,${R2}[0]
++      umlal   $ACC1,$IN01_2,${S4}[0]
++      umlal   $ACC2,$IN01_2,${R0}[0]
++#ifdef        __AARCH64EB__
++       rev    x8,x8
++       rev    x12,x12
++       rev    x9,x9
++       rev    x13,x13
++#endif
++
++      add     $IN01_1,$IN01_1,$H1
++      umlal   $ACC3,$IN01_0,${R3}[0]
++      umlal   $ACC4,$IN01_0,${R4}[0]
++       and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
++      umlal   $ACC2,$IN01_0,${R2}[0]
++       and    x5,x9,#0x03ffffff
++      umlal   $ACC0,$IN01_0,${R0}[0]
++       ubfx   x6,x8,#26,#26
++      umlal   $ACC1,$IN01_0,${R1}[0]
++       ubfx   x7,x9,#26,#26
++
++      add     $IN01_3,$IN01_3,$H3
++       add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
++      umlal   $ACC3,$IN01_1,${R2}[0]
++       extr   x8,x12,x8,#52
++      umlal   $ACC4,$IN01_1,${R3}[0]
++       extr   x9,x13,x9,#52
++      umlal   $ACC0,$IN01_1,${S4}[0]
++       add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
++      umlal   $ACC2,$IN01_1,${R1}[0]
++       fmov   $IN01_0,x4
++      umlal   $ACC1,$IN01_1,${R0}[0]
++       and    x8,x8,#0x03ffffff
++
++      add     $IN01_4,$IN01_4,$H4
++       and    x9,x9,#0x03ffffff
++      umlal   $ACC3,$IN01_3,${R0}[0]
++       ubfx   x10,x12,#14,#26
++      umlal   $ACC0,$IN01_3,${S2}[0]
++       ubfx   x11,x13,#14,#26
++      umlal   $ACC4,$IN01_3,${R1}[0]
++       add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
++      umlal   $ACC1,$IN01_3,${S3}[0]
++       fmov   $IN01_1,x6
++      umlal   $ACC2,$IN01_3,${S4}[0]
++       add    x12,$padbit,x12,lsr#40
++
++      umlal   $ACC3,$IN01_4,${S4}[0]
++       add    x13,$padbit,x13,lsr#40
++      umlal   $ACC0,$IN01_4,${S1}[0]
++       add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
++      umlal   $ACC4,$IN01_4,${R0}[0]
++       add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
++      umlal   $ACC1,$IN01_4,${S2}[0]
++       fmov   $IN01_2,x8
++      umlal   $ACC2,$IN01_4,${S3}[0]
++       fmov   $IN01_3,x10
++       fmov   $IN01_4,x12
++
++      /////////////////////////////////////////////////////////////////
++      // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
++      // and P. Schwabe
++      //
++      // [see discussion in poly1305-armv4 module]
++
++      ushr    $T0.2d,$ACC3,#26
++      xtn     $H3,$ACC3
++       ushr   $T1.2d,$ACC0,#26
++       and    $ACC0,$ACC0,$MASK.2d
++      add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
++      bic     $H3,#0xfc,lsl#24        // &=0x03ffffff
++       add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
++
++      ushr    $T0.2d,$ACC4,#26
++      xtn     $H4,$ACC4
++       ushr   $T1.2d,$ACC1,#26
++       xtn    $H1,$ACC1
++      bic     $H4,#0xfc,lsl#24
++       add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
++
++      add     $ACC0,$ACC0,$T0.2d
++      shl     $T0.2d,$T0.2d,#2
++       shrn   $T1.2s,$ACC2,#26
++       xtn    $H2,$ACC2
++      add     $ACC0,$ACC0,$T0.2d      // h4 -> h0
++       bic    $H1,#0xfc,lsl#24
++       add    $H3,$H3,$T1.2s          // h2 -> h3
++       bic    $H2,#0xfc,lsl#24
++
++      shrn    $T0.2s,$ACC0,#26
++      xtn     $H0,$ACC0
++       ushr   $T1.2s,$H3,#26
++       bic    $H3,#0xfc,lsl#24
++       bic    $H0,#0xfc,lsl#24
++      add     $H1,$H1,$T0.2s          // h0 -> h1
++       add    $H4,$H4,$T1.2s          // h3 -> h4
++
++      b.hi    .Loop_neon
++
++.Lskip_loop:
++      dup     $IN23_2,${IN23_2}[0]
++      add     $IN01_2,$IN01_2,$H2
++
++      ////////////////////////////////////////////////////////////////
++      // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
++
++      adds    $len,$len,#32
++      b.ne    .Long_tail
++
++      dup     $IN23_2,${IN01_2}[0]
++      add     $IN23_0,$IN01_0,$H0
++      add     $IN23_3,$IN01_3,$H3
++      add     $IN23_1,$IN01_1,$H1
++      add     $IN23_4,$IN01_4,$H4
++
++.Long_tail:
++      dup     $IN23_0,${IN23_0}[0]
++      umull2  $ACC0,$IN23_2,${S3}
++      umull2  $ACC3,$IN23_2,${R1}
++      umull2  $ACC4,$IN23_2,${R2}
++      umull2  $ACC2,$IN23_2,${R0}
++      umull2  $ACC1,$IN23_2,${S4}
++
++      dup     $IN23_1,${IN23_1}[0]
++      umlal2  $ACC0,$IN23_0,${R0}
++      umlal2  $ACC2,$IN23_0,${R2}
++      umlal2  $ACC3,$IN23_0,${R3}
++      umlal2  $ACC4,$IN23_0,${R4}
++      umlal2  $ACC1,$IN23_0,${R1}
++
++      dup     $IN23_3,${IN23_3}[0]
++      umlal2  $ACC0,$IN23_1,${S4}
++      umlal2  $ACC3,$IN23_1,${R2}
++      umlal2  $ACC2,$IN23_1,${R1}
++      umlal2  $ACC4,$IN23_1,${R3}
++      umlal2  $ACC1,$IN23_1,${R0}
++
++      dup     $IN23_4,${IN23_4}[0]
++      umlal2  $ACC3,$IN23_3,${R0}
++      umlal2  $ACC4,$IN23_3,${R1}
++      umlal2  $ACC0,$IN23_3,${S2}
++      umlal2  $ACC1,$IN23_3,${S3}
++      umlal2  $ACC2,$IN23_3,${S4}
++
++      umlal2  $ACC3,$IN23_4,${S4}
++      umlal2  $ACC0,$IN23_4,${S1}
++      umlal2  $ACC4,$IN23_4,${R0}
++      umlal2  $ACC1,$IN23_4,${S2}
++      umlal2  $ACC2,$IN23_4,${S3}
++
++      b.eq    .Lshort_tail
++
++      ////////////////////////////////////////////////////////////////
++      // (hash+inp[0:1])*r^4:r^3 and accumulate
++
++      add     $IN01_0,$IN01_0,$H0
++      umlal   $ACC3,$IN01_2,${R1}
++      umlal   $ACC0,$IN01_2,${S3}
++      umlal   $ACC4,$IN01_2,${R2}
++      umlal   $ACC1,$IN01_2,${S4}
++      umlal   $ACC2,$IN01_2,${R0}
++
++      add     $IN01_1,$IN01_1,$H1
++      umlal   $ACC3,$IN01_0,${R3}
++      umlal   $ACC0,$IN01_0,${R0}
++      umlal   $ACC4,$IN01_0,${R4}
++      umlal   $ACC1,$IN01_0,${R1}
++      umlal   $ACC2,$IN01_0,${R2}
++
++      add     $IN01_3,$IN01_3,$H3
++      umlal   $ACC3,$IN01_1,${R2}
++      umlal   $ACC0,$IN01_1,${S4}
++      umlal   $ACC4,$IN01_1,${R3}
++      umlal   $ACC1,$IN01_1,${R0}
++      umlal   $ACC2,$IN01_1,${R1}
++
++      add     $IN01_4,$IN01_4,$H4
++      umlal   $ACC3,$IN01_3,${R0}
++      umlal   $ACC0,$IN01_3,${S2}
++      umlal   $ACC4,$IN01_3,${R1}
++      umlal   $ACC1,$IN01_3,${S3}
++      umlal   $ACC2,$IN01_3,${S4}
++
++      umlal   $ACC3,$IN01_4,${S4}
++      umlal   $ACC0,$IN01_4,${S1}
++      umlal   $ACC4,$IN01_4,${R0}
++      umlal   $ACC1,$IN01_4,${S2}
++      umlal   $ACC2,$IN01_4,${S3}
++
++.Lshort_tail:
++      ////////////////////////////////////////////////////////////////
++      // horizontal add
++
++      addp    $ACC3,$ACC3,$ACC3
++       ldp    d8,d9,[sp,#16]          // meet ABI requirements
++      addp    $ACC0,$ACC0,$ACC0
++       ldp    d10,d11,[sp,#32]
++      addp    $ACC4,$ACC4,$ACC4
++       ldp    d12,d13,[sp,#48]
++      addp    $ACC1,$ACC1,$ACC1
++       ldp    d14,d15,[sp,#64]
++      addp    $ACC2,$ACC2,$ACC2
++       ldr    x30,[sp,#8]
++       .inst  0xd50323bf              // autiasp
++
++      ////////////////////////////////////////////////////////////////
++      // lazy reduction, but without narrowing
++
++      ushr    $T0.2d,$ACC3,#26
++      and     $ACC3,$ACC3,$MASK.2d
++       ushr   $T1.2d,$ACC0,#26
++       and    $ACC0,$ACC0,$MASK.2d
++
++      add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
++       add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
++
++      ushr    $T0.2d,$ACC4,#26
++      and     $ACC4,$ACC4,$MASK.2d
++       ushr   $T1.2d,$ACC1,#26
++       and    $ACC1,$ACC1,$MASK.2d
++       add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
++
++      add     $ACC0,$ACC0,$T0.2d
++      shl     $T0.2d,$T0.2d,#2
++       ushr   $T1.2d,$ACC2,#26
++       and    $ACC2,$ACC2,$MASK.2d
++      add     $ACC0,$ACC0,$T0.2d      // h4 -> h0
++       add    $ACC3,$ACC3,$T1.2d      // h2 -> h3
++
++      ushr    $T0.2d,$ACC0,#26
++      and     $ACC0,$ACC0,$MASK.2d
++       ushr   $T1.2d,$ACC3,#26
++       and    $ACC3,$ACC3,$MASK.2d
++      add     $ACC1,$ACC1,$T0.2d      // h0 -> h1
++       add    $ACC4,$ACC4,$T1.2d      // h3 -> h4
++
++      ////////////////////////////////////////////////////////////////
++      // write the result, can be partially reduced
++
++      st4     {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
++      mov     x4,#1
++      st1     {$ACC4}[0],[$ctx]
++      str     x4,[$ctx,#8]            // set is_base2_26
++
++      ldr     x29,[sp],#80
++      ret
++.size poly1305_blocks_neon,.-poly1305_blocks_neon
++
++.align        5
++.Lzeros:
++.long 0,0,0,0,0,0,0,0
++.asciz        "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
++.align        2
++#if !defined(__KERNEL__) && !defined(_WIN64)
++.comm OPENSSL_armcap_P,4,4
++.hidden       OPENSSL_armcap_P
++#endif
++___
++
++foreach (split("\n",$code)) {
++      s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/                      or
++      s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/     or
++      (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))                 or
++      (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))       or
++      (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))             or
++      (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))            or
++      (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
++
++      s/\.[124]([sd])\[/.$1\[/;
++      s/w#x([0-9]+)/w$1/g;
++
++      print $_,"\n";
++}
++close STDOUT;
+--- /dev/null
++++ b/arch/arm64/crypto/poly1305-core.S_shipped
+@@ -0,0 +1,835 @@
++#ifndef __KERNEL__
++# include "arm_arch.h"
++.extern       OPENSSL_armcap_P
++#endif
++
++.text
++
++// forward "declarations" are required for Apple
++.globl        poly1305_blocks
++.globl        poly1305_emit
++
++.globl        poly1305_init
++.type poly1305_init,%function
++.align        5
++poly1305_init:
++      cmp     x1,xzr
++      stp     xzr,xzr,[x0]            // zero hash value
++      stp     xzr,xzr,[x0,#16]        // [along with is_base2_26]
++
++      csel    x0,xzr,x0,eq
++      b.eq    .Lno_key
++
++#ifndef       __KERNEL__
++      adrp    x17,OPENSSL_armcap_P
++      ldr     w17,[x17,#:lo12:OPENSSL_armcap_P]
++#endif
++
++      ldp     x7,x8,[x1]              // load key
++      mov     x9,#0xfffffffc0fffffff
++      movk    x9,#0x0fff,lsl#48
++#ifdef        __AARCH64EB__
++      rev     x7,x7                   // flip bytes
++      rev     x8,x8
++#endif
++      and     x7,x7,x9                // &=0ffffffc0fffffff
++      and     x9,x9,#-4
++      and     x8,x8,x9                // &=0ffffffc0ffffffc
++      mov     w9,#-1
++      stp     x7,x8,[x0,#32]  // save key value
++      str     w9,[x0,#48]     // impossible key power value
++
++#ifndef       __KERNEL__
++      tst     w17,#ARMV7_NEON
++
++      adr     x12,.Lpoly1305_blocks
++      adr     x7,.Lpoly1305_blocks_neon
++      adr     x13,.Lpoly1305_emit
++
++      csel    x12,x12,x7,eq
++
++# ifdef       __ILP32__
++      stp     w12,w13,[x2]
++# else
++      stp     x12,x13,[x2]
++# endif
++#endif
++      mov     x0,#1
++.Lno_key:
++      ret
++.size poly1305_init,.-poly1305_init
++
++.type poly1305_blocks,%function
++.align        5
++poly1305_blocks:
++.Lpoly1305_blocks:
++      ands    x2,x2,#-16
++      b.eq    .Lno_data
++
++      ldp     x4,x5,[x0]              // load hash value
++      ldp     x6,x17,[x0,#16] // [along with is_base2_26]
++      ldp     x7,x8,[x0,#32]  // load key value
++
++#ifdef        __AARCH64EB__
++      lsr     x12,x4,#32
++      mov     w13,w4
++      lsr     x14,x5,#32
++      mov     w15,w5
++      lsr     x16,x6,#32
++#else
++      mov     w12,w4
++      lsr     x13,x4,#32
++      mov     w14,w5
++      lsr     x15,x5,#32
++      mov     w16,w6
++#endif
++
++      add     x12,x12,x13,lsl#26      // base 2^26 -> base 2^64
++      lsr     x13,x14,#12
++      adds    x12,x12,x14,lsl#52
++      add     x13,x13,x15,lsl#14
++      adc     x13,x13,xzr
++      lsr     x14,x16,#24
++      adds    x13,x13,x16,lsl#40
++      adc     x14,x14,xzr
++
++      cmp     x17,#0                  // is_base2_26?
++      add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
++      csel    x4,x4,x12,eq            // choose between radixes
++      csel    x5,x5,x13,eq
++      csel    x6,x6,x14,eq
++
++.Loop:
++      ldp     x10,x11,[x1],#16        // load input
++      sub     x2,x2,#16
++#ifdef        __AARCH64EB__
++      rev     x10,x10
++      rev     x11,x11
++#endif
++      adds    x4,x4,x10               // accumulate input
++      adcs    x5,x5,x11
++
++      mul     x12,x4,x7               // h0*r0
++      adc     x6,x6,x3
++      umulh   x13,x4,x7
++
++      mul     x10,x5,x9               // h1*5*r1
++      umulh   x11,x5,x9
++
++      adds    x12,x12,x10
++      mul     x10,x4,x8               // h0*r1
++      adc     x13,x13,x11
++      umulh   x14,x4,x8
++
++      adds    x13,x13,x10
++      mul     x10,x5,x7               // h1*r0
++      adc     x14,x14,xzr
++      umulh   x11,x5,x7
++
++      adds    x13,x13,x10
++      mul     x10,x6,x9               // h2*5*r1
++      adc     x14,x14,x11
++      mul     x11,x6,x7               // h2*r0
++
++      adds    x13,x13,x10
++      adc     x14,x14,x11
++
++      and     x10,x14,#-4             // final reduction
++      and     x6,x14,#3
++      add     x10,x10,x14,lsr#2
++      adds    x4,x12,x10
++      adcs    x5,x13,xzr
++      adc     x6,x6,xzr
++
++      cbnz    x2,.Loop
++
++      stp     x4,x5,[x0]              // store hash value
++      stp     x6,xzr,[x0,#16] // [and clear is_base2_26]
++
++.Lno_data:
++      ret
++.size poly1305_blocks,.-poly1305_blocks
++
++.type poly1305_emit,%function
++.align        5
++poly1305_emit:
++.Lpoly1305_emit:
++      ldp     x4,x5,[x0]              // load hash base 2^64
++      ldp     x6,x7,[x0,#16]  // [along with is_base2_26]
++      ldp     x10,x11,[x2]    // load nonce
++
++#ifdef        __AARCH64EB__
++      lsr     x12,x4,#32
++      mov     w13,w4
++      lsr     x14,x5,#32
++      mov     w15,w5
++      lsr     x16,x6,#32
++#else
++      mov     w12,w4
++      lsr     x13,x4,#32
++      mov     w14,w5
++      lsr     x15,x5,#32
++      mov     w16,w6
++#endif
++
++      add     x12,x12,x13,lsl#26      // base 2^26 -> base 2^64
++      lsr     x13,x14,#12
++      adds    x12,x12,x14,lsl#52
++      add     x13,x13,x15,lsl#14
++      adc     x13,x13,xzr
++      lsr     x14,x16,#24
++      adds    x13,x13,x16,lsl#40
++      adc     x14,x14,xzr
++
++      cmp     x7,#0                   // is_base2_26?
++      csel    x4,x4,x12,eq            // choose between radixes
++      csel    x5,x5,x13,eq
++      csel    x6,x6,x14,eq
++
++      adds    x12,x4,#5               // compare to modulus
++      adcs    x13,x5,xzr
++      adc     x14,x6,xzr
++
++      tst     x14,#-4                 // see if it's carried/borrowed
++
++      csel    x4,x4,x12,eq
++      csel    x5,x5,x13,eq
++
++#ifdef        __AARCH64EB__
++      ror     x10,x10,#32             // flip nonce words
++      ror     x11,x11,#32
++#endif
++      adds    x4,x4,x10               // accumulate nonce
++      adc     x5,x5,x11
++#ifdef        __AARCH64EB__
++      rev     x4,x4                   // flip output bytes
++      rev     x5,x5
++#endif
++      stp     x4,x5,[x1]              // write result
++
++      ret
++.size poly1305_emit,.-poly1305_emit
++.type poly1305_mult,%function
++.align        5
++poly1305_mult:
++      mul     x12,x4,x7               // h0*r0
++      umulh   x13,x4,x7
++
++      mul     x10,x5,x9               // h1*5*r1
++      umulh   x11,x5,x9
++
++      adds    x12,x12,x10
++      mul     x10,x4,x8               // h0*r1
++      adc     x13,x13,x11
++      umulh   x14,x4,x8
++
++      adds    x13,x13,x10
++      mul     x10,x5,x7               // h1*r0
++      adc     x14,x14,xzr
++      umulh   x11,x5,x7
++
++      adds    x13,x13,x10
++      mul     x10,x6,x9               // h2*5*r1
++      adc     x14,x14,x11
++      mul     x11,x6,x7               // h2*r0
++
++      adds    x13,x13,x10
++      adc     x14,x14,x11
++
++      and     x10,x14,#-4             // final reduction
++      and     x6,x14,#3
++      add     x10,x10,x14,lsr#2
++      adds    x4,x12,x10
++      adcs    x5,x13,xzr
++      adc     x6,x6,xzr
++
++      ret
++.size poly1305_mult,.-poly1305_mult
++
++.type poly1305_splat,%function
++.align        4
++poly1305_splat:
++      and     x12,x4,#0x03ffffff      // base 2^64 -> base 2^26
++      ubfx    x13,x4,#26,#26
++      extr    x14,x5,x4,#52
++      and     x14,x14,#0x03ffffff
++      ubfx    x15,x5,#14,#26
++      extr    x16,x6,x5,#40
++
++      str     w12,[x0,#16*0]  // r0
++      add     w12,w13,w13,lsl#2       // r1*5
++      str     w13,[x0,#16*1]  // r1
++      add     w13,w14,w14,lsl#2       // r2*5
++      str     w12,[x0,#16*2]  // s1
++      str     w14,[x0,#16*3]  // r2
++      add     w14,w15,w15,lsl#2       // r3*5
++      str     w13,[x0,#16*4]  // s2
++      str     w15,[x0,#16*5]  // r3
++      add     w15,w16,w16,lsl#2       // r4*5
++      str     w14,[x0,#16*6]  // s3
++      str     w16,[x0,#16*7]  // r4
++      str     w15,[x0,#16*8]  // s4
++
++      ret
++.size poly1305_splat,.-poly1305_splat
++
++#ifdef        __KERNEL__
++.globl        poly1305_blocks_neon
++#endif
++.type poly1305_blocks_neon,%function
++.align        5
++poly1305_blocks_neon:
++.Lpoly1305_blocks_neon:
++      ldr     x17,[x0,#24]
++      cmp     x2,#128
++      b.lo    .Lpoly1305_blocks
++
++      .inst   0xd503233f              // paciasp
++      stp     x29,x30,[sp,#-80]!
++      add     x29,sp,#0
++
++      stp     d8,d9,[sp,#16]          // meet ABI requirements
++      stp     d10,d11,[sp,#32]
++      stp     d12,d13,[sp,#48]
++      stp     d14,d15,[sp,#64]
++
++      cbz     x17,.Lbase2_64_neon
++
++      ldp     w10,w11,[x0]            // load hash value base 2^26
++      ldp     w12,w13,[x0,#8]
++      ldr     w14,[x0,#16]
++
++      tst     x2,#31
++      b.eq    .Leven_neon
++
++      ldp     x7,x8,[x0,#32]  // load key value
++
++      add     x4,x10,x11,lsl#26       // base 2^26 -> base 2^64
++      lsr     x5,x12,#12
++      adds    x4,x4,x12,lsl#52
++      add     x5,x5,x13,lsl#14
++      adc     x5,x5,xzr
++      lsr     x6,x14,#24
++      adds    x5,x5,x14,lsl#40
++      adc     x14,x6,xzr              // can be partially reduced...
++
++      ldp     x12,x13,[x1],#16        // load input
++      sub     x2,x2,#16
++      add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
++
++#ifdef        __AARCH64EB__
++      rev     x12,x12
++      rev     x13,x13
++#endif
++      adds    x4,x4,x12               // accumulate input
++      adcs    x5,x5,x13
++      adc     x6,x6,x3
++
++      bl      poly1305_mult
++
++      and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
++      ubfx    x11,x4,#26,#26
++      extr    x12,x5,x4,#52
++      and     x12,x12,#0x03ffffff
++      ubfx    x13,x5,#14,#26
++      extr    x14,x6,x5,#40
++
++      b       .Leven_neon
++
++.align        4
++.Lbase2_64_neon:
++      ldp     x7,x8,[x0,#32]  // load key value
++
++      ldp     x4,x5,[x0]              // load hash value base 2^64
++      ldr     x6,[x0,#16]
++
++      tst     x2,#31
++      b.eq    .Linit_neon
++
++      ldp     x12,x13,[x1],#16        // load input
++      sub     x2,x2,#16
++      add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
++#ifdef        __AARCH64EB__
++      rev     x12,x12
++      rev     x13,x13
++#endif
++      adds    x4,x4,x12               // accumulate input
++      adcs    x5,x5,x13
++      adc     x6,x6,x3
++
++      bl      poly1305_mult
++
++.Linit_neon:
++      ldr     w17,[x0,#48]            // first table element
++      and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
++      ubfx    x11,x4,#26,#26
++      extr    x12,x5,x4,#52
++      and     x12,x12,#0x03ffffff
++      ubfx    x13,x5,#14,#26
++      extr    x14,x6,x5,#40
++
++      cmp     w17,#-1                 // is value impossible?
++      b.ne    .Leven_neon
++
++      fmov    d24,x10
++      fmov    d25,x11
++      fmov    d26,x12
++      fmov    d27,x13
++      fmov    d28,x14
++
++      ////////////////////////////////// initialize r^n table
++      mov     x4,x7                   // r^1
++      add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
++      mov     x5,x8
++      mov     x6,xzr
++      add     x0,x0,#48+12
++      bl      poly1305_splat
++
++      bl      poly1305_mult           // r^2
++      sub     x0,x0,#4
++      bl      poly1305_splat
++
++      bl      poly1305_mult           // r^3
++      sub     x0,x0,#4
++      bl      poly1305_splat
++
++      bl      poly1305_mult           // r^4
++      sub     x0,x0,#4
++      bl      poly1305_splat
++      sub     x0,x0,#48               // restore original x0
++      b       .Ldo_neon
++
++.align        4
++.Leven_neon:
++      fmov    d24,x10
++      fmov    d25,x11
++      fmov    d26,x12
++      fmov    d27,x13
++      fmov    d28,x14
++
++.Ldo_neon:
++      ldp     x8,x12,[x1,#32] // inp[2:3]
++      subs    x2,x2,#64
++      ldp     x9,x13,[x1,#48]
++      add     x16,x1,#96
++      adr     x17,.Lzeros
++
++      lsl     x3,x3,#24
++      add     x15,x0,#48
++
++#ifdef        __AARCH64EB__
++      rev     x8,x8
++      rev     x12,x12
++      rev     x9,x9
++      rev     x13,x13
++#endif
++      and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
++      and     x5,x9,#0x03ffffff
++      ubfx    x6,x8,#26,#26
++      ubfx    x7,x9,#26,#26
++      add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
++      extr    x8,x12,x8,#52
++      extr    x9,x13,x9,#52
++      add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
++      fmov    d14,x4
++      and     x8,x8,#0x03ffffff
++      and     x9,x9,#0x03ffffff
++      ubfx    x10,x12,#14,#26
++      ubfx    x11,x13,#14,#26
++      add     x12,x3,x12,lsr#40
++      add     x13,x3,x13,lsr#40
++      add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
++      fmov    d15,x6
++      add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
++      add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
++      fmov    d16,x8
++      fmov    d17,x10
++      fmov    d18,x12
++
++      ldp     x8,x12,[x1],#16 // inp[0:1]
++      ldp     x9,x13,[x1],#48
++
++      ld1     {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
++      ld1     {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
++      ld1     {v8.4s},[x15]
++
++#ifdef        __AARCH64EB__
++      rev     x8,x8
++      rev     x12,x12
++      rev     x9,x9
++      rev     x13,x13
++#endif
++      and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
++      and     x5,x9,#0x03ffffff
++      ubfx    x6,x8,#26,#26
++      ubfx    x7,x9,#26,#26
++      add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
++      extr    x8,x12,x8,#52
++      extr    x9,x13,x9,#52
++      add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
++      fmov    d9,x4
++      and     x8,x8,#0x03ffffff
++      and     x9,x9,#0x03ffffff
++      ubfx    x10,x12,#14,#26
++      ubfx    x11,x13,#14,#26
++      add     x12,x3,x12,lsr#40
++      add     x13,x3,x13,lsr#40
++      add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
++      fmov    d10,x6
++      add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
++      add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
++      movi    v31.2d,#-1
++      fmov    d11,x8
++      fmov    d12,x10
++      fmov    d13,x12
++      ushr    v31.2d,v31.2d,#38
++
++      b.ls    .Lskip_loop
++
++.align        4
++.Loop_neon:
++      ////////////////////////////////////////////////////////////////
++      // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
++      // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
++      //   ___________________/
++      // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
++      // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
++      //   ___________________/ ____________________/
++      //
++      // Note that we start with inp[2:3]*r^2. This is because it
++      // doesn't depend on reduction in previous iteration.
++      ////////////////////////////////////////////////////////////////
++      // d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
++      // d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
++      // d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
++      // d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
++      // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
++
++      subs    x2,x2,#64
++      umull   v23.2d,v14.2s,v7.s[2]
++      csel    x16,x17,x16,lo
++      umull   v22.2d,v14.2s,v5.s[2]
++      umull   v21.2d,v14.2s,v3.s[2]
++       ldp    x8,x12,[x16],#16        // inp[2:3] (or zero)
++      umull   v20.2d,v14.2s,v1.s[2]
++       ldp    x9,x13,[x16],#48
++      umull   v19.2d,v14.2s,v0.s[2]
++#ifdef        __AARCH64EB__
++       rev    x8,x8
++       rev    x12,x12
++       rev    x9,x9
++       rev    x13,x13
++#endif
++
++      umlal   v23.2d,v15.2s,v5.s[2]
++       and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
++      umlal   v22.2d,v15.2s,v3.s[2]
++       and    x5,x9,#0x03ffffff
++      umlal   v21.2d,v15.2s,v1.s[2]
++       ubfx   x6,x8,#26,#26
++      umlal   v20.2d,v15.2s,v0.s[2]
++       ubfx   x7,x9,#26,#26
++      umlal   v19.2d,v15.2s,v8.s[2]
++       add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
++
++      umlal   v23.2d,v16.2s,v3.s[2]
++       extr   x8,x12,x8,#52
++      umlal   v22.2d,v16.2s,v1.s[2]
++       extr   x9,x13,x9,#52
++      umlal   v21.2d,v16.2s,v0.s[2]
++       add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
++      umlal   v20.2d,v16.2s,v8.s[2]
++       fmov   d14,x4
++      umlal   v19.2d,v16.2s,v6.s[2]
++       and    x8,x8,#0x03ffffff
++
++      umlal   v23.2d,v17.2s,v1.s[2]
++       and    x9,x9,#0x03ffffff
++      umlal   v22.2d,v17.2s,v0.s[2]
++       ubfx   x10,x12,#14,#26
++      umlal   v21.2d,v17.2s,v8.s[2]
++       ubfx   x11,x13,#14,#26
++      umlal   v20.2d,v17.2s,v6.s[2]
++       add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
++      umlal   v19.2d,v17.2s,v4.s[2]
++       fmov   d15,x6
++
++      add     v11.2s,v11.2s,v26.2s
++       add    x12,x3,x12,lsr#40
++      umlal   v23.2d,v18.2s,v0.s[2]
++       add    x13,x3,x13,lsr#40
++      umlal   v22.2d,v18.2s,v8.s[2]
++       add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
++      umlal   v21.2d,v18.2s,v6.s[2]
++       add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
++      umlal   v20.2d,v18.2s,v4.s[2]
++       fmov   d16,x8
++      umlal   v19.2d,v18.2s,v2.s[2]
++       fmov   d17,x10
++
++      ////////////////////////////////////////////////////////////////
++      // (hash+inp[0:1])*r^4 and accumulate
++
++      add     v9.2s,v9.2s,v24.2s
++       fmov   d18,x12
++      umlal   v22.2d,v11.2s,v1.s[0]
++       ldp    x8,x12,[x1],#16 // inp[0:1]
++      umlal   v19.2d,v11.2s,v6.s[0]
++       ldp    x9,x13,[x1],#48
++      umlal   v23.2d,v11.2s,v3.s[0]
++      umlal   v20.2d,v11.2s,v8.s[0]
++      umlal   v21.2d,v11.2s,v0.s[0]
++#ifdef        __AARCH64EB__
++       rev    x8,x8
++       rev    x12,x12
++       rev    x9,x9
++       rev    x13,x13
++#endif
++
++      add     v10.2s,v10.2s,v25.2s
++      umlal   v22.2d,v9.2s,v5.s[0]
++      umlal   v23.2d,v9.2s,v7.s[0]
++       and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
++      umlal   v21.2d,v9.2s,v3.s[0]
++       and    x5,x9,#0x03ffffff
++      umlal   v19.2d,v9.2s,v0.s[0]
++       ubfx   x6,x8,#26,#26
++      umlal   v20.2d,v9.2s,v1.s[0]
++       ubfx   x7,x9,#26,#26
++
++      add     v12.2s,v12.2s,v27.2s
++       add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
++      umlal   v22.2d,v10.2s,v3.s[0]
++       extr   x8,x12,x8,#52
++      umlal   v23.2d,v10.2s,v5.s[0]
++       extr   x9,x13,x9,#52
++      umlal   v19.2d,v10.2s,v8.s[0]
++       add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
++      umlal   v21.2d,v10.2s,v1.s[0]
++       fmov   d9,x4
++      umlal   v20.2d,v10.2s,v0.s[0]
++       and    x8,x8,#0x03ffffff
++
++      add     v13.2s,v13.2s,v28.2s
++       and    x9,x9,#0x03ffffff
++      umlal   v22.2d,v12.2s,v0.s[0]
++       ubfx   x10,x12,#14,#26
++      umlal   v19.2d,v12.2s,v4.s[0]
++       ubfx   x11,x13,#14,#26
++      umlal   v23.2d,v12.2s,v1.s[0]
++       add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
++      umlal   v20.2d,v12.2s,v6.s[0]
++       fmov   d10,x6
++      umlal   v21.2d,v12.2s,v8.s[0]
++       add    x12,x3,x12,lsr#40
++
++      umlal   v22.2d,v13.2s,v8.s[0]
++       add    x13,x3,x13,lsr#40
++      umlal   v19.2d,v13.2s,v2.s[0]
++       add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
++      umlal   v23.2d,v13.2s,v0.s[0]
++       add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
++      umlal   v20.2d,v13.2s,v4.s[0]
++       fmov   d11,x8
++      umlal   v21.2d,v13.2s,v6.s[0]
++       fmov   d12,x10
++       fmov   d13,x12
++
++      /////////////////////////////////////////////////////////////////
++      // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
++      // and P. Schwabe
++      //
++      // [see discussion in poly1305-armv4 module]
++
++      ushr    v29.2d,v22.2d,#26
++      xtn     v27.2s,v22.2d
++       ushr   v30.2d,v19.2d,#26
++       and    v19.16b,v19.16b,v31.16b
++      add     v23.2d,v23.2d,v29.2d    // h3 -> h4
++      bic     v27.2s,#0xfc,lsl#24     // &=0x03ffffff
++       add    v20.2d,v20.2d,v30.2d    // h0 -> h1
++
++      ushr    v29.2d,v23.2d,#26
++      xtn     v28.2s,v23.2d
++       ushr   v30.2d,v20.2d,#26
++       xtn    v25.2s,v20.2d
++      bic     v28.2s,#0xfc,lsl#24
++       add    v21.2d,v21.2d,v30.2d    // h1 -> h2
++
++      add     v19.2d,v19.2d,v29.2d
++      shl     v29.2d,v29.2d,#2
++       shrn   v30.2s,v21.2d,#26
++       xtn    v26.2s,v21.2d
++      add     v19.2d,v19.2d,v29.2d    // h4 -> h0
++       bic    v25.2s,#0xfc,lsl#24
++       add    v27.2s,v27.2s,v30.2s            // h2 -> h3
++       bic    v26.2s,#0xfc,lsl#24
++
++      shrn    v29.2s,v19.2d,#26
++      xtn     v24.2s,v19.2d
++       ushr   v30.2s,v27.2s,#26
++       bic    v27.2s,#0xfc,lsl#24
++       bic    v24.2s,#0xfc,lsl#24
++      add     v25.2s,v25.2s,v29.2s            // h0 -> h1
++       add    v28.2s,v28.2s,v30.2s            // h3 -> h4
++
++      b.hi    .Loop_neon
++
++.Lskip_loop:
++      dup     v16.2d,v16.d[0]
++      add     v11.2s,v11.2s,v26.2s
++
++      ////////////////////////////////////////////////////////////////
++      // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
++
++      adds    x2,x2,#32
++      b.ne    .Long_tail
++
++      dup     v16.2d,v11.d[0]
++      add     v14.2s,v9.2s,v24.2s
++      add     v17.2s,v12.2s,v27.2s
++      add     v15.2s,v10.2s,v25.2s
++      add     v18.2s,v13.2s,v28.2s
++
++.Long_tail:
++      dup     v14.2d,v14.d[0]
++      umull2  v19.2d,v16.4s,v6.4s
++      umull2  v22.2d,v16.4s,v1.4s
++      umull2  v23.2d,v16.4s,v3.4s
++      umull2  v21.2d,v16.4s,v0.4s
++      umull2  v20.2d,v16.4s,v8.4s
++
++      dup     v15.2d,v15.d[0]
++      umlal2  v19.2d,v14.4s,v0.4s
++      umlal2  v21.2d,v14.4s,v3.4s
++      umlal2  v22.2d,v14.4s,v5.4s
++      umlal2  v23.2d,v14.4s,v7.4s
++      umlal2  v20.2d,v14.4s,v1.4s
++
++      dup     v17.2d,v17.d[0]
++      umlal2  v19.2d,v15.4s,v8.4s
++      umlal2  v22.2d,v15.4s,v3.4s
++      umlal2  v21.2d,v15.4s,v1.4s
++      umlal2  v23.2d,v15.4s,v5.4s
++      umlal2  v20.2d,v15.4s,v0.4s
++
++      dup     v18.2d,v18.d[0]
++      umlal2  v22.2d,v17.4s,v0.4s
++      umlal2  v23.2d,v17.4s,v1.4s
++      umlal2  v19.2d,v17.4s,v4.4s
++      umlal2  v20.2d,v17.4s,v6.4s
++      umlal2  v21.2d,v17.4s,v8.4s
++
++      umlal2  v22.2d,v18.4s,v8.4s
++      umlal2  v19.2d,v18.4s,v2.4s
++      umlal2  v23.2d,v18.4s,v0.4s
++      umlal2  v20.2d,v18.4s,v4.4s
++      umlal2  v21.2d,v18.4s,v6.4s
++
++      b.eq    .Lshort_tail
++
++      ////////////////////////////////////////////////////////////////
++      // (hash+inp[0:1])*r^4:r^3 and accumulate
++
++      add     v9.2s,v9.2s,v24.2s
++      umlal   v22.2d,v11.2s,v1.2s
++      umlal   v19.2d,v11.2s,v6.2s
++      umlal   v23.2d,v11.2s,v3.2s
++      umlal   v20.2d,v11.2s,v8.2s
++      umlal   v21.2d,v11.2s,v0.2s
++
++      add     v10.2s,v10.2s,v25.2s
++      umlal   v22.2d,v9.2s,v5.2s
++      umlal   v19.2d,v9.2s,v0.2s
++      umlal   v23.2d,v9.2s,v7.2s
++      umlal   v20.2d,v9.2s,v1.2s
++      umlal   v21.2d,v9.2s,v3.2s
++
++      add     v12.2s,v12.2s,v27.2s
++      umlal   v22.2d,v10.2s,v3.2s
++      umlal   v19.2d,v10.2s,v8.2s
++      umlal   v23.2d,v10.2s,v5.2s
++      umlal   v20.2d,v10.2s,v0.2s
++      umlal   v21.2d,v10.2s,v1.2s
++
++      add     v13.2s,v13.2s,v28.2s
++      umlal   v22.2d,v12.2s,v0.2s
++      umlal   v19.2d,v12.2s,v4.2s
++      umlal   v23.2d,v12.2s,v1.2s
++      umlal   v20.2d,v12.2s,v6.2s
++      umlal   v21.2d,v12.2s,v8.2s
++
++      umlal   v22.2d,v13.2s,v8.2s
++      umlal   v19.2d,v13.2s,v2.2s
++      umlal   v23.2d,v13.2s,v0.2s
++      umlal   v20.2d,v13.2s,v4.2s
++      umlal   v21.2d,v13.2s,v6.2s
++
++.Lshort_tail:
++      ////////////////////////////////////////////////////////////////
++      // horizontal add
++
++      addp    v22.2d,v22.2d,v22.2d
++       ldp    d8,d9,[sp,#16]          // meet ABI requirements
++      addp    v19.2d,v19.2d,v19.2d
++       ldp    d10,d11,[sp,#32]
++      addp    v23.2d,v23.2d,v23.2d
++       ldp    d12,d13,[sp,#48]
++      addp    v20.2d,v20.2d,v20.2d
++       ldp    d14,d15,[sp,#64]
++      addp    v21.2d,v21.2d,v21.2d
++       ldr    x30,[sp,#8]
++       .inst  0xd50323bf              // autiasp
++
++      ////////////////////////////////////////////////////////////////
++      // lazy reduction, but without narrowing
++
++      ushr    v29.2d,v22.2d,#26
++      and     v22.16b,v22.16b,v31.16b
++       ushr   v30.2d,v19.2d,#26
++       and    v19.16b,v19.16b,v31.16b
++
++      add     v23.2d,v23.2d,v29.2d    // h3 -> h4
++       add    v20.2d,v20.2d,v30.2d    // h0 -> h1
++
++      ushr    v29.2d,v23.2d,#26
++      and     v23.16b,v23.16b,v31.16b
++       ushr   v30.2d,v20.2d,#26
++       and    v20.16b,v20.16b,v31.16b
++       add    v21.2d,v21.2d,v30.2d    // h1 -> h2
++
++      add     v19.2d,v19.2d,v29.2d
++      shl     v29.2d,v29.2d,#2
++       ushr   v30.2d,v21.2d,#26
++       and    v21.16b,v21.16b,v31.16b
++      add     v19.2d,v19.2d,v29.2d    // h4 -> h0
++       add    v22.2d,v22.2d,v30.2d    // h2 -> h3
++
++      ushr    v29.2d,v19.2d,#26
++      and     v19.16b,v19.16b,v31.16b
++       ushr   v30.2d,v22.2d,#26
++       and    v22.16b,v22.16b,v31.16b
++      add     v20.2d,v20.2d,v29.2d    // h0 -> h1
++       add    v23.2d,v23.2d,v30.2d    // h3 -> h4
++
++      ////////////////////////////////////////////////////////////////
++      // write the result, can be partially reduced
++
++      st4     {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
++      mov     x4,#1
++      st1     {v23.s}[0],[x0]
++      str     x4,[x0,#8]              // set is_base2_26
++
++      ldr     x29,[sp],#80
++      ret
++.size poly1305_blocks_neon,.-poly1305_blocks_neon
++
++.align        5
++.Lzeros:
++.long 0,0,0,0,0,0,0,0
++.asciz        "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
++.align        2
++#if !defined(__KERNEL__) && !defined(_WIN64)
++.comm OPENSSL_armcap_P,4,4
++.hidden       OPENSSL_armcap_P
++#endif
+--- /dev/null
++++ b/arch/arm64/crypto/poly1305-glue.c
+@@ -0,0 +1,237 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
++ *
++ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
++ */
++
++#include <asm/hwcap.h>
++#include <asm/neon.h>
++#include <asm/simd.h>
++#include <asm/unaligned.h>
++#include <crypto/algapi.h>
++#include <crypto/internal/hash.h>
++#include <crypto/internal/poly1305.h>
++#include <crypto/internal/simd.h>
++#include <linux/cpufeature.h>
++#include <linux/crypto.h>
++#include <linux/jump_label.h>
++#include <linux/module.h>
++
++asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
++asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
++asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
++asmlinkage void poly1305_emit(void *state, __le32 *digest, const u32 *nonce);
++
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
++
++void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
++{
++      poly1305_init_arm64(&dctx->h, key);
++      dctx->s[0] = get_unaligned_le32(key + 16);
++      dctx->s[1] = get_unaligned_le32(key + 20);
++      dctx->s[2] = get_unaligned_le32(key + 24);
++      dctx->s[3] = get_unaligned_le32(key + 28);
++      dctx->buflen = 0;
++}
++EXPORT_SYMBOL(poly1305_init_arch);
++
++static int neon_poly1305_init(struct shash_desc *desc)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      dctx->buflen = 0;
++      dctx->rset = 0;
++      dctx->sset = false;
++
++      return 0;
++}
++
++static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
++                               u32 len, u32 hibit, bool do_neon)
++{
++      if (unlikely(!dctx->sset)) {
++              if (!dctx->rset) {
++                      poly1305_init_arch(dctx, src);
++                      src += POLY1305_BLOCK_SIZE;
++                      len -= POLY1305_BLOCK_SIZE;
++                      dctx->rset = 1;
++              }
++              if (len >= POLY1305_BLOCK_SIZE) {
++                      dctx->s[0] = get_unaligned_le32(src +  0);
++                      dctx->s[1] = get_unaligned_le32(src +  4);
++                      dctx->s[2] = get_unaligned_le32(src +  8);
++                      dctx->s[3] = get_unaligned_le32(src + 12);
++                      src += POLY1305_BLOCK_SIZE;
++                      len -= POLY1305_BLOCK_SIZE;
++                      dctx->sset = true;
++              }
++              if (len < POLY1305_BLOCK_SIZE)
++                      return;
++      }
++
++      len &= ~(POLY1305_BLOCK_SIZE - 1);
++
++      if (static_branch_likely(&have_neon) && likely(do_neon))
++              poly1305_blocks_neon(&dctx->h, src, len, hibit);
++      else
++              poly1305_blocks(&dctx->h, src, len, hibit);
++}
++
++static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
++                                  const u8 *src, u32 len, bool do_neon)
++{
++      if (unlikely(dctx->buflen)) {
++              u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++              memcpy(dctx->buf + dctx->buflen, src, bytes);
++              src += bytes;
++              len -= bytes;
++              dctx->buflen += bytes;
++
++              if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++                      neon_poly1305_blocks(dctx, dctx->buf,
++                                           POLY1305_BLOCK_SIZE, 1, false);
++                      dctx->buflen = 0;
++              }
++      }
++
++      if (likely(len >= POLY1305_BLOCK_SIZE)) {
++              neon_poly1305_blocks(dctx, src, len, 1, do_neon);
++              src += round_down(len, POLY1305_BLOCK_SIZE);
++              len %= POLY1305_BLOCK_SIZE;
++      }
++
++      if (unlikely(len)) {
++              dctx->buflen = len;
++              memcpy(dctx->buf, src, len);
++      }
++}
++
++static int neon_poly1305_update(struct shash_desc *desc,
++                              const u8 *src, unsigned int srclen)
++{
++      bool do_neon = crypto_simd_usable() && srclen > 128;
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      if (static_branch_likely(&have_neon) && do_neon)
++              kernel_neon_begin();
++      neon_poly1305_do_update(dctx, src, srclen, do_neon);
++      if (static_branch_likely(&have_neon) && do_neon)
++              kernel_neon_end();
++      return 0;
++}
++
++void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
++                        unsigned int nbytes)
++{
++      if (unlikely(dctx->buflen)) {
++              u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++              memcpy(dctx->buf + dctx->buflen, src, bytes);
++              src += bytes;
++              nbytes -= bytes;
++              dctx->buflen += bytes;
++
++              if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++                      poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
++                      dctx->buflen = 0;
++              }
++      }
++
++      if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
++              unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
++
++              if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
++                      kernel_neon_begin();
++                      poly1305_blocks_neon(&dctx->h, src, len, 1);
++                      kernel_neon_end();
++              } else {
++                      poly1305_blocks(&dctx->h, src, len, 1);
++              }
++              src += len;
++              nbytes %= POLY1305_BLOCK_SIZE;
++      }
++
++      if (unlikely(nbytes)) {
++              dctx->buflen = nbytes;
++              memcpy(dctx->buf, src, nbytes);
++      }
++}
++EXPORT_SYMBOL(poly1305_update_arch);
++
++void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
++{
++      __le32 digest[4];
++      u64 f = 0;
++
++      if (unlikely(dctx->buflen)) {
++              dctx->buf[dctx->buflen++] = 1;
++              memset(dctx->buf + dctx->buflen, 0,
++                     POLY1305_BLOCK_SIZE - dctx->buflen);
++              poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
++      }
++
++      poly1305_emit(&dctx->h, digest, dctx->s);
++
++      /* mac = (h + s) % (2^128) */
++      f = (f >> 32) + le32_to_cpu(digest[0]);
++      put_unaligned_le32(f, dst);
++      f = (f >> 32) + le32_to_cpu(digest[1]);
++      put_unaligned_le32(f, dst + 4);
++      f = (f >> 32) + le32_to_cpu(digest[2]);
++      put_unaligned_le32(f, dst + 8);
++      f = (f >> 32) + le32_to_cpu(digest[3]);
++      put_unaligned_le32(f, dst + 12);
++
++      *dctx = (struct poly1305_desc_ctx){};
++}
++EXPORT_SYMBOL(poly1305_final_arch);
++
++static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      if (unlikely(!dctx->sset))
++              return -ENOKEY;
++
++      poly1305_final_arch(dctx, dst);
++      return 0;
++}
++
++static struct shash_alg neon_poly1305_alg = {
++      .init                   = neon_poly1305_init,
++      .update                 = neon_poly1305_update,
++      .final                  = neon_poly1305_final,
++      .digestsize             = POLY1305_DIGEST_SIZE,
++      .descsize               = sizeof(struct poly1305_desc_ctx),
++
++      .base.cra_name          = "poly1305",
++      .base.cra_driver_name   = "poly1305-neon",
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = POLY1305_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++};
++
++static int __init neon_poly1305_mod_init(void)
++{
++      if (!cpu_have_named_feature(ASIMD))
++              return 0;
++
++      static_branch_enable(&have_neon);
++
++      return crypto_register_shash(&neon_poly1305_alg);
++}
++
++static void __exit neon_poly1305_mod_exit(void)
++{
++      if (cpu_have_named_feature(ASIMD))
++              crypto_unregister_shash(&neon_poly1305_alg);
++}
++
++module_init(neon_poly1305_mod_init);
++module_exit(neon_poly1305_mod_exit);
++
++MODULE_LICENSE("GPL v2");
++MODULE_ALIAS_CRYPTO("poly1305");
++MODULE_ALIAS_CRYPTO("poly1305-neon");
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -40,6 +40,7 @@ config CRYPTO_LIB_DES
+ config CRYPTO_LIB_POLY1305_RSIZE
+       int
+       default 4 if X86_64
++      default 9 if ARM64
+       default 1
+ 
+ config CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch b/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch

new file mode 100644 (file)

index 0000000..d48235c
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
@@ -0,0 +1,2776 @@
+From 588765ccad76f9f65f09e1dcadc464d22441c889 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:25 +0100
+Subject: [PATCH 019/124] crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS
+ NEON implementation
+
+commit a6b803b3ddc793d6db0c16f12fc12d30d20fa9cc upstream.
+
+This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation
+for NEON authored by Andy Polyakov, and contributed by him to the OpenSSL
+project. The file 'poly1305-armv4.pl' is taken straight from this upstream
+GitHub repository [0] at commit ec55a08dc0244ce570c4fc7cade330c60798952f,
+and already contains all the changes required to build it as part of a
+Linux kernel module.
+
+[0] https://github.com/dot-asm/cryptogams
+
+Co-developed-by: Andy Polyakov <appro@cryptogams.org>
+Signed-off-by: Andy Polyakov <appro@cryptogams.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/Kconfig                 |    5 +
+ arch/arm/crypto/Makefile                |   12 +-
+ arch/arm/crypto/poly1305-armv4.pl       | 1236 +++++++++++++++++++++++
+ arch/arm/crypto/poly1305-core.S_shipped | 1158 +++++++++++++++++++++
+ arch/arm/crypto/poly1305-glue.c         |  276 +++++
+ lib/crypto/Kconfig                      |    2 +-
+ 6 files changed, 2687 insertions(+), 2 deletions(-)
+ create mode 100644 arch/arm/crypto/poly1305-armv4.pl
+ create mode 100644 arch/arm/crypto/poly1305-core.S_shipped
+ create mode 100644 arch/arm/crypto/poly1305-glue.c
+
+--- a/arch/arm/crypto/Kconfig
++++ b/arch/arm/crypto/Kconfig
+@@ -131,6 +131,11 @@ config CRYPTO_CHACHA20_NEON
+       select CRYPTO_BLKCIPHER
+       select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ 
++config CRYPTO_POLY1305_ARM
++      tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
++      select CRYPTO_HASH
++      select CRYPTO_ARCH_HAVE_LIB_POLY1305
++
+ config CRYPTO_NHPOLY1305_NEON
+       tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+       depends on KERNEL_MODE_NEON
+--- a/arch/arm/crypto/Makefile
++++ b/arch/arm/crypto/Makefile
+@@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sh
+ obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
+ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+ obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
++obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
+ 
+ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
+@@ -55,12 +56,16 @@ crct10dif-arm-ce-y := crct10dif-ce-core.
+ crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+ chacha-neon-y := chacha-scalar-core.o chacha-glue.o
+ chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
++poly1305-arm-y := poly1305-core.o poly1305-glue.o
+ nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
+ 
+ ifdef REGENERATE_ARM_CRYPTO
+ quiet_cmd_perl = PERL    $@
+       cmd_perl = $(PERL) $(<) > $(@)
+ 
++$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
++      $(call cmd,perl)
++
+ $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+       $(call cmd,perl)
+ 
+@@ -68,4 +73,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha
+       $(call cmd,perl)
+ endif
+ 
+-clean-files += sha256-core.S sha512-core.S
++clean-files += poly1305-core.S sha256-core.S sha512-core.S
++
++# massage the perlasm code a bit so we only get the NEON routine if we need it
++poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
++poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
++AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
+--- /dev/null
++++ b/arch/arm/crypto/poly1305-armv4.pl
+@@ -0,0 +1,1236 @@
++#!/usr/bin/env perl
++# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
++#
++# ====================================================================
++# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
++# project.
++# ====================================================================
++#
++#                     IALU(*)/gcc-4.4         NEON
++#
++# ARM11xx(ARMv6)      7.78/+100%              -
++# Cortex-A5           6.35/+130%              3.00
++# Cortex-A8           6.25/+115%              2.36
++# Cortex-A9           5.10/+95%               2.55
++# Cortex-A15          3.85/+85%               1.25(**)
++# Snapdragon S4               5.70/+100%              1.48(**)
++#
++# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
++# (**)        these are trade-off results, they can be improved by ~8% but at
++#     the cost of 15/12% regression on Cortex-A5/A7, it's even possible
++#     to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
++
++$flavour = shift;
++if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
++else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
++
++if ($flavour && $flavour ne "void") {
++    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++    die "can't locate arm-xlate.pl";
++
++    open STDOUT,"| \"$^X\" $xlate $flavour $output";
++} else {
++    open STDOUT,">$output";
++}
++
++($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
++
++$code.=<<___;
++#ifndef       __KERNEL__
++# include "arm_arch.h"
++#else
++# define __ARM_ARCH__ __LINUX_ARM_ARCH__
++# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
++# define poly1305_init   poly1305_init_arm
++# define poly1305_blocks poly1305_blocks_arm
++# define poly1305_emit   poly1305_emit_arm
++.globl        poly1305_blocks_neon
++#endif
++
++#if defined(__thumb2__)
++.syntax       unified
++.thumb
++#else
++.code 32
++#endif
++
++.text
++
++.globl        poly1305_emit
++.globl        poly1305_blocks
++.globl        poly1305_init
++.type poly1305_init,%function
++.align        5
++poly1305_init:
++.Lpoly1305_init:
++      stmdb   sp!,{r4-r11}
++
++      eor     r3,r3,r3
++      cmp     $inp,#0
++      str     r3,[$ctx,#0]            @ zero hash value
++      str     r3,[$ctx,#4]
++      str     r3,[$ctx,#8]
++      str     r3,[$ctx,#12]
++      str     r3,[$ctx,#16]
++      str     r3,[$ctx,#36]           @ clear is_base2_26
++      add     $ctx,$ctx,#20
++
++#ifdef        __thumb2__
++      it      eq
++#endif
++      moveq   r0,#0
++      beq     .Lno_key
++
++#if   __ARM_MAX_ARCH__>=7
++      mov     r3,#-1
++      str     r3,[$ctx,#28]           @ impossible key power value
++# ifndef __KERNEL__
++      adr     r11,.Lpoly1305_init
++      ldr     r12,.LOPENSSL_armcap
++# endif
++#endif
++      ldrb    r4,[$inp,#0]
++      mov     r10,#0x0fffffff
++      ldrb    r5,[$inp,#1]
++      and     r3,r10,#-4              @ 0x0ffffffc
++      ldrb    r6,[$inp,#2]
++      ldrb    r7,[$inp,#3]
++      orr     r4,r4,r5,lsl#8
++      ldrb    r5,[$inp,#4]
++      orr     r4,r4,r6,lsl#16
++      ldrb    r6,[$inp,#5]
++      orr     r4,r4,r7,lsl#24
++      ldrb    r7,[$inp,#6]
++      and     r4,r4,r10
++
++#if   __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++# if !defined(_WIN32)
++      ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
++# endif
++# if defined(__APPLE__) || defined(_WIN32)
++      ldr     r12,[r12]
++# endif
++#endif
++      ldrb    r8,[$inp,#7]
++      orr     r5,r5,r6,lsl#8
++      ldrb    r6,[$inp,#8]
++      orr     r5,r5,r7,lsl#16
++      ldrb    r7,[$inp,#9]
++      orr     r5,r5,r8,lsl#24
++      ldrb    r8,[$inp,#10]
++      and     r5,r5,r3
++
++#if   __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++      tst     r12,#ARMV7_NEON         @ check for NEON
++# ifdef       __thumb2__
++      adr     r9,.Lpoly1305_blocks_neon
++      adr     r11,.Lpoly1305_blocks
++      it      ne
++      movne   r11,r9
++      adr     r12,.Lpoly1305_emit
++      orr     r11,r11,#1              @ thumb-ify addresses
++      orr     r12,r12,#1
++# else
++      add     r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
++      ite     eq
++      addeq   r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
++      addne   r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
++# endif
++#endif
++      ldrb    r9,[$inp,#11]
++      orr     r6,r6,r7,lsl#8
++      ldrb    r7,[$inp,#12]
++      orr     r6,r6,r8,lsl#16
++      ldrb    r8,[$inp,#13]
++      orr     r6,r6,r9,lsl#24
++      ldrb    r9,[$inp,#14]
++      and     r6,r6,r3
++
++      ldrb    r10,[$inp,#15]
++      orr     r7,r7,r8,lsl#8
++      str     r4,[$ctx,#0]
++      orr     r7,r7,r9,lsl#16
++      str     r5,[$ctx,#4]
++      orr     r7,r7,r10,lsl#24
++      str     r6,[$ctx,#8]
++      and     r7,r7,r3
++      str     r7,[$ctx,#12]
++#if   __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++      stmia   r2,{r11,r12}            @ fill functions table
++      mov     r0,#1
++#else
++      mov     r0,#0
++#endif
++.Lno_key:
++      ldmia   sp!,{r4-r11}
++#if   __ARM_ARCH__>=5
++      ret                             @ bx    lr
++#else
++      tst     lr,#1
++      moveq   pc,lr                   @ be binary compatible with V4, yet
++      bx      lr                      @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_init,.-poly1305_init
++___
++{
++my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
++my ($s1,$s2,$s3)=($r1,$r2,$r3);
++
++$code.=<<___;
++.type poly1305_blocks,%function
++.align        5
++poly1305_blocks:
++.Lpoly1305_blocks:
++      stmdb   sp!,{r3-r11,lr}
++
++      ands    $len,$len,#-16
++      beq     .Lno_data
++
++      add     $len,$len,$inp          @ end pointer
++      sub     sp,sp,#32
++
++#if __ARM_ARCH__<7
++      ldmia   $ctx,{$h0-$r3}          @ load context
++      add     $ctx,$ctx,#20
++      str     $len,[sp,#16]           @ offload stuff
++      str     $ctx,[sp,#12]
++#else
++      ldr     lr,[$ctx,#36]           @ is_base2_26
++      ldmia   $ctx!,{$h0-$h4}         @ load hash value
++      str     $len,[sp,#16]           @ offload stuff
++      str     $ctx,[sp,#12]
++
++      adds    $r0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
++      mov     $r1,$h1,lsr#6
++      adcs    $r1,$r1,$h2,lsl#20
++      mov     $r2,$h2,lsr#12
++      adcs    $r2,$r2,$h3,lsl#14
++      mov     $r3,$h3,lsr#18
++      adcs    $r3,$r3,$h4,lsl#8
++      mov     $len,#0
++      teq     lr,#0
++      str     $len,[$ctx,#16]         @ clear is_base2_26
++      adc     $len,$len,$h4,lsr#24
++
++      itttt   ne
++      movne   $h0,$r0                 @ choose between radixes
++      movne   $h1,$r1
++      movne   $h2,$r2
++      movne   $h3,$r3
++      ldmia   $ctx,{$r0-$r3}          @ load key
++      it      ne
++      movne   $h4,$len
++#endif
++
++      mov     lr,$inp
++      cmp     $padbit,#0
++      str     $r1,[sp,#20]
++      str     $r2,[sp,#24]
++      str     $r3,[sp,#28]
++      b       .Loop
++
++.align        4
++.Loop:
++#if __ARM_ARCH__<7
++      ldrb    r0,[lr],#16             @ load input
++# ifdef       __thumb2__
++      it      hi
++# endif
++      addhi   $h4,$h4,#1              @ 1<<128
++      ldrb    r1,[lr,#-15]
++      ldrb    r2,[lr,#-14]
++      ldrb    r3,[lr,#-13]
++      orr     r1,r0,r1,lsl#8
++      ldrb    r0,[lr,#-12]
++      orr     r2,r1,r2,lsl#16
++      ldrb    r1,[lr,#-11]
++      orr     r3,r2,r3,lsl#24
++      ldrb    r2,[lr,#-10]
++      adds    $h0,$h0,r3              @ accumulate input
++
++      ldrb    r3,[lr,#-9]
++      orr     r1,r0,r1,lsl#8
++      ldrb    r0,[lr,#-8]
++      orr     r2,r1,r2,lsl#16
++      ldrb    r1,[lr,#-7]
++      orr     r3,r2,r3,lsl#24
++      ldrb    r2,[lr,#-6]
++      adcs    $h1,$h1,r3
++
++      ldrb    r3,[lr,#-5]
++      orr     r1,r0,r1,lsl#8
++      ldrb    r0,[lr,#-4]
++      orr     r2,r1,r2,lsl#16
++      ldrb    r1,[lr,#-3]
++      orr     r3,r2,r3,lsl#24
++      ldrb    r2,[lr,#-2]
++      adcs    $h2,$h2,r3
++
++      ldrb    r3,[lr,#-1]
++      orr     r1,r0,r1,lsl#8
++      str     lr,[sp,#8]              @ offload input pointer
++      orr     r2,r1,r2,lsl#16
++      add     $s1,$r1,$r1,lsr#2
++      orr     r3,r2,r3,lsl#24
++#else
++      ldr     r0,[lr],#16             @ load input
++      it      hi
++      addhi   $h4,$h4,#1              @ padbit
++      ldr     r1,[lr,#-12]
++      ldr     r2,[lr,#-8]
++      ldr     r3,[lr,#-4]
++# ifdef       __ARMEB__
++      rev     r0,r0
++      rev     r1,r1
++      rev     r2,r2
++      rev     r3,r3
++# endif
++      adds    $h0,$h0,r0              @ accumulate input
++      str     lr,[sp,#8]              @ offload input pointer
++      adcs    $h1,$h1,r1
++      add     $s1,$r1,$r1,lsr#2
++      adcs    $h2,$h2,r2
++#endif
++      add     $s2,$r2,$r2,lsr#2
++      adcs    $h3,$h3,r3
++      add     $s3,$r3,$r3,lsr#2
++
++      umull   r2,r3,$h1,$r0
++       adc    $h4,$h4,#0
++      umull   r0,r1,$h0,$r0
++      umlal   r2,r3,$h4,$s1
++      umlal   r0,r1,$h3,$s1
++      ldr     $r1,[sp,#20]            @ reload $r1
++      umlal   r2,r3,$h2,$s3
++      umlal   r0,r1,$h1,$s3
++      umlal   r2,r3,$h3,$s2
++      umlal   r0,r1,$h2,$s2
++      umlal   r2,r3,$h0,$r1
++      str     r0,[sp,#0]              @ future $h0
++       mul    r0,$s2,$h4
++      ldr     $r2,[sp,#24]            @ reload $r2
++      adds    r2,r2,r1                @ d1+=d0>>32
++       eor    r1,r1,r1
++      adc     lr,r3,#0                @ future $h2
++      str     r2,[sp,#4]              @ future $h1
++
++      mul     r2,$s3,$h4
++      eor     r3,r3,r3
++      umlal   r0,r1,$h3,$s3
++      ldr     $r3,[sp,#28]            @ reload $r3
++      umlal   r2,r3,$h3,$r0
++      umlal   r0,r1,$h2,$r0
++      umlal   r2,r3,$h2,$r1
++      umlal   r0,r1,$h1,$r1
++      umlal   r2,r3,$h1,$r2
++      umlal   r0,r1,$h0,$r2
++      umlal   r2,r3,$h0,$r3
++      ldr     $h0,[sp,#0]
++      mul     $h4,$r0,$h4
++      ldr     $h1,[sp,#4]
++
++      adds    $h2,lr,r0               @ d2+=d1>>32
++      ldr     lr,[sp,#8]              @ reload input pointer
++      adc     r1,r1,#0
++      adds    $h3,r2,r1               @ d3+=d2>>32
++      ldr     r0,[sp,#16]             @ reload end pointer
++      adc     r3,r3,#0
++      add     $h4,$h4,r3              @ h4+=d3>>32
++
++      and     r1,$h4,#-4
++      and     $h4,$h4,#3
++      add     r1,r1,r1,lsr#2          @ *=5
++      adds    $h0,$h0,r1
++      adcs    $h1,$h1,#0
++      adcs    $h2,$h2,#0
++      adcs    $h3,$h3,#0
++      adc     $h4,$h4,#0
++
++      cmp     r0,lr                   @ done yet?
++      bhi     .Loop
++
++      ldr     $ctx,[sp,#12]
++      add     sp,sp,#32
++      stmdb   $ctx,{$h0-$h4}          @ store the result
++
++.Lno_data:
++#if   __ARM_ARCH__>=5
++      ldmia   sp!,{r3-r11,pc}
++#else
++      ldmia   sp!,{r3-r11,lr}
++      tst     lr,#1
++      moveq   pc,lr                   @ be binary compatible with V4, yet
++      bx      lr                      @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_blocks,.-poly1305_blocks
++___
++}
++{
++my ($ctx,$mac,$nonce)=map("r$_",(0..2));
++my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
++my $g4=$ctx;
++
++$code.=<<___;
++.type poly1305_emit,%function
++.align        5
++poly1305_emit:
++.Lpoly1305_emit:
++      stmdb   sp!,{r4-r11}
++
++      ldmia   $ctx,{$h0-$h4}
++
++#if __ARM_ARCH__>=7
++      ldr     ip,[$ctx,#36]           @ is_base2_26
++
++      adds    $g0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
++      mov     $g1,$h1,lsr#6
++      adcs    $g1,$g1,$h2,lsl#20
++      mov     $g2,$h2,lsr#12
++      adcs    $g2,$g2,$h3,lsl#14
++      mov     $g3,$h3,lsr#18
++      adcs    $g3,$g3,$h4,lsl#8
++      mov     $g4,#0
++      adc     $g4,$g4,$h4,lsr#24
++
++      tst     ip,ip
++      itttt   ne
++      movne   $h0,$g0
++      movne   $h1,$g1
++      movne   $h2,$g2
++      movne   $h3,$g3
++      it      ne
++      movne   $h4,$g4
++#endif
++
++      adds    $g0,$h0,#5              @ compare to modulus
++      adcs    $g1,$h1,#0
++      adcs    $g2,$h2,#0
++      adcs    $g3,$h3,#0
++      adc     $g4,$h4,#0
++      tst     $g4,#4                  @ did it carry/borrow?
++
++#ifdef        __thumb2__
++      it      ne
++#endif
++      movne   $h0,$g0
++      ldr     $g0,[$nonce,#0]
++#ifdef        __thumb2__
++      it      ne
++#endif
++      movne   $h1,$g1
++      ldr     $g1,[$nonce,#4]
++#ifdef        __thumb2__
++      it      ne
++#endif
++      movne   $h2,$g2
++      ldr     $g2,[$nonce,#8]
++#ifdef        __thumb2__
++      it      ne
++#endif
++      movne   $h3,$g3
++      ldr     $g3,[$nonce,#12]
++
++      adds    $h0,$h0,$g0
++      adcs    $h1,$h1,$g1
++      adcs    $h2,$h2,$g2
++      adc     $h3,$h3,$g3
++
++#if __ARM_ARCH__>=7
++# ifdef __ARMEB__
++      rev     $h0,$h0
++      rev     $h1,$h1
++      rev     $h2,$h2
++      rev     $h3,$h3
++# endif
++      str     $h0,[$mac,#0]
++      str     $h1,[$mac,#4]
++      str     $h2,[$mac,#8]
++      str     $h3,[$mac,#12]
++#else
++      strb    $h0,[$mac,#0]
++      mov     $h0,$h0,lsr#8
++      strb    $h1,[$mac,#4]
++      mov     $h1,$h1,lsr#8
++      strb    $h2,[$mac,#8]
++      mov     $h2,$h2,lsr#8
++      strb    $h3,[$mac,#12]
++      mov     $h3,$h3,lsr#8
++
++      strb    $h0,[$mac,#1]
++      mov     $h0,$h0,lsr#8
++      strb    $h1,[$mac,#5]
++      mov     $h1,$h1,lsr#8
++      strb    $h2,[$mac,#9]
++      mov     $h2,$h2,lsr#8
++      strb    $h3,[$mac,#13]
++      mov     $h3,$h3,lsr#8
++
++      strb    $h0,[$mac,#2]
++      mov     $h0,$h0,lsr#8
++      strb    $h1,[$mac,#6]
++      mov     $h1,$h1,lsr#8
++      strb    $h2,[$mac,#10]
++      mov     $h2,$h2,lsr#8
++      strb    $h3,[$mac,#14]
++      mov     $h3,$h3,lsr#8
++
++      strb    $h0,[$mac,#3]
++      strb    $h1,[$mac,#7]
++      strb    $h2,[$mac,#11]
++      strb    $h3,[$mac,#15]
++#endif
++      ldmia   sp!,{r4-r11}
++#if   __ARM_ARCH__>=5
++      ret                             @ bx    lr
++#else
++      tst     lr,#1
++      moveq   pc,lr                   @ be binary compatible with V4, yet
++      bx      lr                      @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_emit,.-poly1305_emit
++___
++{
++my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
++my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
++my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
++
++my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
++
++$code.=<<___;
++#if   __ARM_MAX_ARCH__>=7
++.fpu  neon
++
++.type poly1305_init_neon,%function
++.align        5
++poly1305_init_neon:
++.Lpoly1305_init_neon:
++      ldr     r3,[$ctx,#48]           @ first table element
++      cmp     r3,#-1                  @ is value impossible?
++      bne     .Lno_init_neon
++
++      ldr     r4,[$ctx,#20]           @ load key base 2^32
++      ldr     r5,[$ctx,#24]
++      ldr     r6,[$ctx,#28]
++      ldr     r7,[$ctx,#32]
++
++      and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
++      mov     r3,r4,lsr#26
++      mov     r4,r5,lsr#20
++      orr     r3,r3,r5,lsl#6
++      mov     r5,r6,lsr#14
++      orr     r4,r4,r6,lsl#12
++      mov     r6,r7,lsr#8
++      orr     r5,r5,r7,lsl#18
++      and     r3,r3,#0x03ffffff
++      and     r4,r4,#0x03ffffff
++      and     r5,r5,#0x03ffffff
++
++      vdup.32 $R0,r2                  @ r^1 in both lanes
++      add     r2,r3,r3,lsl#2          @ *5
++      vdup.32 $R1,r3
++      add     r3,r4,r4,lsl#2
++      vdup.32 $S1,r2
++      vdup.32 $R2,r4
++      add     r4,r5,r5,lsl#2
++      vdup.32 $S2,r3
++      vdup.32 $R3,r5
++      add     r5,r6,r6,lsl#2
++      vdup.32 $S3,r4
++      vdup.32 $R4,r6
++      vdup.32 $S4,r5
++
++      mov     $zeros,#2               @ counter
++
++.Lsquare_neon:
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++      @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
++      @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
++      @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
++      @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
++
++      vmull.u32       $D0,$R0,${R0}[1]
++      vmull.u32       $D1,$R1,${R0}[1]
++      vmull.u32       $D2,$R2,${R0}[1]
++      vmull.u32       $D3,$R3,${R0}[1]
++      vmull.u32       $D4,$R4,${R0}[1]
++
++      vmlal.u32       $D0,$R4,${S1}[1]
++      vmlal.u32       $D1,$R0,${R1}[1]
++      vmlal.u32       $D2,$R1,${R1}[1]
++      vmlal.u32       $D3,$R2,${R1}[1]
++      vmlal.u32       $D4,$R3,${R1}[1]
++
++      vmlal.u32       $D0,$R3,${S2}[1]
++      vmlal.u32       $D1,$R4,${S2}[1]
++      vmlal.u32       $D3,$R1,${R2}[1]
++      vmlal.u32       $D2,$R0,${R2}[1]
++      vmlal.u32       $D4,$R2,${R2}[1]
++
++      vmlal.u32       $D0,$R2,${S3}[1]
++      vmlal.u32       $D3,$R0,${R3}[1]
++      vmlal.u32       $D1,$R3,${S3}[1]
++      vmlal.u32       $D2,$R4,${S3}[1]
++      vmlal.u32       $D4,$R1,${R3}[1]
++
++      vmlal.u32       $D3,$R4,${S4}[1]
++      vmlal.u32       $D0,$R1,${S4}[1]
++      vmlal.u32       $D1,$R2,${S4}[1]
++      vmlal.u32       $D2,$R3,${S4}[1]
++      vmlal.u32       $D4,$R0,${R4}[1]
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
++      @ and P. Schwabe
++      @
++      @ H0>>+H1>>+H2>>+H3>>+H4
++      @ H3>>+H4>>*5+H0>>+H1
++      @
++      @ Trivia.
++      @
++      @ Result of multiplication of n-bit number by m-bit number is
++      @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
++      @ m-bit number multiplied by 2^n is still n+m bits wide.
++      @
++      @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
++      @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
++      @ one is n+1 bits wide.
++      @
++      @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
++      @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
++      @ can be 27. However! In cases when their width exceeds 26 bits
++      @ they are limited by 2^26+2^6. This in turn means that *sum*
++      @ of the products with these values can still be viewed as sum
++      @ of 52-bit numbers as long as the amount of addends is not a
++      @ power of 2. For example,
++      @
++      @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
++      @
++      @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
++      @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
++      @ 8 * (2^52) or 2^55. However, the value is then multiplied by
++      @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
++      @ which is less than 32 * (2^52) or 2^57. And when processing
++      @ data we are looking at triple as many addends...
++      @
++      @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
++      @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
++      @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
++      @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
++      @ instruction accepts 2x32-bit input and writes 2x64-bit result.
++      @ This means that result of reduction have to be compressed upon
++      @ loop wrap-around. This can be done in the process of reduction
++      @ to minimize amount of instructions [as well as amount of
++      @ 128-bit instructions, which benefits low-end processors], but
++      @ one has to watch for H2 (which is narrower than H0) and 5*H4
++      @ not being wider than 58 bits, so that result of right shift
++      @ by 26 bits fits in 32 bits. This is also useful on x86,
++      @ because it allows to use paddd in place for paddq, which
++      @ benefits Atom, where paddq is ridiculously slow.
++
++      vshr.u64        $T0,$D3,#26
++      vmovn.i64       $D3#lo,$D3
++       vshr.u64       $T1,$D0,#26
++       vmovn.i64      $D0#lo,$D0
++      vadd.i64        $D4,$D4,$T0             @ h3 -> h4
++      vbic.i32        $D3#lo,#0xfc000000      @ &=0x03ffffff
++       vadd.i64       $D1,$D1,$T1             @ h0 -> h1
++       vbic.i32       $D0#lo,#0xfc000000
++
++      vshrn.u64       $T0#lo,$D4,#26
++      vmovn.i64       $D4#lo,$D4
++       vshr.u64       $T1,$D1,#26
++       vmovn.i64      $D1#lo,$D1
++       vadd.i64       $D2,$D2,$T1             @ h1 -> h2
++      vbic.i32        $D4#lo,#0xfc000000
++       vbic.i32       $D1#lo,#0xfc000000
++
++      vadd.i32        $D0#lo,$D0#lo,$T0#lo
++      vshl.u32        $T0#lo,$T0#lo,#2
++       vshrn.u64      $T1#lo,$D2,#26
++       vmovn.i64      $D2#lo,$D2
++      vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
++       vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
++       vbic.i32       $D2#lo,#0xfc000000
++
++      vshr.u32        $T0#lo,$D0#lo,#26
++      vbic.i32        $D0#lo,#0xfc000000
++       vshr.u32       $T1#lo,$D3#lo,#26
++       vbic.i32       $D3#lo,#0xfc000000
++      vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
++       vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
++
++      subs            $zeros,$zeros,#1
++      beq             .Lsquare_break_neon
++
++      add             $tbl0,$ctx,#(48+0*9*4)
++      add             $tbl1,$ctx,#(48+1*9*4)
++
++      vtrn.32         $R0,$D0#lo              @ r^2:r^1
++      vtrn.32         $R2,$D2#lo
++      vtrn.32         $R3,$D3#lo
++      vtrn.32         $R1,$D1#lo
++      vtrn.32         $R4,$D4#lo
++
++      vshl.u32        $S2,$R2,#2              @ *5
++      vshl.u32        $S3,$R3,#2
++      vshl.u32        $S1,$R1,#2
++      vshl.u32        $S4,$R4,#2
++      vadd.i32        $S2,$S2,$R2
++      vadd.i32        $S1,$S1,$R1
++      vadd.i32        $S3,$S3,$R3
++      vadd.i32        $S4,$S4,$R4
++
++      vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
++      vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
++      vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
++      vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
++      vst1.32         {${S4}[0]},[$tbl0,:32]
++      vst1.32         {${S4}[1]},[$tbl1,:32]
++
++      b               .Lsquare_neon
++
++.align        4
++.Lsquare_break_neon:
++      add             $tbl0,$ctx,#(48+2*4*9)
++      add             $tbl1,$ctx,#(48+3*4*9)
++
++      vmov            $R0,$D0#lo              @ r^4:r^3
++      vshl.u32        $S1,$D1#lo,#2           @ *5
++      vmov            $R1,$D1#lo
++      vshl.u32        $S2,$D2#lo,#2
++      vmov            $R2,$D2#lo
++      vshl.u32        $S3,$D3#lo,#2
++      vmov            $R3,$D3#lo
++      vshl.u32        $S4,$D4#lo,#2
++      vmov            $R4,$D4#lo
++      vadd.i32        $S1,$S1,$D1#lo
++      vadd.i32        $S2,$S2,$D2#lo
++      vadd.i32        $S3,$S3,$D3#lo
++      vadd.i32        $S4,$S4,$D4#lo
++
++      vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
++      vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
++      vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
++      vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
++      vst1.32         {${S4}[0]},[$tbl0]
++      vst1.32         {${S4}[1]},[$tbl1]
++
++.Lno_init_neon:
++      ret                             @ bx    lr
++.size poly1305_init_neon,.-poly1305_init_neon
++
++.type poly1305_blocks_neon,%function
++.align        5
++poly1305_blocks_neon:
++.Lpoly1305_blocks_neon:
++      ldr     ip,[$ctx,#36]           @ is_base2_26
++
++      cmp     $len,#64
++      blo     .Lpoly1305_blocks
++
++      stmdb   sp!,{r4-r7}
++      vstmdb  sp!,{d8-d15}            @ ABI specification says so
++
++      tst     ip,ip                   @ is_base2_26?
++      bne     .Lbase2_26_neon
++
++      stmdb   sp!,{r1-r3,lr}
++      bl      .Lpoly1305_init_neon
++
++      ldr     r4,[$ctx,#0]            @ load hash value base 2^32
++      ldr     r5,[$ctx,#4]
++      ldr     r6,[$ctx,#8]
++      ldr     r7,[$ctx,#12]
++      ldr     ip,[$ctx,#16]
++
++      and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
++      mov     r3,r4,lsr#26
++       veor   $D0#lo,$D0#lo,$D0#lo
++      mov     r4,r5,lsr#20
++      orr     r3,r3,r5,lsl#6
++       veor   $D1#lo,$D1#lo,$D1#lo
++      mov     r5,r6,lsr#14
++      orr     r4,r4,r6,lsl#12
++       veor   $D2#lo,$D2#lo,$D2#lo
++      mov     r6,r7,lsr#8
++      orr     r5,r5,r7,lsl#18
++       veor   $D3#lo,$D3#lo,$D3#lo
++      and     r3,r3,#0x03ffffff
++      orr     r6,r6,ip,lsl#24
++       veor   $D4#lo,$D4#lo,$D4#lo
++      and     r4,r4,#0x03ffffff
++      mov     r1,#1
++      and     r5,r5,#0x03ffffff
++      str     r1,[$ctx,#36]           @ set is_base2_26
++
++      vmov.32 $D0#lo[0],r2
++      vmov.32 $D1#lo[0],r3
++      vmov.32 $D2#lo[0],r4
++      vmov.32 $D3#lo[0],r5
++      vmov.32 $D4#lo[0],r6
++      adr     $zeros,.Lzeros
++
++      ldmia   sp!,{r1-r3,lr}
++      b       .Lhash_loaded
++
++.align        4
++.Lbase2_26_neon:
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ load hash value
++
++      veor            $D0#lo,$D0#lo,$D0#lo
++      veor            $D1#lo,$D1#lo,$D1#lo
++      veor            $D2#lo,$D2#lo,$D2#lo
++      veor            $D3#lo,$D3#lo,$D3#lo
++      veor            $D4#lo,$D4#lo,$D4#lo
++      vld4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
++      adr             $zeros,.Lzeros
++      vld1.32         {$D4#lo[0]},[$ctx]
++      sub             $ctx,$ctx,#16           @ rewind
++
++.Lhash_loaded:
++      add             $in2,$inp,#32
++      mov             $padbit,$padbit,lsl#24
++      tst             $len,#31
++      beq             .Leven
++
++      vld4.32         {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
++      vmov.32         $H4#lo[0],$padbit
++      sub             $len,$len,#16
++      add             $in2,$inp,#32
++
++# ifdef       __ARMEB__
++      vrev32.8        $H0,$H0
++      vrev32.8        $H3,$H3
++      vrev32.8        $H1,$H1
++      vrev32.8        $H2,$H2
++# endif
++      vsri.u32        $H4#lo,$H3#lo,#8        @ base 2^32 -> base 2^26
++      vshl.u32        $H3#lo,$H3#lo,#18
++
++      vsri.u32        $H3#lo,$H2#lo,#14
++      vshl.u32        $H2#lo,$H2#lo,#12
++      vadd.i32        $H4#hi,$H4#lo,$D4#lo    @ add hash value and move to #hi
++
++      vbic.i32        $H3#lo,#0xfc000000
++      vsri.u32        $H2#lo,$H1#lo,#20
++      vshl.u32        $H1#lo,$H1#lo,#6
++
++      vbic.i32        $H2#lo,#0xfc000000
++      vsri.u32        $H1#lo,$H0#lo,#26
++      vadd.i32        $H3#hi,$H3#lo,$D3#lo
++
++      vbic.i32        $H0#lo,#0xfc000000
++      vbic.i32        $H1#lo,#0xfc000000
++      vadd.i32        $H2#hi,$H2#lo,$D2#lo
++
++      vadd.i32        $H0#hi,$H0#lo,$D0#lo
++      vadd.i32        $H1#hi,$H1#lo,$D1#lo
++
++      mov             $tbl1,$zeros
++      add             $tbl0,$ctx,#48
++
++      cmp             $len,$len
++      b               .Long_tail
++
++.align        4
++.Leven:
++      subs            $len,$len,#64
++      it              lo
++      movlo           $in2,$zeros
++
++      vmov.i32        $H4,#1<<24              @ padbit, yes, always
++      vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
++      add             $inp,$inp,#64
++      vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
++      add             $in2,$in2,#64
++      itt             hi
++      addhi           $tbl1,$ctx,#(48+1*9*4)
++      addhi           $tbl0,$ctx,#(48+3*9*4)
++
++# ifdef       __ARMEB__
++      vrev32.8        $H0,$H0
++      vrev32.8        $H3,$H3
++      vrev32.8        $H1,$H1
++      vrev32.8        $H2,$H2
++# endif
++      vsri.u32        $H4,$H3,#8              @ base 2^32 -> base 2^26
++      vshl.u32        $H3,$H3,#18
++
++      vsri.u32        $H3,$H2,#14
++      vshl.u32        $H2,$H2,#12
++
++      vbic.i32        $H3,#0xfc000000
++      vsri.u32        $H2,$H1,#20
++      vshl.u32        $H1,$H1,#6
++
++      vbic.i32        $H2,#0xfc000000
++      vsri.u32        $H1,$H0,#26
++
++      vbic.i32        $H0,#0xfc000000
++      vbic.i32        $H1,#0xfc000000
++
++      bls             .Lskip_loop
++
++      vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^2
++      vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
++      vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
++      vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
++      b               .Loop_neon
++
++.align        5
++.Loop_neon:
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
++      @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
++      @   \___________________/
++      @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
++      @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
++      @   \___________________/ \____________________/
++      @
++      @ Note that we start with inp[2:3]*r^2. This is because it
++      @ doesn't depend on reduction in previous iteration.
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
++      @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
++      @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
++      @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
++      @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ inp[2:3]*r^2
++
++      vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ accumulate inp[0:1]
++      vmull.u32       $D2,$H2#hi,${R0}[1]
++      vadd.i32        $H0#lo,$H0#lo,$D0#lo
++      vmull.u32       $D0,$H0#hi,${R0}[1]
++      vadd.i32        $H3#lo,$H3#lo,$D3#lo
++      vmull.u32       $D3,$H3#hi,${R0}[1]
++      vmlal.u32       $D2,$H1#hi,${R1}[1]
++      vadd.i32        $H1#lo,$H1#lo,$D1#lo
++      vmull.u32       $D1,$H1#hi,${R0}[1]
++
++      vadd.i32        $H4#lo,$H4#lo,$D4#lo
++      vmull.u32       $D4,$H4#hi,${R0}[1]
++      subs            $len,$len,#64
++      vmlal.u32       $D0,$H4#hi,${S1}[1]
++      it              lo
++      movlo           $in2,$zeros
++      vmlal.u32       $D3,$H2#hi,${R1}[1]
++      vld1.32         ${S4}[1],[$tbl1,:32]
++      vmlal.u32       $D1,$H0#hi,${R1}[1]
++      vmlal.u32       $D4,$H3#hi,${R1}[1]
++
++      vmlal.u32       $D0,$H3#hi,${S2}[1]
++      vmlal.u32       $D3,$H1#hi,${R2}[1]
++      vmlal.u32       $D4,$H2#hi,${R2}[1]
++      vmlal.u32       $D1,$H4#hi,${S2}[1]
++      vmlal.u32       $D2,$H0#hi,${R2}[1]
++
++      vmlal.u32       $D3,$H0#hi,${R3}[1]
++      vmlal.u32       $D0,$H2#hi,${S3}[1]
++      vmlal.u32       $D4,$H1#hi,${R3}[1]
++      vmlal.u32       $D1,$H3#hi,${S3}[1]
++      vmlal.u32       $D2,$H4#hi,${S3}[1]
++
++      vmlal.u32       $D3,$H4#hi,${S4}[1]
++      vmlal.u32       $D0,$H1#hi,${S4}[1]
++      vmlal.u32       $D4,$H0#hi,${R4}[1]
++      vmlal.u32       $D1,$H2#hi,${S4}[1]
++      vmlal.u32       $D2,$H3#hi,${S4}[1]
++
++      vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
++      add             $in2,$in2,#64
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ (hash+inp[0:1])*r^4 and accumulate
++
++      vmlal.u32       $D3,$H3#lo,${R0}[0]
++      vmlal.u32       $D0,$H0#lo,${R0}[0]
++      vmlal.u32       $D4,$H4#lo,${R0}[0]
++      vmlal.u32       $D1,$H1#lo,${R0}[0]
++      vmlal.u32       $D2,$H2#lo,${R0}[0]
++      vld1.32         ${S4}[0],[$tbl0,:32]
++
++      vmlal.u32       $D3,$H2#lo,${R1}[0]
++      vmlal.u32       $D0,$H4#lo,${S1}[0]
++      vmlal.u32       $D4,$H3#lo,${R1}[0]
++      vmlal.u32       $D1,$H0#lo,${R1}[0]
++      vmlal.u32       $D2,$H1#lo,${R1}[0]
++
++      vmlal.u32       $D3,$H1#lo,${R2}[0]
++      vmlal.u32       $D0,$H3#lo,${S2}[0]
++      vmlal.u32       $D4,$H2#lo,${R2}[0]
++      vmlal.u32       $D1,$H4#lo,${S2}[0]
++      vmlal.u32       $D2,$H0#lo,${R2}[0]
++
++      vmlal.u32       $D3,$H0#lo,${R3}[0]
++      vmlal.u32       $D0,$H2#lo,${S3}[0]
++      vmlal.u32       $D4,$H1#lo,${R3}[0]
++      vmlal.u32       $D1,$H3#lo,${S3}[0]
++      vmlal.u32       $D3,$H4#lo,${S4}[0]
++
++      vmlal.u32       $D2,$H4#lo,${S3}[0]
++      vmlal.u32       $D0,$H1#lo,${S4}[0]
++      vmlal.u32       $D4,$H0#lo,${R4}[0]
++      vmov.i32        $H4,#1<<24              @ padbit, yes, always
++      vmlal.u32       $D1,$H2#lo,${S4}[0]
++      vmlal.u32       $D2,$H3#lo,${S4}[0]
++
++      vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
++      add             $inp,$inp,#64
++# ifdef       __ARMEB__
++      vrev32.8        $H0,$H0
++      vrev32.8        $H1,$H1
++      vrev32.8        $H2,$H2
++      vrev32.8        $H3,$H3
++# endif
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ lazy reduction interleaved with base 2^32 -> base 2^26 of
++      @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
++
++      vshr.u64        $T0,$D3,#26
++      vmovn.i64       $D3#lo,$D3
++       vshr.u64       $T1,$D0,#26
++       vmovn.i64      $D0#lo,$D0
++      vadd.i64        $D4,$D4,$T0             @ h3 -> h4
++      vbic.i32        $D3#lo,#0xfc000000
++        vsri.u32      $H4,$H3,#8              @ base 2^32 -> base 2^26
++       vadd.i64       $D1,$D1,$T1             @ h0 -> h1
++        vshl.u32      $H3,$H3,#18
++       vbic.i32       $D0#lo,#0xfc000000
++
++      vshrn.u64       $T0#lo,$D4,#26
++      vmovn.i64       $D4#lo,$D4
++       vshr.u64       $T1,$D1,#26
++       vmovn.i64      $D1#lo,$D1
++       vadd.i64       $D2,$D2,$T1             @ h1 -> h2
++        vsri.u32      $H3,$H2,#14
++      vbic.i32        $D4#lo,#0xfc000000
++        vshl.u32      $H2,$H2,#12
++       vbic.i32       $D1#lo,#0xfc000000
++
++      vadd.i32        $D0#lo,$D0#lo,$T0#lo
++      vshl.u32        $T0#lo,$T0#lo,#2
++        vbic.i32      $H3,#0xfc000000
++       vshrn.u64      $T1#lo,$D2,#26
++       vmovn.i64      $D2#lo,$D2
++      vaddl.u32       $D0,$D0#lo,$T0#lo       @ h4 -> h0 [widen for a sec]
++        vsri.u32      $H2,$H1,#20
++       vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
++        vshl.u32      $H1,$H1,#6
++       vbic.i32       $D2#lo,#0xfc000000
++        vbic.i32      $H2,#0xfc000000
++
++      vshrn.u64       $T0#lo,$D0,#26          @ re-narrow
++      vmovn.i64       $D0#lo,$D0
++        vsri.u32      $H1,$H0,#26
++        vbic.i32      $H0,#0xfc000000
++       vshr.u32       $T1#lo,$D3#lo,#26
++       vbic.i32       $D3#lo,#0xfc000000
++      vbic.i32        $D0#lo,#0xfc000000
++      vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
++       vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
++        vbic.i32      $H1,#0xfc000000
++
++      bhi             .Loop_neon
++
++.Lskip_loop:
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
++
++      add             $tbl1,$ctx,#(48+0*9*4)
++      add             $tbl0,$ctx,#(48+1*9*4)
++      adds            $len,$len,#32
++      it              ne
++      movne           $len,#0
++      bne             .Long_tail
++
++      vadd.i32        $H2#hi,$H2#lo,$D2#lo    @ add hash value and move to #hi
++      vadd.i32        $H0#hi,$H0#lo,$D0#lo
++      vadd.i32        $H3#hi,$H3#lo,$D3#lo
++      vadd.i32        $H1#hi,$H1#lo,$D1#lo
++      vadd.i32        $H4#hi,$H4#lo,$D4#lo
++
++.Long_tail:
++      vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^1
++      vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^2
++
++      vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ can be redundant
++      vmull.u32       $D2,$H2#hi,$R0
++      vadd.i32        $H0#lo,$H0#lo,$D0#lo
++      vmull.u32       $D0,$H0#hi,$R0
++      vadd.i32        $H3#lo,$H3#lo,$D3#lo
++      vmull.u32       $D3,$H3#hi,$R0
++      vadd.i32        $H1#lo,$H1#lo,$D1#lo
++      vmull.u32       $D1,$H1#hi,$R0
++      vadd.i32        $H4#lo,$H4#lo,$D4#lo
++      vmull.u32       $D4,$H4#hi,$R0
++
++      vmlal.u32       $D0,$H4#hi,$S1
++      vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
++      vmlal.u32       $D3,$H2#hi,$R1
++      vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
++      vmlal.u32       $D1,$H0#hi,$R1
++      vmlal.u32       $D4,$H3#hi,$R1
++      vmlal.u32       $D2,$H1#hi,$R1
++
++      vmlal.u32       $D3,$H1#hi,$R2
++      vld1.32         ${S4}[1],[$tbl1,:32]
++      vmlal.u32       $D0,$H3#hi,$S2
++      vld1.32         ${S4}[0],[$tbl0,:32]
++      vmlal.u32       $D4,$H2#hi,$R2
++      vmlal.u32       $D1,$H4#hi,$S2
++      vmlal.u32       $D2,$H0#hi,$R2
++
++      vmlal.u32       $D3,$H0#hi,$R3
++       it             ne
++       addne          $tbl1,$ctx,#(48+2*9*4)
++      vmlal.u32       $D0,$H2#hi,$S3
++       it             ne
++       addne          $tbl0,$ctx,#(48+3*9*4)
++      vmlal.u32       $D4,$H1#hi,$R3
++      vmlal.u32       $D1,$H3#hi,$S3
++      vmlal.u32       $D2,$H4#hi,$S3
++
++      vmlal.u32       $D3,$H4#hi,$S4
++       vorn           $MASK,$MASK,$MASK       @ all-ones, can be redundant
++      vmlal.u32       $D0,$H1#hi,$S4
++       vshr.u64       $MASK,$MASK,#38
++      vmlal.u32       $D4,$H0#hi,$R4
++      vmlal.u32       $D1,$H2#hi,$S4
++      vmlal.u32       $D2,$H3#hi,$S4
++
++      beq             .Lshort_tail
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ (hash+inp[0:1])*r^4:r^3 and accumulate
++
++      vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^3
++      vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
++
++      vmlal.u32       $D2,$H2#lo,$R0
++      vmlal.u32       $D0,$H0#lo,$R0
++      vmlal.u32       $D3,$H3#lo,$R0
++      vmlal.u32       $D1,$H1#lo,$R0
++      vmlal.u32       $D4,$H4#lo,$R0
++
++      vmlal.u32       $D0,$H4#lo,$S1
++      vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
++      vmlal.u32       $D3,$H2#lo,$R1
++      vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
++      vmlal.u32       $D1,$H0#lo,$R1
++      vmlal.u32       $D4,$H3#lo,$R1
++      vmlal.u32       $D2,$H1#lo,$R1
++
++      vmlal.u32       $D3,$H1#lo,$R2
++      vld1.32         ${S4}[1],[$tbl1,:32]
++      vmlal.u32       $D0,$H3#lo,$S2
++      vld1.32         ${S4}[0],[$tbl0,:32]
++      vmlal.u32       $D4,$H2#lo,$R2
++      vmlal.u32       $D1,$H4#lo,$S2
++      vmlal.u32       $D2,$H0#lo,$R2
++
++      vmlal.u32       $D3,$H0#lo,$R3
++      vmlal.u32       $D0,$H2#lo,$S3
++      vmlal.u32       $D4,$H1#lo,$R3
++      vmlal.u32       $D1,$H3#lo,$S3
++      vmlal.u32       $D2,$H4#lo,$S3
++
++      vmlal.u32       $D3,$H4#lo,$S4
++       vorn           $MASK,$MASK,$MASK       @ all-ones
++      vmlal.u32       $D0,$H1#lo,$S4
++       vshr.u64       $MASK,$MASK,#38
++      vmlal.u32       $D4,$H0#lo,$R4
++      vmlal.u32       $D1,$H2#lo,$S4
++      vmlal.u32       $D2,$H3#lo,$S4
++
++.Lshort_tail:
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ horizontal addition
++
++      vadd.i64        $D3#lo,$D3#lo,$D3#hi
++      vadd.i64        $D0#lo,$D0#lo,$D0#hi
++      vadd.i64        $D4#lo,$D4#lo,$D4#hi
++      vadd.i64        $D1#lo,$D1#lo,$D1#hi
++      vadd.i64        $D2#lo,$D2#lo,$D2#hi
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ lazy reduction, but without narrowing
++
++      vshr.u64        $T0,$D3,#26
++      vand.i64        $D3,$D3,$MASK
++       vshr.u64       $T1,$D0,#26
++       vand.i64       $D0,$D0,$MASK
++      vadd.i64        $D4,$D4,$T0             @ h3 -> h4
++       vadd.i64       $D1,$D1,$T1             @ h0 -> h1
++
++      vshr.u64        $T0,$D4,#26
++      vand.i64        $D4,$D4,$MASK
++       vshr.u64       $T1,$D1,#26
++       vand.i64       $D1,$D1,$MASK
++       vadd.i64       $D2,$D2,$T1             @ h1 -> h2
++
++      vadd.i64        $D0,$D0,$T0
++      vshl.u64        $T0,$T0,#2
++       vshr.u64       $T1,$D2,#26
++       vand.i64       $D2,$D2,$MASK
++      vadd.i64        $D0,$D0,$T0             @ h4 -> h0
++       vadd.i64       $D3,$D3,$T1             @ h2 -> h3
++
++      vshr.u64        $T0,$D0,#26
++      vand.i64        $D0,$D0,$MASK
++       vshr.u64       $T1,$D3,#26
++       vand.i64       $D3,$D3,$MASK
++      vadd.i64        $D1,$D1,$T0             @ h0 -> h1
++       vadd.i64       $D4,$D4,$T1             @ h3 -> h4
++
++      cmp             $len,#0
++      bne             .Leven
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ store hash value
++
++      vst4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
++      vst1.32         {$D4#lo[0]},[$ctx]
++
++      vldmia  sp!,{d8-d15}                    @ epilogue
++      ldmia   sp!,{r4-r7}
++      ret                                     @ bx    lr
++.size poly1305_blocks_neon,.-poly1305_blocks_neon
++
++.align        5
++.Lzeros:
++.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
++#ifndef       __KERNEL__
++.LOPENSSL_armcap:
++# ifdef       _WIN32
++.word OPENSSL_armcap_P
++# else
++.word OPENSSL_armcap_P-.Lpoly1305_init
++# endif
++.comm OPENSSL_armcap_P,4,4
++.hidden       OPENSSL_armcap_P
++#endif
++#endif
++___
++}     }
++$code.=<<___;
++.asciz        "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
++.align        2
++___
++
++foreach (split("\n",$code)) {
++      s/\`([^\`]*)\`/eval $1/geo;
++
++      s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
++      s/\bret\b/bx    lr/go                                           or
++      s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
++
++      print $_,"\n";
++}
++close STDOUT; # enforce flush
+--- /dev/null
++++ b/arch/arm/crypto/poly1305-core.S_shipped
+@@ -0,0 +1,1158 @@
++#ifndef       __KERNEL__
++# include "arm_arch.h"
++#else
++# define __ARM_ARCH__ __LINUX_ARM_ARCH__
++# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
++# define poly1305_init   poly1305_init_arm
++# define poly1305_blocks poly1305_blocks_arm
++# define poly1305_emit   poly1305_emit_arm
++.globl        poly1305_blocks_neon
++#endif
++
++#if defined(__thumb2__)
++.syntax       unified
++.thumb
++#else
++.code 32
++#endif
++
++.text
++
++.globl        poly1305_emit
++.globl        poly1305_blocks
++.globl        poly1305_init
++.type poly1305_init,%function
++.align        5
++poly1305_init:
++.Lpoly1305_init:
++      stmdb   sp!,{r4-r11}
++
++      eor     r3,r3,r3
++      cmp     r1,#0
++      str     r3,[r0,#0]              @ zero hash value
++      str     r3,[r0,#4]
++      str     r3,[r0,#8]
++      str     r3,[r0,#12]
++      str     r3,[r0,#16]
++      str     r3,[r0,#36]             @ clear is_base2_26
++      add     r0,r0,#20
++
++#ifdef        __thumb2__
++      it      eq
++#endif
++      moveq   r0,#0
++      beq     .Lno_key
++
++#if   __ARM_MAX_ARCH__>=7
++      mov     r3,#-1
++      str     r3,[r0,#28]             @ impossible key power value
++# ifndef __KERNEL__
++      adr     r11,.Lpoly1305_init
++      ldr     r12,.LOPENSSL_armcap
++# endif
++#endif
++      ldrb    r4,[r1,#0]
++      mov     r10,#0x0fffffff
++      ldrb    r5,[r1,#1]
++      and     r3,r10,#-4              @ 0x0ffffffc
++      ldrb    r6,[r1,#2]
++      ldrb    r7,[r1,#3]
++      orr     r4,r4,r5,lsl#8
++      ldrb    r5,[r1,#4]
++      orr     r4,r4,r6,lsl#16
++      ldrb    r6,[r1,#5]
++      orr     r4,r4,r7,lsl#24
++      ldrb    r7,[r1,#6]
++      and     r4,r4,r10
++
++#if   __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++# if !defined(_WIN32)
++      ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
++# endif
++# if defined(__APPLE__) || defined(_WIN32)
++      ldr     r12,[r12]
++# endif
++#endif
++      ldrb    r8,[r1,#7]
++      orr     r5,r5,r6,lsl#8
++      ldrb    r6,[r1,#8]
++      orr     r5,r5,r7,lsl#16
++      ldrb    r7,[r1,#9]
++      orr     r5,r5,r8,lsl#24
++      ldrb    r8,[r1,#10]
++      and     r5,r5,r3
++
++#if   __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++      tst     r12,#ARMV7_NEON         @ check for NEON
++# ifdef       __thumb2__
++      adr     r9,.Lpoly1305_blocks_neon
++      adr     r11,.Lpoly1305_blocks
++      it      ne
++      movne   r11,r9
++      adr     r12,.Lpoly1305_emit
++      orr     r11,r11,#1              @ thumb-ify addresses
++      orr     r12,r12,#1
++# else
++      add     r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
++      ite     eq
++      addeq   r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
++      addne   r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
++# endif
++#endif
++      ldrb    r9,[r1,#11]
++      orr     r6,r6,r7,lsl#8
++      ldrb    r7,[r1,#12]
++      orr     r6,r6,r8,lsl#16
++      ldrb    r8,[r1,#13]
++      orr     r6,r6,r9,lsl#24
++      ldrb    r9,[r1,#14]
++      and     r6,r6,r3
++
++      ldrb    r10,[r1,#15]
++      orr     r7,r7,r8,lsl#8
++      str     r4,[r0,#0]
++      orr     r7,r7,r9,lsl#16
++      str     r5,[r0,#4]
++      orr     r7,r7,r10,lsl#24
++      str     r6,[r0,#8]
++      and     r7,r7,r3
++      str     r7,[r0,#12]
++#if   __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
++      stmia   r2,{r11,r12}            @ fill functions table
++      mov     r0,#1
++#else
++      mov     r0,#0
++#endif
++.Lno_key:
++      ldmia   sp!,{r4-r11}
++#if   __ARM_ARCH__>=5
++      bx      lr                              @ bx    lr
++#else
++      tst     lr,#1
++      moveq   pc,lr                   @ be binary compatible with V4, yet
++      .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_init,.-poly1305_init
++.type poly1305_blocks,%function
++.align        5
++poly1305_blocks:
++.Lpoly1305_blocks:
++      stmdb   sp!,{r3-r11,lr}
++
++      ands    r2,r2,#-16
++      beq     .Lno_data
++
++      add     r2,r2,r1                @ end pointer
++      sub     sp,sp,#32
++
++#if __ARM_ARCH__<7
++      ldmia   r0,{r4-r12}             @ load context
++      add     r0,r0,#20
++      str     r2,[sp,#16]             @ offload stuff
++      str     r0,[sp,#12]
++#else
++      ldr     lr,[r0,#36]             @ is_base2_26
++      ldmia   r0!,{r4-r8}             @ load hash value
++      str     r2,[sp,#16]             @ offload stuff
++      str     r0,[sp,#12]
++
++      adds    r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32
++      mov     r10,r5,lsr#6
++      adcs    r10,r10,r6,lsl#20
++      mov     r11,r6,lsr#12
++      adcs    r11,r11,r7,lsl#14
++      mov     r12,r7,lsr#18
++      adcs    r12,r12,r8,lsl#8
++      mov     r2,#0
++      teq     lr,#0
++      str     r2,[r0,#16]             @ clear is_base2_26
++      adc     r2,r2,r8,lsr#24
++
++      itttt   ne
++      movne   r4,r9                   @ choose between radixes
++      movne   r5,r10
++      movne   r6,r11
++      movne   r7,r12
++      ldmia   r0,{r9-r12}             @ load key
++      it      ne
++      movne   r8,r2
++#endif
++
++      mov     lr,r1
++      cmp     r3,#0
++      str     r10,[sp,#20]
++      str     r11,[sp,#24]
++      str     r12,[sp,#28]
++      b       .Loop
++
++.align        4
++.Loop:
++#if __ARM_ARCH__<7
++      ldrb    r0,[lr],#16             @ load input
++# ifdef       __thumb2__
++      it      hi
++# endif
++      addhi   r8,r8,#1                @ 1<<128
++      ldrb    r1,[lr,#-15]
++      ldrb    r2,[lr,#-14]
++      ldrb    r3,[lr,#-13]
++      orr     r1,r0,r1,lsl#8
++      ldrb    r0,[lr,#-12]
++      orr     r2,r1,r2,lsl#16
++      ldrb    r1,[lr,#-11]
++      orr     r3,r2,r3,lsl#24
++      ldrb    r2,[lr,#-10]
++      adds    r4,r4,r3                @ accumulate input
++
++      ldrb    r3,[lr,#-9]
++      orr     r1,r0,r1,lsl#8
++      ldrb    r0,[lr,#-8]
++      orr     r2,r1,r2,lsl#16
++      ldrb    r1,[lr,#-7]
++      orr     r3,r2,r3,lsl#24
++      ldrb    r2,[lr,#-6]
++      adcs    r5,r5,r3
++
++      ldrb    r3,[lr,#-5]
++      orr     r1,r0,r1,lsl#8
++      ldrb    r0,[lr,#-4]
++      orr     r2,r1,r2,lsl#16
++      ldrb    r1,[lr,#-3]
++      orr     r3,r2,r3,lsl#24
++      ldrb    r2,[lr,#-2]
++      adcs    r6,r6,r3
++
++      ldrb    r3,[lr,#-1]
++      orr     r1,r0,r1,lsl#8
++      str     lr,[sp,#8]              @ offload input pointer
++      orr     r2,r1,r2,lsl#16
++      add     r10,r10,r10,lsr#2
++      orr     r3,r2,r3,lsl#24
++#else
++      ldr     r0,[lr],#16             @ load input
++      it      hi
++      addhi   r8,r8,#1                @ padbit
++      ldr     r1,[lr,#-12]
++      ldr     r2,[lr,#-8]
++      ldr     r3,[lr,#-4]
++# ifdef       __ARMEB__
++      rev     r0,r0
++      rev     r1,r1
++      rev     r2,r2
++      rev     r3,r3
++# endif
++      adds    r4,r4,r0                @ accumulate input
++      str     lr,[sp,#8]              @ offload input pointer
++      adcs    r5,r5,r1
++      add     r10,r10,r10,lsr#2
++      adcs    r6,r6,r2
++#endif
++      add     r11,r11,r11,lsr#2
++      adcs    r7,r7,r3
++      add     r12,r12,r12,lsr#2
++
++      umull   r2,r3,r5,r9
++       adc    r8,r8,#0
++      umull   r0,r1,r4,r9
++      umlal   r2,r3,r8,r10
++      umlal   r0,r1,r7,r10
++      ldr     r10,[sp,#20]            @ reload r10
++      umlal   r2,r3,r6,r12
++      umlal   r0,r1,r5,r12
++      umlal   r2,r3,r7,r11
++      umlal   r0,r1,r6,r11
++      umlal   r2,r3,r4,r10
++      str     r0,[sp,#0]              @ future r4
++       mul    r0,r11,r8
++      ldr     r11,[sp,#24]            @ reload r11
++      adds    r2,r2,r1                @ d1+=d0>>32
++       eor    r1,r1,r1
++      adc     lr,r3,#0                @ future r6
++      str     r2,[sp,#4]              @ future r5
++
++      mul     r2,r12,r8
++      eor     r3,r3,r3
++      umlal   r0,r1,r7,r12
++      ldr     r12,[sp,#28]            @ reload r12
++      umlal   r2,r3,r7,r9
++      umlal   r0,r1,r6,r9
++      umlal   r2,r3,r6,r10
++      umlal   r0,r1,r5,r10
++      umlal   r2,r3,r5,r11
++      umlal   r0,r1,r4,r11
++      umlal   r2,r3,r4,r12
++      ldr     r4,[sp,#0]
++      mul     r8,r9,r8
++      ldr     r5,[sp,#4]
++
++      adds    r6,lr,r0                @ d2+=d1>>32
++      ldr     lr,[sp,#8]              @ reload input pointer
++      adc     r1,r1,#0
++      adds    r7,r2,r1                @ d3+=d2>>32
++      ldr     r0,[sp,#16]             @ reload end pointer
++      adc     r3,r3,#0
++      add     r8,r8,r3                @ h4+=d3>>32
++
++      and     r1,r8,#-4
++      and     r8,r8,#3
++      add     r1,r1,r1,lsr#2          @ *=5
++      adds    r4,r4,r1
++      adcs    r5,r5,#0
++      adcs    r6,r6,#0
++      adcs    r7,r7,#0
++      adc     r8,r8,#0
++
++      cmp     r0,lr                   @ done yet?
++      bhi     .Loop
++
++      ldr     r0,[sp,#12]
++      add     sp,sp,#32
++      stmdb   r0,{r4-r8}              @ store the result
++
++.Lno_data:
++#if   __ARM_ARCH__>=5
++      ldmia   sp!,{r3-r11,pc}
++#else
++      ldmia   sp!,{r3-r11,lr}
++      tst     lr,#1
++      moveq   pc,lr                   @ be binary compatible with V4, yet
++      .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_blocks,.-poly1305_blocks
++.type poly1305_emit,%function
++.align        5
++poly1305_emit:
++.Lpoly1305_emit:
++      stmdb   sp!,{r4-r11}
++
++      ldmia   r0,{r3-r7}
++
++#if __ARM_ARCH__>=7
++      ldr     ip,[r0,#36]             @ is_base2_26
++
++      adds    r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32
++      mov     r9,r4,lsr#6
++      adcs    r9,r9,r5,lsl#20
++      mov     r10,r5,lsr#12
++      adcs    r10,r10,r6,lsl#14
++      mov     r11,r6,lsr#18
++      adcs    r11,r11,r7,lsl#8
++      mov     r0,#0
++      adc     r0,r0,r7,lsr#24
++
++      tst     ip,ip
++      itttt   ne
++      movne   r3,r8
++      movne   r4,r9
++      movne   r5,r10
++      movne   r6,r11
++      it      ne
++      movne   r7,r0
++#endif
++
++      adds    r8,r3,#5                @ compare to modulus
++      adcs    r9,r4,#0
++      adcs    r10,r5,#0
++      adcs    r11,r6,#0
++      adc     r0,r7,#0
++      tst     r0,#4                   @ did it carry/borrow?
++
++#ifdef        __thumb2__
++      it      ne
++#endif
++      movne   r3,r8
++      ldr     r8,[r2,#0]
++#ifdef        __thumb2__
++      it      ne
++#endif
++      movne   r4,r9
++      ldr     r9,[r2,#4]
++#ifdef        __thumb2__
++      it      ne
++#endif
++      movne   r5,r10
++      ldr     r10,[r2,#8]
++#ifdef        __thumb2__
++      it      ne
++#endif
++      movne   r6,r11
++      ldr     r11,[r2,#12]
++
++      adds    r3,r3,r8
++      adcs    r4,r4,r9
++      adcs    r5,r5,r10
++      adc     r6,r6,r11
++
++#if __ARM_ARCH__>=7
++# ifdef __ARMEB__
++      rev     r3,r3
++      rev     r4,r4
++      rev     r5,r5
++      rev     r6,r6
++# endif
++      str     r3,[r1,#0]
++      str     r4,[r1,#4]
++      str     r5,[r1,#8]
++      str     r6,[r1,#12]
++#else
++      strb    r3,[r1,#0]
++      mov     r3,r3,lsr#8
++      strb    r4,[r1,#4]
++      mov     r4,r4,lsr#8
++      strb    r5,[r1,#8]
++      mov     r5,r5,lsr#8
++      strb    r6,[r1,#12]
++      mov     r6,r6,lsr#8
++
++      strb    r3,[r1,#1]
++      mov     r3,r3,lsr#8
++      strb    r4,[r1,#5]
++      mov     r4,r4,lsr#8
++      strb    r5,[r1,#9]
++      mov     r5,r5,lsr#8
++      strb    r6,[r1,#13]
++      mov     r6,r6,lsr#8
++
++      strb    r3,[r1,#2]
++      mov     r3,r3,lsr#8
++      strb    r4,[r1,#6]
++      mov     r4,r4,lsr#8
++      strb    r5,[r1,#10]
++      mov     r5,r5,lsr#8
++      strb    r6,[r1,#14]
++      mov     r6,r6,lsr#8
++
++      strb    r3,[r1,#3]
++      strb    r4,[r1,#7]
++      strb    r5,[r1,#11]
++      strb    r6,[r1,#15]
++#endif
++      ldmia   sp!,{r4-r11}
++#if   __ARM_ARCH__>=5
++      bx      lr                              @ bx    lr
++#else
++      tst     lr,#1
++      moveq   pc,lr                   @ be binary compatible with V4, yet
++      .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
++#endif
++.size poly1305_emit,.-poly1305_emit
++#if   __ARM_MAX_ARCH__>=7
++.fpu  neon
++
++.type poly1305_init_neon,%function
++.align        5
++poly1305_init_neon:
++.Lpoly1305_init_neon:
++      ldr     r3,[r0,#48]             @ first table element
++      cmp     r3,#-1                  @ is value impossible?
++      bne     .Lno_init_neon
++
++      ldr     r4,[r0,#20]             @ load key base 2^32
++      ldr     r5,[r0,#24]
++      ldr     r6,[r0,#28]
++      ldr     r7,[r0,#32]
++
++      and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
++      mov     r3,r4,lsr#26
++      mov     r4,r5,lsr#20
++      orr     r3,r3,r5,lsl#6
++      mov     r5,r6,lsr#14
++      orr     r4,r4,r6,lsl#12
++      mov     r6,r7,lsr#8
++      orr     r5,r5,r7,lsl#18
++      and     r3,r3,#0x03ffffff
++      and     r4,r4,#0x03ffffff
++      and     r5,r5,#0x03ffffff
++
++      vdup.32 d0,r2                   @ r^1 in both lanes
++      add     r2,r3,r3,lsl#2          @ *5
++      vdup.32 d1,r3
++      add     r3,r4,r4,lsl#2
++      vdup.32 d2,r2
++      vdup.32 d3,r4
++      add     r4,r5,r5,lsl#2
++      vdup.32 d4,r3
++      vdup.32 d5,r5
++      add     r5,r6,r6,lsl#2
++      vdup.32 d6,r4
++      vdup.32 d7,r6
++      vdup.32 d8,r5
++
++      mov     r5,#2           @ counter
++
++.Lsquare_neon:
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++      @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
++      @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
++      @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
++      @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
++
++      vmull.u32       q5,d0,d0[1]
++      vmull.u32       q6,d1,d0[1]
++      vmull.u32       q7,d3,d0[1]
++      vmull.u32       q8,d5,d0[1]
++      vmull.u32       q9,d7,d0[1]
++
++      vmlal.u32       q5,d7,d2[1]
++      vmlal.u32       q6,d0,d1[1]
++      vmlal.u32       q7,d1,d1[1]
++      vmlal.u32       q8,d3,d1[1]
++      vmlal.u32       q9,d5,d1[1]
++
++      vmlal.u32       q5,d5,d4[1]
++      vmlal.u32       q6,d7,d4[1]
++      vmlal.u32       q8,d1,d3[1]
++      vmlal.u32       q7,d0,d3[1]
++      vmlal.u32       q9,d3,d3[1]
++
++      vmlal.u32       q5,d3,d6[1]
++      vmlal.u32       q8,d0,d5[1]
++      vmlal.u32       q6,d5,d6[1]
++      vmlal.u32       q7,d7,d6[1]
++      vmlal.u32       q9,d1,d5[1]
++
++      vmlal.u32       q8,d7,d8[1]
++      vmlal.u32       q5,d1,d8[1]
++      vmlal.u32       q6,d3,d8[1]
++      vmlal.u32       q7,d5,d8[1]
++      vmlal.u32       q9,d0,d7[1]
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
++      @ and P. Schwabe
++      @
++      @ H0>>+H1>>+H2>>+H3>>+H4
++      @ H3>>+H4>>*5+H0>>+H1
++      @
++      @ Trivia.
++      @
++      @ Result of multiplication of n-bit number by m-bit number is
++      @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
++      @ m-bit number multiplied by 2^n is still n+m bits wide.
++      @
++      @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
++      @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
++      @ one is n+1 bits wide.
++      @
++      @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
++      @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
++      @ can be 27. However! In cases when their width exceeds 26 bits
++      @ they are limited by 2^26+2^6. This in turn means that *sum*
++      @ of the products with these values can still be viewed as sum
++      @ of 52-bit numbers as long as the amount of addends is not a
++      @ power of 2. For example,
++      @
++      @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
++      @
++      @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
++      @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
++      @ 8 * (2^52) or 2^55. However, the value is then multiplied by
++      @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
++      @ which is less than 32 * (2^52) or 2^57. And when processing
++      @ data we are looking at triple as many addends...
++      @
++      @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
++      @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
++      @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
++      @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
++      @ instruction accepts 2x32-bit input and writes 2x64-bit result.
++      @ This means that result of reduction have to be compressed upon
++      @ loop wrap-around. This can be done in the process of reduction
++      @ to minimize amount of instructions [as well as amount of
++      @ 128-bit instructions, which benefits low-end processors], but
++      @ one has to watch for H2 (which is narrower than H0) and 5*H4
++      @ not being wider than 58 bits, so that result of right shift
++      @ by 26 bits fits in 32 bits. This is also useful on x86,
++      @ because it allows to use paddd in place for paddq, which
++      @ benefits Atom, where paddq is ridiculously slow.
++
++      vshr.u64        q15,q8,#26
++      vmovn.i64       d16,q8
++       vshr.u64       q4,q5,#26
++       vmovn.i64      d10,q5
++      vadd.i64        q9,q9,q15               @ h3 -> h4
++      vbic.i32        d16,#0xfc000000 @ &=0x03ffffff
++       vadd.i64       q6,q6,q4                @ h0 -> h1
++       vbic.i32       d10,#0xfc000000
++
++      vshrn.u64       d30,q9,#26
++      vmovn.i64       d18,q9
++       vshr.u64       q4,q6,#26
++       vmovn.i64      d12,q6
++       vadd.i64       q7,q7,q4                @ h1 -> h2
++      vbic.i32        d18,#0xfc000000
++       vbic.i32       d12,#0xfc000000
++
++      vadd.i32        d10,d10,d30
++      vshl.u32        d30,d30,#2
++       vshrn.u64      d8,q7,#26
++       vmovn.i64      d14,q7
++      vadd.i32        d10,d10,d30     @ h4 -> h0
++       vadd.i32       d16,d16,d8      @ h2 -> h3
++       vbic.i32       d14,#0xfc000000
++
++      vshr.u32        d30,d10,#26
++      vbic.i32        d10,#0xfc000000
++       vshr.u32       d8,d16,#26
++       vbic.i32       d16,#0xfc000000
++      vadd.i32        d12,d12,d30     @ h0 -> h1
++       vadd.i32       d18,d18,d8      @ h3 -> h4
++
++      subs            r5,r5,#1
++      beq             .Lsquare_break_neon
++
++      add             r6,r0,#(48+0*9*4)
++      add             r7,r0,#(48+1*9*4)
++
++      vtrn.32         d0,d10          @ r^2:r^1
++      vtrn.32         d3,d14
++      vtrn.32         d5,d16
++      vtrn.32         d1,d12
++      vtrn.32         d7,d18
++
++      vshl.u32        d4,d3,#2                @ *5
++      vshl.u32        d6,d5,#2
++      vshl.u32        d2,d1,#2
++      vshl.u32        d8,d7,#2
++      vadd.i32        d4,d4,d3
++      vadd.i32        d2,d2,d1
++      vadd.i32        d6,d6,d5
++      vadd.i32        d8,d8,d7
++
++      vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r6]!
++      vst4.32         {d0[1],d1[1],d2[1],d3[1]},[r7]!
++      vst4.32         {d4[0],d5[0],d6[0],d7[0]},[r6]!
++      vst4.32         {d4[1],d5[1],d6[1],d7[1]},[r7]!
++      vst1.32         {d8[0]},[r6,:32]
++      vst1.32         {d8[1]},[r7,:32]
++
++      b               .Lsquare_neon
++
++.align        4
++.Lsquare_break_neon:
++      add             r6,r0,#(48+2*4*9)
++      add             r7,r0,#(48+3*4*9)
++
++      vmov            d0,d10          @ r^4:r^3
++      vshl.u32        d2,d12,#2               @ *5
++      vmov            d1,d12
++      vshl.u32        d4,d14,#2
++      vmov            d3,d14
++      vshl.u32        d6,d16,#2
++      vmov            d5,d16
++      vshl.u32        d8,d18,#2
++      vmov            d7,d18
++      vadd.i32        d2,d2,d12
++      vadd.i32        d4,d4,d14
++      vadd.i32        d6,d6,d16
++      vadd.i32        d8,d8,d18
++
++      vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r6]!
++      vst4.32         {d0[1],d1[1],d2[1],d3[1]},[r7]!
++      vst4.32         {d4[0],d5[0],d6[0],d7[0]},[r6]!
++      vst4.32         {d4[1],d5[1],d6[1],d7[1]},[r7]!
++      vst1.32         {d8[0]},[r6]
++      vst1.32         {d8[1]},[r7]
++
++.Lno_init_neon:
++      bx      lr                              @ bx    lr
++.size poly1305_init_neon,.-poly1305_init_neon
++
++.type poly1305_blocks_neon,%function
++.align        5
++poly1305_blocks_neon:
++.Lpoly1305_blocks_neon:
++      ldr     ip,[r0,#36]             @ is_base2_26
++
++      cmp     r2,#64
++      blo     .Lpoly1305_blocks
++
++      stmdb   sp!,{r4-r7}
++      vstmdb  sp!,{d8-d15}            @ ABI specification says so
++
++      tst     ip,ip                   @ is_base2_26?
++      bne     .Lbase2_26_neon
++
++      stmdb   sp!,{r1-r3,lr}
++      bl      .Lpoly1305_init_neon
++
++      ldr     r4,[r0,#0]              @ load hash value base 2^32
++      ldr     r5,[r0,#4]
++      ldr     r6,[r0,#8]
++      ldr     r7,[r0,#12]
++      ldr     ip,[r0,#16]
++
++      and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
++      mov     r3,r4,lsr#26
++       veor   d10,d10,d10
++      mov     r4,r5,lsr#20
++      orr     r3,r3,r5,lsl#6
++       veor   d12,d12,d12
++      mov     r5,r6,lsr#14
++      orr     r4,r4,r6,lsl#12
++       veor   d14,d14,d14
++      mov     r6,r7,lsr#8
++      orr     r5,r5,r7,lsl#18
++       veor   d16,d16,d16
++      and     r3,r3,#0x03ffffff
++      orr     r6,r6,ip,lsl#24
++       veor   d18,d18,d18
++      and     r4,r4,#0x03ffffff
++      mov     r1,#1
++      and     r5,r5,#0x03ffffff
++      str     r1,[r0,#36]             @ set is_base2_26
++
++      vmov.32 d10[0],r2
++      vmov.32 d12[0],r3
++      vmov.32 d14[0],r4
++      vmov.32 d16[0],r5
++      vmov.32 d18[0],r6
++      adr     r5,.Lzeros
++
++      ldmia   sp!,{r1-r3,lr}
++      b       .Lhash_loaded
++
++.align        4
++.Lbase2_26_neon:
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ load hash value
++
++      veor            d10,d10,d10
++      veor            d12,d12,d12
++      veor            d14,d14,d14
++      veor            d16,d16,d16
++      veor            d18,d18,d18
++      vld4.32         {d10[0],d12[0],d14[0],d16[0]},[r0]!
++      adr             r5,.Lzeros
++      vld1.32         {d18[0]},[r0]
++      sub             r0,r0,#16               @ rewind
++
++.Lhash_loaded:
++      add             r4,r1,#32
++      mov             r3,r3,lsl#24
++      tst             r2,#31
++      beq             .Leven
++
++      vld4.32         {d20[0],d22[0],d24[0],d26[0]},[r1]!
++      vmov.32         d28[0],r3
++      sub             r2,r2,#16
++      add             r4,r1,#32
++
++# ifdef       __ARMEB__
++      vrev32.8        q10,q10
++      vrev32.8        q13,q13
++      vrev32.8        q11,q11
++      vrev32.8        q12,q12
++# endif
++      vsri.u32        d28,d26,#8      @ base 2^32 -> base 2^26
++      vshl.u32        d26,d26,#18
++
++      vsri.u32        d26,d24,#14
++      vshl.u32        d24,d24,#12
++      vadd.i32        d29,d28,d18     @ add hash value and move to #hi
++
++      vbic.i32        d26,#0xfc000000
++      vsri.u32        d24,d22,#20
++      vshl.u32        d22,d22,#6
++
++      vbic.i32        d24,#0xfc000000
++      vsri.u32        d22,d20,#26
++      vadd.i32        d27,d26,d16
++
++      vbic.i32        d20,#0xfc000000
++      vbic.i32        d22,#0xfc000000
++      vadd.i32        d25,d24,d14
++
++      vadd.i32        d21,d20,d10
++      vadd.i32        d23,d22,d12
++
++      mov             r7,r5
++      add             r6,r0,#48
++
++      cmp             r2,r2
++      b               .Long_tail
++
++.align        4
++.Leven:
++      subs            r2,r2,#64
++      it              lo
++      movlo           r4,r5
++
++      vmov.i32        q14,#1<<24              @ padbit, yes, always
++      vld4.32         {d20,d22,d24,d26},[r1]  @ inp[0:1]
++      add             r1,r1,#64
++      vld4.32         {d21,d23,d25,d27},[r4]  @ inp[2:3] (or 0)
++      add             r4,r4,#64
++      itt             hi
++      addhi           r7,r0,#(48+1*9*4)
++      addhi           r6,r0,#(48+3*9*4)
++
++# ifdef       __ARMEB__
++      vrev32.8        q10,q10
++      vrev32.8        q13,q13
++      vrev32.8        q11,q11
++      vrev32.8        q12,q12
++# endif
++      vsri.u32        q14,q13,#8              @ base 2^32 -> base 2^26
++      vshl.u32        q13,q13,#18
++
++      vsri.u32        q13,q12,#14
++      vshl.u32        q12,q12,#12
++
++      vbic.i32        q13,#0xfc000000
++      vsri.u32        q12,q11,#20
++      vshl.u32        q11,q11,#6
++
++      vbic.i32        q12,#0xfc000000
++      vsri.u32        q11,q10,#26
++
++      vbic.i32        q10,#0xfc000000
++      vbic.i32        q11,#0xfc000000
++
++      bls             .Lskip_loop
++
++      vld4.32         {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
++      vld4.32         {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
++      vld4.32         {d4[1],d5[1],d6[1],d7[1]},[r7]!
++      vld4.32         {d4[0],d5[0],d6[0],d7[0]},[r6]!
++      b               .Loop_neon
++
++.align        5
++.Loop_neon:
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
++      @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
++      @   ___________________/
++      @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
++      @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
++      @   ___________________/ ____________________/
++      @
++      @ Note that we start with inp[2:3]*r^2. This is because it
++      @ doesn't depend on reduction in previous iteration.
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
++      @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
++      @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
++      @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
++      @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ inp[2:3]*r^2
++
++      vadd.i32        d24,d24,d14     @ accumulate inp[0:1]
++      vmull.u32       q7,d25,d0[1]
++      vadd.i32        d20,d20,d10
++      vmull.u32       q5,d21,d0[1]
++      vadd.i32        d26,d26,d16
++      vmull.u32       q8,d27,d0[1]
++      vmlal.u32       q7,d23,d1[1]
++      vadd.i32        d22,d22,d12
++      vmull.u32       q6,d23,d0[1]
++
++      vadd.i32        d28,d28,d18
++      vmull.u32       q9,d29,d0[1]
++      subs            r2,r2,#64
++      vmlal.u32       q5,d29,d2[1]
++      it              lo
++      movlo           r4,r5
++      vmlal.u32       q8,d25,d1[1]
++      vld1.32         d8[1],[r7,:32]
++      vmlal.u32       q6,d21,d1[1]
++      vmlal.u32       q9,d27,d1[1]
++
++      vmlal.u32       q5,d27,d4[1]
++      vmlal.u32       q8,d23,d3[1]
++      vmlal.u32       q9,d25,d3[1]
++      vmlal.u32       q6,d29,d4[1]
++      vmlal.u32       q7,d21,d3[1]
++
++      vmlal.u32       q8,d21,d5[1]
++      vmlal.u32       q5,d25,d6[1]
++      vmlal.u32       q9,d23,d5[1]
++      vmlal.u32       q6,d27,d6[1]
++      vmlal.u32       q7,d29,d6[1]
++
++      vmlal.u32       q8,d29,d8[1]
++      vmlal.u32       q5,d23,d8[1]
++      vmlal.u32       q9,d21,d7[1]
++      vmlal.u32       q6,d25,d8[1]
++      vmlal.u32       q7,d27,d8[1]
++
++      vld4.32         {d21,d23,d25,d27},[r4]  @ inp[2:3] (or 0)
++      add             r4,r4,#64
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ (hash+inp[0:1])*r^4 and accumulate
++
++      vmlal.u32       q8,d26,d0[0]
++      vmlal.u32       q5,d20,d0[0]
++      vmlal.u32       q9,d28,d0[0]
++      vmlal.u32       q6,d22,d0[0]
++      vmlal.u32       q7,d24,d0[0]
++      vld1.32         d8[0],[r6,:32]
++
++      vmlal.u32       q8,d24,d1[0]
++      vmlal.u32       q5,d28,d2[0]
++      vmlal.u32       q9,d26,d1[0]
++      vmlal.u32       q6,d20,d1[0]
++      vmlal.u32       q7,d22,d1[0]
++
++      vmlal.u32       q8,d22,d3[0]
++      vmlal.u32       q5,d26,d4[0]
++      vmlal.u32       q9,d24,d3[0]
++      vmlal.u32       q6,d28,d4[0]
++      vmlal.u32       q7,d20,d3[0]
++
++      vmlal.u32       q8,d20,d5[0]
++      vmlal.u32       q5,d24,d6[0]
++      vmlal.u32       q9,d22,d5[0]
++      vmlal.u32       q6,d26,d6[0]
++      vmlal.u32       q8,d28,d8[0]
++
++      vmlal.u32       q7,d28,d6[0]
++      vmlal.u32       q5,d22,d8[0]
++      vmlal.u32       q9,d20,d7[0]
++      vmov.i32        q14,#1<<24              @ padbit, yes, always
++      vmlal.u32       q6,d24,d8[0]
++      vmlal.u32       q7,d26,d8[0]
++
++      vld4.32         {d20,d22,d24,d26},[r1]  @ inp[0:1]
++      add             r1,r1,#64
++# ifdef       __ARMEB__
++      vrev32.8        q10,q10
++      vrev32.8        q11,q11
++      vrev32.8        q12,q12
++      vrev32.8        q13,q13
++# endif
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ lazy reduction interleaved with base 2^32 -> base 2^26 of
++      @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
++
++      vshr.u64        q15,q8,#26
++      vmovn.i64       d16,q8
++       vshr.u64       q4,q5,#26
++       vmovn.i64      d10,q5
++      vadd.i64        q9,q9,q15               @ h3 -> h4
++      vbic.i32        d16,#0xfc000000
++        vsri.u32      q14,q13,#8              @ base 2^32 -> base 2^26
++       vadd.i64       q6,q6,q4                @ h0 -> h1
++        vshl.u32      q13,q13,#18
++       vbic.i32       d10,#0xfc000000
++
++      vshrn.u64       d30,q9,#26
++      vmovn.i64       d18,q9
++       vshr.u64       q4,q6,#26
++       vmovn.i64      d12,q6
++       vadd.i64       q7,q7,q4                @ h1 -> h2
++        vsri.u32      q13,q12,#14
++      vbic.i32        d18,#0xfc000000
++        vshl.u32      q12,q12,#12
++       vbic.i32       d12,#0xfc000000
++
++      vadd.i32        d10,d10,d30
++      vshl.u32        d30,d30,#2
++        vbic.i32      q13,#0xfc000000
++       vshrn.u64      d8,q7,#26
++       vmovn.i64      d14,q7
++      vaddl.u32       q5,d10,d30      @ h4 -> h0 [widen for a sec]
++        vsri.u32      q12,q11,#20
++       vadd.i32       d16,d16,d8      @ h2 -> h3
++        vshl.u32      q11,q11,#6
++       vbic.i32       d14,#0xfc000000
++        vbic.i32      q12,#0xfc000000
++
++      vshrn.u64       d30,q5,#26              @ re-narrow
++      vmovn.i64       d10,q5
++        vsri.u32      q11,q10,#26
++        vbic.i32      q10,#0xfc000000
++       vshr.u32       d8,d16,#26
++       vbic.i32       d16,#0xfc000000
++      vbic.i32        d10,#0xfc000000
++      vadd.i32        d12,d12,d30     @ h0 -> h1
++       vadd.i32       d18,d18,d8      @ h3 -> h4
++        vbic.i32      q11,#0xfc000000
++
++      bhi             .Loop_neon
++
++.Lskip_loop:
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
++
++      add             r7,r0,#(48+0*9*4)
++      add             r6,r0,#(48+1*9*4)
++      adds            r2,r2,#32
++      it              ne
++      movne           r2,#0
++      bne             .Long_tail
++
++      vadd.i32        d25,d24,d14     @ add hash value and move to #hi
++      vadd.i32        d21,d20,d10
++      vadd.i32        d27,d26,d16
++      vadd.i32        d23,d22,d12
++      vadd.i32        d29,d28,d18
++
++.Long_tail:
++      vld4.32         {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
++      vld4.32         {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
++
++      vadd.i32        d24,d24,d14     @ can be redundant
++      vmull.u32       q7,d25,d0
++      vadd.i32        d20,d20,d10
++      vmull.u32       q5,d21,d0
++      vadd.i32        d26,d26,d16
++      vmull.u32       q8,d27,d0
++      vadd.i32        d22,d22,d12
++      vmull.u32       q6,d23,d0
++      vadd.i32        d28,d28,d18
++      vmull.u32       q9,d29,d0
++
++      vmlal.u32       q5,d29,d2
++      vld4.32         {d4[1],d5[1],d6[1],d7[1]},[r7]!
++      vmlal.u32       q8,d25,d1
++      vld4.32         {d4[0],d5[0],d6[0],d7[0]},[r6]!
++      vmlal.u32       q6,d21,d1
++      vmlal.u32       q9,d27,d1
++      vmlal.u32       q7,d23,d1
++
++      vmlal.u32       q8,d23,d3
++      vld1.32         d8[1],[r7,:32]
++      vmlal.u32       q5,d27,d4
++      vld1.32         d8[0],[r6,:32]
++      vmlal.u32       q9,d25,d3
++      vmlal.u32       q6,d29,d4
++      vmlal.u32       q7,d21,d3
++
++      vmlal.u32       q8,d21,d5
++       it             ne
++       addne          r7,r0,#(48+2*9*4)
++      vmlal.u32       q5,d25,d6
++       it             ne
++       addne          r6,r0,#(48+3*9*4)
++      vmlal.u32       q9,d23,d5
++      vmlal.u32       q6,d27,d6
++      vmlal.u32       q7,d29,d6
++
++      vmlal.u32       q8,d29,d8
++       vorn           q0,q0,q0        @ all-ones, can be redundant
++      vmlal.u32       q5,d23,d8
++       vshr.u64       q0,q0,#38
++      vmlal.u32       q9,d21,d7
++      vmlal.u32       q6,d25,d8
++      vmlal.u32       q7,d27,d8
++
++      beq             .Lshort_tail
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ (hash+inp[0:1])*r^4:r^3 and accumulate
++
++      vld4.32         {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
++      vld4.32         {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
++
++      vmlal.u32       q7,d24,d0
++      vmlal.u32       q5,d20,d0
++      vmlal.u32       q8,d26,d0
++      vmlal.u32       q6,d22,d0
++      vmlal.u32       q9,d28,d0
++
++      vmlal.u32       q5,d28,d2
++      vld4.32         {d4[1],d5[1],d6[1],d7[1]},[r7]!
++      vmlal.u32       q8,d24,d1
++      vld4.32         {d4[0],d5[0],d6[0],d7[0]},[r6]!
++      vmlal.u32       q6,d20,d1
++      vmlal.u32       q9,d26,d1
++      vmlal.u32       q7,d22,d1
++
++      vmlal.u32       q8,d22,d3
++      vld1.32         d8[1],[r7,:32]
++      vmlal.u32       q5,d26,d4
++      vld1.32         d8[0],[r6,:32]
++      vmlal.u32       q9,d24,d3
++      vmlal.u32       q6,d28,d4
++      vmlal.u32       q7,d20,d3
++
++      vmlal.u32       q8,d20,d5
++      vmlal.u32       q5,d24,d6
++      vmlal.u32       q9,d22,d5
++      vmlal.u32       q6,d26,d6
++      vmlal.u32       q7,d28,d6
++
++      vmlal.u32       q8,d28,d8
++       vorn           q0,q0,q0        @ all-ones
++      vmlal.u32       q5,d22,d8
++       vshr.u64       q0,q0,#38
++      vmlal.u32       q9,d20,d7
++      vmlal.u32       q6,d24,d8
++      vmlal.u32       q7,d26,d8
++
++.Lshort_tail:
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ horizontal addition
++
++      vadd.i64        d16,d16,d17
++      vadd.i64        d10,d10,d11
++      vadd.i64        d18,d18,d19
++      vadd.i64        d12,d12,d13
++      vadd.i64        d14,d14,d15
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ lazy reduction, but without narrowing
++
++      vshr.u64        q15,q8,#26
++      vand.i64        q8,q8,q0
++       vshr.u64       q4,q5,#26
++       vand.i64       q5,q5,q0
++      vadd.i64        q9,q9,q15               @ h3 -> h4
++       vadd.i64       q6,q6,q4                @ h0 -> h1
++
++      vshr.u64        q15,q9,#26
++      vand.i64        q9,q9,q0
++       vshr.u64       q4,q6,#26
++       vand.i64       q6,q6,q0
++       vadd.i64       q7,q7,q4                @ h1 -> h2
++
++      vadd.i64        q5,q5,q15
++      vshl.u64        q15,q15,#2
++       vshr.u64       q4,q7,#26
++       vand.i64       q7,q7,q0
++      vadd.i64        q5,q5,q15               @ h4 -> h0
++       vadd.i64       q8,q8,q4                @ h2 -> h3
++
++      vshr.u64        q15,q5,#26
++      vand.i64        q5,q5,q0
++       vshr.u64       q4,q8,#26
++       vand.i64       q8,q8,q0
++      vadd.i64        q6,q6,q15               @ h0 -> h1
++       vadd.i64       q9,q9,q4                @ h3 -> h4
++
++      cmp             r2,#0
++      bne             .Leven
++
++      @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
++      @ store hash value
++
++      vst4.32         {d10[0],d12[0],d14[0],d16[0]},[r0]!
++      vst1.32         {d18[0]},[r0]
++
++      vldmia  sp!,{d8-d15}                    @ epilogue
++      ldmia   sp!,{r4-r7}
++      bx      lr                                      @ bx    lr
++.size poly1305_blocks_neon,.-poly1305_blocks_neon
++
++.align        5
++.Lzeros:
++.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
++#ifndef       __KERNEL__
++.LOPENSSL_armcap:
++# ifdef       _WIN32
++.word OPENSSL_armcap_P
++# else
++.word OPENSSL_armcap_P-.Lpoly1305_init
++# endif
++.comm OPENSSL_armcap_P,4,4
++.hidden       OPENSSL_armcap_P
++#endif
++#endif
++.asciz        "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm"
++.align        2
+--- /dev/null
++++ b/arch/arm/crypto/poly1305-glue.c
+@@ -0,0 +1,276 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
++ *
++ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
++ */
++
++#include <asm/hwcap.h>
++#include <asm/neon.h>
++#include <asm/simd.h>
++#include <asm/unaligned.h>
++#include <crypto/algapi.h>
++#include <crypto/internal/hash.h>
++#include <crypto/internal/poly1305.h>
++#include <crypto/internal/simd.h>
++#include <linux/cpufeature.h>
++#include <linux/crypto.h>
++#include <linux/jump_label.h>
++#include <linux/module.h>
++
++void poly1305_init_arm(void *state, const u8 *key);
++void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
++void poly1305_emit_arm(void *state, __le32 *digest, const u32 *nonce);
++
++void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
++{
++}
++
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
++
++void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
++{
++      poly1305_init_arm(&dctx->h, key);
++      dctx->s[0] = get_unaligned_le32(key + 16);
++      dctx->s[1] = get_unaligned_le32(key + 20);
++      dctx->s[2] = get_unaligned_le32(key + 24);
++      dctx->s[3] = get_unaligned_le32(key + 28);
++      dctx->buflen = 0;
++}
++EXPORT_SYMBOL(poly1305_init_arch);
++
++static int arm_poly1305_init(struct shash_desc *desc)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      dctx->buflen = 0;
++      dctx->rset = 0;
++      dctx->sset = false;
++
++      return 0;
++}
++
++static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
++                               u32 len, u32 hibit, bool do_neon)
++{
++      if (unlikely(!dctx->sset)) {
++              if (!dctx->rset) {
++                      poly1305_init_arm(&dctx->h, src);
++                      src += POLY1305_BLOCK_SIZE;
++                      len -= POLY1305_BLOCK_SIZE;
++                      dctx->rset = 1;
++              }
++              if (len >= POLY1305_BLOCK_SIZE) {
++                      dctx->s[0] = get_unaligned_le32(src +  0);
++                      dctx->s[1] = get_unaligned_le32(src +  4);
++                      dctx->s[2] = get_unaligned_le32(src +  8);
++                      dctx->s[3] = get_unaligned_le32(src + 12);
++                      src += POLY1305_BLOCK_SIZE;
++                      len -= POLY1305_BLOCK_SIZE;
++                      dctx->sset = true;
++              }
++              if (len < POLY1305_BLOCK_SIZE)
++                      return;
++      }
++
++      len &= ~(POLY1305_BLOCK_SIZE - 1);
++
++      if (static_branch_likely(&have_neon) && likely(do_neon))
++              poly1305_blocks_neon(&dctx->h, src, len, hibit);
++      else
++              poly1305_blocks_arm(&dctx->h, src, len, hibit);
++}
++
++static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
++                                  const u8 *src, u32 len, bool do_neon)
++{
++      if (unlikely(dctx->buflen)) {
++              u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++              memcpy(dctx->buf + dctx->buflen, src, bytes);
++              src += bytes;
++              len -= bytes;
++              dctx->buflen += bytes;
++
++              if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++                      arm_poly1305_blocks(dctx, dctx->buf,
++                                          POLY1305_BLOCK_SIZE, 1, false);
++                      dctx->buflen = 0;
++              }
++      }
++
++      if (likely(len >= POLY1305_BLOCK_SIZE)) {
++              arm_poly1305_blocks(dctx, src, len, 1, do_neon);
++              src += round_down(len, POLY1305_BLOCK_SIZE);
++              len %= POLY1305_BLOCK_SIZE;
++      }
++
++      if (unlikely(len)) {
++              dctx->buflen = len;
++              memcpy(dctx->buf, src, len);
++      }
++}
++
++static int arm_poly1305_update(struct shash_desc *desc,
++                             const u8 *src, unsigned int srclen)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      arm_poly1305_do_update(dctx, src, srclen, false);
++      return 0;
++}
++
++static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
++                                                 const u8 *src,
++                                                 unsigned int srclen)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++      bool do_neon = crypto_simd_usable() && srclen > 128;
++
++      if (static_branch_likely(&have_neon) && do_neon)
++              kernel_neon_begin();
++      arm_poly1305_do_update(dctx, src, srclen, do_neon);
++      if (static_branch_likely(&have_neon) && do_neon)
++              kernel_neon_end();
++      return 0;
++}
++
++void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
++                        unsigned int nbytes)
++{
++      bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
++                     crypto_simd_usable();
++
++      if (unlikely(dctx->buflen)) {
++              u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++              memcpy(dctx->buf + dctx->buflen, src, bytes);
++              src += bytes;
++              nbytes -= bytes;
++              dctx->buflen += bytes;
++
++              if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++                      poly1305_blocks_arm(&dctx->h, dctx->buf,
++                                          POLY1305_BLOCK_SIZE, 1);
++                      dctx->buflen = 0;
++              }
++      }
++
++      if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
++              unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
++
++              if (static_branch_likely(&have_neon) && do_neon) {
++                      kernel_neon_begin();
++                      poly1305_blocks_neon(&dctx->h, src, len, 1);
++                      kernel_neon_end();
++              } else {
++                      poly1305_blocks_arm(&dctx->h, src, len, 1);
++              }
++              src += len;
++              nbytes %= POLY1305_BLOCK_SIZE;
++      }
++
++      if (unlikely(nbytes)) {
++              dctx->buflen = nbytes;
++              memcpy(dctx->buf, src, nbytes);
++      }
++}
++EXPORT_SYMBOL(poly1305_update_arch);
++
++void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
++{
++      __le32 digest[4];
++      u64 f = 0;
++
++      if (unlikely(dctx->buflen)) {
++              dctx->buf[dctx->buflen++] = 1;
++              memset(dctx->buf + dctx->buflen, 0,
++                     POLY1305_BLOCK_SIZE - dctx->buflen);
++              poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
++      }
++
++      poly1305_emit_arm(&dctx->h, digest, dctx->s);
++
++      /* mac = (h + s) % (2^128) */
++      f = (f >> 32) + le32_to_cpu(digest[0]);
++      put_unaligned_le32(f, dst);
++      f = (f >> 32) + le32_to_cpu(digest[1]);
++      put_unaligned_le32(f, dst + 4);
++      f = (f >> 32) + le32_to_cpu(digest[2]);
++      put_unaligned_le32(f, dst + 8);
++      f = (f >> 32) + le32_to_cpu(digest[3]);
++      put_unaligned_le32(f, dst + 12);
++
++      *dctx = (struct poly1305_desc_ctx){};
++}
++EXPORT_SYMBOL(poly1305_final_arch);
++
++static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      if (unlikely(!dctx->sset))
++              return -ENOKEY;
++
++      poly1305_final_arch(dctx, dst);
++      return 0;
++}
++
++static struct shash_alg arm_poly1305_algs[] = {{
++      .init                   = arm_poly1305_init,
++      .update                 = arm_poly1305_update,
++      .final                  = arm_poly1305_final,
++      .digestsize             = POLY1305_DIGEST_SIZE,
++      .descsize               = sizeof(struct poly1305_desc_ctx),
++
++      .base.cra_name          = "poly1305",
++      .base.cra_driver_name   = "poly1305-arm",
++      .base.cra_priority      = 150,
++      .base.cra_blocksize     = POLY1305_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++#ifdef CONFIG_KERNEL_MODE_NEON
++}, {
++      .init                   = arm_poly1305_init,
++      .update                 = arm_poly1305_update_neon,
++      .final                  = arm_poly1305_final,
++      .digestsize             = POLY1305_DIGEST_SIZE,
++      .descsize               = sizeof(struct poly1305_desc_ctx),
++
++      .base.cra_name          = "poly1305",
++      .base.cra_driver_name   = "poly1305-neon",
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = POLY1305_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++#endif
++}};
++
++static int __init arm_poly1305_mod_init(void)
++{
++      if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
++          (elf_hwcap & HWCAP_NEON))
++              static_branch_enable(&have_neon);
++      else
++              /* register only the first entry */
++              return crypto_register_shash(&arm_poly1305_algs[0]);
++
++      return crypto_register_shashes(arm_poly1305_algs,
++                                     ARRAY_SIZE(arm_poly1305_algs));
++}
++
++static void __exit arm_poly1305_mod_exit(void)
++{
++      if (!static_branch_likely(&have_neon)) {
++              crypto_unregister_shash(&arm_poly1305_algs[0]);
++              return;
++      }
++      crypto_unregister_shashes(arm_poly1305_algs,
++                                ARRAY_SIZE(arm_poly1305_algs));
++}
++
++module_init(arm_poly1305_mod_init);
++module_exit(arm_poly1305_mod_exit);
++
++MODULE_LICENSE("GPL v2");
++MODULE_ALIAS_CRYPTO("poly1305");
++MODULE_ALIAS_CRYPTO("poly1305-arm");
++MODULE_ALIAS_CRYPTO("poly1305-neon");
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -40,7 +40,7 @@ config CRYPTO_LIB_DES
+ config CRYPTO_LIB_POLY1305_RSIZE
+       int
+       default 4 if X86_64
+-      default 9 if ARM64
++      default 9 if ARM || ARM64
+       default 1
+ 
+ config CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch b/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch

new file mode 100644 (file)

index 0000000..68cac9c
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
@@ -0,0 +1,1563 @@
+From a338793df36990e97ab0b824fad6fbf6ef171f94 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:26 +0100
+Subject: [PATCH 020/124] crypto: mips/poly1305 - incorporate
+ OpenSSL/CRYPTOGAMS optimized implementation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit a11d055e7a64ac34a5e99b6fe731299449cbcd58 upstream.
+
+This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation for
+MIPS authored by Andy Polyakov, a prior 64-bit only version of which has been
+contributed by him to the OpenSSL project. The file 'poly1305-mips.pl' is taken
+straight from this upstream GitHub repository [0] at commit
+d22ade312a7af958ec955620b0d241cf42c37feb, and already contains all the changes
+required to build it as part of a Linux kernel module.
+
+[0] https://github.com/dot-asm/cryptogams
+
+Co-developed-by: Andy Polyakov <appro@cryptogams.org>
+Signed-off-by: Andy Polyakov <appro@cryptogams.org>
+Co-developed-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/mips/crypto/Makefile         |   14 +
+ arch/mips/crypto/poly1305-glue.c  |  203 +++++
+ arch/mips/crypto/poly1305-mips.pl | 1273 +++++++++++++++++++++++++++++
+ crypto/Kconfig                    |    5 +
+ lib/crypto/Kconfig                |    1 +
+ 5 files changed, 1496 insertions(+)
+ create mode 100644 arch/mips/crypto/poly1305-glue.c
+ create mode 100644 arch/mips/crypto/poly1305-mips.pl
+
+--- a/arch/mips/crypto/Makefile
++++ b/arch/mips/crypto/Makefile
+@@ -8,3 +8,17 @@ obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32
+ obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
+ chacha-mips-y := chacha-core.o chacha-glue.o
+ AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
++
++obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
++poly1305-mips-y := poly1305-core.o poly1305-glue.o
++
++perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
++perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
++
++quiet_cmd_perlasm = PERLASM $@
++      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
++
++$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
++      $(call if_changed,perlasm)
++
++targets += poly1305-core.S
+--- /dev/null
++++ b/arch/mips/crypto/poly1305-glue.c
+@@ -0,0 +1,203 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
++ *
++ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
++ */
++
++#include <asm/unaligned.h>
++#include <crypto/algapi.h>
++#include <crypto/internal/hash.h>
++#include <crypto/internal/poly1305.h>
++#include <linux/cpufeature.h>
++#include <linux/crypto.h>
++#include <linux/module.h>
++
++asmlinkage void poly1305_init_mips(void *state, const u8 *key);
++asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
++asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce);
++
++void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
++{
++      poly1305_init_mips(&dctx->h, key);
++      dctx->s[0] = get_unaligned_le32(key + 16);
++      dctx->s[1] = get_unaligned_le32(key + 20);
++      dctx->s[2] = get_unaligned_le32(key + 24);
++      dctx->s[3] = get_unaligned_le32(key + 28);
++      dctx->buflen = 0;
++}
++EXPORT_SYMBOL(poly1305_init_arch);
++
++static int mips_poly1305_init(struct shash_desc *desc)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      dctx->buflen = 0;
++      dctx->rset = 0;
++      dctx->sset = false;
++
++      return 0;
++}
++
++static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
++                               u32 len, u32 hibit)
++{
++      if (unlikely(!dctx->sset)) {
++              if (!dctx->rset) {
++                      poly1305_init_mips(&dctx->h, src);
++                      src += POLY1305_BLOCK_SIZE;
++                      len -= POLY1305_BLOCK_SIZE;
++                      dctx->rset = 1;
++              }
++              if (len >= POLY1305_BLOCK_SIZE) {
++                      dctx->s[0] = get_unaligned_le32(src +  0);
++                      dctx->s[1] = get_unaligned_le32(src +  4);
++                      dctx->s[2] = get_unaligned_le32(src +  8);
++                      dctx->s[3] = get_unaligned_le32(src + 12);
++                      src += POLY1305_BLOCK_SIZE;
++                      len -= POLY1305_BLOCK_SIZE;
++                      dctx->sset = true;
++              }
++              if (len < POLY1305_BLOCK_SIZE)
++                      return;
++      }
++
++      len &= ~(POLY1305_BLOCK_SIZE - 1);
++
++      poly1305_blocks_mips(&dctx->h, src, len, hibit);
++}
++
++static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
++                              unsigned int len)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      if (unlikely(dctx->buflen)) {
++              u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++              memcpy(dctx->buf + dctx->buflen, src, bytes);
++              src += bytes;
++              len -= bytes;
++              dctx->buflen += bytes;
++
++              if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++                      mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
++                      dctx->buflen = 0;
++              }
++      }
++
++      if (likely(len >= POLY1305_BLOCK_SIZE)) {
++              mips_poly1305_blocks(dctx, src, len, 1);
++              src += round_down(len, POLY1305_BLOCK_SIZE);
++              len %= POLY1305_BLOCK_SIZE;
++      }
++
++      if (unlikely(len)) {
++              dctx->buflen = len;
++              memcpy(dctx->buf, src, len);
++      }
++      return 0;
++}
++
++void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
++                        unsigned int nbytes)
++{
++      if (unlikely(dctx->buflen)) {
++              u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++              memcpy(dctx->buf + dctx->buflen, src, bytes);
++              src += bytes;
++              nbytes -= bytes;
++              dctx->buflen += bytes;
++
++              if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++                      poly1305_blocks_mips(&dctx->h, dctx->buf,
++                                           POLY1305_BLOCK_SIZE, 1);
++                      dctx->buflen = 0;
++              }
++      }
++
++      if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
++              unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
++
++              poly1305_blocks_mips(&dctx->h, src, len, 1);
++              src += len;
++              nbytes %= POLY1305_BLOCK_SIZE;
++      }
++
++      if (unlikely(nbytes)) {
++              dctx->buflen = nbytes;
++              memcpy(dctx->buf, src, nbytes);
++      }
++}
++EXPORT_SYMBOL(poly1305_update_arch);
++
++void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
++{
++      __le32 digest[4];
++      u64 f = 0;
++
++      if (unlikely(dctx->buflen)) {
++              dctx->buf[dctx->buflen++] = 1;
++              memset(dctx->buf + dctx->buflen, 0,
++                     POLY1305_BLOCK_SIZE - dctx->buflen);
++              poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
++      }
++
++      poly1305_emit_mips(&dctx->h, digest, dctx->s);
++
++      /* mac = (h + s) % (2^128) */
++      f = (f >> 32) + le32_to_cpu(digest[0]);
++      put_unaligned_le32(f, dst);
++      f = (f >> 32) + le32_to_cpu(digest[1]);
++      put_unaligned_le32(f, dst + 4);
++      f = (f >> 32) + le32_to_cpu(digest[2]);
++      put_unaligned_le32(f, dst + 8);
++      f = (f >> 32) + le32_to_cpu(digest[3]);
++      put_unaligned_le32(f, dst + 12);
++
++      *dctx = (struct poly1305_desc_ctx){};
++}
++EXPORT_SYMBOL(poly1305_final_arch);
++
++static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
++{
++      struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++      if (unlikely(!dctx->sset))
++              return -ENOKEY;
++
++      poly1305_final_arch(dctx, dst);
++      return 0;
++}
++
++static struct shash_alg mips_poly1305_alg = {
++      .init                   = mips_poly1305_init,
++      .update                 = mips_poly1305_update,
++      .final                  = mips_poly1305_final,
++      .digestsize             = POLY1305_DIGEST_SIZE,
++      .descsize               = sizeof(struct poly1305_desc_ctx),
++
++      .base.cra_name          = "poly1305",
++      .base.cra_driver_name   = "poly1305-mips",
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = POLY1305_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++};
++
++static int __init mips_poly1305_mod_init(void)
++{
++      return crypto_register_shash(&mips_poly1305_alg);
++}
++
++static void __exit mips_poly1305_mod_exit(void)
++{
++      crypto_unregister_shash(&mips_poly1305_alg);
++}
++
++module_init(mips_poly1305_mod_init);
++module_exit(mips_poly1305_mod_exit);
++
++MODULE_LICENSE("GPL v2");
++MODULE_ALIAS_CRYPTO("poly1305");
++MODULE_ALIAS_CRYPTO("poly1305-mips");
+--- /dev/null
++++ b/arch/mips/crypto/poly1305-mips.pl
+@@ -0,0 +1,1273 @@
++#!/usr/bin/env perl
++# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
++#
++# ====================================================================
++# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
++# project.
++# ====================================================================
++
++# Poly1305 hash for MIPS.
++#
++# May 2016
++#
++# Numbers are cycles per processed byte with poly1305_blocks alone.
++#
++#             IALU/gcc
++# R1x000      ~5.5/+130%      (big-endian)
++# Octeon II   2.50/+70%       (little-endian)
++#
++# March 2019
++#
++# Add 32-bit code path.
++#
++# October 2019
++#
++# Modulo-scheduling reduction allows to omit dependency chain at the
++# end of inner loop and improve performance. Also optimize MIPS32R2
++# code path for MIPS 1004K core. Per René von Dorst's suggestions.
++#
++#             IALU/gcc
++# R1x000      ~9.8/?          (big-endian)
++# Octeon II   3.65/+140%      (little-endian)
++# MT7621/1004K        4.75/?          (little-endian)
++#
++######################################################################
++# There is a number of MIPS ABI in use, O32 and N32/64 are most
++# widely used. Then there is a new contender: NUBI. It appears that if
++# one picks the latter, it's possible to arrange code in ABI neutral
++# manner. Therefore let's stick to NUBI register layout:
++#
++($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
++($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
++($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
++($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
++#
++# The return value is placed in $a0. Following coding rules facilitate
++# interoperability:
++#
++# - never ever touch $tp, "thread pointer", former $gp [o32 can be
++#   excluded from the rule, because it's specified volatile];
++# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
++#   old code];
++# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
++#
++# For reference here is register layout for N32/64 MIPS ABIs:
++#
++# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
++# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
++# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
++# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
++# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
++#
++# <appro@openssl.org>
++#
++######################################################################
++
++$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
++
++$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
++
++if ($flavour =~ /64|n32/i) {{{
++######################################################################
++# 64-bit code path
++#
++
++my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
++my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
++
++$code.=<<___;
++#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
++     defined(_MIPS_ARCH_MIPS64R6)) \\
++     && !defined(_MIPS_ARCH_MIPS64R2)
++# define _MIPS_ARCH_MIPS64R2
++#endif
++
++#if defined(_MIPS_ARCH_MIPS64R6)
++# define dmultu(rs,rt)
++# define mflo(rd,rs,rt)       dmulu   rd,rs,rt
++# define mfhi(rd,rs,rt)       dmuhu   rd,rs,rt
++#else
++# define dmultu(rs,rt)                dmultu  rs,rt
++# define mflo(rd,rs,rt)       mflo    rd
++# define mfhi(rd,rs,rt)       mfhi    rd
++#endif
++
++#ifdef        __KERNEL__
++# define poly1305_init   poly1305_init_mips
++# define poly1305_blocks poly1305_blocks_mips
++# define poly1305_emit   poly1305_emit_mips
++#endif
++
++#if defined(__MIPSEB__) && !defined(MIPSEB)
++# define MIPSEB
++#endif
++
++#ifdef MIPSEB
++# define MSB 0
++# define LSB 7
++#else
++# define MSB 7
++# define LSB 0
++#endif
++
++.text
++.set  noat
++.set  noreorder
++
++.align        5
++.globl        poly1305_init
++.ent  poly1305_init
++poly1305_init:
++      .frame  $sp,0,$ra
++      .set    reorder
++
++      sd      $zero,0($ctx)
++      sd      $zero,8($ctx)
++      sd      $zero,16($ctx)
++
++      beqz    $inp,.Lno_key
++
++#if defined(_MIPS_ARCH_MIPS64R6)
++      andi    $tmp0,$inp,7            # $inp % 8
++      dsubu   $inp,$inp,$tmp0         # align $inp
++      sll     $tmp0,$tmp0,3           # byte to bit offset
++      ld      $in0,0($inp)
++      ld      $in1,8($inp)
++      beqz    $tmp0,.Laligned_key
++      ld      $tmp2,16($inp)
++
++      subu    $tmp1,$zero,$tmp0
++# ifdef       MIPSEB
++      dsllv   $in0,$in0,$tmp0
++      dsrlv   $tmp3,$in1,$tmp1
++      dsllv   $in1,$in1,$tmp0
++      dsrlv   $tmp2,$tmp2,$tmp1
++# else
++      dsrlv   $in0,$in0,$tmp0
++      dsllv   $tmp3,$in1,$tmp1
++      dsrlv   $in1,$in1,$tmp0
++      dsllv   $tmp2,$tmp2,$tmp1
++# endif
++      or      $in0,$in0,$tmp3
++      or      $in1,$in1,$tmp2
++.Laligned_key:
++#else
++      ldl     $in0,0+MSB($inp)
++      ldl     $in1,8+MSB($inp)
++      ldr     $in0,0+LSB($inp)
++      ldr     $in1,8+LSB($inp)
++#endif
++#ifdef        MIPSEB
++# if defined(_MIPS_ARCH_MIPS64R2)
++      dsbh    $in0,$in0               # byte swap
++       dsbh   $in1,$in1
++      dshd    $in0,$in0
++       dshd   $in1,$in1
++# else
++      ori     $tmp0,$zero,0xFF
++      dsll    $tmp2,$tmp0,32
++      or      $tmp0,$tmp2             # 0x000000FF000000FF
++
++      and     $tmp1,$in0,$tmp0        # byte swap
++       and    $tmp3,$in1,$tmp0
++      dsrl    $tmp2,$in0,24
++       dsrl   $tmp4,$in1,24
++      dsll    $tmp1,24
++       dsll   $tmp3,24
++      and     $tmp2,$tmp0
++       and    $tmp4,$tmp0
++      dsll    $tmp0,8                 # 0x0000FF000000FF00
++      or      $tmp1,$tmp2
++       or     $tmp3,$tmp4
++      and     $tmp2,$in0,$tmp0
++       and    $tmp4,$in1,$tmp0
++      dsrl    $in0,8
++       dsrl   $in1,8
++      dsll    $tmp2,8
++       dsll   $tmp4,8
++      and     $in0,$tmp0
++       and    $in1,$tmp0
++      or      $tmp1,$tmp2
++       or     $tmp3,$tmp4
++      or      $in0,$tmp1
++       or     $in1,$tmp3
++      dsrl    $tmp1,$in0,32
++       dsrl   $tmp3,$in1,32
++      dsll    $in0,32
++       dsll   $in1,32
++      or      $in0,$tmp1
++       or     $in1,$tmp3
++# endif
++#endif
++      li      $tmp0,1
++      dsll    $tmp0,32                # 0x0000000100000000
++      daddiu  $tmp0,-63               # 0x00000000ffffffc1
++      dsll    $tmp0,28                # 0x0ffffffc10000000
++      daddiu  $tmp0,-1                # 0x0ffffffc0fffffff
++
++      and     $in0,$tmp0
++      daddiu  $tmp0,-3                # 0x0ffffffc0ffffffc
++      and     $in1,$tmp0
++
++      sd      $in0,24($ctx)
++      dsrl    $tmp0,$in1,2
++      sd      $in1,32($ctx)
++      daddu   $tmp0,$in1              # s1 = r1 + (r1 >> 2)
++      sd      $tmp0,40($ctx)
++
++.Lno_key:
++      li      $v0,0                   # return 0
++      jr      $ra
++.end  poly1305_init
++___
++{
++my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
++
++my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
++   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
++my ($shr,$shl) = ($s6,$s7);           # used on R6
++
++$code.=<<___;
++.align        5
++.globl        poly1305_blocks
++.ent  poly1305_blocks
++poly1305_blocks:
++      .set    noreorder
++      dsrl    $len,4                  # number of complete blocks
++      bnez    $len,poly1305_blocks_internal
++      nop
++      jr      $ra
++      nop
++.end  poly1305_blocks
++
++.align        5
++.ent  poly1305_blocks_internal
++poly1305_blocks_internal:
++      .set    noreorder
++#if defined(_MIPS_ARCH_MIPS64R6)
++      .frame  $sp,8*8,$ra
++      .mask   $SAVED_REGS_MASK|0x000c0000,-8
++      dsubu   $sp,8*8
++      sd      $s7,56($sp)
++      sd      $s6,48($sp)
++#else
++      .frame  $sp,6*8,$ra
++      .mask   $SAVED_REGS_MASK,-8
++      dsubu   $sp,6*8
++#endif
++      sd      $s5,40($sp)
++      sd      $s4,32($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);        # optimize non-nubi prologue
++      sd      $s3,24($sp)
++      sd      $s2,16($sp)
++      sd      $s1,8($sp)
++      sd      $s0,0($sp)
++___
++$code.=<<___;
++      .set    reorder
++
++#if defined(_MIPS_ARCH_MIPS64R6)
++      andi    $shr,$inp,7
++      dsubu   $inp,$inp,$shr          # align $inp
++      sll     $shr,$shr,3             # byte to bit offset
++      subu    $shl,$zero,$shr
++#endif
++
++      ld      $h0,0($ctx)             # load hash value
++      ld      $h1,8($ctx)
++      ld      $h2,16($ctx)
++
++      ld      $r0,24($ctx)            # load key
++      ld      $r1,32($ctx)
++      ld      $rs1,40($ctx)
++
++      dsll    $len,4
++      daddu   $len,$inp               # end of buffer
++      b       .Loop
++
++.align        4
++.Loop:
++#if defined(_MIPS_ARCH_MIPS64R6)
++      ld      $in0,0($inp)            # load input
++      ld      $in1,8($inp)
++      beqz    $shr,.Laligned_inp
++
++      ld      $tmp2,16($inp)
++# ifdef       MIPSEB
++      dsllv   $in0,$in0,$shr
++      dsrlv   $tmp3,$in1,$shl
++      dsllv   $in1,$in1,$shr
++      dsrlv   $tmp2,$tmp2,$shl
++# else
++      dsrlv   $in0,$in0,$shr
++      dsllv   $tmp3,$in1,$shl
++      dsrlv   $in1,$in1,$shr
++      dsllv   $tmp2,$tmp2,$shl
++# endif
++      or      $in0,$in0,$tmp3
++      or      $in1,$in1,$tmp2
++.Laligned_inp:
++#else
++      ldl     $in0,0+MSB($inp)        # load input
++      ldl     $in1,8+MSB($inp)
++      ldr     $in0,0+LSB($inp)
++      ldr     $in1,8+LSB($inp)
++#endif
++      daddiu  $inp,16
++#ifdef        MIPSEB
++# if defined(_MIPS_ARCH_MIPS64R2)
++      dsbh    $in0,$in0               # byte swap
++       dsbh   $in1,$in1
++      dshd    $in0,$in0
++       dshd   $in1,$in1
++# else
++      ori     $tmp0,$zero,0xFF
++      dsll    $tmp2,$tmp0,32
++      or      $tmp0,$tmp2             # 0x000000FF000000FF
++
++      and     $tmp1,$in0,$tmp0        # byte swap
++       and    $tmp3,$in1,$tmp0
++      dsrl    $tmp2,$in0,24
++       dsrl   $tmp4,$in1,24
++      dsll    $tmp1,24
++       dsll   $tmp3,24
++      and     $tmp2,$tmp0
++       and    $tmp4,$tmp0
++      dsll    $tmp0,8                 # 0x0000FF000000FF00
++      or      $tmp1,$tmp2
++       or     $tmp3,$tmp4
++      and     $tmp2,$in0,$tmp0
++       and    $tmp4,$in1,$tmp0
++      dsrl    $in0,8
++       dsrl   $in1,8
++      dsll    $tmp2,8
++       dsll   $tmp4,8
++      and     $in0,$tmp0
++       and    $in1,$tmp0
++      or      $tmp1,$tmp2
++       or     $tmp3,$tmp4
++      or      $in0,$tmp1
++       or     $in1,$tmp3
++      dsrl    $tmp1,$in0,32
++       dsrl   $tmp3,$in1,32
++      dsll    $in0,32
++       dsll   $in1,32
++      or      $in0,$tmp1
++       or     $in1,$tmp3
++# endif
++#endif
++      dsrl    $tmp1,$h2,2             # modulo-scheduled reduction
++      andi    $h2,$h2,3
++      dsll    $tmp0,$tmp1,2
++
++      daddu   $d0,$h0,$in0            # accumulate input
++       daddu  $tmp1,$tmp0
++      sltu    $tmp0,$d0,$h0
++      daddu   $d0,$d0,$tmp1           # ... and residue
++      sltu    $tmp1,$d0,$tmp1
++      daddu   $d1,$h1,$in1
++      daddu   $tmp0,$tmp1
++      sltu    $tmp1,$d1,$h1
++      daddu   $d1,$tmp0
++
++      dmultu  ($r0,$d0)               # h0*r0
++       daddu  $d2,$h2,$padbit
++       sltu   $tmp0,$d1,$tmp0
++      mflo    ($h0,$r0,$d0)
++      mfhi    ($h1,$r0,$d0)
++
++      dmultu  ($rs1,$d1)              # h1*5*r1
++       daddu  $d2,$tmp1
++       daddu  $d2,$tmp0
++      mflo    ($tmp0,$rs1,$d1)
++      mfhi    ($tmp1,$rs1,$d1)
++
++      dmultu  ($r1,$d0)               # h0*r1
++      mflo    ($tmp2,$r1,$d0)
++      mfhi    ($h2,$r1,$d0)
++       daddu  $h0,$tmp0
++       daddu  $h1,$tmp1
++       sltu   $tmp0,$h0,$tmp0
++
++      dmultu  ($r0,$d1)               # h1*r0
++       daddu  $h1,$tmp0
++       daddu  $h1,$tmp2
++      mflo    ($tmp0,$r0,$d1)
++      mfhi    ($tmp1,$r0,$d1)
++
++      dmultu  ($rs1,$d2)              # h2*5*r1
++       sltu   $tmp2,$h1,$tmp2
++       daddu  $h2,$tmp2
++      mflo    ($tmp2,$rs1,$d2)
++
++      dmultu  ($r0,$d2)               # h2*r0
++       daddu  $h1,$tmp0
++       daddu  $h2,$tmp1
++      mflo    ($tmp3,$r0,$d2)
++       sltu   $tmp0,$h1,$tmp0
++       daddu  $h2,$tmp0
++
++      daddu   $h1,$tmp2
++      sltu    $tmp2,$h1,$tmp2
++      daddu   $h2,$tmp2
++      daddu   $h2,$tmp3
++
++      bne     $inp,$len,.Loop
++
++      sd      $h0,0($ctx)             # store hash value
++      sd      $h1,8($ctx)
++      sd      $h2,16($ctx)
++
++      .set    noreorder
++#if defined(_MIPS_ARCH_MIPS64R6)
++      ld      $s7,56($sp)
++      ld      $s6,48($sp)
++#endif
++      ld      $s5,40($sp)             # epilogue
++      ld      $s4,32($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);        # optimize non-nubi epilogue
++      ld      $s3,24($sp)
++      ld      $s2,16($sp)
++      ld      $s1,8($sp)
++      ld      $s0,0($sp)
++___
++$code.=<<___;
++      jr      $ra
++#if defined(_MIPS_ARCH_MIPS64R6)
++      daddu   $sp,8*8
++#else
++      daddu   $sp,6*8
++#endif
++.end  poly1305_blocks_internal
++___
++}
++{
++my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
++
++$code.=<<___;
++.align        5
++.globl        poly1305_emit
++.ent  poly1305_emit
++poly1305_emit:
++      .frame  $sp,0,$ra
++      .set    reorder
++
++      ld      $tmp2,16($ctx)
++      ld      $tmp0,0($ctx)
++      ld      $tmp1,8($ctx)
++
++      li      $in0,-4                 # final reduction
++      dsrl    $in1,$tmp2,2
++      and     $in0,$tmp2
++      andi    $tmp2,$tmp2,3
++      daddu   $in0,$in1
++
++      daddu   $tmp0,$tmp0,$in0
++      sltu    $in1,$tmp0,$in0
++       daddiu $in0,$tmp0,5            # compare to modulus
++      daddu   $tmp1,$tmp1,$in1
++       sltiu  $tmp3,$in0,5
++      sltu    $tmp4,$tmp1,$in1
++       daddu  $in1,$tmp1,$tmp3
++      daddu   $tmp2,$tmp2,$tmp4
++       sltu   $tmp3,$in1,$tmp3
++       daddu  $tmp2,$tmp2,$tmp3
++
++      dsrl    $tmp2,2                 # see if it carried/borrowed
++      dsubu   $tmp2,$zero,$tmp2
++
++      xor     $in0,$tmp0
++      xor     $in1,$tmp1
++      and     $in0,$tmp2
++      and     $in1,$tmp2
++      xor     $in0,$tmp0
++      xor     $in1,$tmp1
++
++      lwu     $tmp0,0($nonce)         # load nonce
++      lwu     $tmp1,4($nonce)
++      lwu     $tmp2,8($nonce)
++      lwu     $tmp3,12($nonce)
++      dsll    $tmp1,32
++      dsll    $tmp3,32
++      or      $tmp0,$tmp1
++      or      $tmp2,$tmp3
++
++      daddu   $in0,$tmp0              # accumulate nonce
++      daddu   $in1,$tmp2
++      sltu    $tmp0,$in0,$tmp0
++      daddu   $in1,$tmp0
++
++      dsrl    $tmp0,$in0,8            # write mac value
++      dsrl    $tmp1,$in0,16
++      dsrl    $tmp2,$in0,24
++      sb      $in0,0($mac)
++      dsrl    $tmp3,$in0,32
++      sb      $tmp0,1($mac)
++      dsrl    $tmp0,$in0,40
++      sb      $tmp1,2($mac)
++      dsrl    $tmp1,$in0,48
++      sb      $tmp2,3($mac)
++      dsrl    $tmp2,$in0,56
++      sb      $tmp3,4($mac)
++      dsrl    $tmp3,$in1,8
++      sb      $tmp0,5($mac)
++      dsrl    $tmp0,$in1,16
++      sb      $tmp1,6($mac)
++      dsrl    $tmp1,$in1,24
++      sb      $tmp2,7($mac)
++
++      sb      $in1,8($mac)
++      dsrl    $tmp2,$in1,32
++      sb      $tmp3,9($mac)
++      dsrl    $tmp3,$in1,40
++      sb      $tmp0,10($mac)
++      dsrl    $tmp0,$in1,48
++      sb      $tmp1,11($mac)
++      dsrl    $tmp1,$in1,56
++      sb      $tmp2,12($mac)
++      sb      $tmp3,13($mac)
++      sb      $tmp0,14($mac)
++      sb      $tmp1,15($mac)
++
++      jr      $ra
++.end  poly1305_emit
++.rdata
++.asciiz       "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
++.align        2
++___
++}
++}}} else {{{
++######################################################################
++# 32-bit code path
++#
++
++my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
++my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
++   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
++
++$code.=<<___;
++#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
++     defined(_MIPS_ARCH_MIPS32R6)) \\
++     && !defined(_MIPS_ARCH_MIPS32R2)
++# define _MIPS_ARCH_MIPS32R2
++#endif
++
++#if defined(_MIPS_ARCH_MIPS32R6)
++# define multu(rs,rt)
++# define mflo(rd,rs,rt)       mulu    rd,rs,rt
++# define mfhi(rd,rs,rt)       muhu    rd,rs,rt
++#else
++# define multu(rs,rt) multu   rs,rt
++# define mflo(rd,rs,rt)       mflo    rd
++# define mfhi(rd,rs,rt)       mfhi    rd
++#endif
++
++#ifdef        __KERNEL__
++# define poly1305_init   poly1305_init_mips
++# define poly1305_blocks poly1305_blocks_mips
++# define poly1305_emit   poly1305_emit_mips
++#endif
++
++#if defined(__MIPSEB__) && !defined(MIPSEB)
++# define MIPSEB
++#endif
++
++#ifdef MIPSEB
++# define MSB 0
++# define LSB 3
++#else
++# define MSB 3
++# define LSB 0
++#endif
++
++.text
++.set  noat
++.set  noreorder
++
++.align        5
++.globl        poly1305_init
++.ent  poly1305_init
++poly1305_init:
++      .frame  $sp,0,$ra
++      .set    reorder
++
++      sw      $zero,0($ctx)
++      sw      $zero,4($ctx)
++      sw      $zero,8($ctx)
++      sw      $zero,12($ctx)
++      sw      $zero,16($ctx)
++
++      beqz    $inp,.Lno_key
++
++#if defined(_MIPS_ARCH_MIPS32R6)
++      andi    $tmp0,$inp,3            # $inp % 4
++      subu    $inp,$inp,$tmp0         # align $inp
++      sll     $tmp0,$tmp0,3           # byte to bit offset
++      lw      $in0,0($inp)
++      lw      $in1,4($inp)
++      lw      $in2,8($inp)
++      lw      $in3,12($inp)
++      beqz    $tmp0,.Laligned_key
++
++      lw      $tmp2,16($inp)
++      subu    $tmp1,$zero,$tmp0
++# ifdef       MIPSEB
++      sllv    $in0,$in0,$tmp0
++      srlv    $tmp3,$in1,$tmp1
++      sllv    $in1,$in1,$tmp0
++      or      $in0,$in0,$tmp3
++      srlv    $tmp3,$in2,$tmp1
++      sllv    $in2,$in2,$tmp0
++      or      $in1,$in1,$tmp3
++      srlv    $tmp3,$in3,$tmp1
++      sllv    $in3,$in3,$tmp0
++      or      $in2,$in2,$tmp3
++      srlv    $tmp2,$tmp2,$tmp1
++      or      $in3,$in3,$tmp2
++# else
++      srlv    $in0,$in0,$tmp0
++      sllv    $tmp3,$in1,$tmp1
++      srlv    $in1,$in1,$tmp0
++      or      $in0,$in0,$tmp3
++      sllv    $tmp3,$in2,$tmp1
++      srlv    $in2,$in2,$tmp0
++      or      $in1,$in1,$tmp3
++      sllv    $tmp3,$in3,$tmp1
++      srlv    $in3,$in3,$tmp0
++      or      $in2,$in2,$tmp3
++      sllv    $tmp2,$tmp2,$tmp1
++      or      $in3,$in3,$tmp2
++# endif
++.Laligned_key:
++#else
++      lwl     $in0,0+MSB($inp)
++      lwl     $in1,4+MSB($inp)
++      lwl     $in2,8+MSB($inp)
++      lwl     $in3,12+MSB($inp)
++      lwr     $in0,0+LSB($inp)
++      lwr     $in1,4+LSB($inp)
++      lwr     $in2,8+LSB($inp)
++      lwr     $in3,12+LSB($inp)
++#endif
++#ifdef        MIPSEB
++# if defined(_MIPS_ARCH_MIPS32R2)
++      wsbh    $in0,$in0               # byte swap
++      wsbh    $in1,$in1
++      wsbh    $in2,$in2
++      wsbh    $in3,$in3
++      rotr    $in0,$in0,16
++      rotr    $in1,$in1,16
++      rotr    $in2,$in2,16
++      rotr    $in3,$in3,16
++# else
++      srl     $tmp0,$in0,24           # byte swap
++      srl     $tmp1,$in0,8
++      andi    $tmp2,$in0,0xFF00
++      sll     $in0,$in0,24
++      andi    $tmp1,0xFF00
++      sll     $tmp2,$tmp2,8
++      or      $in0,$tmp0
++       srl    $tmp0,$in1,24
++      or      $tmp1,$tmp2
++       srl    $tmp2,$in1,8
++      or      $in0,$tmp1
++       andi   $tmp1,$in1,0xFF00
++       sll    $in1,$in1,24
++       andi   $tmp2,0xFF00
++       sll    $tmp1,$tmp1,8
++       or     $in1,$tmp0
++      srl     $tmp0,$in2,24
++       or     $tmp2,$tmp1
++      srl     $tmp1,$in2,8
++       or     $in1,$tmp2
++      andi    $tmp2,$in2,0xFF00
++      sll     $in2,$in2,24
++      andi    $tmp1,0xFF00
++      sll     $tmp2,$tmp2,8
++      or      $in2,$tmp0
++       srl    $tmp0,$in3,24
++      or      $tmp1,$tmp2
++       srl    $tmp2,$in3,8
++      or      $in2,$tmp1
++       andi   $tmp1,$in3,0xFF00
++       sll    $in3,$in3,24
++       andi   $tmp2,0xFF00
++       sll    $tmp1,$tmp1,8
++       or     $in3,$tmp0
++       or     $tmp2,$tmp1
++       or     $in3,$tmp2
++# endif
++#endif
++      lui     $tmp0,0x0fff
++      ori     $tmp0,0xffff            # 0x0fffffff
++      and     $in0,$in0,$tmp0
++      subu    $tmp0,3                 # 0x0ffffffc
++      and     $in1,$in1,$tmp0
++      and     $in2,$in2,$tmp0
++      and     $in3,$in3,$tmp0
++
++      sw      $in0,20($ctx)
++      sw      $in1,24($ctx)
++      sw      $in2,28($ctx)
++      sw      $in3,32($ctx)
++
++      srl     $tmp1,$in1,2
++      srl     $tmp2,$in2,2
++      srl     $tmp3,$in3,2
++      addu    $in1,$in1,$tmp1         # s1 = r1 + (r1 >> 2)
++      addu    $in2,$in2,$tmp2
++      addu    $in3,$in3,$tmp3
++      sw      $in1,36($ctx)
++      sw      $in2,40($ctx)
++      sw      $in3,44($ctx)
++.Lno_key:
++      li      $v0,0
++      jr      $ra
++.end  poly1305_init
++___
++{
++my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
++
++my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
++   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
++my ($d0,$d1,$d2,$d3) =
++   ($a4,$a5,$a6,$a7);
++my $shr = $t2;                # used on R6
++my $one = $t2;                # used on R2
++
++$code.=<<___;
++.globl        poly1305_blocks
++.align        5
++.ent  poly1305_blocks
++poly1305_blocks:
++      .frame  $sp,16*4,$ra
++      .mask   $SAVED_REGS_MASK,-4
++      .set    noreorder
++      subu    $sp, $sp,4*12
++      sw      $s11,4*11($sp)
++      sw      $s10,4*10($sp)
++      sw      $s9, 4*9($sp)
++      sw      $s8, 4*8($sp)
++      sw      $s7, 4*7($sp)
++      sw      $s6, 4*6($sp)
++      sw      $s5, 4*5($sp)
++      sw      $s4, 4*4($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);        # optimize non-nubi prologue
++      sw      $s3, 4*3($sp)
++      sw      $s2, 4*2($sp)
++      sw      $s1, 4*1($sp)
++      sw      $s0, 4*0($sp)
++___
++$code.=<<___;
++      .set    reorder
++
++      srl     $len,4                  # number of complete blocks
++      li      $one,1
++      beqz    $len,.Labort
++
++#if defined(_MIPS_ARCH_MIPS32R6)
++      andi    $shr,$inp,3
++      subu    $inp,$inp,$shr          # align $inp
++      sll     $shr,$shr,3             # byte to bit offset
++#endif
++
++      lw      $h0,0($ctx)             # load hash value
++      lw      $h1,4($ctx)
++      lw      $h2,8($ctx)
++      lw      $h3,12($ctx)
++      lw      $h4,16($ctx)
++
++      lw      $r0,20($ctx)            # load key
++      lw      $r1,24($ctx)
++      lw      $r2,28($ctx)
++      lw      $r3,32($ctx)
++      lw      $rs1,36($ctx)
++      lw      $rs2,40($ctx)
++      lw      $rs3,44($ctx)
++
++      sll     $len,4
++      addu    $len,$len,$inp          # end of buffer
++      b       .Loop
++
++.align        4
++.Loop:
++#if defined(_MIPS_ARCH_MIPS32R6)
++      lw      $d0,0($inp)             # load input
++      lw      $d1,4($inp)
++      lw      $d2,8($inp)
++      lw      $d3,12($inp)
++      beqz    $shr,.Laligned_inp
++
++      lw      $t0,16($inp)
++      subu    $t1,$zero,$shr
++# ifdef       MIPSEB
++      sllv    $d0,$d0,$shr
++      srlv    $at,$d1,$t1
++      sllv    $d1,$d1,$shr
++      or      $d0,$d0,$at
++      srlv    $at,$d2,$t1
++      sllv    $d2,$d2,$shr
++      or      $d1,$d1,$at
++      srlv    $at,$d3,$t1
++      sllv    $d3,$d3,$shr
++      or      $d2,$d2,$at
++      srlv    $t0,$t0,$t1
++      or      $d3,$d3,$t0
++# else
++      srlv    $d0,$d0,$shr
++      sllv    $at,$d1,$t1
++      srlv    $d1,$d1,$shr
++      or      $d0,$d0,$at
++      sllv    $at,$d2,$t1
++      srlv    $d2,$d2,$shr
++      or      $d1,$d1,$at
++      sllv    $at,$d3,$t1
++      srlv    $d3,$d3,$shr
++      or      $d2,$d2,$at
++      sllv    $t0,$t0,$t1
++      or      $d3,$d3,$t0
++# endif
++.Laligned_inp:
++#else
++      lwl     $d0,0+MSB($inp)         # load input
++      lwl     $d1,4+MSB($inp)
++      lwl     $d2,8+MSB($inp)
++      lwl     $d3,12+MSB($inp)
++      lwr     $d0,0+LSB($inp)
++      lwr     $d1,4+LSB($inp)
++      lwr     $d2,8+LSB($inp)
++      lwr     $d3,12+LSB($inp)
++#endif
++#ifdef        MIPSEB
++# if defined(_MIPS_ARCH_MIPS32R2)
++      wsbh    $d0,$d0                 # byte swap
++      wsbh    $d1,$d1
++      wsbh    $d2,$d2
++      wsbh    $d3,$d3
++      rotr    $d0,$d0,16
++      rotr    $d1,$d1,16
++      rotr    $d2,$d2,16
++      rotr    $d3,$d3,16
++# else
++      srl     $at,$d0,24              # byte swap
++      srl     $t0,$d0,8
++      andi    $t1,$d0,0xFF00
++      sll     $d0,$d0,24
++      andi    $t0,0xFF00
++      sll     $t1,$t1,8
++      or      $d0,$at
++       srl    $at,$d1,24
++      or      $t0,$t1
++       srl    $t1,$d1,8
++      or      $d0,$t0
++       andi   $t0,$d1,0xFF00
++       sll    $d1,$d1,24
++       andi   $t1,0xFF00
++       sll    $t0,$t0,8
++       or     $d1,$at
++      srl     $at,$d2,24
++       or     $t1,$t0
++      srl     $t0,$d2,8
++       or     $d1,$t1
++      andi    $t1,$d2,0xFF00
++      sll     $d2,$d2,24
++      andi    $t0,0xFF00
++      sll     $t1,$t1,8
++      or      $d2,$at
++       srl    $at,$d3,24
++      or      $t0,$t1
++       srl    $t1,$d3,8
++      or      $d2,$t0
++       andi   $t0,$d3,0xFF00
++       sll    $d3,$d3,24
++       andi   $t1,0xFF00
++       sll    $t0,$t0,8
++       or     $d3,$at
++       or     $t1,$t0
++       or     $d3,$t1
++# endif
++#endif
++      srl     $t0,$h4,2               # modulo-scheduled reduction
++      andi    $h4,$h4,3
++      sll     $at,$t0,2
++
++      addu    $d0,$d0,$h0             # accumulate input
++       addu   $t0,$t0,$at
++      sltu    $h0,$d0,$h0
++      addu    $d0,$d0,$t0             # ... and residue
++      sltu    $at,$d0,$t0
++
++      addu    $d1,$d1,$h1
++       addu   $h0,$h0,$at             # carry
++      sltu    $h1,$d1,$h1
++      addu    $d1,$d1,$h0
++      sltu    $h0,$d1,$h0
++
++      addu    $d2,$d2,$h2
++       addu   $h1,$h1,$h0             # carry
++      sltu    $h2,$d2,$h2
++      addu    $d2,$d2,$h1
++      sltu    $h1,$d2,$h1
++
++      addu    $d3,$d3,$h3
++       addu   $h2,$h2,$h1             # carry
++      sltu    $h3,$d3,$h3
++      addu    $d3,$d3,$h2
++
++#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
++      multu   $r0,$d0                 # d0*r0
++       sltu   $h2,$d3,$h2
++      maddu   $rs3,$d1                # d1*s3
++       addu   $h3,$h3,$h2             # carry
++      maddu   $rs2,$d2                # d2*s2
++       addu   $h4,$h4,$padbit
++      maddu   $rs1,$d3                # d3*s1
++       addu   $h4,$h4,$h3
++      mfhi    $at
++      mflo    $h0
++
++      multu   $r1,$d0                 # d0*r1
++      maddu   $r0,$d1                 # d1*r0
++      maddu   $rs3,$d2                # d2*s3
++      maddu   $rs2,$d3                # d3*s2
++      maddu   $rs1,$h4                # h4*s1
++      maddu   $at,$one                # hi*1
++      mfhi    $at
++      mflo    $h1
++
++      multu   $r2,$d0                 # d0*r2
++      maddu   $r1,$d1                 # d1*r1
++      maddu   $r0,$d2                 # d2*r0
++      maddu   $rs3,$d3                # d3*s3
++      maddu   $rs2,$h4                # h4*s2
++      maddu   $at,$one                # hi*1
++      mfhi    $at
++      mflo    $h2
++
++      mul     $t0,$r0,$h4             # h4*r0
++
++      multu   $r3,$d0                 # d0*r3
++      maddu   $r2,$d1                 # d1*r2
++      maddu   $r1,$d2                 # d2*r1
++      maddu   $r0,$d3                 # d3*r0
++      maddu   $rs3,$h4                # h4*s3
++      maddu   $at,$one                # hi*1
++      mfhi    $at
++      mflo    $h3
++
++       addiu  $inp,$inp,16
++
++      addu    $h4,$t0,$at
++#else
++      multu   ($r0,$d0)               # d0*r0
++      mflo    ($h0,$r0,$d0)
++      mfhi    ($h1,$r0,$d0)
++
++       sltu   $h2,$d3,$h2
++       addu   $h3,$h3,$h2             # carry
++
++      multu   ($rs3,$d1)              # d1*s3
++      mflo    ($at,$rs3,$d1)
++      mfhi    ($t0,$rs3,$d1)
++
++       addu   $h4,$h4,$padbit
++       addiu  $inp,$inp,16
++       addu   $h4,$h4,$h3
++
++      multu   ($rs2,$d2)              # d2*s2
++      mflo    ($a3,$rs2,$d2)
++      mfhi    ($t1,$rs2,$d2)
++       addu   $h0,$h0,$at
++       addu   $h1,$h1,$t0
++      multu   ($rs1,$d3)              # d3*s1
++       sltu   $at,$h0,$at
++       addu   $h1,$h1,$at
++
++      mflo    ($at,$rs1,$d3)
++      mfhi    ($t0,$rs1,$d3)
++       addu   $h0,$h0,$a3
++       addu   $h1,$h1,$t1
++      multu   ($r1,$d0)               # d0*r1
++       sltu   $a3,$h0,$a3
++       addu   $h1,$h1,$a3
++
++
++      mflo    ($a3,$r1,$d0)
++      mfhi    ($h2,$r1,$d0)
++       addu   $h0,$h0,$at
++       addu   $h1,$h1,$t0
++      multu   ($r0,$d1)               # d1*r0
++       sltu   $at,$h0,$at
++       addu   $h1,$h1,$at
++
++      mflo    ($at,$r0,$d1)
++      mfhi    ($t0,$r0,$d1)
++       addu   $h1,$h1,$a3
++       sltu   $a3,$h1,$a3
++      multu   ($rs3,$d2)              # d2*s3
++       addu   $h2,$h2,$a3
++
++      mflo    ($a3,$rs3,$d2)
++      mfhi    ($t1,$rs3,$d2)
++       addu   $h1,$h1,$at
++       addu   $h2,$h2,$t0
++      multu   ($rs2,$d3)              # d3*s2
++       sltu   $at,$h1,$at
++       addu   $h2,$h2,$at
++
++      mflo    ($at,$rs2,$d3)
++      mfhi    ($t0,$rs2,$d3)
++       addu   $h1,$h1,$a3
++       addu   $h2,$h2,$t1
++      multu   ($rs1,$h4)              # h4*s1
++       sltu   $a3,$h1,$a3
++       addu   $h2,$h2,$a3
++
++      mflo    ($a3,$rs1,$h4)
++       addu   $h1,$h1,$at
++       addu   $h2,$h2,$t0
++      multu   ($r2,$d0)               # d0*r2
++       sltu   $at,$h1,$at
++       addu   $h2,$h2,$at
++
++
++      mflo    ($at,$r2,$d0)
++      mfhi    ($h3,$r2,$d0)
++       addu   $h1,$h1,$a3
++       sltu   $a3,$h1,$a3
++      multu   ($r1,$d1)               # d1*r1
++       addu   $h2,$h2,$a3
++
++      mflo    ($a3,$r1,$d1)
++      mfhi    ($t1,$r1,$d1)
++       addu   $h2,$h2,$at
++       sltu   $at,$h2,$at
++      multu   ($r0,$d2)               # d2*r0
++       addu   $h3,$h3,$at
++
++      mflo    ($at,$r0,$d2)
++      mfhi    ($t0,$r0,$d2)
++       addu   $h2,$h2,$a3
++       addu   $h3,$h3,$t1
++      multu   ($rs3,$d3)              # d3*s3
++       sltu   $a3,$h2,$a3
++       addu   $h3,$h3,$a3
++
++      mflo    ($a3,$rs3,$d3)
++      mfhi    ($t1,$rs3,$d3)
++       addu   $h2,$h2,$at
++       addu   $h3,$h3,$t0
++      multu   ($rs2,$h4)              # h4*s2
++       sltu   $at,$h2,$at
++       addu   $h3,$h3,$at
++
++      mflo    ($at,$rs2,$h4)
++       addu   $h2,$h2,$a3
++       addu   $h3,$h3,$t1
++      multu   ($r3,$d0)               # d0*r3
++       sltu   $a3,$h2,$a3
++       addu   $h3,$h3,$a3
++
++
++      mflo    ($a3,$r3,$d0)
++      mfhi    ($t1,$r3,$d0)
++       addu   $h2,$h2,$at
++       sltu   $at,$h2,$at
++      multu   ($r2,$d1)               # d1*r2
++       addu   $h3,$h3,$at
++
++      mflo    ($at,$r2,$d1)
++      mfhi    ($t0,$r2,$d1)
++       addu   $h3,$h3,$a3
++       sltu   $a3,$h3,$a3
++      multu   ($r0,$d3)               # d3*r0
++       addu   $t1,$t1,$a3
++
++      mflo    ($a3,$r0,$d3)
++      mfhi    ($d3,$r0,$d3)
++       addu   $h3,$h3,$at
++       addu   $t1,$t1,$t0
++      multu   ($r1,$d2)               # d2*r1
++       sltu   $at,$h3,$at
++       addu   $t1,$t1,$at
++
++      mflo    ($at,$r1,$d2)
++      mfhi    ($t0,$r1,$d2)
++       addu   $h3,$h3,$a3
++       addu   $t1,$t1,$d3
++      multu   ($rs3,$h4)              # h4*s3
++       sltu   $a3,$h3,$a3
++       addu   $t1,$t1,$a3
++
++      mflo    ($a3,$rs3,$h4)
++       addu   $h3,$h3,$at
++       addu   $t1,$t1,$t0
++      multu   ($r0,$h4)               # h4*r0
++       sltu   $at,$h3,$at
++       addu   $t1,$t1,$at
++
++
++      mflo    ($h4,$r0,$h4)
++       addu   $h3,$h3,$a3
++       sltu   $a3,$h3,$a3
++       addu   $t1,$t1,$a3
++      addu    $h4,$h4,$t1
++
++      li      $padbit,1               # if we loop, padbit is 1
++#endif
++      bne     $inp,$len,.Loop
++
++      sw      $h0,0($ctx)             # store hash value
++      sw      $h1,4($ctx)
++      sw      $h2,8($ctx)
++      sw      $h3,12($ctx)
++      sw      $h4,16($ctx)
++
++      .set    noreorder
++.Labort:
++      lw      $s11,4*11($sp)
++      lw      $s10,4*10($sp)
++      lw      $s9, 4*9($sp)
++      lw      $s8, 4*8($sp)
++      lw      $s7, 4*7($sp)
++      lw      $s6, 4*6($sp)
++      lw      $s5, 4*5($sp)
++      lw      $s4, 4*4($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);        # optimize non-nubi prologue
++      lw      $s3, 4*3($sp)
++      lw      $s2, 4*2($sp)
++      lw      $s1, 4*1($sp)
++      lw      $s0, 4*0($sp)
++___
++$code.=<<___;
++      jr      $ra
++      addu    $sp,$sp,4*12
++.end  poly1305_blocks
++___
++}
++{
++my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
++
++$code.=<<___;
++.align        5
++.globl        poly1305_emit
++.ent  poly1305_emit
++poly1305_emit:
++      .frame  $sp,0,$ra
++      .set    reorder
++
++      lw      $tmp4,16($ctx)
++      lw      $tmp0,0($ctx)
++      lw      $tmp1,4($ctx)
++      lw      $tmp2,8($ctx)
++      lw      $tmp3,12($ctx)
++
++      li      $in0,-4                 # final reduction
++      srl     $ctx,$tmp4,2
++      and     $in0,$in0,$tmp4
++      andi    $tmp4,$tmp4,3
++      addu    $ctx,$ctx,$in0
++
++      addu    $tmp0,$tmp0,$ctx
++      sltu    $ctx,$tmp0,$ctx
++       addiu  $in0,$tmp0,5            # compare to modulus
++      addu    $tmp1,$tmp1,$ctx
++       sltiu  $in1,$in0,5
++      sltu    $ctx,$tmp1,$ctx
++       addu   $in1,$in1,$tmp1
++      addu    $tmp2,$tmp2,$ctx
++       sltu   $in2,$in1,$tmp1
++      sltu    $ctx,$tmp2,$ctx
++       addu   $in2,$in2,$tmp2
++      addu    $tmp3,$tmp3,$ctx
++       sltu   $in3,$in2,$tmp2
++      sltu    $ctx,$tmp3,$ctx
++       addu   $in3,$in3,$tmp3
++      addu    $tmp4,$tmp4,$ctx
++       sltu   $ctx,$in3,$tmp3
++       addu   $ctx,$tmp4
++
++      srl     $ctx,2                  # see if it carried/borrowed
++      subu    $ctx,$zero,$ctx
++
++      xor     $in0,$tmp0
++      xor     $in1,$tmp1
++      xor     $in2,$tmp2
++      xor     $in3,$tmp3
++      and     $in0,$ctx
++      and     $in1,$ctx
++      and     $in2,$ctx
++      and     $in3,$ctx
++      xor     $in0,$tmp0
++      xor     $in1,$tmp1
++      xor     $in2,$tmp2
++      xor     $in3,$tmp3
++
++      lw      $tmp0,0($nonce)         # load nonce
++      lw      $tmp1,4($nonce)
++      lw      $tmp2,8($nonce)
++      lw      $tmp3,12($nonce)
++
++      addu    $in0,$tmp0              # accumulate nonce
++      sltu    $ctx,$in0,$tmp0
++
++      addu    $in1,$tmp1
++      sltu    $tmp1,$in1,$tmp1
++      addu    $in1,$ctx
++      sltu    $ctx,$in1,$ctx
++      addu    $ctx,$tmp1
++
++      addu    $in2,$tmp2
++      sltu    $tmp2,$in2,$tmp2
++      addu    $in2,$ctx
++      sltu    $ctx,$in2,$ctx
++      addu    $ctx,$tmp2
++
++      addu    $in3,$tmp3
++      addu    $in3,$ctx
++
++      srl     $tmp0,$in0,8            # write mac value
++      srl     $tmp1,$in0,16
++      srl     $tmp2,$in0,24
++      sb      $in0, 0($mac)
++      sb      $tmp0,1($mac)
++      srl     $tmp0,$in1,8
++      sb      $tmp1,2($mac)
++      srl     $tmp1,$in1,16
++      sb      $tmp2,3($mac)
++      srl     $tmp2,$in1,24
++      sb      $in1, 4($mac)
++      sb      $tmp0,5($mac)
++      srl     $tmp0,$in2,8
++      sb      $tmp1,6($mac)
++      srl     $tmp1,$in2,16
++      sb      $tmp2,7($mac)
++      srl     $tmp2,$in2,24
++      sb      $in2, 8($mac)
++      sb      $tmp0,9($mac)
++      srl     $tmp0,$in3,8
++      sb      $tmp1,10($mac)
++      srl     $tmp1,$in3,16
++      sb      $tmp2,11($mac)
++      srl     $tmp2,$in3,24
++      sb      $in3, 12($mac)
++      sb      $tmp0,13($mac)
++      sb      $tmp1,14($mac)
++      sb      $tmp2,15($mac)
++
++      jr      $ra
++.end  poly1305_emit
++.rdata
++.asciiz       "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
++.align        2
++___
++}
++}}}
++
++$output=pop and open STDOUT,">$output";
++print $code;
++close STDOUT;
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -707,6 +707,11 @@ config CRYPTO_POLY1305_X86_64
+         in IETF protocols. This is the x86_64 assembler implementation using SIMD
+         instructions.
+ 
++config CRYPTO_POLY1305_MIPS
++      tristate "Poly1305 authenticator algorithm (MIPS optimized)"
++      depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
++      select CRYPTO_ARCH_HAVE_LIB_POLY1305
++
+ config CRYPTO_MD4
+       tristate "MD4 digest algorithm"
+       select CRYPTO_HASH
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -39,6 +39,7 @@ config CRYPTO_LIB_DES
+ 
+ config CRYPTO_LIB_POLY1305_RSIZE
+       int
++      default 2 if MIPS
+       default 4 if X86_64
+       default 9 if ARM || ARM64
+       default 1
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0021-crypto-blake2s-generic-C-library-implementation-and-.patch b/target/linux/generic/backport-5.4/080-wireguard-0021-crypto-blake2s-generic-C-library-implementation-and-.patch

new file mode 100644 (file)

index 0000000..a78a964
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0021-crypto-blake2s-generic-C-library-implementation-and-.patch
@@ -0,0 +1,1097 @@
+From 41138d5e49eedc77ff1c4985891b78baba02a874 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:28 +0100
+Subject: [PATCH 021/124] crypto: blake2s - generic C library implementation
+ and selftest
+
+commit 66d7fb94e4ffe5acc589e0b2b4710aecc1f07a28 upstream.
+
+The C implementation was originally based on Samuel Neves' public
+domain reference implementation but has since been heavily modified
+for the kernel. We're able to do compile-time optimizations by moving
+some scaffolding around the final function into the header file.
+
+Information: https://blake2.net/
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
+Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
+[ardb: - move from lib/zinc to lib/crypto
+       - remove simd handling
+       - rewrote selftest for better coverage
+       - use fixed digest length for blake2s_hmac() and rename to
+         blake2s256_hmac() ]
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/crypto/blake2s.h          | 106 +++++
+ include/crypto/internal/blake2s.h |  19 +
+ lib/crypto/Kconfig                |  25 ++
+ lib/crypto/Makefile               |  10 +
+ lib/crypto/blake2s-generic.c      | 111 ++++++
+ lib/crypto/blake2s-selftest.c     | 622 ++++++++++++++++++++++++++++++
+ lib/crypto/blake2s.c              | 126 ++++++
+ 7 files changed, 1019 insertions(+)
+ create mode 100644 include/crypto/blake2s.h
+ create mode 100644 include/crypto/internal/blake2s.h
+ create mode 100644 lib/crypto/blake2s-generic.c
+ create mode 100644 lib/crypto/blake2s-selftest.c
+ create mode 100644 lib/crypto/blake2s.c
+
+--- /dev/null
++++ b/include/crypto/blake2s.h
+@@ -0,0 +1,106 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef BLAKE2S_H
++#define BLAKE2S_H
++
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/string.h>
++
++#include <asm/bug.h>
++
++enum blake2s_lengths {
++      BLAKE2S_BLOCK_SIZE = 64,
++      BLAKE2S_HASH_SIZE = 32,
++      BLAKE2S_KEY_SIZE = 32,
++
++      BLAKE2S_128_HASH_SIZE = 16,
++      BLAKE2S_160_HASH_SIZE = 20,
++      BLAKE2S_224_HASH_SIZE = 28,
++      BLAKE2S_256_HASH_SIZE = 32,
++};
++
++struct blake2s_state {
++      u32 h[8];
++      u32 t[2];
++      u32 f[2];
++      u8 buf[BLAKE2S_BLOCK_SIZE];
++      unsigned int buflen;
++      unsigned int outlen;
++};
++
++enum blake2s_iv {
++      BLAKE2S_IV0 = 0x6A09E667UL,
++      BLAKE2S_IV1 = 0xBB67AE85UL,
++      BLAKE2S_IV2 = 0x3C6EF372UL,
++      BLAKE2S_IV3 = 0xA54FF53AUL,
++      BLAKE2S_IV4 = 0x510E527FUL,
++      BLAKE2S_IV5 = 0x9B05688CUL,
++      BLAKE2S_IV6 = 0x1F83D9ABUL,
++      BLAKE2S_IV7 = 0x5BE0CD19UL,
++};
++
++void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen);
++void blake2s_final(struct blake2s_state *state, u8 *out);
++
++static inline void blake2s_init_param(struct blake2s_state *state,
++                                    const u32 param)
++{
++      *state = (struct blake2s_state){{
++              BLAKE2S_IV0 ^ param,
++              BLAKE2S_IV1,
++              BLAKE2S_IV2,
++              BLAKE2S_IV3,
++              BLAKE2S_IV4,
++              BLAKE2S_IV5,
++              BLAKE2S_IV6,
++              BLAKE2S_IV7,
++      }};
++}
++
++static inline void blake2s_init(struct blake2s_state *state,
++                              const size_t outlen)
++{
++      blake2s_init_param(state, 0x01010000 | outlen);
++      state->outlen = outlen;
++}
++
++static inline void blake2s_init_key(struct blake2s_state *state,
++                                  const size_t outlen, const void *key,
++                                  const size_t keylen)
++{
++      WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
++              !key || !keylen || keylen > BLAKE2S_KEY_SIZE));
++
++      blake2s_init_param(state, 0x01010000 | keylen << 8 | outlen);
++      memcpy(state->buf, key, keylen);
++      state->buflen = BLAKE2S_BLOCK_SIZE;
++      state->outlen = outlen;
++}
++
++static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
++                         const size_t outlen, const size_t inlen,
++                         const size_t keylen)
++{
++      struct blake2s_state state;
++
++      WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
++              outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
++              (!key && keylen)));
++
++      if (keylen)
++              blake2s_init_key(&state, outlen, key, keylen);
++      else
++              blake2s_init(&state, outlen);
++
++      blake2s_update(&state, in, inlen);
++      blake2s_final(&state, out);
++}
++
++void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
++                   const size_t keylen);
++
++#endif /* BLAKE2S_H */
+--- /dev/null
++++ b/include/crypto/internal/blake2s.h
+@@ -0,0 +1,19 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
++
++#ifndef BLAKE2S_INTERNAL_H
++#define BLAKE2S_INTERNAL_H
++
++#include <crypto/blake2s.h>
++
++void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
++                            size_t nblocks, const u32 inc);
++
++void blake2s_compress_arch(struct blake2s_state *state,const u8 *block,
++                         size_t nblocks, const u32 inc);
++
++static inline void blake2s_set_lastblock(struct blake2s_state *state)
++{
++      state->f[0] = -1;
++}
++
++#endif /* BLAKE2S_INTERNAL_H */
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -8,6 +8,31 @@ config CRYPTO_LIB_AES
+ config CRYPTO_LIB_ARC4
+       tristate
+ 
++config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
++      tristate
++      help
++        Declares whether the architecture provides an arch-specific
++        accelerated implementation of the Blake2s library interface,
++        either builtin or as a module.
++
++config CRYPTO_LIB_BLAKE2S_GENERIC
++      tristate
++      help
++        This symbol can be depended upon by arch implementations of the
++        Blake2s library interface that require the generic code as a
++        fallback, e.g., for SIMD implementations. If no arch specific
++        implementation is enabled, this implementation serves the users
++        of CRYPTO_LIB_BLAKE2S.
++
++config CRYPTO_LIB_BLAKE2S
++      tristate "BLAKE2s hash function library"
++      depends on CRYPTO_ARCH_HAVE_LIB_BLAKE2S || !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
++      select CRYPTO_LIB_BLAKE2S_GENERIC if CRYPTO_ARCH_HAVE_LIB_BLAKE2S=n
++      help
++        Enable the Blake2s library interface. This interface may be fulfilled
++        by either the generic implementation or an arch-specific one, if one
++        is available and enabled.
++
+ config CRYPTO_ARCH_HAVE_LIB_CHACHA
+       tristate
+       help
+--- a/lib/crypto/Makefile
++++ b/lib/crypto/Makefile
+@@ -10,6 +10,12 @@ libaes-y                                    := aes.o
+ obj-$(CONFIG_CRYPTO_LIB_ARC4)                 += libarc4.o
+ libarc4-y                                     := arc4.o
+ 
++obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)      += libblake2s-generic.o
++libblake2s-generic-y                          += blake2s-generic.o
++
++obj-$(CONFIG_CRYPTO_LIB_BLAKE2S)              += libblake2s.o
++libblake2s-y                                  += blake2s.o
++
+ obj-$(CONFIG_CRYPTO_LIB_DES)                  += libdes.o
+ libdes-y                                      := des.o
+ 
+@@ -18,3 +24,7 @@ libpoly1305-y                                        := poly1305.o
+ 
+ obj-$(CONFIG_CRYPTO_LIB_SHA256)                       += libsha256.o
+ libsha256-y                                   := sha256.o
++
++ifneq ($(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS),y)
++libblake2s-y                                  += blake2s-selftest.o
++endif
+--- /dev/null
++++ b/lib/crypto/blake2s-generic.c
+@@ -0,0 +1,111 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This is an implementation of the BLAKE2s hash and PRF functions.
++ *
++ * Information: https://blake2.net/
++ *
++ */
++
++#include <crypto/internal/blake2s.h>
++#include <linux/types.h>
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/bug.h>
++#include <asm/unaligned.h>
++
++static const u8 blake2s_sigma[10][16] = {
++      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
++      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
++      { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
++      { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
++      { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
++      { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
++      { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
++      { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
++      { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
++      { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
++};
++
++static inline void blake2s_increment_counter(struct blake2s_state *state,
++                                           const u32 inc)
++{
++      state->t[0] += inc;
++      state->t[1] += (state->t[0] < inc);
++}
++
++void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
++                            size_t nblocks, const u32 inc)
++{
++      u32 m[16];
++      u32 v[16];
++      int i;
++
++      WARN_ON(IS_ENABLED(DEBUG) &&
++              (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
++
++      while (nblocks > 0) {
++              blake2s_increment_counter(state, inc);
++              memcpy(m, block, BLAKE2S_BLOCK_SIZE);
++              le32_to_cpu_array(m, ARRAY_SIZE(m));
++              memcpy(v, state->h, 32);
++              v[ 8] = BLAKE2S_IV0;
++              v[ 9] = BLAKE2S_IV1;
++              v[10] = BLAKE2S_IV2;
++              v[11] = BLAKE2S_IV3;
++              v[12] = BLAKE2S_IV4 ^ state->t[0];
++              v[13] = BLAKE2S_IV5 ^ state->t[1];
++              v[14] = BLAKE2S_IV6 ^ state->f[0];
++              v[15] = BLAKE2S_IV7 ^ state->f[1];
++
++#define G(r, i, a, b, c, d) do { \
++      a += b + m[blake2s_sigma[r][2 * i + 0]]; \
++      d = ror32(d ^ a, 16); \
++      c += d; \
++      b = ror32(b ^ c, 12); \
++      a += b + m[blake2s_sigma[r][2 * i + 1]]; \
++      d = ror32(d ^ a, 8); \
++      c += d; \
++      b = ror32(b ^ c, 7); \
++} while (0)
++
++#define ROUND(r) do { \
++      G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
++      G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
++      G(r, 2, v[2], v[ 6], v[10], v[14]); \
++      G(r, 3, v[3], v[ 7], v[11], v[15]); \
++      G(r, 4, v[0], v[ 5], v[10], v[15]); \
++      G(r, 5, v[1], v[ 6], v[11], v[12]); \
++      G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
++      G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
++} while (0)
++              ROUND(0);
++              ROUND(1);
++              ROUND(2);
++              ROUND(3);
++              ROUND(4);
++              ROUND(5);
++              ROUND(6);
++              ROUND(7);
++              ROUND(8);
++              ROUND(9);
++
++#undef G
++#undef ROUND
++
++              for (i = 0; i < 8; ++i)
++                      state->h[i] ^= v[i] ^ v[i + 8];
++
++              block += BLAKE2S_BLOCK_SIZE;
++              --nblocks;
++      }
++}
++
++EXPORT_SYMBOL(blake2s_compress_generic);
++
++MODULE_LICENSE("GPL v2");
++MODULE_DESCRIPTION("BLAKE2s hash function");
++MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+--- /dev/null
++++ b/lib/crypto/blake2s-selftest.c
+@@ -0,0 +1,622 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include <crypto/blake2s.h>
++#include <linux/string.h>
++
++/*
++ * blake2s_testvecs[] generated with the program below (using libb2-dev and
++ * libssl-dev [OpenSSL])
++ *
++ * #include <blake2.h>
++ * #include <stdint.h>
++ * #include <stdio.h>
++ *
++ * #include <openssl/evp.h>
++ * #include <openssl/hmac.h>
++ *
++ * #define BLAKE2S_TESTVEC_COUNT      256
++ *
++ * static void print_vec(const uint8_t vec[], int len)
++ * {
++ *    int i;
++ *
++ *    printf("  { ");
++ *    for (i = 0; i < len; i++) {
++ *            if (i && (i % 12) == 0)
++ *                    printf("\n    ");
++ *            printf("0x%02x, ", vec[i]);
++ *    }
++ *    printf("},\n");
++ * }
++ *
++ * int main(void)
++ * {
++ *    uint8_t key[BLAKE2S_KEYBYTES];
++ *    uint8_t buf[BLAKE2S_TESTVEC_COUNT];
++ *    uint8_t hash[BLAKE2S_OUTBYTES];
++ *    int i, j;
++ *
++ *    key[0] = key[1] = 1;
++ *    for (i = 2; i < BLAKE2S_KEYBYTES; ++i)
++ *            key[i] = key[i - 2] + key[i - 1];
++ *
++ *    for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i)
++ *            buf[i] = (uint8_t)i;
++ *
++ *    printf("static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
++ *
++ *    for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) {
++ *            int outlen = 1 + i % BLAKE2S_OUTBYTES;
++ *            int keylen = (13 * i) % (BLAKE2S_KEYBYTES + 1);
++ *
++ *            blake2s(hash, buf, key + BLAKE2S_KEYBYTES - keylen, outlen, i,
++ *                    keylen);
++ *            print_vec(hash, outlen);
++ *    }
++ *    printf("};\n\n");
++ *
++ *    printf("static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
++ *
++ *    HMAC(EVP_blake2s256(), key, sizeof(key), buf, sizeof(buf), hash, NULL);
++ *    print_vec(hash, BLAKE2S_OUTBYTES);
++ *
++ *    HMAC(EVP_blake2s256(), buf, sizeof(buf), key, sizeof(key), hash, NULL);
++ *    print_vec(hash, BLAKE2S_OUTBYTES);
++ *
++ *    printf("};\n");
++ *
++ *    return 0;
++ *}
++ */
++static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
++  { 0xa1, },
++  { 0x7c, 0x89, },
++  { 0x74, 0x0e, 0xd4, },
++  { 0x47, 0x0c, 0x21, 0x15, },
++  { 0x18, 0xd6, 0x9c, 0xa6, 0xc4, },
++  { 0x13, 0x5d, 0x16, 0x63, 0x2e, 0xf9, },
++  { 0x2c, 0xb5, 0x04, 0xb7, 0x99, 0xe2, 0x73, },
++  { 0x9a, 0x0f, 0xd2, 0x39, 0xd6, 0x68, 0x1b, 0x92, },
++  { 0xc8, 0xde, 0x7a, 0xea, 0x2f, 0xf4, 0xd2, 0xe3, 0x2b, },
++  { 0x5b, 0xf9, 0x43, 0x52, 0x0c, 0x12, 0xba, 0xb5, 0x93, 0x9f, },
++  { 0xc6, 0x2c, 0x4e, 0x80, 0xfc, 0x32, 0x5b, 0x33, 0xb8, 0xb8, 0x0a, },
++  { 0xa7, 0x5c, 0xfd, 0x3a, 0xcc, 0xbf, 0x90, 0xca, 0xb7, 0x97, 0xde, 0xd8, },
++  { 0x66, 0xca, 0x3c, 0xc4, 0x19, 0xef, 0x92, 0x66, 0x3f, 0x21, 0x8f, 0xda,
++    0xb7, },
++  { 0xba, 0xe5, 0xbb, 0x30, 0x25, 0x94, 0x6d, 0xc3, 0x89, 0x09, 0xc4, 0x25,
++    0x52, 0x3e, },
++  { 0xa2, 0xef, 0x0e, 0x52, 0x0b, 0x5f, 0xa2, 0x01, 0x6d, 0x0a, 0x25, 0xbc,
++    0x57, 0xe2, 0x27, },
++  { 0x4f, 0xe0, 0xf9, 0x52, 0x12, 0xda, 0x84, 0xb7, 0xab, 0xae, 0xb0, 0xa6,
++    0x47, 0x2a, 0xc7, 0xf5, },
++  { 0x56, 0xe7, 0xa8, 0x1c, 0x4c, 0xca, 0xed, 0x90, 0x31, 0xec, 0x87, 0x43,
++    0xe7, 0x72, 0x08, 0xec, 0xbe, },
++  { 0x7e, 0xdf, 0x80, 0x1c, 0x93, 0x33, 0xfd, 0x53, 0x44, 0xba, 0xfd, 0x96,
++    0xe1, 0xbb, 0xb5, 0x65, 0xa5, 0x00, },
++  { 0xec, 0x6b, 0xed, 0xf7, 0x7b, 0x62, 0x1d, 0x7d, 0xf4, 0x82, 0xf3, 0x1e,
++    0x18, 0xff, 0x2b, 0xc4, 0x06, 0x20, 0x2a, },
++  { 0x74, 0x98, 0xd7, 0x68, 0x63, 0xed, 0x87, 0xe4, 0x5d, 0x8d, 0x9e, 0x1d,
++    0xfd, 0x2a, 0xbb, 0x86, 0xac, 0xe9, 0x2a, 0x89, },
++  { 0x89, 0xc3, 0x88, 0xce, 0x2b, 0x33, 0x1e, 0x10, 0xd1, 0x37, 0x20, 0x86,
++    0x28, 0x43, 0x70, 0xd9, 0xfb, 0x96, 0xd9, 0xb5, 0xd3, },
++  { 0xcb, 0x56, 0x74, 0x41, 0x8d, 0x80, 0x01, 0x9a, 0x6b, 0x38, 0xe1, 0x41,
++    0xad, 0x9c, 0x62, 0x74, 0xce, 0x35, 0xd5, 0x6c, 0x89, 0x6e, },
++  { 0x79, 0xaf, 0x94, 0x59, 0x99, 0x26, 0xe1, 0xc9, 0x34, 0xfe, 0x7c, 0x22,
++    0xf7, 0x43, 0xd7, 0x65, 0xd4, 0x48, 0x18, 0xac, 0x3d, 0xfd, 0x93, },
++  { 0x85, 0x0d, 0xff, 0xb8, 0x3e, 0x87, 0x41, 0xb0, 0x95, 0xd3, 0x3d, 0x00,
++    0x47, 0x55, 0x9e, 0xd2, 0x69, 0xea, 0xbf, 0xe9, 0x7a, 0x2d, 0x61, 0x45, },
++  { 0x03, 0xe0, 0x85, 0xec, 0x54, 0xb5, 0x16, 0x53, 0xa8, 0xc4, 0x71, 0xe9,
++    0x6a, 0xe7, 0xcb, 0xc4, 0x15, 0x02, 0xfc, 0x34, 0xa4, 0xa4, 0x28, 0x13,
++    0xd1, },
++  { 0xe3, 0x34, 0x4b, 0xe1, 0xd0, 0x4b, 0x55, 0x61, 0x8f, 0xc0, 0x24, 0x05,
++    0xe6, 0xe0, 0x3d, 0x70, 0x24, 0x4d, 0xda, 0xb8, 0x91, 0x05, 0x29, 0x07,
++    0x01, 0x3e, },
++  { 0x61, 0xff, 0x01, 0x72, 0xb1, 0x4d, 0xf6, 0xfe, 0xd1, 0xd1, 0x08, 0x74,
++    0xe6, 0x91, 0x44, 0xeb, 0x61, 0xda, 0x40, 0xaf, 0xfc, 0x8c, 0x91, 0x6b,
++    0xec, 0x13, 0xed, },
++  { 0xd4, 0x40, 0xd2, 0xa0, 0x7f, 0xc1, 0x58, 0x0c, 0x85, 0xa0, 0x86, 0xc7,
++    0x86, 0xb9, 0x61, 0xc9, 0xea, 0x19, 0x86, 0x1f, 0xab, 0x07, 0xce, 0x37,
++    0x72, 0x67, 0x09, 0xfc, },
++  { 0x9e, 0xf8, 0x18, 0x67, 0x93, 0x10, 0x9b, 0x39, 0x75, 0xe8, 0x8b, 0x38,
++    0x82, 0x7d, 0xb8, 0xb7, 0xa5, 0xaf, 0xe6, 0x6a, 0x22, 0x5e, 0x1f, 0x9c,
++    0x95, 0x29, 0x19, 0xf2, 0x4b, },
++  { 0xc8, 0x62, 0x25, 0xf5, 0x98, 0xc9, 0xea, 0xe5, 0x29, 0x3a, 0xd3, 0x22,
++    0xeb, 0xeb, 0x07, 0x7c, 0x15, 0x07, 0xee, 0x15, 0x61, 0xbb, 0x05, 0x30,
++    0x99, 0x7f, 0x11, 0xf6, 0x0a, 0x1d, },
++  { 0x68, 0x70, 0xf7, 0x90, 0xa1, 0x8b, 0x1f, 0x0f, 0xbb, 0xce, 0xd2, 0x0e,
++    0x33, 0x1f, 0x7f, 0xa9, 0x78, 0xa8, 0xa6, 0x81, 0x66, 0xab, 0x8d, 0xcd,
++    0x58, 0x55, 0x3a, 0x0b, 0x7a, 0xdb, 0xb5, },
++  { 0xdd, 0x35, 0xd2, 0xb4, 0xf6, 0xc7, 0xea, 0xab, 0x64, 0x24, 0x4e, 0xfe,
++    0xe5, 0x3d, 0x4e, 0x95, 0x8b, 0x6d, 0x6c, 0xbc, 0xb0, 0xf8, 0x88, 0x61,
++    0x09, 0xb7, 0x78, 0xa3, 0x31, 0xfe, 0xd9, 0x2f, },
++  { 0x0a, },
++  { 0x6e, 0xd4, },
++  { 0x64, 0xe9, 0xd1, },
++  { 0x30, 0xdd, 0x71, 0xef, },
++  { 0x11, 0xb5, 0x0c, 0x87, 0xc9, },
++  { 0x06, 0x1c, 0x6d, 0x04, 0x82, 0xd0, },
++  { 0x5c, 0x42, 0x0b, 0xee, 0xc5, 0x9c, 0xb2, },
++  { 0xe8, 0x29, 0xd6, 0xb4, 0x5d, 0xf7, 0x2b, 0x93, },
++  { 0x18, 0xca, 0x27, 0x72, 0x43, 0x39, 0x16, 0xbc, 0x6a, },
++  { 0x39, 0x8f, 0xfd, 0x64, 0xf5, 0x57, 0x23, 0xb0, 0x45, 0xf8, },
++  { 0xbb, 0x3a, 0x78, 0x6b, 0x02, 0x1d, 0x0b, 0x16, 0xe3, 0xb2, 0x9a, },
++  { 0xb8, 0xb4, 0x0b, 0xe5, 0xd4, 0x1d, 0x0d, 0x85, 0x49, 0x91, 0x35, 0xfa, },
++  { 0x6d, 0x48, 0x2a, 0x0c, 0x42, 0x08, 0xbd, 0xa9, 0x78, 0x6f, 0x18, 0xaf,
++    0xe2, },
++  { 0x10, 0x45, 0xd4, 0x58, 0x88, 0xec, 0x4e, 0x1e, 0xf6, 0x14, 0x92, 0x64,
++    0x7e, 0xb0, },
++  { 0x8b, 0x0b, 0x95, 0xee, 0x92, 0xc6, 0x3b, 0x91, 0xf1, 0x1e, 0xeb, 0x51,
++    0x98, 0x0a, 0x8d, },
++  { 0xa3, 0x50, 0x4d, 0xa5, 0x1d, 0x03, 0x68, 0xe9, 0x57, 0x78, 0xd6, 0x04,
++    0xf1, 0xc3, 0x94, 0xd8, },
++  { 0xb8, 0x66, 0x6e, 0xdd, 0x46, 0x15, 0xae, 0x3d, 0x83, 0x7e, 0xcf, 0xe7,
++    0x2c, 0xe8, 0x8f, 0xc7, 0x34, },
++  { 0x2e, 0xc0, 0x1f, 0x29, 0xea, 0xf6, 0xb9, 0xe2, 0xc2, 0x93, 0xeb, 0x41,
++    0x0d, 0xf0, 0x0a, 0x13, 0x0e, 0xa2, },
++  { 0x71, 0xb8, 0x33, 0xa9, 0x1b, 0xac, 0xf1, 0xb5, 0x42, 0x8f, 0x5e, 0x81,
++    0x34, 0x43, 0xb7, 0xa4, 0x18, 0x5c, 0x47, },
++  { 0xda, 0x45, 0xb8, 0x2e, 0x82, 0x1e, 0xc0, 0x59, 0x77, 0x9d, 0xfa, 0xb4,
++    0x1c, 0x5e, 0xa0, 0x2b, 0x33, 0x96, 0x5a, 0x58, },
++  { 0xe3, 0x09, 0x05, 0xa9, 0xeb, 0x48, 0x13, 0xad, 0x71, 0x88, 0x81, 0x9a,
++    0x3e, 0x2c, 0xe1, 0x23, 0x99, 0x13, 0x35, 0x9f, 0xb5, },
++  { 0xb7, 0x86, 0x2d, 0x16, 0xe1, 0x04, 0x00, 0x47, 0x47, 0x61, 0x31, 0xfb,
++    0x14, 0xac, 0xd8, 0xe9, 0xe3, 0x49, 0xbd, 0xf7, 0x9c, 0x3f, },
++  { 0x7f, 0xd9, 0x95, 0xa8, 0xa7, 0xa0, 0xcc, 0xba, 0xef, 0xb1, 0x0a, 0xa9,
++    0x21, 0x62, 0x08, 0x0f, 0x1b, 0xff, 0x7b, 0x9d, 0xae, 0xb2, 0x95, },
++  { 0x85, 0x99, 0xea, 0x33, 0xe0, 0x56, 0xff, 0x13, 0xc6, 0x61, 0x8c, 0xf9,
++    0x57, 0x05, 0x03, 0x11, 0xf9, 0xfb, 0x3a, 0xf7, 0xce, 0xbb, 0x52, 0x30, },
++  { 0xb2, 0x72, 0x9c, 0xf8, 0x77, 0x4e, 0x8f, 0x6b, 0x01, 0x6c, 0xff, 0x4e,
++    0x4f, 0x02, 0xd2, 0xbc, 0xeb, 0x51, 0x28, 0x99, 0x50, 0xab, 0xc4, 0x42,
++    0xe3, },
++  { 0x8b, 0x0a, 0xb5, 0x90, 0x8f, 0xf5, 0x7b, 0xdd, 0xba, 0x47, 0x37, 0xc9,
++    0x2a, 0xd5, 0x4b, 0x25, 0x08, 0x8b, 0x02, 0x17, 0xa7, 0x9e, 0x6b, 0x6e,
++    0xe3, 0x90, },
++  { 0x90, 0xdd, 0xf7, 0x75, 0xa7, 0xa3, 0x99, 0x5e, 0x5b, 0x7d, 0x75, 0xc3,
++    0x39, 0x6b, 0xa0, 0xe2, 0x44, 0x53, 0xb1, 0x9e, 0xc8, 0xf1, 0x77, 0x10,
++    0x58, 0x06, 0x9a, },
++  { 0x99, 0x52, 0xf0, 0x49, 0xa8, 0x8c, 0xec, 0xa6, 0x97, 0x32, 0x13, 0xb5,
++    0xf7, 0xa3, 0x8e, 0xfb, 0x4b, 0x59, 0x31, 0x3d, 0x01, 0x59, 0x98, 0x5d,
++    0x53, 0x03, 0x1a, 0x39, },
++  { 0x9f, 0xe0, 0xc2, 0xe5, 0x5d, 0x93, 0xd6, 0x9b, 0x47, 0x8f, 0x9b, 0xe0,
++    0x26, 0x35, 0x84, 0x20, 0x1d, 0xc5, 0x53, 0x10, 0x0f, 0x22, 0xb9, 0xb5,
++    0xd4, 0x36, 0xb1, 0xac, 0x73, },
++  { 0x30, 0x32, 0x20, 0x3b, 0x10, 0x28, 0xec, 0x1f, 0x4f, 0x9b, 0x47, 0x59,
++    0xeb, 0x7b, 0xee, 0x45, 0xfb, 0x0c, 0x49, 0xd8, 0x3d, 0x69, 0xbd, 0x90,
++    0x2c, 0xf0, 0x9e, 0x8d, 0xbf, 0xd5, },
++  { 0x2a, 0x37, 0x73, 0x7f, 0xf9, 0x96, 0x19, 0xaa, 0x25, 0xd8, 0x13, 0x28,
++    0x01, 0x29, 0x89, 0xdf, 0x6e, 0x0c, 0x9b, 0x43, 0x44, 0x51, 0xe9, 0x75,
++    0x26, 0x0c, 0xb7, 0x87, 0x66, 0x0b, 0x5f, },
++  { 0x23, 0xdf, 0x96, 0x68, 0x91, 0x86, 0xd0, 0x93, 0x55, 0x33, 0x24, 0xf6,
++    0xba, 0x08, 0x75, 0x5b, 0x59, 0x11, 0x69, 0xb8, 0xb9, 0xe5, 0x2c, 0x77,
++    0x02, 0xf6, 0x47, 0xee, 0x81, 0xdd, 0xb9, 0x06, },
++  { 0x9d, },
++  { 0x9d, 0x7d, },
++  { 0xfd, 0xc3, 0xda, },
++  { 0xe8, 0x82, 0xcd, 0x21, },
++  { 0xc3, 0x1d, 0x42, 0x4c, 0x74, },
++  { 0xe9, 0xda, 0xf1, 0xa2, 0xe5, 0x7c, },
++  { 0x52, 0xb8, 0x6f, 0x81, 0x5c, 0x3a, 0x4c, },
++  { 0x5b, 0x39, 0x26, 0xfc, 0x92, 0x5e, 0xe0, 0x49, },
++  { 0x59, 0xe4, 0x7c, 0x93, 0x1c, 0xf9, 0x28, 0x93, 0xde, },
++  { 0xde, 0xdf, 0xb2, 0x43, 0x61, 0x0b, 0x86, 0x16, 0x4c, 0x2e, },
++  { 0x14, 0x8f, 0x75, 0x51, 0xaf, 0xb9, 0xee, 0x51, 0x5a, 0xae, 0x23, },
++  { 0x43, 0x5f, 0x50, 0xd5, 0x70, 0xb0, 0x5b, 0x87, 0xf5, 0xd9, 0xb3, 0x6d, },
++  { 0x66, 0x0a, 0x64, 0x93, 0x79, 0x71, 0x94, 0x40, 0xb7, 0x68, 0x2d, 0xd3,
++    0x63, },
++  { 0x15, 0x00, 0xc4, 0x0c, 0x7d, 0x1b, 0x10, 0xa9, 0x73, 0x1b, 0x90, 0x6f,
++    0xe6, 0xa9, },
++  { 0x34, 0x75, 0xf3, 0x86, 0x8f, 0x56, 0xcf, 0x2a, 0x0a, 0xf2, 0x62, 0x0a,
++    0xf6, 0x0e, 0x20, },
++  { 0xb1, 0xde, 0xc9, 0xf5, 0xdb, 0xf3, 0x2f, 0x4c, 0xd6, 0x41, 0x7d, 0x39,
++    0x18, 0x3e, 0xc7, 0xc3, },
++  { 0xc5, 0x89, 0xb2, 0xf8, 0xb8, 0xc0, 0xa3, 0xb9, 0x3b, 0x10, 0x6d, 0x7c,
++    0x92, 0xfc, 0x7f, 0x34, 0x41, },
++  { 0xc4, 0xd8, 0xef, 0xba, 0xef, 0xd2, 0xaa, 0xc5, 0x6c, 0x8e, 0x3e, 0xbb,
++    0x12, 0xfc, 0x0f, 0x72, 0xbf, 0x0f, },
++  { 0xdd, 0x91, 0xd1, 0x15, 0x9e, 0x7d, 0xf8, 0xc1, 0xb9, 0x14, 0x63, 0x96,
++    0xb5, 0xcb, 0x83, 0x1d, 0x35, 0x1c, 0xec, },
++  { 0xa9, 0xf8, 0x52, 0xc9, 0x67, 0x76, 0x2b, 0xad, 0xfb, 0xd8, 0x3a, 0xa6,
++    0x74, 0x02, 0xae, 0xb8, 0x25, 0x2c, 0x63, 0x49, },
++  { 0x77, 0x1f, 0x66, 0x70, 0xfd, 0x50, 0x29, 0xaa, 0xeb, 0xdc, 0xee, 0xba,
++    0x75, 0x98, 0xdc, 0x93, 0x12, 0x3f, 0xdc, 0x7c, 0x38, },
++  { 0xe2, 0xe1, 0x89, 0x5c, 0x37, 0x38, 0x6a, 0xa3, 0x40, 0xac, 0x3f, 0xb0,
++    0xca, 0xfc, 0xa7, 0xf3, 0xea, 0xf9, 0x0f, 0x5d, 0x8e, 0x39, },
++  { 0x0f, 0x67, 0xc8, 0x38, 0x01, 0xb1, 0xb7, 0xb8, 0xa2, 0xe7, 0x0a, 0x6d,
++    0xd2, 0x63, 0x69, 0x9e, 0xcc, 0xf0, 0xf2, 0xbe, 0x9b, 0x98, 0xdd, },
++  { 0x13, 0xe1, 0x36, 0x30, 0xfe, 0xc6, 0x01, 0x8a, 0xa1, 0x63, 0x96, 0x59,
++    0xc2, 0xa9, 0x68, 0x3f, 0x58, 0xd4, 0x19, 0x0c, 0x40, 0xf3, 0xde, 0x02, },
++  { 0xa3, 0x9e, 0xce, 0xda, 0x42, 0xee, 0x8c, 0x6c, 0x5a, 0x7d, 0xdc, 0x89,
++    0x02, 0x77, 0xdd, 0xe7, 0x95, 0xbb, 0xff, 0x0d, 0xa4, 0xb5, 0x38, 0x1e,
++    0xaf, },
++  { 0x9a, 0xf6, 0xb5, 0x9a, 0x4f, 0xa9, 0x4f, 0x2c, 0x35, 0x3c, 0x24, 0xdc,
++    0x97, 0x6f, 0xd9, 0xa1, 0x7d, 0x1a, 0x85, 0x0b, 0xf5, 0xda, 0x2e, 0xe7,
++    0xb1, 0x1d, },
++  { 0x84, 0x1e, 0x8e, 0x3d, 0x45, 0xa5, 0xf2, 0x27, 0xf3, 0x31, 0xfe, 0xb9,
++    0xfb, 0xc5, 0x45, 0x99, 0x99, 0xdd, 0x93, 0x43, 0x02, 0xee, 0x58, 0xaf,
++    0xee, 0x6a, 0xbe, },
++  { 0x07, 0x2f, 0xc0, 0xa2, 0x04, 0xc4, 0xab, 0x7c, 0x26, 0xbb, 0xa8, 0xd8,
++    0xe3, 0x1c, 0x75, 0x15, 0x64, 0x5d, 0x02, 0x6a, 0xf0, 0x86, 0xe9, 0xcd,
++    0x5c, 0xef, 0xa3, 0x25, },
++  { 0x2f, 0x3b, 0x1f, 0xb5, 0x91, 0x8f, 0x86, 0xe0, 0xdc, 0x31, 0x48, 0xb6,
++    0xa1, 0x8c, 0xfd, 0x75, 0xbb, 0x7d, 0x3d, 0xc1, 0xf0, 0x10, 0x9a, 0xd8,
++    0x4b, 0x0e, 0xe3, 0x94, 0x9f, },
++  { 0x29, 0xbb, 0x8f, 0x6c, 0xd1, 0xf2, 0xb6, 0xaf, 0xe5, 0xe3, 0x2d, 0xdc,
++    0x6f, 0xa4, 0x53, 0x88, 0xd8, 0xcf, 0x4d, 0x45, 0x42, 0x62, 0xdb, 0xdf,
++    0xf8, 0x45, 0xc2, 0x13, 0xec, 0x35, },
++  { 0x06, 0x3c, 0xe3, 0x2c, 0x15, 0xc6, 0x43, 0x03, 0x81, 0xfb, 0x08, 0x76,
++    0x33, 0xcb, 0x02, 0xc1, 0xba, 0x33, 0xe5, 0xe0, 0xd1, 0x92, 0xa8, 0x46,
++    0x28, 0x3f, 0x3e, 0x9d, 0x2c, 0x44, 0x54, },
++  { 0xea, 0xbb, 0x96, 0xf8, 0xd1, 0x8b, 0x04, 0x11, 0x40, 0x78, 0x42, 0x02,
++    0x19, 0xd1, 0xbc, 0x65, 0x92, 0xd3, 0xc3, 0xd6, 0xd9, 0x19, 0xe7, 0xc3,
++    0x40, 0x97, 0xbd, 0xd4, 0xed, 0xfa, 0x5e, 0x28, },
++  { 0x02, },
++  { 0x52, 0xa8, },
++  { 0x38, 0x25, 0x0d, },
++  { 0xe3, 0x04, 0xd4, 0x92, },
++  { 0x97, 0xdb, 0xf7, 0x81, 0xca, },
++  { 0x8a, 0x56, 0x9d, 0x62, 0x56, 0xcc, },
++  { 0xa1, 0x8e, 0x3c, 0x72, 0x8f, 0x63, 0x03, },
++  { 0xf7, 0xf3, 0x39, 0x09, 0x0a, 0xa1, 0xbb, 0x23, },
++  { 0x6b, 0x03, 0xc0, 0xe9, 0xd9, 0x83, 0x05, 0x22, 0x01, },
++  { 0x1b, 0x4b, 0xf5, 0xd6, 0x4f, 0x05, 0x75, 0x91, 0x4c, 0x7f, },
++  { 0x4c, 0x8c, 0x25, 0x20, 0x21, 0xcb, 0xc2, 0x4b, 0x3a, 0x5b, 0x8d, },
++  { 0x56, 0xe2, 0x77, 0xa0, 0xb6, 0x9f, 0x81, 0xec, 0x83, 0x75, 0xc4, 0xf9, },
++  { 0x71, 0x70, 0x0f, 0xad, 0x4d, 0x35, 0x81, 0x9d, 0x88, 0x69, 0xf9, 0xaa,
++    0xd3, },
++  { 0x50, 0x6e, 0x86, 0x6e, 0x43, 0xc0, 0xc2, 0x44, 0xc2, 0xe2, 0xa0, 0x1c,
++    0xb7, 0x9a, },
++  { 0xe4, 0x7e, 0x72, 0xc6, 0x12, 0x8e, 0x7c, 0xfc, 0xbd, 0xe2, 0x08, 0x31,
++    0x3d, 0x47, 0x3d, },
++  { 0x08, 0x97, 0x5b, 0x80, 0xae, 0xc4, 0x1d, 0x50, 0x77, 0xdf, 0x1f, 0xd0,
++    0x24, 0xf0, 0x17, 0xc0, },
++  { 0x01, 0xb6, 0x29, 0xf4, 0xaf, 0x78, 0x5f, 0xb6, 0x91, 0xdd, 0x76, 0x76,
++    0xd2, 0xfd, 0x0c, 0x47, 0x40, },
++  { 0xa1, 0xd8, 0x09, 0x97, 0x7a, 0xa6, 0xc8, 0x94, 0xf6, 0x91, 0x7b, 0xae,
++    0x2b, 0x9f, 0x0d, 0x83, 0x48, 0xf7, },
++  { 0x12, 0xd5, 0x53, 0x7d, 0x9a, 0xb0, 0xbe, 0xd9, 0xed, 0xe9, 0x9e, 0xee,
++    0x61, 0x5b, 0x42, 0xf2, 0xc0, 0x73, 0xc0, },
++  { 0xd5, 0x77, 0xd6, 0x5c, 0x6e, 0xa5, 0x69, 0x2b, 0x3b, 0x8c, 0xd6, 0x7d,
++    0x1d, 0xbe, 0x2c, 0xa1, 0x02, 0x21, 0xcd, 0x29, },
++  { 0xa4, 0x98, 0x80, 0xca, 0x22, 0xcf, 0x6a, 0xab, 0x5e, 0x40, 0x0d, 0x61,
++    0x08, 0x21, 0xef, 0xc0, 0x6c, 0x52, 0xb4, 0xb0, 0x53, },
++  { 0xbf, 0xaf, 0x8f, 0x3b, 0x7a, 0x97, 0x33, 0xe5, 0xca, 0x07, 0x37, 0xfd,
++    0x15, 0xdf, 0xce, 0x26, 0x2a, 0xb1, 0xa7, 0x0b, 0xb3, 0xac, },
++  { 0x16, 0x22, 0xe1, 0xbc, 0x99, 0x4e, 0x01, 0xf0, 0xfa, 0xff, 0x8f, 0xa5,
++    0x0c, 0x61, 0xb0, 0xad, 0xcc, 0xb1, 0xe1, 0x21, 0x46, 0xfa, 0x2e, },
++  { 0x11, 0x5b, 0x0b, 0x2b, 0xe6, 0x14, 0xc1, 0xd5, 0x4d, 0x71, 0x5e, 0x17,
++    0xea, 0x23, 0xdd, 0x6c, 0xbd, 0x1d, 0xbe, 0x12, 0x1b, 0xee, 0x4c, 0x1a, },
++  { 0x40, 0x88, 0x22, 0xf3, 0x20, 0x6c, 0xed, 0xe1, 0x36, 0x34, 0x62, 0x2c,
++    0x98, 0x83, 0x52, 0xe2, 0x25, 0xee, 0xe9, 0xf5, 0xe1, 0x17, 0xf0, 0x5c,
++    0xae, },
++  { 0xc3, 0x76, 0x37, 0xde, 0x95, 0x8c, 0xca, 0x2b, 0x0c, 0x23, 0xe7, 0xb5,
++    0x38, 0x70, 0x61, 0xcc, 0xff, 0xd3, 0x95, 0x7b, 0xf3, 0xff, 0x1f, 0x9d,
++    0x59, 0x00, },
++  { 0x0c, 0x19, 0x52, 0x05, 0x22, 0x53, 0xcb, 0x48, 0xd7, 0x10, 0x0e, 0x7e,
++    0x14, 0x69, 0xb5, 0xa2, 0x92, 0x43, 0xa3, 0x9e, 0x4b, 0x8f, 0x51, 0x2c,
++    0x5a, 0x2c, 0x3b, },
++  { 0xe1, 0x9d, 0x70, 0x70, 0x28, 0xec, 0x86, 0x40, 0x55, 0x33, 0x56, 0xda,
++    0x88, 0xca, 0xee, 0xc8, 0x6a, 0x20, 0xb1, 0xe5, 0x3d, 0x57, 0xf8, 0x3c,
++    0x10, 0x07, 0x2a, 0xc4, },
++  { 0x0b, 0xae, 0xf1, 0xc4, 0x79, 0xee, 0x1b, 0x3d, 0x27, 0x35, 0x8d, 0x14,
++    0xd6, 0xae, 0x4e, 0x3c, 0xe9, 0x53, 0x50, 0xb5, 0xcc, 0x0c, 0xf7, 0xdf,
++    0xee, 0xa1, 0x74, 0xd6, 0x71, },
++  { 0xe6, 0xa4, 0xf4, 0x99, 0x98, 0xb9, 0x80, 0xea, 0x96, 0x7f, 0x4f, 0x33,
++    0xcf, 0x74, 0x25, 0x6f, 0x17, 0x6c, 0xbf, 0xf5, 0x5c, 0x38, 0xd0, 0xff,
++    0x96, 0xcb, 0x13, 0xf9, 0xdf, 0xfd, },
++  { 0xbe, 0x92, 0xeb, 0xba, 0x44, 0x2c, 0x24, 0x74, 0xd4, 0x03, 0x27, 0x3c,
++    0x5d, 0x5b, 0x03, 0x30, 0x87, 0x63, 0x69, 0xe0, 0xb8, 0x94, 0xf4, 0x44,
++    0x7e, 0xad, 0xcd, 0x20, 0x12, 0x16, 0x79, },
++  { 0x30, 0xf1, 0xc4, 0x8e, 0x05, 0x90, 0x2a, 0x97, 0x63, 0x94, 0x46, 0xff,
++    0xce, 0xd8, 0x67, 0xa7, 0xac, 0x33, 0x8c, 0x95, 0xb7, 0xcd, 0xa3, 0x23,
++    0x98, 0x9d, 0x76, 0x6c, 0x9d, 0xa8, 0xd6, 0x8a, },
++  { 0xbe, },
++  { 0x17, 0x6c, },
++  { 0x1a, 0x42, 0x4f, },
++  { 0xba, 0xaf, 0xb7, 0x65, },
++  { 0xc2, 0x63, 0x43, 0x6a, 0xea, },
++  { 0xe4, 0x4d, 0xad, 0xf2, 0x0b, 0x02, },
++  { 0x04, 0xc7, 0xc4, 0x7f, 0xa9, 0x2b, 0xce, },
++  { 0x66, 0xf6, 0x67, 0xcb, 0x03, 0x53, 0xc8, 0xf1, },
++  { 0x56, 0xa3, 0x60, 0x78, 0xc9, 0x5f, 0x70, 0x1b, 0x5e, },
++  { 0x99, 0xff, 0x81, 0x7c, 0x13, 0x3c, 0x29, 0x79, 0x4b, 0x65, },
++  { 0x51, 0x10, 0x50, 0x93, 0x01, 0x93, 0xb7, 0x01, 0xc9, 0x18, 0xb7, },
++  { 0x8e, 0x3c, 0x42, 0x1e, 0x5e, 0x7d, 0xc1, 0x50, 0x70, 0x1f, 0x00, 0x98, },
++  { 0x5f, 0xd9, 0x9b, 0xc8, 0xd7, 0xb2, 0x72, 0x62, 0x1a, 0x1e, 0xba, 0x92,
++    0xe9, },
++  { 0x70, 0x2b, 0xba, 0xfe, 0xad, 0x5d, 0x96, 0x3f, 0x27, 0xc2, 0x41, 0x6d,
++    0xc4, 0xb3, },
++  { 0xae, 0xe0, 0xd5, 0xd4, 0xc7, 0xae, 0x15, 0x5e, 0xdc, 0xdd, 0x33, 0x60,
++    0xd7, 0xd3, 0x5e, },
++  { 0x79, 0x8e, 0xbc, 0x9e, 0x20, 0xb9, 0x19, 0x4b, 0x63, 0x80, 0xf3, 0x16,
++    0xaf, 0x39, 0xbd, 0x92, },
++  { 0xc2, 0x0e, 0x85, 0xa0, 0x0b, 0x9a, 0xb0, 0xec, 0xde, 0x38, 0xd3, 0x10,
++    0xd9, 0xa7, 0x66, 0x27, 0xcf, },
++  { 0x0e, 0x3b, 0x75, 0x80, 0x67, 0x14, 0x0c, 0x02, 0x90, 0xd6, 0xb3, 0x02,
++    0x81, 0xf6, 0xa6, 0x87, 0xce, 0x58, },
++  { 0x79, 0xb5, 0xe9, 0x5d, 0x52, 0x4d, 0xf7, 0x59, 0xf4, 0x2e, 0x27, 0xdd,
++    0xb3, 0xed, 0x57, 0x5b, 0x82, 0xea, 0x6f, },
++  { 0xa2, 0x97, 0xf5, 0x80, 0x02, 0x3d, 0xde, 0xa3, 0xf9, 0xf6, 0xab, 0xe3,
++    0x57, 0x63, 0x7b, 0x9b, 0x10, 0x42, 0x6f, 0xf2, },
++  { 0x12, 0x7a, 0xfc, 0xb7, 0x67, 0x06, 0x0c, 0x78, 0x1a, 0xfe, 0x88, 0x4f,
++    0xc6, 0xac, 0x52, 0x96, 0x64, 0x28, 0x97, 0x84, 0x06, },
++  { 0xc5, 0x04, 0x44, 0x6b, 0xb2, 0xa5, 0xa4, 0x66, 0xe1, 0x76, 0xa2, 0x51,
++    0xf9, 0x59, 0x69, 0x97, 0x56, 0x0b, 0xbf, 0x50, 0xb3, 0x34, },
++  { 0x21, 0x32, 0x6b, 0x42, 0xb5, 0xed, 0x71, 0x8d, 0xf7, 0x5a, 0x35, 0xe3,
++    0x90, 0xe2, 0xee, 0xaa, 0x89, 0xf6, 0xc9, 0x9c, 0x4d, 0x73, 0xf4, },
++  { 0x4c, 0xa6, 0x09, 0xf4, 0x48, 0xe7, 0x46, 0xbc, 0x49, 0xfc, 0xe5, 0xda,
++    0xd1, 0x87, 0x13, 0x17, 0x4c, 0x59, 0x71, 0x26, 0x5b, 0x2c, 0x42, 0xb7, },
++  { 0x13, 0x63, 0xf3, 0x40, 0x02, 0xe5, 0xa3, 0x3a, 0x5e, 0x8e, 0xf8, 0xb6,
++    0x8a, 0x49, 0x60, 0x76, 0x34, 0x72, 0x94, 0x73, 0xf6, 0xd9, 0x21, 0x6a,
++    0x26, },
++  { 0xdf, 0x75, 0x16, 0x10, 0x1b, 0x5e, 0x81, 0xc3, 0xc8, 0xde, 0x34, 0x24,
++    0xb0, 0x98, 0xeb, 0x1b, 0x8f, 0xa1, 0x9b, 0x05, 0xee, 0xa5, 0xe9, 0x35,
++    0xf4, 0x1d, },
++  { 0xcd, 0x21, 0x93, 0x6e, 0x5b, 0xa0, 0x26, 0x2b, 0x21, 0x0e, 0xa0, 0xb9,
++    0x1c, 0xb5, 0xbb, 0xb8, 0xf8, 0x1e, 0xff, 0x5c, 0xa8, 0xf9, 0x39, 0x46,
++    0x4e, 0x29, 0x26, },
++  { 0x73, 0x7f, 0x0e, 0x3b, 0x0b, 0x5c, 0xf9, 0x60, 0xaa, 0x88, 0xa1, 0x09,
++    0xb1, 0x5d, 0x38, 0x7b, 0x86, 0x8f, 0x13, 0x7a, 0x8d, 0x72, 0x7a, 0x98,
++    0x1a, 0x5b, 0xff, 0xc9, },
++  { 0xd3, 0x3c, 0x61, 0x71, 0x44, 0x7e, 0x31, 0x74, 0x98, 0x9d, 0x9a, 0xd2,
++    0x27, 0xf3, 0x46, 0x43, 0x42, 0x51, 0xd0, 0x5f, 0xe9, 0x1c, 0x5c, 0x69,
++    0xbf, 0xf6, 0xbe, 0x3c, 0x40, },
++  { 0x31, 0x99, 0x31, 0x9f, 0xaa, 0x43, 0x2e, 0x77, 0x3e, 0x74, 0x26, 0x31,
++    0x5e, 0x61, 0xf1, 0x87, 0xe2, 0xeb, 0x9b, 0xcd, 0xd0, 0x3a, 0xee, 0x20,
++    0x7e, 0x10, 0x0a, 0x0b, 0x7e, 0xfa, },
++  { 0xa4, 0x27, 0x80, 0x67, 0x81, 0x2a, 0xa7, 0x62, 0xf7, 0x6e, 0xda, 0xd4,
++    0x5c, 0x39, 0x74, 0xad, 0x7e, 0xbe, 0xad, 0xa5, 0x84, 0x7f, 0xa9, 0x30,
++    0x5d, 0xdb, 0xe2, 0x05, 0x43, 0xf7, 0x1b, },
++  { 0x0b, 0x37, 0xd8, 0x02, 0xe1, 0x83, 0xd6, 0x80, 0xf2, 0x35, 0xc2, 0xb0,
++    0x37, 0xef, 0xef, 0x5e, 0x43, 0x93, 0xf0, 0x49, 0x45, 0x0a, 0xef, 0xb5,
++    0x76, 0x70, 0x12, 0x44, 0xc4, 0xdb, 0xf5, 0x7a, },
++  { 0x1f, },
++  { 0x82, 0x60, },
++  { 0xcc, 0xe3, 0x08, },
++  { 0x56, 0x17, 0xe4, 0x59, },
++  { 0xe2, 0xd7, 0x9e, 0xc4, 0x4c, },
++  { 0xb2, 0xad, 0xd3, 0x78, 0x58, 0x5a, },
++  { 0xce, 0x43, 0xb4, 0x02, 0x96, 0xab, 0x3c, },
++  { 0xe6, 0x05, 0x1a, 0x73, 0x22, 0x32, 0xbb, 0x77, },
++  { 0x23, 0xe7, 0xda, 0xfe, 0x2c, 0xef, 0x8c, 0x22, 0xec, },
++  { 0xe9, 0x8e, 0x55, 0x38, 0xd1, 0xd7, 0x35, 0x23, 0x98, 0xc7, },
++  { 0xb5, 0x81, 0x1a, 0xe5, 0xb5, 0xa5, 0xd9, 0x4d, 0xca, 0x41, 0xe7, },
++  { 0x41, 0x16, 0x16, 0x95, 0x8d, 0x9e, 0x0c, 0xea, 0x8c, 0x71, 0x9a, 0xc1, },
++  { 0x7c, 0x33, 0xc0, 0xa4, 0x00, 0x62, 0xea, 0x60, 0x67, 0xe4, 0x20, 0xbc,
++    0x5b, },
++  { 0xdb, 0xb1, 0xdc, 0xfd, 0x08, 0xc0, 0xde, 0x82, 0xd1, 0xde, 0x38, 0xc0,
++    0x90, 0x48, },
++  { 0x37, 0x18, 0x2e, 0x0d, 0x61, 0xaa, 0x61, 0xd7, 0x86, 0x20, 0x16, 0x60,
++    0x04, 0xd9, 0xd5, },
++  { 0xb0, 0xcf, 0x2c, 0x4c, 0x5e, 0x5b, 0x4f, 0x2a, 0x23, 0x25, 0x58, 0x47,
++    0xe5, 0x31, 0x06, 0x70, },
++  { 0x91, 0xa0, 0xa3, 0x86, 0x4e, 0xe0, 0x72, 0x38, 0x06, 0x67, 0x59, 0x5c,
++    0x70, 0x25, 0xdb, 0x33, 0x27, },
++  { 0x44, 0x58, 0x66, 0xb8, 0x58, 0xc7, 0x13, 0xed, 0x4c, 0xc0, 0xf4, 0x9a,
++    0x1e, 0x67, 0x75, 0x33, 0xb6, 0xb8, },
++  { 0x7f, 0x98, 0x4a, 0x8e, 0x50, 0xa2, 0x5c, 0xcd, 0x59, 0xde, 0x72, 0xb3,
++    0x9d, 0xc3, 0x09, 0x8a, 0xab, 0x56, 0xf1, },
++  { 0x80, 0x96, 0x49, 0x1a, 0x59, 0xa2, 0xc5, 0xd5, 0xa7, 0x20, 0x8a, 0xb7,
++    0x27, 0x62, 0x84, 0x43, 0xc6, 0xe1, 0x1b, 0x5d, },
++  { 0x6b, 0xb7, 0x2b, 0x26, 0x62, 0x14, 0x70, 0x19, 0x3d, 0x4d, 0xac, 0xac,
++    0x63, 0x58, 0x5e, 0x94, 0xb5, 0xb7, 0xe8, 0xe8, 0xa2, },
++  { 0x20, 0xa8, 0xc0, 0xfd, 0x63, 0x3d, 0x6e, 0x98, 0xcf, 0x0c, 0x49, 0x98,
++    0xe4, 0x5a, 0xfe, 0x8c, 0xaa, 0x70, 0x82, 0x1c, 0x7b, 0x74, },
++  { 0xc8, 0xe8, 0xdd, 0xdf, 0x69, 0x30, 0x01, 0xc2, 0x0f, 0x7e, 0x2f, 0x11,
++    0xcc, 0x3e, 0x17, 0xa5, 0x69, 0x40, 0x3f, 0x0e, 0x79, 0x7f, 0xcf, },
++  { 0xdb, 0x61, 0xc0, 0xe2, 0x2e, 0x49, 0x07, 0x31, 0x1d, 0x91, 0x42, 0x8a,
++    0xfc, 0x5e, 0xd3, 0xf8, 0x56, 0x1f, 0x2b, 0x73, 0xfd, 0x9f, 0xb2, 0x8e, },
++  { 0x0c, 0x89, 0x55, 0x0c, 0x1f, 0x59, 0x2c, 0x9d, 0x1b, 0x29, 0x1d, 0x41,
++    0x1d, 0xe6, 0x47, 0x8f, 0x8c, 0x2b, 0xea, 0x8f, 0xf0, 0xff, 0x21, 0x70,
++    0x88, },
++  { 0x12, 0x18, 0x95, 0xa6, 0x59, 0xb1, 0x31, 0x24, 0x45, 0x67, 0x55, 0xa4,
++    0x1a, 0x2d, 0x48, 0x67, 0x1b, 0x43, 0x88, 0x2d, 0x8e, 0xa0, 0x70, 0xb3,
++    0xc6, 0xbb, },
++  { 0xe7, 0xb1, 0x1d, 0xb2, 0x76, 0x4d, 0x68, 0x68, 0x68, 0x23, 0x02, 0x55,
++    0x3a, 0xe2, 0xe5, 0xd5, 0x4b, 0x43, 0xf9, 0x34, 0x77, 0x5c, 0xa1, 0xf5,
++    0x55, 0xfd, 0x4f, },
++  { 0x8c, 0x87, 0x5a, 0x08, 0x3a, 0x73, 0xad, 0x61, 0xe1, 0xe7, 0x99, 0x7e,
++    0xf0, 0x5d, 0xe9, 0x5d, 0x16, 0x43, 0x80, 0x2f, 0xd0, 0x66, 0x34, 0xe2,
++    0x42, 0x64, 0x3b, 0x1a, },
++  { 0x39, 0xc1, 0x99, 0xcf, 0x22, 0xbf, 0x16, 0x8f, 0x9f, 0x80, 0x7f, 0x95,
++    0x0a, 0x05, 0x67, 0x27, 0xe7, 0x15, 0xdf, 0x9d, 0xb2, 0xfe, 0x1c, 0xb5,
++    0x1d, 0x60, 0x8f, 0x8a, 0x1d, },
++  { 0x9b, 0x6e, 0x08, 0x09, 0x06, 0x73, 0xab, 0x68, 0x02, 0x62, 0x1a, 0xe4,
++    0xd4, 0xdf, 0xc7, 0x02, 0x4c, 0x6a, 0x5f, 0xfd, 0x23, 0xac, 0xae, 0x6d,
++    0x43, 0xa4, 0x7a, 0x50, 0x60, 0x3c, },
++  { 0x1d, 0xb4, 0xc6, 0xe1, 0xb1, 0x4b, 0xe3, 0xf2, 0xe2, 0x1a, 0x73, 0x1b,
++    0xa0, 0x92, 0xa7, 0xf5, 0xff, 0x8f, 0x8b, 0x5d, 0xdf, 0xa8, 0x04, 0xb3,
++    0xb0, 0xf7, 0xcc, 0x12, 0xfa, 0x35, 0x46, },
++  { 0x49, 0x45, 0x97, 0x11, 0x0f, 0x1c, 0x60, 0x8e, 0xe8, 0x47, 0x30, 0xcf,
++    0x60, 0xa8, 0x71, 0xc5, 0x1b, 0xe9, 0x39, 0x4d, 0x49, 0xb6, 0x12, 0x1f,
++    0x24, 0xab, 0x37, 0xff, 0x83, 0xc2, 0xe1, 0x3a, },
++  { 0x60, },
++  { 0x24, 0x26, },
++  { 0x47, 0xeb, 0xc9, },
++  { 0x4a, 0xd0, 0xbc, 0xf0, },
++  { 0x8e, 0x2b, 0xc9, 0x85, 0x3c, },
++  { 0xa2, 0x07, 0x15, 0xb8, 0x12, 0x74, },
++  { 0x0f, 0xdb, 0x5b, 0x33, 0x69, 0xfe, 0x4b, },
++  { 0xa2, 0x86, 0x54, 0xf4, 0xfd, 0xb2, 0xd4, 0xe6, },
++  { 0xbb, 0x84, 0x78, 0x49, 0x27, 0x8e, 0x61, 0xda, 0x60, },
++  { 0x04, 0xc3, 0xcd, 0xaa, 0x8f, 0xa7, 0x03, 0xc9, 0xf9, 0xb6, },
++  { 0xf8, 0x27, 0x1d, 0x61, 0xdc, 0x21, 0x42, 0xdd, 0xad, 0x92, 0x40, },
++  { 0x12, 0x87, 0xdf, 0xc2, 0x41, 0x45, 0x5a, 0x36, 0x48, 0x5b, 0x51, 0x2b, },
++  { 0xbb, 0x37, 0x5d, 0x1f, 0xf1, 0x68, 0x7a, 0xc4, 0xa5, 0xd2, 0xa4, 0x91,
++    0x8d, },
++  { 0x5b, 0x27, 0xd1, 0x04, 0x54, 0x52, 0x9f, 0xa3, 0x47, 0x86, 0x33, 0x33,
++    0xbf, 0xa0, },
++  { 0xcf, 0x04, 0xea, 0xf8, 0x03, 0x2a, 0x43, 0xff, 0xa6, 0x68, 0x21, 0x4c,
++    0xd5, 0x4b, 0xed, },
++  { 0xaf, 0xb8, 0xbc, 0x63, 0x0f, 0x18, 0x4d, 0xe2, 0x7a, 0xdd, 0x46, 0x44,
++    0xc8, 0x24, 0x0a, 0xb7, },
++  { 0x3e, 0xdc, 0x36, 0xe4, 0x89, 0xb1, 0xfa, 0xc6, 0x40, 0x93, 0x2e, 0x75,
++    0xb2, 0x15, 0xd1, 0xb1, 0x10, },
++  { 0x6c, 0xd8, 0x20, 0x3b, 0x82, 0x79, 0xf9, 0xc8, 0xbc, 0x9d, 0xe0, 0x35,
++    0xbe, 0x1b, 0x49, 0x1a, 0xbc, 0x3a, },
++  { 0x78, 0x65, 0x2c, 0xbe, 0x35, 0x67, 0xdc, 0x78, 0xd4, 0x41, 0xf6, 0xc9,
++    0xde, 0xde, 0x1f, 0x18, 0x13, 0x31, 0x11, },
++  { 0x8a, 0x7f, 0xb1, 0x33, 0x8f, 0x0c, 0x3c, 0x0a, 0x06, 0x61, 0xf0, 0x47,
++    0x29, 0x1b, 0x29, 0xbc, 0x1c, 0x47, 0xef, 0x7a, },
++  { 0x65, 0x91, 0xf1, 0xe6, 0xb3, 0x96, 0xd3, 0x8c, 0xc2, 0x4a, 0x59, 0x35,
++    0x72, 0x8e, 0x0b, 0x9a, 0x87, 0xca, 0x34, 0x7b, 0x63, },
++  { 0x5f, 0x08, 0x87, 0x80, 0x56, 0x25, 0x89, 0x77, 0x61, 0x8c, 0x64, 0xa1,
++    0x59, 0x6d, 0x59, 0x62, 0xe8, 0x4a, 0xc8, 0x58, 0x99, 0xd1, },
++  { 0x23, 0x87, 0x1d, 0xed, 0x6f, 0xf2, 0x91, 0x90, 0xe2, 0xfe, 0x43, 0x21,
++    0xaf, 0x97, 0xc6, 0xbc, 0xd7, 0x15, 0xc7, 0x2d, 0x08, 0x77, 0x91, },
++  { 0x90, 0x47, 0x9a, 0x9e, 0x3a, 0xdf, 0xf3, 0xc9, 0x4c, 0x1e, 0xa7, 0xd4,
++    0x6a, 0x32, 0x90, 0xfe, 0xb7, 0xb6, 0x7b, 0xfa, 0x96, 0x61, 0xfb, 0xa4, },
++  { 0xb1, 0x67, 0x60, 0x45, 0xb0, 0x96, 0xc5, 0x15, 0x9f, 0x4d, 0x26, 0xd7,
++    0x9d, 0xf1, 0xf5, 0x6d, 0x21, 0x00, 0x94, 0x31, 0x64, 0x94, 0xd3, 0xa7,
++    0xd3, },
++  { 0x02, 0x3e, 0xaf, 0xf3, 0x79, 0x73, 0xa5, 0xf5, 0xcc, 0x7a, 0x7f, 0xfb,
++    0x79, 0x2b, 0x85, 0x8c, 0x88, 0x72, 0x06, 0xbe, 0xfe, 0xaf, 0xc1, 0x16,
++    0xa6, 0xd6, },
++  { 0x2a, 0xb0, 0x1a, 0xe5, 0xaa, 0x6e, 0xb3, 0xae, 0x53, 0x85, 0x33, 0x80,
++    0x75, 0xae, 0x30, 0xe6, 0xb8, 0x72, 0x42, 0xf6, 0x25, 0x4f, 0x38, 0x88,
++    0x55, 0xd1, 0xa9, },
++  { 0x90, 0xd8, 0x0c, 0xc0, 0x93, 0x4b, 0x4f, 0x9e, 0x65, 0x6c, 0xa1, 0x54,
++    0xa6, 0xf6, 0x6e, 0xca, 0xd2, 0xbb, 0x7e, 0x6a, 0x1c, 0xd3, 0xce, 0x46,
++    0xef, 0xb0, 0x00, 0x8d, },
++  { 0xed, 0x9c, 0x49, 0xcd, 0xc2, 0xde, 0x38, 0x0e, 0xe9, 0x98, 0x6c, 0xc8,
++    0x90, 0x9e, 0x3c, 0xd4, 0xd3, 0xeb, 0x88, 0x32, 0xc7, 0x28, 0xe3, 0x94,
++    0x1c, 0x9f, 0x8b, 0xf3, 0xcb, },
++  { 0xac, 0xe7, 0x92, 0x16, 0xb4, 0x14, 0xa0, 0xe4, 0x04, 0x79, 0xa2, 0xf4,
++    0x31, 0xe6, 0x0c, 0x26, 0xdc, 0xbf, 0x2f, 0x69, 0x1b, 0x55, 0x94, 0x67,
++    0xda, 0x0c, 0xd7, 0x32, 0x1f, 0xef, },
++  { 0x68, 0x63, 0x85, 0x57, 0x95, 0x9e, 0x42, 0x27, 0x41, 0x43, 0x42, 0x02,
++    0xa5, 0x78, 0xa7, 0xc6, 0x43, 0xc1, 0x6a, 0xba, 0x70, 0x80, 0xcd, 0x04,
++    0xb6, 0x78, 0x76, 0x29, 0xf3, 0xe8, 0xa0, },
++  { 0xe6, 0xac, 0x8d, 0x9d, 0xf0, 0xc0, 0xf7, 0xf7, 0xe3, 0x3e, 0x4e, 0x28,
++    0x0f, 0x59, 0xb2, 0x67, 0x9e, 0x84, 0x34, 0x42, 0x96, 0x30, 0x2b, 0xca,
++    0x49, 0xb6, 0xc5, 0x9a, 0x84, 0x59, 0xa7, 0x81, },
++  { 0x7e, },
++  { 0x1e, 0x21, },
++  { 0x26, 0xd3, 0xdd, },
++  { 0x2c, 0xd4, 0xb3, 0x3d, },
++  { 0x86, 0x7b, 0x76, 0x3c, 0xf0, },
++  { 0x12, 0xc3, 0x70, 0x1d, 0x55, 0x18, },
++  { 0x96, 0xc2, 0xbd, 0x61, 0x55, 0xf4, 0x24, },
++  { 0x20, 0x51, 0xf7, 0x86, 0x58, 0x8f, 0x07, 0x2a, },
++  { 0x93, 0x15, 0xa8, 0x1d, 0xda, 0x97, 0xee, 0x0e, 0x6c, },
++  { 0x39, 0x93, 0xdf, 0xd5, 0x0e, 0xca, 0xdc, 0x7a, 0x92, 0xce, },
++  { 0x60, 0xd5, 0xfd, 0xf5, 0x1b, 0x26, 0x82, 0x26, 0x73, 0x02, 0xbc, },
++  { 0x98, 0xf2, 0x34, 0xe1, 0xf5, 0xfb, 0x00, 0xac, 0x10, 0x4a, 0x38, 0x9f, },
++  { 0xda, 0x3a, 0x92, 0x8a, 0xd0, 0xcd, 0x12, 0xcd, 0x15, 0xbb, 0xab, 0x77,
++    0x66, },
++  { 0xa2, 0x92, 0x1a, 0xe5, 0xca, 0x0c, 0x30, 0x75, 0xeb, 0xaf, 0x00, 0x31,
++    0x55, 0x66, },
++  { 0x06, 0xea, 0xfd, 0x3e, 0x86, 0x38, 0x62, 0x4e, 0xa9, 0x12, 0xa4, 0x12,
++    0x43, 0xbf, 0xa1, },
++  { 0xe4, 0x71, 0x7b, 0x94, 0xdb, 0xa0, 0xd2, 0xff, 0x9b, 0xeb, 0xad, 0x8e,
++    0x95, 0x8a, 0xc5, 0xed, },
++  { 0x25, 0x5a, 0x77, 0x71, 0x41, 0x0e, 0x7a, 0xe9, 0xed, 0x0c, 0x10, 0xef,
++    0xf6, 0x2b, 0x3a, 0xba, 0x60, },
++  { 0xee, 0xe2, 0xa3, 0x67, 0x64, 0x1d, 0xc6, 0x04, 0xc4, 0xe1, 0x68, 0xd2,
++    0x6e, 0xd2, 0x91, 0x75, 0x53, 0x07, },
++  { 0xe0, 0xf6, 0x4d, 0x8f, 0x68, 0xfc, 0x06, 0x7e, 0x18, 0x79, 0x7f, 0x2b,
++    0x6d, 0xef, 0x46, 0x7f, 0xab, 0xb2, 0xad, },
++  { 0x3d, 0x35, 0x88, 0x9f, 0x2e, 0xcf, 0x96, 0x45, 0x07, 0x60, 0x71, 0x94,
++    0x00, 0x8d, 0xbf, 0xf4, 0xef, 0x46, 0x2e, 0x3c, },
++  { 0x43, 0xcf, 0x98, 0xf7, 0x2d, 0xf4, 0x17, 0xe7, 0x8c, 0x05, 0x2d, 0x9b,
++    0x24, 0xfb, 0x4d, 0xea, 0x4a, 0xec, 0x01, 0x25, 0x29, },
++  { 0x8e, 0x73, 0x9a, 0x78, 0x11, 0xfe, 0x48, 0xa0, 0x3b, 0x1a, 0x26, 0xdf,
++    0x25, 0xe9, 0x59, 0x1c, 0x70, 0x07, 0x9f, 0xdc, 0xa0, 0xa6, },
++  { 0xe8, 0x47, 0x71, 0xc7, 0x3e, 0xdf, 0xb5, 0x13, 0xb9, 0x85, 0x13, 0xa8,
++    0x54, 0x47, 0x6e, 0x59, 0x96, 0x09, 0x13, 0x5f, 0x82, 0x16, 0x0b, },
++  { 0xfb, 0xc0, 0x8c, 0x03, 0x21, 0xb3, 0xc4, 0xb5, 0x43, 0x32, 0x6c, 0xea,
++    0x7f, 0xa8, 0x43, 0x91, 0xe8, 0x4e, 0x3f, 0xbf, 0x45, 0x58, 0x6a, 0xa3, },
++  { 0x55, 0xf8, 0xf3, 0x00, 0x76, 0x09, 0xef, 0x69, 0x5d, 0xd2, 0x8a, 0xf2,
++    0x65, 0xc3, 0xcb, 0x9b, 0x43, 0xfd, 0xb1, 0x7e, 0x7f, 0xa1, 0x94, 0xb0,
++    0xd7, },
++  { 0xaa, 0x13, 0xc1, 0x51, 0x40, 0x6d, 0x8d, 0x4c, 0x0a, 0x95, 0x64, 0x7b,
++    0xd1, 0x96, 0xb6, 0x56, 0xb4, 0x5b, 0xcf, 0xd6, 0xd9, 0x15, 0x97, 0xdd,
++    0xb6, 0xef, },
++  { 0xaf, 0xb7, 0x36, 0xb0, 0x04, 0xdb, 0xd7, 0x9c, 0x9a, 0x44, 0xc4, 0xf6,
++    0x1f, 0x12, 0x21, 0x2d, 0x59, 0x30, 0x54, 0xab, 0x27, 0x61, 0xa3, 0x57,
++    0xef, 0xf8, 0x53, },
++  { 0x97, 0x34, 0x45, 0x3e, 0xce, 0x7c, 0x35, 0xa2, 0xda, 0x9f, 0x4b, 0x46,
++    0x6c, 0x11, 0x67, 0xff, 0x2f, 0x76, 0x58, 0x15, 0x71, 0xfa, 0x44, 0x89,
++    0x89, 0xfd, 0xf7, 0x99, },
++  { 0x1f, 0xb1, 0x62, 0xeb, 0x83, 0xc5, 0x9c, 0x89, 0xf9, 0x2c, 0xd2, 0x03,
++    0x61, 0xbc, 0xbb, 0xa5, 0x74, 0x0e, 0x9b, 0x7e, 0x82, 0x3e, 0x70, 0x0a,
++    0xa9, 0x8f, 0x2b, 0x59, 0xfb, },
++  { 0xf8, 0xca, 0x5e, 0x3a, 0x4f, 0x9e, 0x10, 0x69, 0x10, 0xd5, 0x4c, 0xeb,
++    0x1a, 0x0f, 0x3c, 0x6a, 0x98, 0xf5, 0xb0, 0x97, 0x5b, 0x37, 0x2f, 0x0d,
++    0xbd, 0x42, 0x4b, 0x69, 0xa1, 0x82, },
++  { 0x12, 0x8c, 0x6d, 0x52, 0x08, 0xef, 0x74, 0xb2, 0xe6, 0xaa, 0xd3, 0xb0,
++    0x26, 0xb0, 0xd9, 0x94, 0xb6, 0x11, 0x45, 0x0e, 0x36, 0x71, 0x14, 0x2d,
++    0x41, 0x8c, 0x21, 0x53, 0x31, 0xe9, 0x68, },
++  { 0xee, 0xea, 0x0d, 0x89, 0x47, 0x7e, 0x72, 0xd1, 0xd8, 0xce, 0x58, 0x4c,
++    0x94, 0x1f, 0x0d, 0x51, 0x08, 0xa3, 0xb6, 0x3d, 0xe7, 0x82, 0x46, 0x92,
++    0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, },
++};
++
++static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
++  { 0xce, 0xe1, 0x57, 0x69, 0x82, 0xdc, 0xbf, 0x43, 0xad, 0x56, 0x4c, 0x70,
++    0xed, 0x68, 0x16, 0x96, 0xcf, 0xa4, 0x73, 0xe8, 0xe8, 0xfc, 0x32, 0x79,
++    0x08, 0x0a, 0x75, 0x82, 0xda, 0x3f, 0x05, 0x11, },
++  { 0x77, 0x2f, 0x0c, 0x71, 0x41, 0xf4, 0x4b, 0x2b, 0xb3, 0xc6, 0xb6, 0xf9,
++    0x60, 0xde, 0xe4, 0x52, 0x38, 0x66, 0xe8, 0xbf, 0x9b, 0x96, 0xc4, 0x9f,
++    0x60, 0xd9, 0x24, 0x37, 0x99, 0xd6, 0xec, 0x31, },
++};
++
++bool __init blake2s_selftest(void)
++{
++      u8 key[BLAKE2S_KEY_SIZE];
++      u8 buf[ARRAY_SIZE(blake2s_testvecs)];
++      u8 hash[BLAKE2S_HASH_SIZE];
++      struct blake2s_state state;
++      bool success = true;
++      int i, l;
++
++      key[0] = key[1] = 1;
++      for (i = 2; i < sizeof(key); ++i)
++              key[i] = key[i - 2] + key[i - 1];
++
++      for (i = 0; i < sizeof(buf); ++i)
++              buf[i] = (u8)i;
++
++      for (i = l = 0; i < ARRAY_SIZE(blake2s_testvecs); l = (l + 37) % ++i) {
++              int outlen = 1 + i % BLAKE2S_HASH_SIZE;
++              int keylen = (13 * i) % (BLAKE2S_KEY_SIZE + 1);
++
++              blake2s(hash, buf, key + BLAKE2S_KEY_SIZE - keylen, outlen, i,
++                      keylen);
++              if (memcmp(hash, blake2s_testvecs[i], outlen)) {
++                      pr_err("blake2s self-test %d: FAIL\n", i + 1);
++                      success = false;
++              }
++
++              if (!keylen)
++                      blake2s_init(&state, outlen);
++              else
++                      blake2s_init_key(&state, outlen,
++                                       key + BLAKE2S_KEY_SIZE - keylen,
++                                       keylen);
++
++              blake2s_update(&state, buf, l);
++              blake2s_update(&state, buf + l, i - l);
++              blake2s_final(&state, hash);
++              if (memcmp(hash, blake2s_testvecs[i], outlen)) {
++                      pr_err("blake2s init/update/final self-test %d: FAIL\n",
++                             i + 1);
++                      success = false;
++              }
++      }
++
++      if (success) {
++              blake2s256_hmac(hash, buf, key, sizeof(buf), sizeof(key));
++              success &= !memcmp(hash, blake2s_hmac_testvecs[0], BLAKE2S_HASH_SIZE);
++
++              blake2s256_hmac(hash, key, buf, sizeof(key), sizeof(buf));
++              success &= !memcmp(hash, blake2s_hmac_testvecs[1], BLAKE2S_HASH_SIZE);
++
++              if (!success)
++                      pr_err("blake2s256_hmac self-test: FAIL\n");
++      }
++
++      return success;
++}
+--- /dev/null
++++ b/lib/crypto/blake2s.c
+@@ -0,0 +1,126 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This is an implementation of the BLAKE2s hash and PRF functions.
++ *
++ * Information: https://blake2.net/
++ *
++ */
++
++#include <crypto/internal/blake2s.h>
++#include <linux/types.h>
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/bug.h>
++#include <asm/unaligned.h>
++
++bool blake2s_selftest(void);
++
++void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
++{
++      const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
++
++      if (unlikely(!inlen))
++              return;
++      if (inlen > fill) {
++              memcpy(state->buf + state->buflen, in, fill);
++              if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S))
++                      blake2s_compress_arch(state, state->buf, 1,
++                                            BLAKE2S_BLOCK_SIZE);
++              else
++                      blake2s_compress_generic(state, state->buf, 1,
++                                               BLAKE2S_BLOCK_SIZE);
++              state->buflen = 0;
++              in += fill;
++              inlen -= fill;
++      }
++      if (inlen > BLAKE2S_BLOCK_SIZE) {
++              const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
++              /* Hash one less (full) block than strictly possible */
++              if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S))
++                      blake2s_compress_arch(state, in, nblocks - 1,
++                                            BLAKE2S_BLOCK_SIZE);
++              else
++                      blake2s_compress_generic(state, in, nblocks - 1,
++                                               BLAKE2S_BLOCK_SIZE);
++              in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++              inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++      }
++      memcpy(state->buf + state->buflen, in, inlen);
++      state->buflen += inlen;
++}
++EXPORT_SYMBOL(blake2s_update);
++
++void blake2s_final(struct blake2s_state *state, u8 *out)
++{
++      WARN_ON(IS_ENABLED(DEBUG) && !out);
++      blake2s_set_lastblock(state);
++      memset(state->buf + state->buflen, 0,
++             BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S))
++              blake2s_compress_arch(state, state->buf, 1, state->buflen);
++      else
++              blake2s_compress_generic(state, state->buf, 1, state->buflen);
++      cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
++      memcpy(out, state->h, state->outlen);
++      memzero_explicit(state, sizeof(*state));
++}
++EXPORT_SYMBOL(blake2s_final);
++
++void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
++                   const size_t keylen)
++{
++      struct blake2s_state state;
++      u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
++      u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
++      int i;
++
++      if (keylen > BLAKE2S_BLOCK_SIZE) {
++              blake2s_init(&state, BLAKE2S_HASH_SIZE);
++              blake2s_update(&state, key, keylen);
++              blake2s_final(&state, x_key);
++      } else
++              memcpy(x_key, key, keylen);
++
++      for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
++              x_key[i] ^= 0x36;
++
++      blake2s_init(&state, BLAKE2S_HASH_SIZE);
++      blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
++      blake2s_update(&state, in, inlen);
++      blake2s_final(&state, i_hash);
++
++      for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
++              x_key[i] ^= 0x5c ^ 0x36;
++
++      blake2s_init(&state, BLAKE2S_HASH_SIZE);
++      blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
++      blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
++      blake2s_final(&state, i_hash);
++
++      memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
++      memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
++      memzero_explicit(i_hash, BLAKE2S_HASH_SIZE);
++}
++EXPORT_SYMBOL(blake2s256_hmac);
++
++static int __init mod_init(void)
++{
++      if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
++          WARN_ON(!blake2s_selftest()))
++              return -ENODEV;
++      return 0;
++}
++
++static void __exit mod_exit(void)
++{
++}
++
++module_init(mod_init);
++module_exit(mod_exit);
++MODULE_LICENSE("GPL v2");
++MODULE_DESCRIPTION("BLAKE2s hash function");
++MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0022-crypto-testmgr-add-test-cases-for-Blake2s.patch b/target/linux/generic/backport-5.4/080-wireguard-0022-crypto-testmgr-add-test-cases-for-Blake2s.patch

new file mode 100644 (file)

index 0000000..95ace4b
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0022-crypto-testmgr-add-test-cases-for-Blake2s.patch
@@ -0,0 +1,322 @@
+From 4852555d88528a86fc20ac63da7aca29f9071193 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:29 +0100
+Subject: [PATCH 022/124] crypto: testmgr - add test cases for Blake2s
+
+commit 17e1df67023a5c9ccaeb5de8bf5b88f63127ecf7 upstream.
+
+As suggested by Eric for the Blake2b implementation contributed by
+David, introduce a set of test vectors for Blake2s covering different
+digest and key sizes.
+
+          blake2s-128  blake2s-160  blake2s-224  blake2s-256
+         ---------------------------------------------------
+len=0   | klen=0       klen=1       klen=16      klen=32
+len=1   | klen=16      klen=32      klen=0       klen=1
+len=7   | klen=32      klen=0       klen=1       klen=16
+len=15  | klen=1       klen=16      klen=32      klen=0
+len=64  | klen=0       klen=1       klen=16      klen=32
+len=247 | klen=16      klen=32      klen=0       klen=1
+len=256 | klen=32      klen=0       klen=1       klen=16
+
+Cc: David Sterba <dsterba@suse.com>
+Cc: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/testmgr.c |  24 +++++
+ crypto/testmgr.h | 251 +++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 275 insertions(+)
+
+--- a/crypto/testmgr.c
++++ b/crypto/testmgr.c
+@@ -4035,6 +4035,30 @@ static const struct alg_test_desc alg_te
+               .test = alg_test_null,
+               .fips_allowed = 1,
+       }, {
++              .alg = "blake2s-128",
++              .test = alg_test_hash,
++              .suite = {
++                      .hash = __VECS(blakes2s_128_tv_template)
++              }
++      }, {
++              .alg = "blake2s-160",
++              .test = alg_test_hash,
++              .suite = {
++                      .hash = __VECS(blakes2s_160_tv_template)
++              }
++      }, {
++              .alg = "blake2s-224",
++              .test = alg_test_hash,
++              .suite = {
++                      .hash = __VECS(blakes2s_224_tv_template)
++              }
++      }, {
++              .alg = "blake2s-256",
++              .test = alg_test_hash,
++              .suite = {
++                      .hash = __VECS(blakes2s_256_tv_template)
++              }
++      }, {
+               .alg = "cbc(aes)",
+               .test = alg_test_skcipher,
+               .fips_allowed = 1,
+--- a/crypto/testmgr.h
++++ b/crypto/testmgr.h
+@@ -31567,4 +31567,255 @@ static const struct aead_testvec essiv_h
+       },
+ };
+ 
++static const char blake2_ordered_sequence[] =
++      "\x00\x01\x02\x03\x04\x05\x06\x07"
++      "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
++      "\x10\x11\x12\x13\x14\x15\x16\x17"
++      "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
++      "\x20\x21\x22\x23\x24\x25\x26\x27"
++      "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
++      "\x30\x31\x32\x33\x34\x35\x36\x37"
++      "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
++      "\x40\x41\x42\x43\x44\x45\x46\x47"
++      "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
++      "\x50\x51\x52\x53\x54\x55\x56\x57"
++      "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
++      "\x60\x61\x62\x63\x64\x65\x66\x67"
++      "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
++      "\x70\x71\x72\x73\x74\x75\x76\x77"
++      "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
++      "\x80\x81\x82\x83\x84\x85\x86\x87"
++      "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
++      "\x90\x91\x92\x93\x94\x95\x96\x97"
++      "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
++      "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
++      "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
++      "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
++      "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
++      "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
++      "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
++      "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
++      "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
++      "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
++      "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
++      "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
++      "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
++
++static const struct hash_testvec blakes2s_128_tv_template[] = {{
++      .digest = (u8[]){ 0x64, 0x55, 0x0d, 0x6f, 0xfe, 0x2c, 0x0a, 0x01,
++                        0xa1, 0x4a, 0xba, 0x1e, 0xad, 0xe0, 0x20, 0x0c, },
++}, {
++      .plaintext = blake2_ordered_sequence,
++      .psize = 64,
++      .digest = (u8[]){ 0xdc, 0x66, 0xca, 0x8f, 0x03, 0x86, 0x58, 0x01,
++                        0xb0, 0xff, 0xe0, 0x6e, 0xd8, 0xa1, 0xa9, 0x0e, },
++}, {
++      .ksize = 16,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 1,
++      .digest = (u8[]){ 0x88, 0x1e, 0x42, 0xe7, 0xbb, 0x35, 0x80, 0x82,
++                        0x63, 0x7c, 0x0a, 0x0f, 0xd7, 0xec, 0x6c, 0x2f, },
++}, {
++      .ksize = 32,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 7,
++      .digest = (u8[]){ 0xcf, 0x9e, 0x07, 0x2a, 0xd5, 0x22, 0xf2, 0xcd,
++                        0xa2, 0xd8, 0x25, 0x21, 0x80, 0x86, 0x73, 0x1c, },
++}, {
++      .ksize = 1,
++      .key = "B",
++      .plaintext = blake2_ordered_sequence,
++      .psize = 15,
++      .digest = (u8[]){ 0xf6, 0x33, 0x5a, 0x2c, 0x22, 0xa0, 0x64, 0xb2,
++                        0xb6, 0x3f, 0xeb, 0xbc, 0xd1, 0xc3, 0xe5, 0xb2, },
++}, {
++      .ksize = 16,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 247,
++      .digest = (u8[]){ 0x72, 0x66, 0x49, 0x60, 0xf9, 0x4a, 0xea, 0xbe,
++                        0x1f, 0xf4, 0x60, 0xce, 0xb7, 0x81, 0xcb, 0x09, },
++}, {
++      .ksize = 32,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 256,
++      .digest = (u8[]){ 0xd5, 0xa4, 0x0e, 0xc3, 0x16, 0xc7, 0x51, 0xa6,
++                        0x3c, 0xd0, 0xd9, 0x11, 0x57, 0xfa, 0x1e, 0xbb, },
++}};
++
++static const struct hash_testvec blakes2s_160_tv_template[] = {{
++      .plaintext = blake2_ordered_sequence,
++      .psize = 7,
++      .digest = (u8[]){ 0xb4, 0xf2, 0x03, 0x49, 0x37, 0xed, 0xb1, 0x3e,
++                        0x5b, 0x2a, 0xca, 0x64, 0x82, 0x74, 0xf6, 0x62,
++                        0xe3, 0xf2, 0x84, 0xff, },
++}, {
++      .plaintext = blake2_ordered_sequence,
++      .psize = 256,
++      .digest = (u8[]){ 0xaa, 0x56, 0x9b, 0xdc, 0x98, 0x17, 0x75, 0xf2,
++                        0xb3, 0x68, 0x83, 0xb7, 0x9b, 0x8d, 0x48, 0xb1,
++                        0x9b, 0x2d, 0x35, 0x05, },
++}, {
++      .ksize = 1,
++      .key = "B",
++      .digest = (u8[]){ 0x50, 0x16, 0xe7, 0x0c, 0x01, 0xd0, 0xd3, 0xc3,
++                        0xf4, 0x3e, 0xb1, 0x6e, 0x97, 0xa9, 0x4e, 0xd1,
++                        0x79, 0x65, 0x32, 0x93, },
++}, {
++      .ksize = 32,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 1,
++      .digest = (u8[]){ 0x1c, 0x2b, 0xcd, 0x9a, 0x68, 0xca, 0x8c, 0x71,
++                        0x90, 0x29, 0x6c, 0x54, 0xfa, 0x56, 0x4a, 0xef,
++                        0xa2, 0x3a, 0x56, 0x9c, },
++}, {
++      .ksize = 16,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 15,
++      .digest = (u8[]){ 0x36, 0xc3, 0x5f, 0x9a, 0xdc, 0x7e, 0xbf, 0x19,
++                        0x68, 0xaa, 0xca, 0xd8, 0x81, 0xbf, 0x09, 0x34,
++                        0x83, 0x39, 0x0f, 0x30, },
++}, {
++      .ksize = 1,
++      .key = "B",
++      .plaintext = blake2_ordered_sequence,
++      .psize = 64,
++      .digest = (u8[]){ 0x86, 0x80, 0x78, 0xa4, 0x14, 0xec, 0x03, 0xe5,
++                        0xb6, 0x9a, 0x52, 0x0e, 0x42, 0xee, 0x39, 0x9d,
++                        0xac, 0xa6, 0x81, 0x63, },
++}, {
++      .ksize = 32,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 247,
++      .digest = (u8[]){ 0x2d, 0xd8, 0xd2, 0x53, 0x66, 0xfa, 0xa9, 0x01,
++                        0x1c, 0x9c, 0xaf, 0xa3, 0xe2, 0x9d, 0x9b, 0x10,
++                        0x0a, 0xf6, 0x73, 0xe8, },
++}};
++
++static const struct hash_testvec blakes2s_224_tv_template[] = {{
++      .plaintext = blake2_ordered_sequence,
++      .psize = 1,
++      .digest = (u8[]){ 0x61, 0xb9, 0x4e, 0xc9, 0x46, 0x22, 0xa3, 0x91,
++                        0xd2, 0xae, 0x42, 0xe6, 0x45, 0x6c, 0x90, 0x12,
++                        0xd5, 0x80, 0x07, 0x97, 0xb8, 0x86, 0x5a, 0xfc,
++                        0x48, 0x21, 0x97, 0xbb, },
++}, {
++      .plaintext = blake2_ordered_sequence,
++      .psize = 247,
++      .digest = (u8[]){ 0x9e, 0xda, 0xc7, 0x20, 0x2c, 0xd8, 0x48, 0x2e,
++                        0x31, 0x94, 0xab, 0x46, 0x6d, 0x94, 0xd8, 0xb4,
++                        0x69, 0xcd, 0xae, 0x19, 0x6d, 0x9e, 0x41, 0xcc,
++                        0x2b, 0xa4, 0xd5, 0xf6, },
++}, {
++      .ksize = 16,
++      .key = blake2_ordered_sequence,
++      .digest = (u8[]){ 0x32, 0xc0, 0xac, 0xf4, 0x3b, 0xd3, 0x07, 0x9f,
++                        0xbe, 0xfb, 0xfa, 0x4d, 0x6b, 0x4e, 0x56, 0xb3,
++                        0xaa, 0xd3, 0x27, 0xf6, 0x14, 0xbf, 0xb9, 0x32,
++                        0xa7, 0x19, 0xfc, 0xb8, },
++}, {
++      .ksize = 1,
++      .key = "B",
++      .plaintext = blake2_ordered_sequence,
++      .psize = 7,
++      .digest = (u8[]){ 0x73, 0xad, 0x5e, 0x6d, 0xb9, 0x02, 0x8e, 0x76,
++                        0xf2, 0x66, 0x42, 0x4b, 0x4c, 0xfa, 0x1f, 0xe6,
++                        0x2e, 0x56, 0x40, 0xe5, 0xa2, 0xb0, 0x3c, 0xe8,
++                        0x7b, 0x45, 0xfe, 0x05, },
++}, {
++      .ksize = 32,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 15,
++      .digest = (u8[]){ 0x16, 0x60, 0xfb, 0x92, 0x54, 0xb3, 0x6e, 0x36,
++                        0x81, 0xf4, 0x16, 0x41, 0xc3, 0x3d, 0xd3, 0x43,
++                        0x84, 0xed, 0x10, 0x6f, 0x65, 0x80, 0x7a, 0x3e,
++                        0x25, 0xab, 0xc5, 0x02, },
++}, {
++      .ksize = 16,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 64,
++      .digest = (u8[]){ 0xca, 0xaa, 0x39, 0x67, 0x9c, 0xf7, 0x6b, 0xc7,
++                        0xb6, 0x82, 0xca, 0x0e, 0x65, 0x36, 0x5b, 0x7c,
++                        0x24, 0x00, 0xfa, 0x5f, 0xda, 0x06, 0x91, 0x93,
++                        0x6a, 0x31, 0x83, 0xb5, },
++}, {
++      .ksize = 1,
++      .key = "B",
++      .plaintext = blake2_ordered_sequence,
++      .psize = 256,
++      .digest = (u8[]){ 0x90, 0x02, 0x26, 0xb5, 0x06, 0x9c, 0x36, 0x86,
++                        0x94, 0x91, 0x90, 0x1e, 0x7d, 0x2a, 0x71, 0xb2,
++                        0x48, 0xb5, 0xe8, 0x16, 0xfd, 0x64, 0x33, 0x45,
++                        0xb3, 0xd7, 0xec, 0xcc, },
++}};
++
++static const struct hash_testvec blakes2s_256_tv_template[] = {{
++      .plaintext = blake2_ordered_sequence,
++      .psize = 15,
++      .digest = (u8[]){ 0xd9, 0x7c, 0x82, 0x8d, 0x81, 0x82, 0xa7, 0x21,
++                        0x80, 0xa0, 0x6a, 0x78, 0x26, 0x83, 0x30, 0x67,
++                        0x3f, 0x7c, 0x4e, 0x06, 0x35, 0x94, 0x7c, 0x04,
++                        0xc0, 0x23, 0x23, 0xfd, 0x45, 0xc0, 0xa5, 0x2d, },
++}, {
++      .ksize = 32,
++      .key = blake2_ordered_sequence,
++      .digest = (u8[]){ 0x48, 0xa8, 0x99, 0x7d, 0xa4, 0x07, 0x87, 0x6b,
++                        0x3d, 0x79, 0xc0, 0xd9, 0x23, 0x25, 0xad, 0x3b,
++                        0x89, 0xcb, 0xb7, 0x54, 0xd8, 0x6a, 0xb7, 0x1a,
++                        0xee, 0x04, 0x7a, 0xd3, 0x45, 0xfd, 0x2c, 0x49, },
++}, {
++      .ksize = 1,
++      .key = "B",
++      .plaintext = blake2_ordered_sequence,
++      .psize = 1,
++      .digest = (u8[]){ 0x22, 0x27, 0xae, 0xaa, 0x6e, 0x81, 0x56, 0x03,
++                        0xa7, 0xe3, 0xa1, 0x18, 0xa5, 0x9a, 0x2c, 0x18,
++                        0xf4, 0x63, 0xbc, 0x16, 0x70, 0xf1, 0xe7, 0x4b,
++                        0x00, 0x6d, 0x66, 0x16, 0xae, 0x9e, 0x74, 0x4e, },
++}, {
++      .ksize = 16,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 7,
++      .digest = (u8[]){ 0x58, 0x5d, 0xa8, 0x60, 0x1c, 0xa4, 0xd8, 0x03,
++                        0x86, 0x86, 0x84, 0x64, 0xd7, 0xa0, 0x8e, 0x15,
++                        0x2f, 0x05, 0xa2, 0x1b, 0xbc, 0xef, 0x7a, 0x34,
++                        0xb3, 0xc5, 0xbc, 0x4b, 0xf0, 0x32, 0xeb, 0x12, },
++}, {
++      .ksize = 32,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 64,
++      .digest = (u8[]){ 0x89, 0x75, 0xb0, 0x57, 0x7f, 0xd3, 0x55, 0x66,
++                        0xd7, 0x50, 0xb3, 0x62, 0xb0, 0x89, 0x7a, 0x26,
++                        0xc3, 0x99, 0x13, 0x6d, 0xf0, 0x7b, 0xab, 0xab,
++                        0xbd, 0xe6, 0x20, 0x3f, 0xf2, 0x95, 0x4e, 0xd4, },
++}, {
++      .ksize = 1,
++      .key = "B",
++      .plaintext = blake2_ordered_sequence,
++      .psize = 247,
++      .digest = (u8[]){ 0x2e, 0x74, 0x1c, 0x1d, 0x03, 0xf4, 0x9d, 0x84,
++                        0x6f, 0xfc, 0x86, 0x32, 0x92, 0x49, 0x7e, 0x66,
++                        0xd7, 0xc3, 0x10, 0x88, 0xfe, 0x28, 0xb3, 0xe0,
++                        0xbf, 0x50, 0x75, 0xad, 0x8e, 0xa4, 0xe6, 0xb2, },
++}, {
++      .ksize = 16,
++      .key = blake2_ordered_sequence,
++      .plaintext = blake2_ordered_sequence,
++      .psize = 256,
++      .digest = (u8[]){ 0xb9, 0xd2, 0x81, 0x0e, 0x3a, 0xb1, 0x62, 0x9b,
++                        0xad, 0x44, 0x05, 0xf4, 0x92, 0x2e, 0x99, 0xc1,
++                        0x4a, 0x47, 0xbb, 0x5b, 0x6f, 0xb2, 0x96, 0xed,
++                        0xd5, 0x06, 0xb5, 0x3a, 0x7c, 0x7a, 0x65, 0x1d, },
++}};
++
+ #endif        /* _CRYPTO_TESTMGR_H */
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0023-crypto-blake2s-implement-generic-shash-driver.patch b/target/linux/generic/backport-5.4/080-wireguard-0023-crypto-blake2s-implement-generic-shash-driver.patch

new file mode 100644 (file)

index 0000000..4116973
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0023-crypto-blake2s-implement-generic-shash-driver.patch
@@ -0,0 +1,245 @@
+From af5b936f5e17306da571f703bdef1f011a602b57 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:30 +0100
+Subject: [PATCH 023/124] crypto: blake2s - implement generic shash driver
+
+commit 7f9b0880925f1f9d7d59504ea0892d2ae9cfc233 upstream.
+
+Wire up our newly added Blake2s implementation via the shash API.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/Kconfig                    |  18 ++++
+ crypto/Makefile                   |   1 +
+ crypto/blake2s_generic.c          | 171 ++++++++++++++++++++++++++++++
+ include/crypto/internal/blake2s.h |   5 +
+ 4 files changed, 195 insertions(+)
+ create mode 100644 crypto/blake2s_generic.c
+
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -639,6 +639,24 @@ config CRYPTO_XXHASH
+         xxHash non-cryptographic hash algorithm. Extremely fast, working at
+         speeds close to RAM limits.
+ 
++config CRYPTO_BLAKE2S
++      tristate "BLAKE2s digest algorithm"
++      select CRYPTO_LIB_BLAKE2S_GENERIC
++      select CRYPTO_HASH
++      help
++        Implementation of cryptographic hash function BLAKE2s
++        optimized for 8-32bit platforms and can produce digests of any size
++        between 1 to 32.  The keyed hash is also implemented.
++
++        This module provides the following algorithms:
++
++        - blake2s-128
++        - blake2s-160
++        - blake2s-224
++        - blake2s-256
++
++        See https://blake2.net for further information.
++
+ config CRYPTO_CRCT10DIF
+       tristate "CRCT10DIF algorithm"
+       select CRYPTO_HASH
+--- a/crypto/Makefile
++++ b/crypto/Makefile
+@@ -74,6 +74,7 @@ obj-$(CONFIG_CRYPTO_STREEBOG) += streebo
+ obj-$(CONFIG_CRYPTO_WP512) += wp512.o
+ CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
+ obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
++obj-$(CONFIG_CRYPTO_BLAKE2S) += blake2s_generic.o
+ obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
+ obj-$(CONFIG_CRYPTO_ECB) += ecb.o
+ obj-$(CONFIG_CRYPTO_CBC) += cbc.o
+--- /dev/null
++++ b/crypto/blake2s_generic.c
+@@ -0,0 +1,171 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include <crypto/internal/blake2s.h>
++#include <crypto/internal/simd.h>
++#include <crypto/internal/hash.h>
++
++#include <linux/types.h>
++#include <linux/jump_label.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
++                               unsigned int keylen)
++{
++      struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
++
++      if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
++              crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
++              return -EINVAL;
++      }
++
++      memcpy(tctx->key, key, keylen);
++      tctx->keylen = keylen;
++
++      return 0;
++}
++
++static int crypto_blake2s_init(struct shash_desc *desc)
++{
++      struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
++      struct blake2s_state *state = shash_desc_ctx(desc);
++      const int outlen = crypto_shash_digestsize(desc->tfm);
++
++      if (tctx->keylen)
++              blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
++      else
++              blake2s_init(state, outlen);
++
++      return 0;
++}
++
++static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
++                               unsigned int inlen)
++{
++      struct blake2s_state *state = shash_desc_ctx(desc);
++      const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
++
++      if (unlikely(!inlen))
++              return 0;
++      if (inlen > fill) {
++              memcpy(state->buf + state->buflen, in, fill);
++              blake2s_compress_generic(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
++              state->buflen = 0;
++              in += fill;
++              inlen -= fill;
++      }
++      if (inlen > BLAKE2S_BLOCK_SIZE) {
++              const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
++              /* Hash one less (full) block than strictly possible */
++              blake2s_compress_generic(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
++              in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++              inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++      }
++      memcpy(state->buf + state->buflen, in, inlen);
++      state->buflen += inlen;
++
++      return 0;
++}
++
++static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
++{
++      struct blake2s_state *state = shash_desc_ctx(desc);
++
++      blake2s_set_lastblock(state);
++      memset(state->buf + state->buflen, 0,
++             BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
++      blake2s_compress_generic(state, state->buf, 1, state->buflen);
++      cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
++      memcpy(out, state->h, state->outlen);
++      memzero_explicit(state, sizeof(*state));
++
++      return 0;
++}
++
++static struct shash_alg blake2s_algs[] = {{
++      .base.cra_name          = "blake2s-128",
++      .base.cra_driver_name   = "blake2s-128-generic",
++      .base.cra_flags         = CRYPTO_ALG_OPTIONAL_KEY,
++      .base.cra_ctxsize       = sizeof(struct blake2s_tfm_ctx),
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++
++      .digestsize             = BLAKE2S_128_HASH_SIZE,
++      .setkey                 = crypto_blake2s_setkey,
++      .init                   = crypto_blake2s_init,
++      .update                 = crypto_blake2s_update,
++      .final                  = crypto_blake2s_final,
++      .descsize               = sizeof(struct blake2s_state),
++}, {
++      .base.cra_name          = "blake2s-160",
++      .base.cra_driver_name   = "blake2s-160-generic",
++      .base.cra_flags         = CRYPTO_ALG_OPTIONAL_KEY,
++      .base.cra_ctxsize       = sizeof(struct blake2s_tfm_ctx),
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++
++      .digestsize             = BLAKE2S_160_HASH_SIZE,
++      .setkey                 = crypto_blake2s_setkey,
++      .init                   = crypto_blake2s_init,
++      .update                 = crypto_blake2s_update,
++      .final                  = crypto_blake2s_final,
++      .descsize               = sizeof(struct blake2s_state),
++}, {
++      .base.cra_name          = "blake2s-224",
++      .base.cra_driver_name   = "blake2s-224-generic",
++      .base.cra_flags         = CRYPTO_ALG_OPTIONAL_KEY,
++      .base.cra_ctxsize       = sizeof(struct blake2s_tfm_ctx),
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++
++      .digestsize             = BLAKE2S_224_HASH_SIZE,
++      .setkey                 = crypto_blake2s_setkey,
++      .init                   = crypto_blake2s_init,
++      .update                 = crypto_blake2s_update,
++      .final                  = crypto_blake2s_final,
++      .descsize               = sizeof(struct blake2s_state),
++}, {
++      .base.cra_name          = "blake2s-256",
++      .base.cra_driver_name   = "blake2s-256-generic",
++      .base.cra_flags         = CRYPTO_ALG_OPTIONAL_KEY,
++      .base.cra_ctxsize       = sizeof(struct blake2s_tfm_ctx),
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++
++      .digestsize             = BLAKE2S_256_HASH_SIZE,
++      .setkey                 = crypto_blake2s_setkey,
++      .init                   = crypto_blake2s_init,
++      .update                 = crypto_blake2s_update,
++      .final                  = crypto_blake2s_final,
++      .descsize               = sizeof(struct blake2s_state),
++}};
++
++static int __init blake2s_mod_init(void)
++{
++      return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
++}
++
++static void __exit blake2s_mod_exit(void)
++{
++      crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
++}
++
++subsys_initcall(blake2s_mod_init);
++module_exit(blake2s_mod_exit);
++
++MODULE_ALIAS_CRYPTO("blake2s-128");
++MODULE_ALIAS_CRYPTO("blake2s-128-generic");
++MODULE_ALIAS_CRYPTO("blake2s-160");
++MODULE_ALIAS_CRYPTO("blake2s-160-generic");
++MODULE_ALIAS_CRYPTO("blake2s-224");
++MODULE_ALIAS_CRYPTO("blake2s-224-generic");
++MODULE_ALIAS_CRYPTO("blake2s-256");
++MODULE_ALIAS_CRYPTO("blake2s-256-generic");
++MODULE_LICENSE("GPL v2");
+--- a/include/crypto/internal/blake2s.h
++++ b/include/crypto/internal/blake2s.h
+@@ -5,6 +5,11 @@
+ 
+ #include <crypto/blake2s.h>
+ 
++struct blake2s_tfm_ctx {
++      u8 key[BLAKE2S_KEY_SIZE];
++      unsigned int keylen;
++};
++
+ void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+                             size_t nblocks, const u32 inc);
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch b/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch

new file mode 100644 (file)

index 0000000..80bf831
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch
@@ -0,0 +1,557 @@
+From 7960239adcaf7b56b081426ea3aa0ebf17398375 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:31 +0100
+Subject: [PATCH 024/124] crypto: blake2s - x86_64 SIMD implementation
+
+commit ed0356eda153f6a95649e11feb7b07083caf9e20 upstream.
+
+These implementations from Samuel Neves support AVX and AVX-512VL.
+Originally this used AVX-512F, but Skylake thermal throttling made
+AVX-512VL more attractive and possible to do with negligable difference.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
+Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
+[ardb: move to arch/x86/crypto, wire into lib/crypto framework]
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/Makefile       |   2 +
+ arch/x86/crypto/blake2s-core.S | 258 +++++++++++++++++++++++++++++++++
+ arch/x86/crypto/blake2s-glue.c | 233 +++++++++++++++++++++++++++++
+ crypto/Kconfig                 |   6 +
+ 4 files changed, 499 insertions(+)
+ create mode 100644 arch/x86/crypto/blake2s-core.S
+ create mode 100644 arch/x86/crypto/blake2s-glue.c
+
+--- a/arch/x86/crypto/Makefile
++++ b/arch/x86/crypto/Makefile
+@@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
+       obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+       obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+       obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
++      obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
+ endif
+ 
+ # These modules require assembler to support AVX2.
+@@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x8
+ aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+ 
+ nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
++blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+ 
+ ifeq ($(avx_supported),yes)
+       camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+--- /dev/null
++++ b/arch/x86/crypto/blake2s-core.S
+@@ -0,0 +1,258 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
++ */
++
++#include <linux/linkage.h>
++
++.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
++.align 32
++IV:   .octa 0xA54FF53A3C6EF372BB67AE856A09E667
++      .octa 0x5BE0CD191F83D9AB9B05688C510E527F
++.section .rodata.cst16.ROT16, "aM", @progbits, 16
++.align 16
++ROT16:        .octa 0x0D0C0F0E09080B0A0504070601000302
++.section .rodata.cst16.ROR328, "aM", @progbits, 16
++.align 16
++ROR328:       .octa 0x0C0F0E0D080B0A090407060500030201
++.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
++.align 64
++SIGMA:
++.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
++.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
++.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
++.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
++.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
++.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
++.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
++.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
++.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
++.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
++#ifdef CONFIG_AS_AVX512
++.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
++.align 64
++SIGMA2:
++.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
++.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
++.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
++.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
++.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
++.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
++.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
++.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
++.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
++.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
++#endif /* CONFIG_AS_AVX512 */
++
++.text
++#ifdef CONFIG_AS_SSSE3
++ENTRY(blake2s_compress_ssse3)
++      testq           %rdx,%rdx
++      je              .Lendofloop
++      movdqu          (%rdi),%xmm0
++      movdqu          0x10(%rdi),%xmm1
++      movdqa          ROT16(%rip),%xmm12
++      movdqa          ROR328(%rip),%xmm13
++      movdqu          0x20(%rdi),%xmm14
++      movq            %rcx,%xmm15
++      leaq            SIGMA+0xa0(%rip),%r8
++      jmp             .Lbeginofloop
++      .align          32
++.Lbeginofloop:
++      movdqa          %xmm0,%xmm10
++      movdqa          %xmm1,%xmm11
++      paddq           %xmm15,%xmm14
++      movdqa          IV(%rip),%xmm2
++      movdqa          %xmm14,%xmm3
++      pxor            IV+0x10(%rip),%xmm3
++      leaq            SIGMA(%rip),%rcx
++.Lroundloop:
++      movzbl          (%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm4
++      movzbl          0x1(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm5
++      movzbl          0x2(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm6
++      movzbl          0x3(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm7
++      punpckldq       %xmm5,%xmm4
++      punpckldq       %xmm7,%xmm6
++      punpcklqdq      %xmm6,%xmm4
++      paddd           %xmm4,%xmm0
++      paddd           %xmm1,%xmm0
++      pxor            %xmm0,%xmm3
++      pshufb          %xmm12,%xmm3
++      paddd           %xmm3,%xmm2
++      pxor            %xmm2,%xmm1
++      movdqa          %xmm1,%xmm8
++      psrld           $0xc,%xmm1
++      pslld           $0x14,%xmm8
++      por             %xmm8,%xmm1
++      movzbl          0x4(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm5
++      movzbl          0x5(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm6
++      movzbl          0x6(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm7
++      movzbl          0x7(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm4
++      punpckldq       %xmm6,%xmm5
++      punpckldq       %xmm4,%xmm7
++      punpcklqdq      %xmm7,%xmm5
++      paddd           %xmm5,%xmm0
++      paddd           %xmm1,%xmm0
++      pxor            %xmm0,%xmm3
++      pshufb          %xmm13,%xmm3
++      paddd           %xmm3,%xmm2
++      pxor            %xmm2,%xmm1
++      movdqa          %xmm1,%xmm8
++      psrld           $0x7,%xmm1
++      pslld           $0x19,%xmm8
++      por             %xmm8,%xmm1
++      pshufd          $0x93,%xmm0,%xmm0
++      pshufd          $0x4e,%xmm3,%xmm3
++      pshufd          $0x39,%xmm2,%xmm2
++      movzbl          0x8(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm6
++      movzbl          0x9(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm7
++      movzbl          0xa(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm4
++      movzbl          0xb(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm5
++      punpckldq       %xmm7,%xmm6
++      punpckldq       %xmm5,%xmm4
++      punpcklqdq      %xmm4,%xmm6
++      paddd           %xmm6,%xmm0
++      paddd           %xmm1,%xmm0
++      pxor            %xmm0,%xmm3
++      pshufb          %xmm12,%xmm3
++      paddd           %xmm3,%xmm2
++      pxor            %xmm2,%xmm1
++      movdqa          %xmm1,%xmm8
++      psrld           $0xc,%xmm1
++      pslld           $0x14,%xmm8
++      por             %xmm8,%xmm1
++      movzbl          0xc(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm7
++      movzbl          0xd(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm4
++      movzbl          0xe(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm5
++      movzbl          0xf(%rcx),%eax
++      movd            (%rsi,%rax,4),%xmm6
++      punpckldq       %xmm4,%xmm7
++      punpckldq       %xmm6,%xmm5
++      punpcklqdq      %xmm5,%xmm7
++      paddd           %xmm7,%xmm0
++      paddd           %xmm1,%xmm0
++      pxor            %xmm0,%xmm3
++      pshufb          %xmm13,%xmm3
++      paddd           %xmm3,%xmm2
++      pxor            %xmm2,%xmm1
++      movdqa          %xmm1,%xmm8
++      psrld           $0x7,%xmm1
++      pslld           $0x19,%xmm8
++      por             %xmm8,%xmm1
++      pshufd          $0x39,%xmm0,%xmm0
++      pshufd          $0x4e,%xmm3,%xmm3
++      pshufd          $0x93,%xmm2,%xmm2
++      addq            $0x10,%rcx
++      cmpq            %r8,%rcx
++      jnz             .Lroundloop
++      pxor            %xmm2,%xmm0
++      pxor            %xmm3,%xmm1
++      pxor            %xmm10,%xmm0
++      pxor            %xmm11,%xmm1
++      addq            $0x40,%rsi
++      decq            %rdx
++      jnz             .Lbeginofloop
++      movdqu          %xmm0,(%rdi)
++      movdqu          %xmm1,0x10(%rdi)
++      movdqu          %xmm14,0x20(%rdi)
++.Lendofloop:
++      ret
++ENDPROC(blake2s_compress_ssse3)
++#endif /* CONFIG_AS_SSSE3 */
++
++#ifdef CONFIG_AS_AVX512
++ENTRY(blake2s_compress_avx512)
++      vmovdqu         (%rdi),%xmm0
++      vmovdqu         0x10(%rdi),%xmm1
++      vmovdqu         0x20(%rdi),%xmm4
++      vmovq           %rcx,%xmm5
++      vmovdqa         IV(%rip),%xmm14
++      vmovdqa         IV+16(%rip),%xmm15
++      jmp             .Lblake2s_compress_avx512_mainloop
++.align 32
++.Lblake2s_compress_avx512_mainloop:
++      vmovdqa         %xmm0,%xmm10
++      vmovdqa         %xmm1,%xmm11
++      vpaddq          %xmm5,%xmm4,%xmm4
++      vmovdqa         %xmm14,%xmm2
++      vpxor           %xmm15,%xmm4,%xmm3
++      vmovdqu         (%rsi),%ymm6
++      vmovdqu         0x20(%rsi),%ymm7
++      addq            $0x40,%rsi
++      leaq            SIGMA2(%rip),%rax
++      movb            $0xa,%cl
++.Lblake2s_compress_avx512_roundloop:
++      addq            $0x40,%rax
++      vmovdqa         -0x40(%rax),%ymm8
++      vmovdqa         -0x20(%rax),%ymm9
++      vpermi2d        %ymm7,%ymm6,%ymm8
++      vpermi2d        %ymm7,%ymm6,%ymm9
++      vmovdqa         %ymm8,%ymm6
++      vmovdqa         %ymm9,%ymm7
++      vpaddd          %xmm8,%xmm0,%xmm0
++      vpaddd          %xmm1,%xmm0,%xmm0
++      vpxor           %xmm0,%xmm3,%xmm3
++      vprord          $0x10,%xmm3,%xmm3
++      vpaddd          %xmm3,%xmm2,%xmm2
++      vpxor           %xmm2,%xmm1,%xmm1
++      vprord          $0xc,%xmm1,%xmm1
++      vextracti128    $0x1,%ymm8,%xmm8
++      vpaddd          %xmm8,%xmm0,%xmm0
++      vpaddd          %xmm1,%xmm0,%xmm0
++      vpxor           %xmm0,%xmm3,%xmm3
++      vprord          $0x8,%xmm3,%xmm3
++      vpaddd          %xmm3,%xmm2,%xmm2
++      vpxor           %xmm2,%xmm1,%xmm1
++      vprord          $0x7,%xmm1,%xmm1
++      vpshufd         $0x93,%xmm0,%xmm0
++      vpshufd         $0x4e,%xmm3,%xmm3
++      vpshufd         $0x39,%xmm2,%xmm2
++      vpaddd          %xmm9,%xmm0,%xmm0
++      vpaddd          %xmm1,%xmm0,%xmm0
++      vpxor           %xmm0,%xmm3,%xmm3
++      vprord          $0x10,%xmm3,%xmm3
++      vpaddd          %xmm3,%xmm2,%xmm2
++      vpxor           %xmm2,%xmm1,%xmm1
++      vprord          $0xc,%xmm1,%xmm1
++      vextracti128    $0x1,%ymm9,%xmm9
++      vpaddd          %xmm9,%xmm0,%xmm0
++      vpaddd          %xmm1,%xmm0,%xmm0
++      vpxor           %xmm0,%xmm3,%xmm3
++      vprord          $0x8,%xmm3,%xmm3
++      vpaddd          %xmm3,%xmm2,%xmm2
++      vpxor           %xmm2,%xmm1,%xmm1
++      vprord          $0x7,%xmm1,%xmm1
++      vpshufd         $0x39,%xmm0,%xmm0
++      vpshufd         $0x4e,%xmm3,%xmm3
++      vpshufd         $0x93,%xmm2,%xmm2
++      decb            %cl
++      jne             .Lblake2s_compress_avx512_roundloop
++      vpxor           %xmm10,%xmm0,%xmm0
++      vpxor           %xmm11,%xmm1,%xmm1
++      vpxor           %xmm2,%xmm0,%xmm0
++      vpxor           %xmm3,%xmm1,%xmm1
++      decq            %rdx
++      jne             .Lblake2s_compress_avx512_mainloop
++      vmovdqu         %xmm0,(%rdi)
++      vmovdqu         %xmm1,0x10(%rdi)
++      vmovdqu         %xmm4,0x20(%rdi)
++      vzeroupper
++      retq
++ENDPROC(blake2s_compress_avx512)
++#endif /* CONFIG_AS_AVX512 */
+--- /dev/null
++++ b/arch/x86/crypto/blake2s-glue.c
+@@ -0,0 +1,233 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include <crypto/internal/blake2s.h>
++#include <crypto/internal/simd.h>
++#include <crypto/internal/hash.h>
++
++#include <linux/types.h>
++#include <linux/jump_label.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <asm/cpufeature.h>
++#include <asm/fpu/api.h>
++#include <asm/processor.h>
++#include <asm/simd.h>
++
++asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
++                                     const u8 *block, const size_t nblocks,
++                                     const u32 inc);
++asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
++                                      const u8 *block, const size_t nblocks,
++                                      const u32 inc);
++
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
++
++void blake2s_compress_arch(struct blake2s_state *state,
++                         const u8 *block, size_t nblocks,
++                         const u32 inc)
++{
++      /* SIMD disables preemption, so relax after processing each page. */
++      BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
++
++      if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
++              blake2s_compress_generic(state, block, nblocks, inc);
++              return;
++      }
++
++      for (;;) {
++              const size_t blocks = min_t(size_t, nblocks,
++                                          PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
++
++              kernel_fpu_begin();
++              if (IS_ENABLED(CONFIG_AS_AVX512) &&
++                  static_branch_likely(&blake2s_use_avx512))
++                      blake2s_compress_avx512(state, block, blocks, inc);
++              else
++                      blake2s_compress_ssse3(state, block, blocks, inc);
++              kernel_fpu_end();
++
++              nblocks -= blocks;
++              if (!nblocks)
++                      break;
++              block += blocks * BLAKE2S_BLOCK_SIZE;
++      }
++}
++EXPORT_SYMBOL(blake2s_compress_arch);
++
++static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
++                               unsigned int keylen)
++{
++      struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
++
++      if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
++              crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
++              return -EINVAL;
++      }
++
++      memcpy(tctx->key, key, keylen);
++      tctx->keylen = keylen;
++
++      return 0;
++}
++
++static int crypto_blake2s_init(struct shash_desc *desc)
++{
++      struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
++      struct blake2s_state *state = shash_desc_ctx(desc);
++      const int outlen = crypto_shash_digestsize(desc->tfm);
++
++      if (tctx->keylen)
++              blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
++      else
++              blake2s_init(state, outlen);
++
++      return 0;
++}
++
++static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
++                               unsigned int inlen)
++{
++      struct blake2s_state *state = shash_desc_ctx(desc);
++      const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
++
++      if (unlikely(!inlen))
++              return 0;
++      if (inlen > fill) {
++              memcpy(state->buf + state->buflen, in, fill);
++              blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
++              state->buflen = 0;
++              in += fill;
++              inlen -= fill;
++      }
++      if (inlen > BLAKE2S_BLOCK_SIZE) {
++              const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
++              /* Hash one less (full) block than strictly possible */
++              blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
++              in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++              inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++      }
++      memcpy(state->buf + state->buflen, in, inlen);
++      state->buflen += inlen;
++
++      return 0;
++}
++
++static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
++{
++      struct blake2s_state *state = shash_desc_ctx(desc);
++
++      blake2s_set_lastblock(state);
++      memset(state->buf + state->buflen, 0,
++             BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
++      blake2s_compress_arch(state, state->buf, 1, state->buflen);
++      cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
++      memcpy(out, state->h, state->outlen);
++      memzero_explicit(state, sizeof(*state));
++
++      return 0;
++}
++
++static struct shash_alg blake2s_algs[] = {{
++      .base.cra_name          = "blake2s-128",
++      .base.cra_driver_name   = "blake2s-128-x86",
++      .base.cra_flags         = CRYPTO_ALG_OPTIONAL_KEY,
++      .base.cra_ctxsize       = sizeof(struct blake2s_tfm_ctx),
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++
++      .digestsize             = BLAKE2S_128_HASH_SIZE,
++      .setkey                 = crypto_blake2s_setkey,
++      .init                   = crypto_blake2s_init,
++      .update                 = crypto_blake2s_update,
++      .final                  = crypto_blake2s_final,
++      .descsize               = sizeof(struct blake2s_state),
++}, {
++      .base.cra_name          = "blake2s-160",
++      .base.cra_driver_name   = "blake2s-160-x86",
++      .base.cra_flags         = CRYPTO_ALG_OPTIONAL_KEY,
++      .base.cra_ctxsize       = sizeof(struct blake2s_tfm_ctx),
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++
++      .digestsize             = BLAKE2S_160_HASH_SIZE,
++      .setkey                 = crypto_blake2s_setkey,
++      .init                   = crypto_blake2s_init,
++      .update                 = crypto_blake2s_update,
++      .final                  = crypto_blake2s_final,
++      .descsize               = sizeof(struct blake2s_state),
++}, {
++      .base.cra_name          = "blake2s-224",
++      .base.cra_driver_name   = "blake2s-224-x86",
++      .base.cra_flags         = CRYPTO_ALG_OPTIONAL_KEY,
++      .base.cra_ctxsize       = sizeof(struct blake2s_tfm_ctx),
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++
++      .digestsize             = BLAKE2S_224_HASH_SIZE,
++      .setkey                 = crypto_blake2s_setkey,
++      .init                   = crypto_blake2s_init,
++      .update                 = crypto_blake2s_update,
++      .final                  = crypto_blake2s_final,
++      .descsize               = sizeof(struct blake2s_state),
++}, {
++      .base.cra_name          = "blake2s-256",
++      .base.cra_driver_name   = "blake2s-256-x86",
++      .base.cra_flags         = CRYPTO_ALG_OPTIONAL_KEY,
++      .base.cra_ctxsize       = sizeof(struct blake2s_tfm_ctx),
++      .base.cra_priority      = 200,
++      .base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++      .base.cra_module        = THIS_MODULE,
++
++      .digestsize             = BLAKE2S_256_HASH_SIZE,
++      .setkey                 = crypto_blake2s_setkey,
++      .init                   = crypto_blake2s_init,
++      .update                 = crypto_blake2s_update,
++      .final                  = crypto_blake2s_final,
++      .descsize               = sizeof(struct blake2s_state),
++}};
++
++static int __init blake2s_mod_init(void)
++{
++      if (!boot_cpu_has(X86_FEATURE_SSSE3))
++              return 0;
++
++      static_branch_enable(&blake2s_use_ssse3);
++
++      if (IS_ENABLED(CONFIG_AS_AVX512) &&
++          boot_cpu_has(X86_FEATURE_AVX) &&
++          boot_cpu_has(X86_FEATURE_AVX2) &&
++          boot_cpu_has(X86_FEATURE_AVX512F) &&
++          boot_cpu_has(X86_FEATURE_AVX512VL) &&
++          cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
++                            XFEATURE_MASK_AVX512, NULL))
++              static_branch_enable(&blake2s_use_avx512);
++
++      return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
++}
++
++static void __exit blake2s_mod_exit(void)
++{
++      if (boot_cpu_has(X86_FEATURE_SSSE3))
++              crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
++}
++
++module_init(blake2s_mod_init);
++module_exit(blake2s_mod_exit);
++
++MODULE_ALIAS_CRYPTO("blake2s-128");
++MODULE_ALIAS_CRYPTO("blake2s-128-x86");
++MODULE_ALIAS_CRYPTO("blake2s-160");
++MODULE_ALIAS_CRYPTO("blake2s-160-x86");
++MODULE_ALIAS_CRYPTO("blake2s-224");
++MODULE_ALIAS_CRYPTO("blake2s-224-x86");
++MODULE_ALIAS_CRYPTO("blake2s-256");
++MODULE_ALIAS_CRYPTO("blake2s-256-x86");
++MODULE_LICENSE("GPL v2");
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -657,6 +657,12 @@ config CRYPTO_BLAKE2S
+ 
+         See https://blake2.net for further information.
+ 
++config CRYPTO_BLAKE2S_X86
++      tristate "BLAKE2s digest algorithm (x86 accelerated version)"
++      depends on X86 && 64BIT
++      select CRYPTO_LIB_BLAKE2S_GENERIC
++      select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
++
+ config CRYPTO_CRCT10DIF
+       tristate "CRCT10DIF algorithm"
+       select CRYPTO_HASH
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0025-crypto-curve25519-generic-C-library-implementations.patch b/target/linux/generic/backport-5.4/080-wireguard-0025-crypto-curve25519-generic-C-library-implementations.patch

new file mode 100644 (file)

index 0000000..87d4d41
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0025-crypto-curve25519-generic-C-library-implementations.patch
@@ -0,0 +1,1850 @@
+From feadb4076186623fb4ca14d8f70759637c4df1f2 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:32 +0100
+Subject: [PATCH 025/124] crypto: curve25519 - generic C library
+ implementations
+
+commit 0ed42a6f431e930b2e8fae21955406e09fe75d70 upstream.
+
+This contains two formally verified C implementations of the Curve25519
+scalar multiplication function, one for 32-bit systems, and one for
+64-bit systems whose compiler supports efficient 128-bit integer types.
+Not only are these implementations formally verified, but they are also
+the fastest available C implementations. They have been modified to be
+friendly to kernel space and to be generally less horrendous looking,
+but still an effort has been made to retain their formally verified
+characteristic, and so the C might look slightly unidiomatic.
+
+The 64-bit version comes from HACL*: https://github.com/project-everest/hacl-star
+The 32-bit version comes from Fiat: https://github.com/mit-plv/fiat-crypto
+
+Information: https://cr.yp.to/ecdh.html
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+[ardb: - move from lib/zinc to lib/crypto
+       - replace .c #includes with Kconfig based object selection
+       - drop simd handling and simplify support for per-arch versions ]
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/crypto/curve25519.h    |  71 +++
+ lib/crypto/Kconfig             |  25 +
+ lib/crypto/Makefile            |   5 +
+ lib/crypto/curve25519-fiat32.c | 864 +++++++++++++++++++++++++++++++++
+ lib/crypto/curve25519-hacl64.c | 788 ++++++++++++++++++++++++++++++
+ lib/crypto/curve25519.c        |  25 +
+ 6 files changed, 1778 insertions(+)
+ create mode 100644 include/crypto/curve25519.h
+ create mode 100644 lib/crypto/curve25519-fiat32.c
+ create mode 100644 lib/crypto/curve25519-hacl64.c
+ create mode 100644 lib/crypto/curve25519.c
+
+--- /dev/null
++++ b/include/crypto/curve25519.h
+@@ -0,0 +1,71 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef CURVE25519_H
++#define CURVE25519_H
++
++#include <crypto/algapi.h> // For crypto_memneq.
++#include <linux/types.h>
++#include <linux/random.h>
++
++enum curve25519_lengths {
++      CURVE25519_KEY_SIZE = 32
++};
++
++extern const u8 curve25519_null_point[];
++extern const u8 curve25519_base_point[];
++
++void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
++                      const u8 scalar[CURVE25519_KEY_SIZE],
++                      const u8 point[CURVE25519_KEY_SIZE]);
++
++void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
++                   const u8 scalar[CURVE25519_KEY_SIZE],
++                   const u8 point[CURVE25519_KEY_SIZE]);
++
++void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
++                        const u8 secret[CURVE25519_KEY_SIZE]);
++
++static inline
++bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
++                           const u8 secret[CURVE25519_KEY_SIZE],
++                           const u8 basepoint[CURVE25519_KEY_SIZE])
++{
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
++              curve25519_arch(mypublic, secret, basepoint);
++      else
++              curve25519_generic(mypublic, secret, basepoint);
++      return crypto_memneq(mypublic, curve25519_null_point,
++                           CURVE25519_KEY_SIZE);
++}
++
++static inline bool
++__must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
++                                      const u8 secret[CURVE25519_KEY_SIZE])
++{
++      if (unlikely(!crypto_memneq(secret, curve25519_null_point,
++                                  CURVE25519_KEY_SIZE)))
++              return false;
++
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
++              curve25519_base_arch(pub, secret);
++      else
++              curve25519_generic(pub, secret, curve25519_base_point);
++      return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE);
++}
++
++static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE])
++{
++      secret[0] &= 248;
++      secret[31] = (secret[31] & 127) | 64;
++}
++
++static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE])
++{
++      get_random_bytes_wait(secret, CURVE25519_KEY_SIZE);
++      curve25519_clamp_secret(secret);
++}
++
++#endif /* CURVE25519_H */
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -59,6 +59,31 @@ config CRYPTO_LIB_CHACHA
+         by either the generic implementation or an arch-specific one, if one
+         is available and enabled.
+ 
++config CRYPTO_ARCH_HAVE_LIB_CURVE25519
++      tristate
++      help
++        Declares whether the architecture provides an arch-specific
++        accelerated implementation of the Curve25519 library interface,
++        either builtin or as a module.
++
++config CRYPTO_LIB_CURVE25519_GENERIC
++      tristate
++      help
++        This symbol can be depended upon by arch implementations of the
++        Curve25519 library interface that require the generic code as a
++        fallback, e.g., for SIMD implementations. If no arch specific
++        implementation is enabled, this implementation serves the users
++        of CRYPTO_LIB_CURVE25519.
++
++config CRYPTO_LIB_CURVE25519
++      tristate "Curve25519 scalar multiplication library"
++      depends on CRYPTO_ARCH_HAVE_LIB_CURVE25519 || !CRYPTO_ARCH_HAVE_LIB_CURVE25519
++      select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n
++      help
++        Enable the Curve25519 library interface. This interface may be
++        fulfilled by either the generic implementation or an arch-specific
++        one, if one is available and enabled.
++
+ config CRYPTO_LIB_DES
+       tristate
+ 
+--- a/lib/crypto/Makefile
++++ b/lib/crypto/Makefile
+@@ -16,6 +16,11 @@ libblake2s-generic-y                                += blake2s-gener
+ obj-$(CONFIG_CRYPTO_LIB_BLAKE2S)              += libblake2s.o
+ libblake2s-y                                  += blake2s.o
+ 
++obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC)   += libcurve25519.o
++libcurve25519-y                                       := curve25519-fiat32.o
++libcurve25519-$(CONFIG_ARCH_SUPPORTS_INT128)  := curve25519-hacl64.o
++libcurve25519-y                                       += curve25519.o
++
+ obj-$(CONFIG_CRYPTO_LIB_DES)                  += libdes.o
+ libdes-y                                      := des.o
+ 
+--- /dev/null
++++ b/lib/crypto/curve25519-fiat32.c
+@@ -0,0 +1,864 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2016 The fiat-crypto Authors.
++ * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This is a machine-generated formally verified implementation of Curve25519
++ * ECDH from: <https://github.com/mit-plv/fiat-crypto>. Though originally
++ * machine generated, it has been tweaked to be suitable for use in the kernel.
++ * It is optimized for 32-bit machines and machines that cannot work efficiently
++ * with 128-bit integer types.
++ */
++
++#include <asm/unaligned.h>
++#include <crypto/curve25519.h>
++#include <linux/string.h>
++
++/* fe means field element. Here the field is \Z/(2^255-19). An element t,
++ * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
++ * t[3]+2^102 t[4]+...+2^230 t[9].
++ * fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc.
++ * Multiplication and carrying produce fe from fe_loose.
++ */
++typedef struct fe { u32 v[10]; } fe;
++
++/* fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc
++ * Addition and subtraction produce fe_loose from (fe, fe).
++ */
++typedef struct fe_loose { u32 v[10]; } fe_loose;
++
++static __always_inline void fe_frombytes_impl(u32 h[10], const u8 *s)
++{
++      /* Ignores top bit of s. */
++      u32 a0 = get_unaligned_le32(s);
++      u32 a1 = get_unaligned_le32(s+4);
++      u32 a2 = get_unaligned_le32(s+8);
++      u32 a3 = get_unaligned_le32(s+12);
++      u32 a4 = get_unaligned_le32(s+16);
++      u32 a5 = get_unaligned_le32(s+20);
++      u32 a6 = get_unaligned_le32(s+24);
++      u32 a7 = get_unaligned_le32(s+28);
++      h[0] = a0&((1<<26)-1);                    /* 26 used, 32-26 left.   26 */
++      h[1] = (a0>>26) | ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 =  6+19 = 25 */
++      h[2] = (a1>>19) | ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */
++      h[3] = (a2>>13) | ((a3&((1<< 6)-1))<<19); /* (32-13) +  6 = 19+ 6 = 25 */
++      h[4] = (a3>> 6);                          /* (32- 6)              = 26 */
++      h[5] = a4&((1<<25)-1);                    /*                        25 */
++      h[6] = (a4>>25) | ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 =  7+19 = 26 */
++      h[7] = (a5>>19) | ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */
++      h[8] = (a6>>12) | ((a7&((1<< 6)-1))<<20); /* (32-12) +  6 = 20+ 6 = 26 */
++      h[9] = (a7>> 6)&((1<<25)-1); /*                                     25 */
++}
++
++static __always_inline void fe_frombytes(fe *h, const u8 *s)
++{
++      fe_frombytes_impl(h->v, s);
++}
++
++static __always_inline u8 /*bool*/
++addcarryx_u25(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
++{
++      /* This function extracts 25 bits of result and 1 bit of carry
++       * (26 total), so a 32-bit intermediate is sufficient.
++       */
++      u32 x = a + b + c;
++      *low = x & ((1 << 25) - 1);
++      return (x >> 25) & 1;
++}
++
++static __always_inline u8 /*bool*/
++addcarryx_u26(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
++{
++      /* This function extracts 26 bits of result and 1 bit of carry
++       * (27 total), so a 32-bit intermediate is sufficient.
++       */
++      u32 x = a + b + c;
++      *low = x & ((1 << 26) - 1);
++      return (x >> 26) & 1;
++}
++
++static __always_inline u8 /*bool*/
++subborrow_u25(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
++{
++      /* This function extracts 25 bits of result and 1 bit of borrow
++       * (26 total), so a 32-bit intermediate is sufficient.
++       */
++      u32 x = a - b - c;
++      *low = x & ((1 << 25) - 1);
++      return x >> 31;
++}
++
++static __always_inline u8 /*bool*/
++subborrow_u26(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
++{
++      /* This function extracts 26 bits of result and 1 bit of borrow
++       *(27 total), so a 32-bit intermediate is sufficient.
++       */
++      u32 x = a - b - c;
++      *low = x & ((1 << 26) - 1);
++      return x >> 31;
++}
++
++static __always_inline u32 cmovznz32(u32 t, u32 z, u32 nz)
++{
++      t = -!!t; /* all set if nonzero, 0 if 0 */
++      return (t&nz) | ((~t)&z);
++}
++
++static __always_inline void fe_freeze(u32 out[10], const u32 in1[10])
++{
++      { const u32 x17 = in1[9];
++      { const u32 x18 = in1[8];
++      { const u32 x16 = in1[7];
++      { const u32 x14 = in1[6];
++      { const u32 x12 = in1[5];
++      { const u32 x10 = in1[4];
++      { const u32 x8 = in1[3];
++      { const u32 x6 = in1[2];
++      { const u32 x4 = in1[1];
++      { const u32 x2 = in1[0];
++      { u32 x20; u8/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20);
++      { u32 x23; u8/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23);
++      { u32 x26; u8/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26);
++      { u32 x29; u8/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29);
++      { u32 x32; u8/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32);
++      { u32 x35; u8/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35);
++      { u32 x38; u8/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38);
++      { u32 x41; u8/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41);
++      { u32 x44; u8/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44);
++      { u32 x47; u8/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47);
++      { u32 x49 = cmovznz32(x48, 0x0, 0xffffffff);
++      { u32 x50 = (x49 & 0x3ffffed);
++      { u32 x52; u8/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52);
++      { u32 x54 = (x49 & 0x1ffffff);
++      { u32 x56; u8/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56);
++      { u32 x58 = (x49 & 0x3ffffff);
++      { u32 x60; u8/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60);
++      { u32 x62 = (x49 & 0x1ffffff);
++      { u32 x64; u8/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64);
++      { u32 x66 = (x49 & 0x3ffffff);
++      { u32 x68; u8/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68);
++      { u32 x70 = (x49 & 0x1ffffff);
++      { u32 x72; u8/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72);
++      { u32 x74 = (x49 & 0x3ffffff);
++      { u32 x76; u8/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76);
++      { u32 x78 = (x49 & 0x1ffffff);
++      { u32 x80; u8/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80);
++      { u32 x82 = (x49 & 0x3ffffff);
++      { u32 x84; u8/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84);
++      { u32 x86 = (x49 & 0x1ffffff);
++      { u32 x88; addcarryx_u25(x85, x47, x86, &x88);
++      out[0] = x52;
++      out[1] = x56;
++      out[2] = x60;
++      out[3] = x64;
++      out[4] = x68;
++      out[5] = x72;
++      out[6] = x76;
++      out[7] = x80;
++      out[8] = x84;
++      out[9] = x88;
++      }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
++}
++
++static __always_inline void fe_tobytes(u8 s[32], const fe *f)
++{
++      u32 h[10];
++      fe_freeze(h, f->v);
++      s[0] = h[0] >> 0;
++      s[1] = h[0] >> 8;
++      s[2] = h[0] >> 16;
++      s[3] = (h[0] >> 24) | (h[1] << 2);
++      s[4] = h[1] >> 6;
++      s[5] = h[1] >> 14;
++      s[6] = (h[1] >> 22) | (h[2] << 3);
++      s[7] = h[2] >> 5;
++      s[8] = h[2] >> 13;
++      s[9] = (h[2] >> 21) | (h[3] << 5);
++      s[10] = h[3] >> 3;
++      s[11] = h[3] >> 11;
++      s[12] = (h[3] >> 19) | (h[4] << 6);
++      s[13] = h[4] >> 2;
++      s[14] = h[4] >> 10;
++      s[15] = h[4] >> 18;
++      s[16] = h[5] >> 0;
++      s[17] = h[5] >> 8;
++      s[18] = h[5] >> 16;
++      s[19] = (h[5] >> 24) | (h[6] << 1);
++      s[20] = h[6] >> 7;
++      s[21] = h[6] >> 15;
++      s[22] = (h[6] >> 23) | (h[7] << 3);
++      s[23] = h[7] >> 5;
++      s[24] = h[7] >> 13;
++      s[25] = (h[7] >> 21) | (h[8] << 4);
++      s[26] = h[8] >> 4;
++      s[27] = h[8] >> 12;
++      s[28] = (h[8] >> 20) | (h[9] << 6);
++      s[29] = h[9] >> 2;
++      s[30] = h[9] >> 10;
++      s[31] = h[9] >> 18;
++}
++
++/* h = f */
++static __always_inline void fe_copy(fe *h, const fe *f)
++{
++      memmove(h, f, sizeof(u32) * 10);
++}
++
++static __always_inline void fe_copy_lt(fe_loose *h, const fe *f)
++{
++      memmove(h, f, sizeof(u32) * 10);
++}
++
++/* h = 0 */
++static __always_inline void fe_0(fe *h)
++{
++      memset(h, 0, sizeof(u32) * 10);
++}
++
++/* h = 1 */
++static __always_inline void fe_1(fe *h)
++{
++      memset(h, 0, sizeof(u32) * 10);
++      h->v[0] = 1;
++}
++
++static void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
++{
++      { const u32 x20 = in1[9];
++      { const u32 x21 = in1[8];
++      { const u32 x19 = in1[7];
++      { const u32 x17 = in1[6];
++      { const u32 x15 = in1[5];
++      { const u32 x13 = in1[4];
++      { const u32 x11 = in1[3];
++      { const u32 x9 = in1[2];
++      { const u32 x7 = in1[1];
++      { const u32 x5 = in1[0];
++      { const u32 x38 = in2[9];
++      { const u32 x39 = in2[8];
++      { const u32 x37 = in2[7];
++      { const u32 x35 = in2[6];
++      { const u32 x33 = in2[5];
++      { const u32 x31 = in2[4];
++      { const u32 x29 = in2[3];
++      { const u32 x27 = in2[2];
++      { const u32 x25 = in2[1];
++      { const u32 x23 = in2[0];
++      out[0] = (x5 + x23);
++      out[1] = (x7 + x25);
++      out[2] = (x9 + x27);
++      out[3] = (x11 + x29);
++      out[4] = (x13 + x31);
++      out[5] = (x15 + x33);
++      out[6] = (x17 + x35);
++      out[7] = (x19 + x37);
++      out[8] = (x21 + x39);
++      out[9] = (x20 + x38);
++      }}}}}}}}}}}}}}}}}}}}
++}
++
++/* h = f + g
++ * Can overlap h with f or g.
++ */
++static __always_inline void fe_add(fe_loose *h, const fe *f, const fe *g)
++{
++      fe_add_impl(h->v, f->v, g->v);
++}
++
++static void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
++{
++      { const u32 x20 = in1[9];
++      { const u32 x21 = in1[8];
++      { const u32 x19 = in1[7];
++      { const u32 x17 = in1[6];
++      { const u32 x15 = in1[5];
++      { const u32 x13 = in1[4];
++      { const u32 x11 = in1[3];
++      { const u32 x9 = in1[2];
++      { const u32 x7 = in1[1];
++      { const u32 x5 = in1[0];
++      { const u32 x38 = in2[9];
++      { const u32 x39 = in2[8];
++      { const u32 x37 = in2[7];
++      { const u32 x35 = in2[6];
++      { const u32 x33 = in2[5];
++      { const u32 x31 = in2[4];
++      { const u32 x29 = in2[3];
++      { const u32 x27 = in2[2];
++      { const u32 x25 = in2[1];
++      { const u32 x23 = in2[0];
++      out[0] = ((0x7ffffda + x5) - x23);
++      out[1] = ((0x3fffffe + x7) - x25);
++      out[2] = ((0x7fffffe + x9) - x27);
++      out[3] = ((0x3fffffe + x11) - x29);
++      out[4] = ((0x7fffffe + x13) - x31);
++      out[5] = ((0x3fffffe + x15) - x33);
++      out[6] = ((0x7fffffe + x17) - x35);
++      out[7] = ((0x3fffffe + x19) - x37);
++      out[8] = ((0x7fffffe + x21) - x39);
++      out[9] = ((0x3fffffe + x20) - x38);
++      }}}}}}}}}}}}}}}}}}}}
++}
++
++/* h = f - g
++ * Can overlap h with f or g.
++ */
++static __always_inline void fe_sub(fe_loose *h, const fe *f, const fe *g)
++{
++      fe_sub_impl(h->v, f->v, g->v);
++}
++
++static void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
++{
++      { const u32 x20 = in1[9];
++      { const u32 x21 = in1[8];
++      { const u32 x19 = in1[7];
++      { const u32 x17 = in1[6];
++      { const u32 x15 = in1[5];
++      { const u32 x13 = in1[4];
++      { const u32 x11 = in1[3];
++      { const u32 x9 = in1[2];
++      { const u32 x7 = in1[1];
++      { const u32 x5 = in1[0];
++      { const u32 x38 = in2[9];
++      { const u32 x39 = in2[8];
++      { const u32 x37 = in2[7];
++      { const u32 x35 = in2[6];
++      { const u32 x33 = in2[5];
++      { const u32 x31 = in2[4];
++      { const u32 x29 = in2[3];
++      { const u32 x27 = in2[2];
++      { const u32 x25 = in2[1];
++      { const u32 x23 = in2[0];
++      { u64 x40 = ((u64)x23 * x5);
++      { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5));
++      { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5));
++      { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5));
++      { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5));
++      { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5));
++      { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5));
++      { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5));
++      { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5));
++      { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5));
++      { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9));
++      { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9));
++      { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13));
++      { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13));
++      { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17));
++      { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17));
++      { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19))));
++      { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21));
++      { u64 x58 = ((u64)(0x2 * x38) * x20);
++      { u64 x59 = (x48 + (x58 << 0x4));
++      { u64 x60 = (x59 + (x58 << 0x1));
++      { u64 x61 = (x60 + x58);
++      { u64 x62 = (x47 + (x57 << 0x4));
++      { u64 x63 = (x62 + (x57 << 0x1));
++      { u64 x64 = (x63 + x57);
++      { u64 x65 = (x46 + (x56 << 0x4));
++      { u64 x66 = (x65 + (x56 << 0x1));
++      { u64 x67 = (x66 + x56);
++      { u64 x68 = (x45 + (x55 << 0x4));
++      { u64 x69 = (x68 + (x55 << 0x1));
++      { u64 x70 = (x69 + x55);
++      { u64 x71 = (x44 + (x54 << 0x4));
++      { u64 x72 = (x71 + (x54 << 0x1));
++      { u64 x73 = (x72 + x54);
++      { u64 x74 = (x43 + (x53 << 0x4));
++      { u64 x75 = (x74 + (x53 << 0x1));
++      { u64 x76 = (x75 + x53);
++      { u64 x77 = (x42 + (x52 << 0x4));
++      { u64 x78 = (x77 + (x52 << 0x1));
++      { u64 x79 = (x78 + x52);
++      { u64 x80 = (x41 + (x51 << 0x4));
++      { u64 x81 = (x80 + (x51 << 0x1));
++      { u64 x82 = (x81 + x51);
++      { u64 x83 = (x40 + (x50 << 0x4));
++      { u64 x84 = (x83 + (x50 << 0x1));
++      { u64 x85 = (x84 + x50);
++      { u64 x86 = (x85 >> 0x1a);
++      { u32 x87 = ((u32)x85 & 0x3ffffff);
++      { u64 x88 = (x86 + x82);
++      { u64 x89 = (x88 >> 0x19);
++      { u32 x90 = ((u32)x88 & 0x1ffffff);
++      { u64 x91 = (x89 + x79);
++      { u64 x92 = (x91 >> 0x1a);
++      { u32 x93 = ((u32)x91 & 0x3ffffff);
++      { u64 x94 = (x92 + x76);
++      { u64 x95 = (x94 >> 0x19);
++      { u32 x96 = ((u32)x94 & 0x1ffffff);
++      { u64 x97 = (x95 + x73);
++      { u64 x98 = (x97 >> 0x1a);
++      { u32 x99 = ((u32)x97 & 0x3ffffff);
++      { u64 x100 = (x98 + x70);
++      { u64 x101 = (x100 >> 0x19);
++      { u32 x102 = ((u32)x100 & 0x1ffffff);
++      { u64 x103 = (x101 + x67);
++      { u64 x104 = (x103 >> 0x1a);
++      { u32 x105 = ((u32)x103 & 0x3ffffff);
++      { u64 x106 = (x104 + x64);
++      { u64 x107 = (x106 >> 0x19);
++      { u32 x108 = ((u32)x106 & 0x1ffffff);
++      { u64 x109 = (x107 + x61);
++      { u64 x110 = (x109 >> 0x1a);
++      { u32 x111 = ((u32)x109 & 0x3ffffff);
++      { u64 x112 = (x110 + x49);
++      { u64 x113 = (x112 >> 0x19);
++      { u32 x114 = ((u32)x112 & 0x1ffffff);
++      { u64 x115 = (x87 + (0x13 * x113));
++      { u32 x116 = (u32) (x115 >> 0x1a);
++      { u32 x117 = ((u32)x115 & 0x3ffffff);
++      { u32 x118 = (x116 + x90);
++      { u32 x119 = (x118 >> 0x19);
++      { u32 x120 = (x118 & 0x1ffffff);
++      out[0] = x117;
++      out[1] = x120;
++      out[2] = (x119 + x93);
++      out[3] = x96;
++      out[4] = x99;
++      out[5] = x102;
++      out[6] = x105;
++      out[7] = x108;
++      out[8] = x111;
++      out[9] = x114;
++      }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
++}
++
++static __always_inline void fe_mul_ttt(fe *h, const fe *f, const fe *g)
++{
++      fe_mul_impl(h->v, f->v, g->v);
++}
++
++static __always_inline void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g)
++{
++      fe_mul_impl(h->v, f->v, g->v);
++}
++
++static __always_inline void
++fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g)
++{
++      fe_mul_impl(h->v, f->v, g->v);
++}
++
++static void fe_sqr_impl(u32 out[10], const u32 in1[10])
++{
++      { const u32 x17 = in1[9];
++      { const u32 x18 = in1[8];
++      { const u32 x16 = in1[7];
++      { const u32 x14 = in1[6];
++      { const u32 x12 = in1[5];
++      { const u32 x10 = in1[4];
++      { const u32 x8 = in1[3];
++      { const u32 x6 = in1[2];
++      { const u32 x4 = in1[1];
++      { const u32 x2 = in1[0];
++      { u64 x19 = ((u64)x2 * x2);
++      { u64 x20 = ((u64)(0x2 * x2) * x4);
++      { u64 x21 = (0x2 * (((u64)x4 * x4) + ((u64)x2 * x6)));
++      { u64 x22 = (0x2 * (((u64)x4 * x6) + ((u64)x2 * x8)));
++      { u64 x23 = ((((u64)x6 * x6) + ((u64)(0x4 * x4) * x8)) + ((u64)(0x2 * x2) * x10));
++      { u64 x24 = (0x2 * ((((u64)x6 * x8) + ((u64)x4 * x10)) + ((u64)x2 * x12)));
++      { u64 x25 = (0x2 * (((((u64)x8 * x8) + ((u64)x6 * x10)) + ((u64)x2 * x14)) + ((u64)(0x2 * x4) * x12)));
++      { u64 x26 = (0x2 * (((((u64)x8 * x10) + ((u64)x6 * x12)) + ((u64)x4 * x14)) + ((u64)x2 * x16)));
++      { u64 x27 = (((u64)x10 * x10) + (0x2 * ((((u64)x6 * x14) + ((u64)x2 * x18)) + (0x2 * (((u64)x4 * x16) + ((u64)x8 * x12))))));
++      { u64 x28 = (0x2 * ((((((u64)x10 * x12) + ((u64)x8 * x14)) + ((u64)x6 * x16)) + ((u64)x4 * x18)) + ((u64)x2 * x17)));
++      { u64 x29 = (0x2 * (((((u64)x12 * x12) + ((u64)x10 * x14)) + ((u64)x6 * x18)) + (0x2 * (((u64)x8 * x16) + ((u64)x4 * x17)))));
++      { u64 x30 = (0x2 * (((((u64)x12 * x14) + ((u64)x10 * x16)) + ((u64)x8 * x18)) + ((u64)x6 * x17)));
++      { u64 x31 = (((u64)x14 * x14) + (0x2 * (((u64)x10 * x18) + (0x2 * (((u64)x12 * x16) + ((u64)x8 * x17))))));
++      { u64 x32 = (0x2 * ((((u64)x14 * x16) + ((u64)x12 * x18)) + ((u64)x10 * x17)));
++      { u64 x33 = (0x2 * ((((u64)x16 * x16) + ((u64)x14 * x18)) + ((u64)(0x2 * x12) * x17)));
++      { u64 x34 = (0x2 * (((u64)x16 * x18) + ((u64)x14 * x17)));
++      { u64 x35 = (((u64)x18 * x18) + ((u64)(0x4 * x16) * x17));
++      { u64 x36 = ((u64)(0x2 * x18) * x17);
++      { u64 x37 = ((u64)(0x2 * x17) * x17);
++      { u64 x38 = (x27 + (x37 << 0x4));
++      { u64 x39 = (x38 + (x37 << 0x1));
++      { u64 x40 = (x39 + x37);
++      { u64 x41 = (x26 + (x36 << 0x4));
++      { u64 x42 = (x41 + (x36 << 0x1));
++      { u64 x43 = (x42 + x36);
++      { u64 x44 = (x25 + (x35 << 0x4));
++      { u64 x45 = (x44 + (x35 << 0x1));
++      { u64 x46 = (x45 + x35);
++      { u64 x47 = (x24 + (x34 << 0x4));
++      { u64 x48 = (x47 + (x34 << 0x1));
++      { u64 x49 = (x48 + x34);
++      { u64 x50 = (x23 + (x33 << 0x4));
++      { u64 x51 = (x50 + (x33 << 0x1));
++      { u64 x52 = (x51 + x33);
++      { u64 x53 = (x22 + (x32 << 0x4));
++      { u64 x54 = (x53 + (x32 << 0x1));
++      { u64 x55 = (x54 + x32);
++      { u64 x56 = (x21 + (x31 << 0x4));
++      { u64 x57 = (x56 + (x31 << 0x1));
++      { u64 x58 = (x57 + x31);
++      { u64 x59 = (x20 + (x30 << 0x4));
++      { u64 x60 = (x59 + (x30 << 0x1));
++      { u64 x61 = (x60 + x30);
++      { u64 x62 = (x19 + (x29 << 0x4));
++      { u64 x63 = (x62 + (x29 << 0x1));
++      { u64 x64 = (x63 + x29);
++      { u64 x65 = (x64 >> 0x1a);
++      { u32 x66 = ((u32)x64 & 0x3ffffff);
++      { u64 x67 = (x65 + x61);
++      { u64 x68 = (x67 >> 0x19);
++      { u32 x69 = ((u32)x67 & 0x1ffffff);
++      { u64 x70 = (x68 + x58);
++      { u64 x71 = (x70 >> 0x1a);
++      { u32 x72 = ((u32)x70 & 0x3ffffff);
++      { u64 x73 = (x71 + x55);
++      { u64 x74 = (x73 >> 0x19);
++      { u32 x75 = ((u32)x73 & 0x1ffffff);
++      { u64 x76 = (x74 + x52);
++      { u64 x77 = (x76 >> 0x1a);
++      { u32 x78 = ((u32)x76 & 0x3ffffff);
++      { u64 x79 = (x77 + x49);
++      { u64 x80 = (x79 >> 0x19);
++      { u32 x81 = ((u32)x79 & 0x1ffffff);
++      { u64 x82 = (x80 + x46);
++      { u64 x83 = (x82 >> 0x1a);
++      { u32 x84 = ((u32)x82 & 0x3ffffff);
++      { u64 x85 = (x83 + x43);
++      { u64 x86 = (x85 >> 0x19);
++      { u32 x87 = ((u32)x85 & 0x1ffffff);
++      { u64 x88 = (x86 + x40);
++      { u64 x89 = (x88 >> 0x1a);
++      { u32 x90 = ((u32)x88 & 0x3ffffff);
++      { u64 x91 = (x89 + x28);
++      { u64 x92 = (x91 >> 0x19);
++      { u32 x93 = ((u32)x91 & 0x1ffffff);
++      { u64 x94 = (x66 + (0x13 * x92));
++      { u32 x95 = (u32) (x94 >> 0x1a);
++      { u32 x96 = ((u32)x94 & 0x3ffffff);
++      { u32 x97 = (x95 + x69);
++      { u32 x98 = (x97 >> 0x19);
++      { u32 x99 = (x97 & 0x1ffffff);
++      out[0] = x96;
++      out[1] = x99;
++      out[2] = (x98 + x72);
++      out[3] = x75;
++      out[4] = x78;
++      out[5] = x81;
++      out[6] = x84;
++      out[7] = x87;
++      out[8] = x90;
++      out[9] = x93;
++      }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
++}
++
++static __always_inline void fe_sq_tl(fe *h, const fe_loose *f)
++{
++      fe_sqr_impl(h->v, f->v);
++}
++
++static __always_inline void fe_sq_tt(fe *h, const fe *f)
++{
++      fe_sqr_impl(h->v, f->v);
++}
++
++static __always_inline void fe_loose_invert(fe *out, const fe_loose *z)
++{
++      fe t0;
++      fe t1;
++      fe t2;
++      fe t3;
++      int i;
++
++      fe_sq_tl(&t0, z);
++      fe_sq_tt(&t1, &t0);
++      for (i = 1; i < 2; ++i)
++              fe_sq_tt(&t1, &t1);
++      fe_mul_tlt(&t1, z, &t1);
++      fe_mul_ttt(&t0, &t0, &t1);
++      fe_sq_tt(&t2, &t0);
++      fe_mul_ttt(&t1, &t1, &t2);
++      fe_sq_tt(&t2, &t1);
++      for (i = 1; i < 5; ++i)
++              fe_sq_tt(&t2, &t2);
++      fe_mul_ttt(&t1, &t2, &t1);
++      fe_sq_tt(&t2, &t1);
++      for (i = 1; i < 10; ++i)
++              fe_sq_tt(&t2, &t2);
++      fe_mul_ttt(&t2, &t2, &t1);
++      fe_sq_tt(&t3, &t2);
++      for (i = 1; i < 20; ++i)
++              fe_sq_tt(&t3, &t3);
++      fe_mul_ttt(&t2, &t3, &t2);
++      fe_sq_tt(&t2, &t2);
++      for (i = 1; i < 10; ++i)
++              fe_sq_tt(&t2, &t2);
++      fe_mul_ttt(&t1, &t2, &t1);
++      fe_sq_tt(&t2, &t1);
++      for (i = 1; i < 50; ++i)
++              fe_sq_tt(&t2, &t2);
++      fe_mul_ttt(&t2, &t2, &t1);
++      fe_sq_tt(&t3, &t2);
++      for (i = 1; i < 100; ++i)
++              fe_sq_tt(&t3, &t3);
++      fe_mul_ttt(&t2, &t3, &t2);
++      fe_sq_tt(&t2, &t2);
++      for (i = 1; i < 50; ++i)
++              fe_sq_tt(&t2, &t2);
++      fe_mul_ttt(&t1, &t2, &t1);
++      fe_sq_tt(&t1, &t1);
++      for (i = 1; i < 5; ++i)
++              fe_sq_tt(&t1, &t1);
++      fe_mul_ttt(out, &t1, &t0);
++}
++
++static __always_inline void fe_invert(fe *out, const fe *z)
++{
++      fe_loose l;
++      fe_copy_lt(&l, z);
++      fe_loose_invert(out, &l);
++}
++
++/* Replace (f,g) with (g,f) if b == 1;
++ * replace (f,g) with (f,g) if b == 0.
++ *
++ * Preconditions: b in {0,1}
++ */
++static __always_inline void fe_cswap(fe *f, fe *g, unsigned int b)
++{
++      unsigned i;
++      b = 0 - b;
++      for (i = 0; i < 10; i++) {
++              u32 x = f->v[i] ^ g->v[i];
++              x &= b;
++              f->v[i] ^= x;
++              g->v[i] ^= x;
++      }
++}
++
++/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/
++static __always_inline void fe_mul_121666_impl(u32 out[10], const u32 in1[10])
++{
++      { const u32 x20 = in1[9];
++      { const u32 x21 = in1[8];
++      { const u32 x19 = in1[7];
++      { const u32 x17 = in1[6];
++      { const u32 x15 = in1[5];
++      { const u32 x13 = in1[4];
++      { const u32 x11 = in1[3];
++      { const u32 x9 = in1[2];
++      { const u32 x7 = in1[1];
++      { const u32 x5 = in1[0];
++      { const u32 x38 = 0;
++      { const u32 x39 = 0;
++      { const u32 x37 = 0;
++      { const u32 x35 = 0;
++      { const u32 x33 = 0;
++      { const u32 x31 = 0;
++      { const u32 x29 = 0;
++      { const u32 x27 = 0;
++      { const u32 x25 = 0;
++      { const u32 x23 = 121666;
++      { u64 x40 = ((u64)x23 * x5);
++      { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5));
++      { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5));
++      { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5));
++      { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5));
++      { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5));
++      { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5));
++      { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5));
++      { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5));
++      { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5));
++      { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9));
++      { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9));
++      { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13));
++      { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13));
++      { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17));
++      { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17));
++      { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19))));
++      { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21));
++      { u64 x58 = ((u64)(0x2 * x38) * x20);
++      { u64 x59 = (x48 + (x58 << 0x4));
++      { u64 x60 = (x59 + (x58 << 0x1));
++      { u64 x61 = (x60 + x58);
++      { u64 x62 = (x47 + (x57 << 0x4));
++      { u64 x63 = (x62 + (x57 << 0x1));
++      { u64 x64 = (x63 + x57);
++      { u64 x65 = (x46 + (x56 << 0x4));
++      { u64 x66 = (x65 + (x56 << 0x1));
++      { u64 x67 = (x66 + x56);
++      { u64 x68 = (x45 + (x55 << 0x4));
++      { u64 x69 = (x68 + (x55 << 0x1));
++      { u64 x70 = (x69 + x55);
++      { u64 x71 = (x44 + (x54 << 0x4));
++      { u64 x72 = (x71 + (x54 << 0x1));
++      { u64 x73 = (x72 + x54);
++      { u64 x74 = (x43 + (x53 << 0x4));
++      { u64 x75 = (x74 + (x53 << 0x1));
++      { u64 x76 = (x75 + x53);
++      { u64 x77 = (x42 + (x52 << 0x4));
++      { u64 x78 = (x77 + (x52 << 0x1));
++      { u64 x79 = (x78 + x52);
++      { u64 x80 = (x41 + (x51 << 0x4));
++      { u64 x81 = (x80 + (x51 << 0x1));
++      { u64 x82 = (x81 + x51);
++      { u64 x83 = (x40 + (x50 << 0x4));
++      { u64 x84 = (x83 + (x50 << 0x1));
++      { u64 x85 = (x84 + x50);
++      { u64 x86 = (x85 >> 0x1a);
++      { u32 x87 = ((u32)x85 & 0x3ffffff);
++      { u64 x88 = (x86 + x82);
++      { u64 x89 = (x88 >> 0x19);
++      { u32 x90 = ((u32)x88 & 0x1ffffff);
++      { u64 x91 = (x89 + x79);
++      { u64 x92 = (x91 >> 0x1a);
++      { u32 x93 = ((u32)x91 & 0x3ffffff);
++      { u64 x94 = (x92 + x76);
++      { u64 x95 = (x94 >> 0x19);
++      { u32 x96 = ((u32)x94 & 0x1ffffff);
++      { u64 x97 = (x95 + x73);
++      { u64 x98 = (x97 >> 0x1a);
++      { u32 x99 = ((u32)x97 & 0x3ffffff);
++      { u64 x100 = (x98 + x70);
++      { u64 x101 = (x100 >> 0x19);
++      { u32 x102 = ((u32)x100 & 0x1ffffff);
++      { u64 x103 = (x101 + x67);
++      { u64 x104 = (x103 >> 0x1a);
++      { u32 x105 = ((u32)x103 & 0x3ffffff);
++      { u64 x106 = (x104 + x64);
++      { u64 x107 = (x106 >> 0x19);
++      { u32 x108 = ((u32)x106 & 0x1ffffff);
++      { u64 x109 = (x107 + x61);
++      { u64 x110 = (x109 >> 0x1a);
++      { u32 x111 = ((u32)x109 & 0x3ffffff);
++      { u64 x112 = (x110 + x49);
++      { u64 x113 = (x112 >> 0x19);
++      { u32 x114 = ((u32)x112 & 0x1ffffff);
++      { u64 x115 = (x87 + (0x13 * x113));
++      { u32 x116 = (u32) (x115 >> 0x1a);
++      { u32 x117 = ((u32)x115 & 0x3ffffff);
++      { u32 x118 = (x116 + x90);
++      { u32 x119 = (x118 >> 0x19);
++      { u32 x120 = (x118 & 0x1ffffff);
++      out[0] = x117;
++      out[1] = x120;
++      out[2] = (x119 + x93);
++      out[3] = x96;
++      out[4] = x99;
++      out[5] = x102;
++      out[6] = x105;
++      out[7] = x108;
++      out[8] = x111;
++      out[9] = x114;
++      }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
++}
++
++static __always_inline void fe_mul121666(fe *h, const fe_loose *f)
++{
++      fe_mul_121666_impl(h->v, f->v);
++}
++
++void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
++                      const u8 scalar[CURVE25519_KEY_SIZE],
++                      const u8 point[CURVE25519_KEY_SIZE])
++{
++      fe x1, x2, z2, x3, z3;
++      fe_loose x2l, z2l, x3l;
++      unsigned swap = 0;
++      int pos;
++      u8 e[32];
++
++      memcpy(e, scalar, 32);
++      curve25519_clamp_secret(e);
++
++      /* The following implementation was transcribed to Coq and proven to
++       * correspond to unary scalar multiplication in affine coordinates given
++       * that x1 != 0 is the x coordinate of some point on the curve. It was
++       * also checked in Coq that doing a ladderstep with x1 = x3 = 0 gives
++       * z2' = z3' = 0, and z2 = z3 = 0 gives z2' = z3' = 0. The statement was
++       * quantified over the underlying field, so it applies to Curve25519
++       * itself and the quadratic twist of Curve25519. It was not proven in
++       * Coq that prime-field arithmetic correctly simulates extension-field
++       * arithmetic on prime-field values. The decoding of the byte array
++       * representation of e was not considered.
++       *
++       * Specification of Montgomery curves in affine coordinates:
++       * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
++       *
++       * Proof that these form a group that is isomorphic to a Weierstrass
++       * curve:
++       * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
++       *
++       * Coq transcription and correctness proof of the loop
++       * (where scalarbits=255):
++       * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
++       * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
++       * preconditions: 0 <= e < 2^255 (not necessarily e < order),
++       * fe_invert(0) = 0
++       */
++      fe_frombytes(&x1, point);
++      fe_1(&x2);
++      fe_0(&z2);
++      fe_copy(&x3, &x1);
++      fe_1(&z3);
++
++      for (pos = 254; pos >= 0; --pos) {
++              fe tmp0, tmp1;
++              fe_loose tmp0l, tmp1l;
++              /* loop invariant as of right before the test, for the case
++               * where x1 != 0:
++               *   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3
++               *   is nonzero
++               *   let r := e >> (pos+1) in the following equalities of
++               *   projective points:
++               *   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
++               *   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
++               *   x1 is the nonzero x coordinate of the nonzero
++               *   point (r*P-(r+1)*P)
++               */
++              unsigned b = 1 & (e[pos / 8] >> (pos & 7));
++              swap ^= b;
++              fe_cswap(&x2, &x3, swap);
++              fe_cswap(&z2, &z3, swap);
++              swap = b;
++              /* Coq transcription of ladderstep formula (called from
++               * transcribed loop):
++               * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
++               * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
++               * x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
++               * x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
++               */
++              fe_sub(&tmp0l, &x3, &z3);
++              fe_sub(&tmp1l, &x2, &z2);
++              fe_add(&x2l, &x2, &z2);
++              fe_add(&z2l, &x3, &z3);
++              fe_mul_tll(&z3, &tmp0l, &x2l);
++              fe_mul_tll(&z2, &z2l, &tmp1l);
++              fe_sq_tl(&tmp0, &tmp1l);
++              fe_sq_tl(&tmp1, &x2l);
++              fe_add(&x3l, &z3, &z2);
++              fe_sub(&z2l, &z3, &z2);
++              fe_mul_ttt(&x2, &tmp1, &tmp0);
++              fe_sub(&tmp1l, &tmp1, &tmp0);
++              fe_sq_tl(&z2, &z2l);
++              fe_mul121666(&z3, &tmp1l);
++              fe_sq_tl(&x3, &x3l);
++              fe_add(&tmp0l, &tmp0, &z3);
++              fe_mul_ttt(&z3, &x1, &z2);
++              fe_mul_tll(&z2, &tmp1l, &tmp0l);
++      }
++      /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3)
++       * else (x2, z2)
++       */
++      fe_cswap(&x2, &x3, swap);
++      fe_cswap(&z2, &z3, swap);
++
++      fe_invert(&z2, &z2);
++      fe_mul_ttt(&x2, &x2, &z2);
++      fe_tobytes(out, &x2);
++
++      memzero_explicit(&x1, sizeof(x1));
++      memzero_explicit(&x2, sizeof(x2));
++      memzero_explicit(&z2, sizeof(z2));
++      memzero_explicit(&x3, sizeof(x3));
++      memzero_explicit(&z3, sizeof(z3));
++      memzero_explicit(&x2l, sizeof(x2l));
++      memzero_explicit(&z2l, sizeof(z2l));
++      memzero_explicit(&x3l, sizeof(x3l));
++      memzero_explicit(&e, sizeof(e));
++}
+--- /dev/null
++++ b/lib/crypto/curve25519-hacl64.c
+@@ -0,0 +1,788 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
++ * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This is a machine-generated formally verified implementation of Curve25519
++ * ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
++ * generated, it has been tweaked to be suitable for use in the kernel. It is
++ * optimized for 64-bit machines that can efficiently work with 128-bit
++ * integer types.
++ */
++
++#include <asm/unaligned.h>
++#include <crypto/curve25519.h>
++#include <linux/string.h>
++
++typedef __uint128_t u128;
++
++static __always_inline u64 u64_eq_mask(u64 a, u64 b)
++{
++      u64 x = a ^ b;
++      u64 minus_x = ~x + (u64)1U;
++      u64 x_or_minus_x = x | minus_x;
++      u64 xnx = x_or_minus_x >> (u32)63U;
++      u64 c = xnx - (u64)1U;
++      return c;
++}
++
++static __always_inline u64 u64_gte_mask(u64 a, u64 b)
++{
++      u64 x = a;
++      u64 y = b;
++      u64 x_xor_y = x ^ y;
++      u64 x_sub_y = x - y;
++      u64 x_sub_y_xor_y = x_sub_y ^ y;
++      u64 q = x_xor_y | x_sub_y_xor_y;
++      u64 x_xor_q = x ^ q;
++      u64 x_xor_q_ = x_xor_q >> (u32)63U;
++      u64 c = x_xor_q_ - (u64)1U;
++      return c;
++}
++
++static __always_inline void modulo_carry_top(u64 *b)
++{
++      u64 b4 = b[4];
++      u64 b0 = b[0];
++      u64 b4_ = b4 & 0x7ffffffffffffLLU;
++      u64 b0_ = b0 + 19 * (b4 >> 51);
++      b[4] = b4_;
++      b[0] = b0_;
++}
++
++static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
++{
++      {
++              u128 xi = input[0];
++              output[0] = ((u64)(xi));
++      }
++      {
++              u128 xi = input[1];
++              output[1] = ((u64)(xi));
++      }
++      {
++              u128 xi = input[2];
++              output[2] = ((u64)(xi));
++      }
++      {
++              u128 xi = input[3];
++              output[3] = ((u64)(xi));
++      }
++      {
++              u128 xi = input[4];
++              output[4] = ((u64)(xi));
++      }
++}
++
++static __always_inline void
++fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
++{
++      output[0] += (u128)input[0] * s;
++      output[1] += (u128)input[1] * s;
++      output[2] += (u128)input[2] * s;
++      output[3] += (u128)input[3] * s;
++      output[4] += (u128)input[4] * s;
++}
++
++static __always_inline void fproduct_carry_wide_(u128 *tmp)
++{
++      {
++              u32 ctr = 0;
++              u128 tctr = tmp[ctr];
++              u128 tctrp1 = tmp[ctr + 1];
++              u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
++              u128 c = ((tctr) >> (51));
++              tmp[ctr] = ((u128)(r0));
++              tmp[ctr + 1] = ((tctrp1) + (c));
++      }
++      {
++              u32 ctr = 1;
++              u128 tctr = tmp[ctr];
++              u128 tctrp1 = tmp[ctr + 1];
++              u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
++              u128 c = ((tctr) >> (51));
++              tmp[ctr] = ((u128)(r0));
++              tmp[ctr + 1] = ((tctrp1) + (c));
++      }
++
++      {
++              u32 ctr = 2;
++              u128 tctr = tmp[ctr];
++              u128 tctrp1 = tmp[ctr + 1];
++              u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
++              u128 c = ((tctr) >> (51));
++              tmp[ctr] = ((u128)(r0));
++              tmp[ctr + 1] = ((tctrp1) + (c));
++      }
++      {
++              u32 ctr = 3;
++              u128 tctr = tmp[ctr];
++              u128 tctrp1 = tmp[ctr + 1];
++              u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
++              u128 c = ((tctr) >> (51));
++              tmp[ctr] = ((u128)(r0));
++              tmp[ctr + 1] = ((tctrp1) + (c));
++      }
++}
++
++static __always_inline void fmul_shift_reduce(u64 *output)
++{
++      u64 tmp = output[4];
++      u64 b0;
++      {
++              u32 ctr = 5 - 0 - 1;
++              u64 z = output[ctr - 1];
++              output[ctr] = z;
++      }
++      {
++              u32 ctr = 5 - 1 - 1;
++              u64 z = output[ctr - 1];
++              output[ctr] = z;
++      }
++      {
++              u32 ctr = 5 - 2 - 1;
++              u64 z = output[ctr - 1];
++              output[ctr] = z;
++      }
++      {
++              u32 ctr = 5 - 3 - 1;
++              u64 z = output[ctr - 1];
++              output[ctr] = z;
++      }
++      output[0] = tmp;
++      b0 = output[0];
++      output[0] = 19 * b0;
++}
++
++static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
++                                                 u64 *input21)
++{
++      u32 i;
++      u64 input2i;
++      {
++              u64 input2i = input21[0];
++              fproduct_sum_scalar_multiplication_(output, input, input2i);
++              fmul_shift_reduce(input);
++      }
++      {
++              u64 input2i = input21[1];
++              fproduct_sum_scalar_multiplication_(output, input, input2i);
++              fmul_shift_reduce(input);
++      }
++      {
++              u64 input2i = input21[2];
++              fproduct_sum_scalar_multiplication_(output, input, input2i);
++              fmul_shift_reduce(input);
++      }
++      {
++              u64 input2i = input21[3];
++              fproduct_sum_scalar_multiplication_(output, input, input2i);
++              fmul_shift_reduce(input);
++      }
++      i = 4;
++      input2i = input21[i];
++      fproduct_sum_scalar_multiplication_(output, input, input2i);
++}
++
++static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
++{
++      u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
++      {
++              u128 b4;
++              u128 b0;
++              u128 b4_;
++              u128 b0_;
++              u64 i0;
++              u64 i1;
++              u64 i0_;
++              u64 i1_;
++              u128 t[5] = { 0 };
++              fmul_mul_shift_reduce_(t, tmp, input21);
++              fproduct_carry_wide_(t);
++              b4 = t[4];
++              b0 = t[0];
++              b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
++              b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
++              t[4] = b4_;
++              t[0] = b0_;
++              fproduct_copy_from_wide_(output, t);
++              i0 = output[0];
++              i1 = output[1];
++              i0_ = i0 & 0x7ffffffffffffLLU;
++              i1_ = i1 + (i0 >> 51);
++              output[0] = i0_;
++              output[1] = i1_;
++      }
++}
++
++static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
++{
++      u64 r0 = output[0];
++      u64 r1 = output[1];
++      u64 r2 = output[2];
++      u64 r3 = output[3];
++      u64 r4 = output[4];
++      u64 d0 = r0 * 2;
++      u64 d1 = r1 * 2;
++      u64 d2 = r2 * 2 * 19;
++      u64 d419 = r4 * 19;
++      u64 d4 = d419 * 2;
++      u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
++                 (((u128)(d2) * (r3))));
++      u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
++                 (((u128)(r3 * 19) * (r3))));
++      u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
++                 (((u128)(d4) * (r3))));
++      u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
++                 (((u128)(r4) * (d419))));
++      u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
++                 (((u128)(r2) * (r2))));
++      tmp[0] = s0;
++      tmp[1] = s1;
++      tmp[2] = s2;
++      tmp[3] = s3;
++      tmp[4] = s4;
++}
++
++static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
++{
++      u128 b4;
++      u128 b0;
++      u128 b4_;
++      u128 b0_;
++      u64 i0;
++      u64 i1;
++      u64 i0_;
++      u64 i1_;
++      fsquare_fsquare__(tmp, output);
++      fproduct_carry_wide_(tmp);
++      b4 = tmp[4];
++      b0 = tmp[0];
++      b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
++      b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
++      tmp[4] = b4_;
++      tmp[0] = b0_;
++      fproduct_copy_from_wide_(output, tmp);
++      i0 = output[0];
++      i1 = output[1];
++      i0_ = i0 & 0x7ffffffffffffLLU;
++      i1_ = i1 + (i0 >> 51);
++      output[0] = i0_;
++      output[1] = i1_;
++}
++
++static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
++                                                 u32 count1)
++{
++      u32 i;
++      fsquare_fsquare_(tmp, output);
++      for (i = 1; i < count1; ++i)
++              fsquare_fsquare_(tmp, output);
++}
++
++static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
++                                                u32 count1)
++{
++      u128 t[5];
++      memcpy(output, input, 5 * sizeof(*input));
++      fsquare_fsquare_times_(output, t, count1);
++}
++
++static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
++                                                        u32 count1)
++{
++      u128 t[5];
++      fsquare_fsquare_times_(output, t, count1);
++}
++
++static __always_inline void crecip_crecip(u64 *out, u64 *z)
++{
++      u64 buf[20] = { 0 };
++      u64 *a0 = buf;
++      u64 *t00 = buf + 5;
++      u64 *b0 = buf + 10;
++      u64 *t01;
++      u64 *b1;
++      u64 *c0;
++      u64 *a;
++      u64 *t0;
++      u64 *b;
++      u64 *c;
++      fsquare_fsquare_times(a0, z, 1);
++      fsquare_fsquare_times(t00, a0, 2);
++      fmul_fmul(b0, t00, z);
++      fmul_fmul(a0, b0, a0);
++      fsquare_fsquare_times(t00, a0, 1);
++      fmul_fmul(b0, t00, b0);
++      fsquare_fsquare_times(t00, b0, 5);
++      t01 = buf + 5;
++      b1 = buf + 10;
++      c0 = buf + 15;
++      fmul_fmul(b1, t01, b1);
++      fsquare_fsquare_times(t01, b1, 10);
++      fmul_fmul(c0, t01, b1);
++      fsquare_fsquare_times(t01, c0, 20);
++      fmul_fmul(t01, t01, c0);
++      fsquare_fsquare_times_inplace(t01, 10);
++      fmul_fmul(b1, t01, b1);
++      fsquare_fsquare_times(t01, b1, 50);
++      a = buf;
++      t0 = buf + 5;
++      b = buf + 10;
++      c = buf + 15;
++      fmul_fmul(c, t0, b);
++      fsquare_fsquare_times(t0, c, 100);
++      fmul_fmul(t0, t0, c);
++      fsquare_fsquare_times_inplace(t0, 50);
++      fmul_fmul(t0, t0, b);
++      fsquare_fsquare_times_inplace(t0, 5);
++      fmul_fmul(out, t0, a);
++}
++
++static __always_inline void fsum(u64 *a, u64 *b)
++{
++      a[0] += b[0];
++      a[1] += b[1];
++      a[2] += b[2];
++      a[3] += b[3];
++      a[4] += b[4];
++}
++
++static __always_inline void fdifference(u64 *a, u64 *b)
++{
++      u64 tmp[5] = { 0 };
++      u64 b0;
++      u64 b1;
++      u64 b2;
++      u64 b3;
++      u64 b4;
++      memcpy(tmp, b, 5 * sizeof(*b));
++      b0 = tmp[0];
++      b1 = tmp[1];
++      b2 = tmp[2];
++      b3 = tmp[3];
++      b4 = tmp[4];
++      tmp[0] = b0 + 0x3fffffffffff68LLU;
++      tmp[1] = b1 + 0x3ffffffffffff8LLU;
++      tmp[2] = b2 + 0x3ffffffffffff8LLU;
++      tmp[3] = b3 + 0x3ffffffffffff8LLU;
++      tmp[4] = b4 + 0x3ffffffffffff8LLU;
++      {
++              u64 xi = a[0];
++              u64 yi = tmp[0];
++              a[0] = yi - xi;
++      }
++      {
++              u64 xi = a[1];
++              u64 yi = tmp[1];
++              a[1] = yi - xi;
++      }
++      {
++              u64 xi = a[2];
++              u64 yi = tmp[2];
++              a[2] = yi - xi;
++      }
++      {
++              u64 xi = a[3];
++              u64 yi = tmp[3];
++              a[3] = yi - xi;
++      }
++      {
++              u64 xi = a[4];
++              u64 yi = tmp[4];
++              a[4] = yi - xi;
++      }
++}
++
++static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
++{
++      u128 tmp[5];
++      u128 b4;
++      u128 b0;
++      u128 b4_;
++      u128 b0_;
++      {
++              u64 xi = b[0];
++              tmp[0] = ((u128)(xi) * (s));
++      }
++      {
++              u64 xi = b[1];
++              tmp[1] = ((u128)(xi) * (s));
++      }
++      {
++              u64 xi = b[2];
++              tmp[2] = ((u128)(xi) * (s));
++      }
++      {
++              u64 xi = b[3];
++              tmp[3] = ((u128)(xi) * (s));
++      }
++      {
++              u64 xi = b[4];
++              tmp[4] = ((u128)(xi) * (s));
++      }
++      fproduct_carry_wide_(tmp);
++      b4 = tmp[4];
++      b0 = tmp[0];
++      b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
++      b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
++      tmp[4] = b4_;
++      tmp[0] = b0_;
++      fproduct_copy_from_wide_(output, tmp);
++}
++
++static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
++{
++      fmul_fmul(output, a, b);
++}
++
++static __always_inline void crecip(u64 *output, u64 *input)
++{
++      crecip_crecip(output, input);
++}
++
++static __always_inline void point_swap_conditional_step(u64 *a, u64 *b,
++                                                      u64 swap1, u32 ctr)
++{
++      u32 i = ctr - 1;
++      u64 ai = a[i];
++      u64 bi = b[i];
++      u64 x = swap1 & (ai ^ bi);
++      u64 ai1 = ai ^ x;
++      u64 bi1 = bi ^ x;
++      a[i] = ai1;
++      b[i] = bi1;
++}
++
++static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
++{
++      point_swap_conditional_step(a, b, swap1, 5);
++      point_swap_conditional_step(a, b, swap1, 4);
++      point_swap_conditional_step(a, b, swap1, 3);
++      point_swap_conditional_step(a, b, swap1, 2);
++      point_swap_conditional_step(a, b, swap1, 1);
++}
++
++static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
++{
++      u64 swap1 = 0 - iswap;
++      point_swap_conditional5(a, b, swap1);
++      point_swap_conditional5(a + 5, b + 5, swap1);
++}
++
++static __always_inline void point_copy(u64 *output, u64 *input)
++{
++      memcpy(output, input, 5 * sizeof(*input));
++      memcpy(output + 5, input + 5, 5 * sizeof(*input));
++}
++
++static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
++                                              u64 *pq, u64 *qmqp)
++{
++      u64 *qx = qmqp;
++      u64 *x2 = pp;
++      u64 *z2 = pp + 5;
++      u64 *x3 = ppq;
++      u64 *z3 = ppq + 5;
++      u64 *x = p;
++      u64 *z = p + 5;
++      u64 *xprime = pq;
++      u64 *zprime = pq + 5;
++      u64 buf[40] = { 0 };
++      u64 *origx = buf;
++      u64 *origxprime0 = buf + 5;
++      u64 *xxprime0;
++      u64 *zzprime0;
++      u64 *origxprime;
++      xxprime0 = buf + 25;
++      zzprime0 = buf + 30;
++      memcpy(origx, x, 5 * sizeof(*x));
++      fsum(x, z);
++      fdifference(z, origx);
++      memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
++      fsum(xprime, zprime);
++      fdifference(zprime, origxprime0);
++      fmul(xxprime0, xprime, z);
++      fmul(zzprime0, x, zprime);
++      origxprime = buf + 5;
++      {
++              u64 *xx0;
++              u64 *zz0;
++              u64 *xxprime;
++              u64 *zzprime;
++              u64 *zzzprime;
++              xx0 = buf + 15;
++              zz0 = buf + 20;
++              xxprime = buf + 25;
++              zzprime = buf + 30;
++              zzzprime = buf + 35;
++              memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
++              fsum(xxprime, zzprime);
++              fdifference(zzprime, origxprime);
++              fsquare_fsquare_times(x3, xxprime, 1);
++              fsquare_fsquare_times(zzzprime, zzprime, 1);
++              fmul(z3, zzzprime, qx);
++              fsquare_fsquare_times(xx0, x, 1);
++              fsquare_fsquare_times(zz0, z, 1);
++              {
++                      u64 *zzz;
++                      u64 *xx;
++                      u64 *zz;
++                      u64 scalar;
++                      zzz = buf + 10;
++                      xx = buf + 15;
++                      zz = buf + 20;
++                      fmul(x2, xx, zz);
++                      fdifference(zz, xx);
++                      scalar = 121665;
++                      fscalar(zzz, zz, scalar);
++                      fsum(zzz, xx);
++                      fmul(z2, zzz, zz);
++              }
++      }
++}
++
++static __always_inline void
++ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
++                                     u64 *q, u8 byt)
++{
++      u64 bit0 = (u64)(byt >> 7);
++      u64 bit;
++      point_swap_conditional(nq, nqpq, bit0);
++      addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
++      bit = (u64)(byt >> 7);
++      point_swap_conditional(nq2, nqpq2, bit);
++}
++
++static __always_inline void
++ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2,
++                                            u64 *nqpq2, u64 *q, u8 byt)
++{
++      u8 byt1;
++      ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
++      byt1 = byt << 1;
++      ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
++}
++
++static __always_inline void
++ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
++                                u64 *q, u8 byt, u32 i)
++{
++      while (i--) {
++              ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
++                                                            nqpq2, q, byt);
++              byt <<= 2;
++      }
++}
++
++static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq,
++                                                        u64 *nqpq, u64 *nq2,
++                                                        u64 *nqpq2, u64 *q,
++                                                        u32 i)
++{
++      while (i--) {
++              u8 byte = n1[i];
++              ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
++                                                byte, 4);
++      }
++}
++
++static void ladder_cmult(u64 *result, u8 *n1, u64 *q)
++{
++      u64 point_buf[40] = { 0 };
++      u64 *nq = point_buf;
++      u64 *nqpq = point_buf + 10;
++      u64 *nq2 = point_buf + 20;
++      u64 *nqpq2 = point_buf + 30;
++      point_copy(nqpq, q);
++      nq[0] = 1;
++      ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
++      point_copy(result, nq);
++}
++
++static __always_inline void format_fexpand(u64 *output, const u8 *input)
++{
++      const u8 *x00 = input + 6;
++      const u8 *x01 = input + 12;
++      const u8 *x02 = input + 19;
++      const u8 *x0 = input + 24;
++      u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
++      i0 = get_unaligned_le64(input);
++      i1 = get_unaligned_le64(x00);
++      i2 = get_unaligned_le64(x01);
++      i3 = get_unaligned_le64(x02);
++      i4 = get_unaligned_le64(x0);
++      output0 = i0 & 0x7ffffffffffffLLU;
++      output1 = i1 >> 3 & 0x7ffffffffffffLLU;
++      output2 = i2 >> 6 & 0x7ffffffffffffLLU;
++      output3 = i3 >> 1 & 0x7ffffffffffffLLU;
++      output4 = i4 >> 12 & 0x7ffffffffffffLLU;
++      output[0] = output0;
++      output[1] = output1;
++      output[2] = output2;
++      output[3] = output3;
++      output[4] = output4;
++}
++
++static __always_inline void format_fcontract_first_carry_pass(u64 *input)
++{
++      u64 t0 = input[0];
++      u64 t1 = input[1];
++      u64 t2 = input[2];
++      u64 t3 = input[3];
++      u64 t4 = input[4];
++      u64 t1_ = t1 + (t0 >> 51);
++      u64 t0_ = t0 & 0x7ffffffffffffLLU;
++      u64 t2_ = t2 + (t1_ >> 51);
++      u64 t1__ = t1_ & 0x7ffffffffffffLLU;
++      u64 t3_ = t3 + (t2_ >> 51);
++      u64 t2__ = t2_ & 0x7ffffffffffffLLU;
++      u64 t4_ = t4 + (t3_ >> 51);
++      u64 t3__ = t3_ & 0x7ffffffffffffLLU;
++      input[0] = t0_;
++      input[1] = t1__;
++      input[2] = t2__;
++      input[3] = t3__;
++      input[4] = t4_;
++}
++
++static __always_inline void format_fcontract_first_carry_full(u64 *input)
++{
++      format_fcontract_first_carry_pass(input);
++      modulo_carry_top(input);
++}
++
++static __always_inline void format_fcontract_second_carry_pass(u64 *input)
++{
++      u64 t0 = input[0];
++      u64 t1 = input[1];
++      u64 t2 = input[2];
++      u64 t3 = input[3];
++      u64 t4 = input[4];
++      u64 t1_ = t1 + (t0 >> 51);
++      u64 t0_ = t0 & 0x7ffffffffffffLLU;
++      u64 t2_ = t2 + (t1_ >> 51);
++      u64 t1__ = t1_ & 0x7ffffffffffffLLU;
++      u64 t3_ = t3 + (t2_ >> 51);
++      u64 t2__ = t2_ & 0x7ffffffffffffLLU;
++      u64 t4_ = t4 + (t3_ >> 51);
++      u64 t3__ = t3_ & 0x7ffffffffffffLLU;
++      input[0] = t0_;
++      input[1] = t1__;
++      input[2] = t2__;
++      input[3] = t3__;
++      input[4] = t4_;
++}
++
++static __always_inline void format_fcontract_second_carry_full(u64 *input)
++{
++      u64 i0;
++      u64 i1;
++      u64 i0_;
++      u64 i1_;
++      format_fcontract_second_carry_pass(input);
++      modulo_carry_top(input);
++      i0 = input[0];
++      i1 = input[1];
++      i0_ = i0 & 0x7ffffffffffffLLU;
++      i1_ = i1 + (i0 >> 51);
++      input[0] = i0_;
++      input[1] = i1_;
++}
++
++static __always_inline void format_fcontract_trim(u64 *input)
++{
++      u64 a0 = input[0];
++      u64 a1 = input[1];
++      u64 a2 = input[2];
++      u64 a3 = input[3];
++      u64 a4 = input[4];
++      u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
++      u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
++      u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
++      u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
++      u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
++      u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
++      u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
++      u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
++      u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
++      u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
++      u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
++      input[0] = a0_;
++      input[1] = a1_;
++      input[2] = a2_;
++      input[3] = a3_;
++      input[4] = a4_;
++}
++
++static __always_inline void format_fcontract_store(u8 *output, u64 *input)
++{
++      u64 t0 = input[0];
++      u64 t1 = input[1];
++      u64 t2 = input[2];
++      u64 t3 = input[3];
++      u64 t4 = input[4];
++      u64 o0 = t1 << 51 | t0;
++      u64 o1 = t2 << 38 | t1 >> 13;
++      u64 o2 = t3 << 25 | t2 >> 26;
++      u64 o3 = t4 << 12 | t3 >> 39;
++      u8 *b0 = output;
++      u8 *b1 = output + 8;
++      u8 *b2 = output + 16;
++      u8 *b3 = output + 24;
++      put_unaligned_le64(o0, b0);
++      put_unaligned_le64(o1, b1);
++      put_unaligned_le64(o2, b2);
++      put_unaligned_le64(o3, b3);
++}
++
++static __always_inline void format_fcontract(u8 *output, u64 *input)
++{
++      format_fcontract_first_carry_full(input);
++      format_fcontract_second_carry_full(input);
++      format_fcontract_trim(input);
++      format_fcontract_store(output, input);
++}
++
++static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
++{
++      u64 *x = point;
++      u64 *z = point + 5;
++      u64 buf[10] __aligned(32) = { 0 };
++      u64 *zmone = buf;
++      u64 *sc = buf + 5;
++      crecip(zmone, z);
++      fmul(sc, x, zmone);
++      format_fcontract(scalar, sc);
++}
++
++void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
++                      const u8 secret[CURVE25519_KEY_SIZE],
++                      const u8 basepoint[CURVE25519_KEY_SIZE])
++{
++      u64 buf0[10] __aligned(32) = { 0 };
++      u64 *x0 = buf0;
++      u64 *z = buf0 + 5;
++      u64 *q;
++      format_fexpand(x0, basepoint);
++      z[0] = 1;
++      q = buf0;
++      {
++              u8 e[32] __aligned(32) = { 0 };
++              u8 *scalar;
++              memcpy(e, secret, 32);
++              curve25519_clamp_secret(e);
++              scalar = e;
++              {
++                      u64 buf[15] = { 0 };
++                      u64 *nq = buf;
++                      u64 *x = nq;
++                      x[0] = 1;
++                      ladder_cmult(nq, scalar, q);
++                      format_scalar_of_point(mypublic, nq);
++                      memzero_explicit(buf, sizeof(buf));
++              }
++              memzero_explicit(e, sizeof(e));
++      }
++      memzero_explicit(buf0, sizeof(buf0));
++}
+--- /dev/null
++++ b/lib/crypto/curve25519.c
+@@ -0,0 +1,25 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This is an implementation of the Curve25519 ECDH algorithm, using either
++ * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
++ * depending on what is supported by the target compiler.
++ *
++ * Information: https://cr.yp.to/ecdh.html
++ */
++
++#include <crypto/curve25519.h>
++#include <linux/module.h>
++#include <linux/init.h>
++
++const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
++const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
++
++EXPORT_SYMBOL(curve25519_null_point);
++EXPORT_SYMBOL(curve25519_base_point);
++EXPORT_SYMBOL(curve25519_generic);
++
++MODULE_LICENSE("GPL v2");
++MODULE_DESCRIPTION("Curve25519 scalar multiplication");
++MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0026-crypto-curve25519-add-kpp-selftest.patch b/target/linux/generic/backport-5.4/080-wireguard-0026-crypto-curve25519-add-kpp-selftest.patch

new file mode 100644 (file)

index 0000000..66c144e
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0026-crypto-curve25519-add-kpp-selftest.patch
@@ -0,0 +1,1268 @@
+From c8ff08024112b37805ab5b1edbd7e451de35a17d Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:33 +0100
+Subject: [PATCH 026/124] crypto: curve25519 - add kpp selftest
+
+commit f613457a7af085728297bef71233c37faf3c01b1 upstream.
+
+In preparation of introducing KPP implementations of Curve25519, import
+the set of test cases proposed by the Zinc patch set, but converted to
+the KPP format.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/testmgr.c |    6 +
+ crypto/testmgr.h | 1225 ++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 1231 insertions(+)
+
+--- a/crypto/testmgr.c
++++ b/crypto/testmgr.c
+@@ -4296,6 +4296,12 @@ static const struct alg_test_desc alg_te
+               .test = alg_test_null,
+               .fips_allowed = 1,
+       }, {
++              .alg = "curve25519",
++              .test = alg_test_kpp,
++              .suite = {
++                      .kpp = __VECS(curve25519_tv_template)
++              }
++      }, {
+               .alg = "deflate",
+               .test = alg_test_comp,
+               .fips_allowed = 1,
+--- a/crypto/testmgr.h
++++ b/crypto/testmgr.h
+@@ -1030,6 +1030,1231 @@ static const struct kpp_testvec dh_tv_te
+       }
+ };
+ 
++static const struct kpp_testvec curve25519_tv_template[] = {
++{
++      .secret = (u8[32]){ 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d,
++                   0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45,
++                   0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a,
++                   0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a },
++      .b_public = (u8[32]){ 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4,
++                  0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37,
++                  0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d,
++                  0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f },
++      .expected_ss = (u8[32]){ 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1,
++                  0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25,
++                  0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33,
++                  0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++{
++      .secret = (u8[32]){ 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b,
++                   0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6,
++                   0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd,
++                   0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb },
++      .b_public = (u8[32]){ 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54,
++                  0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a,
++                  0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4,
++                  0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a },
++      .expected_ss = (u8[32]){ 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1,
++                  0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25,
++                  0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33,
++                  0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++{
++      .secret = (u8[32]){ 1 },
++      .b_public = (u8[32]){ 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++      .expected_ss = (u8[32]){ 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64,
++                  0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d,
++                  0x0b, 0x95, 0x48, 0xdc, 0x0c, 0xd8, 0x19, 0x98,
++                  0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++{
++      .secret = (u8[32]){ 1 },
++      .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .expected_ss = (u8[32]){ 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f,
++                  0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d,
++                  0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x08, 0xed, 0xe3,
++                  0x0b, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++{
++      .secret = (u8[32]){ 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d,
++                   0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd,
++                   0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18,
++                   0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 },
++      .b_public = (u8[32]){ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb,
++                  0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c,
++                  0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b,
++                  0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c },
++      .expected_ss = (u8[32]){ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90,
++                  0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f,
++                  0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7,
++                  0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++{
++      .secret = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0x0a, 0xff, 0xff, 0xff,
++                   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0x0a, 0x00, 0xfb, 0x9f },
++      .expected_ss = (u8[32]){ 0x77, 0x52, 0xb6, 0x18, 0xc1, 0x2d, 0x48, 0xd2,
++                  0xc6, 0x93, 0x46, 0x83, 0x81, 0x7c, 0xc6, 0x57,
++                  0xf3, 0x31, 0x03, 0x19, 0x49, 0x48, 0x20, 0x05,
++                  0x42, 0x2b, 0x4e, 0xae, 0x8d, 0x1d, 0x43, 0x23 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++{
++      .secret = (u8[32]){ 0x8e, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++      .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x06 },
++      .expected_ss = (u8[32]){ 0x5a, 0xdf, 0xaa, 0x25, 0x86, 0x8e, 0x32, 0x3d,
++                  0xae, 0x49, 0x62, 0xc1, 0x01, 0x5c, 0xb3, 0x12,
++                  0xe1, 0xc5, 0xc7, 0x9e, 0x95, 0x3f, 0x03, 0x99,
++                  0xb0, 0xba, 0x16, 0x22, 0xf3, 0xb6, 0xf7, 0x0c },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - normal case */
++{
++      .secret = (u8[32]){ 0x48, 0x52, 0x83, 0x4d, 0x9d, 0x6b, 0x77, 0xda,
++                   0xde, 0xab, 0xaa, 0xf2, 0xe1, 0x1d, 0xca, 0x66,
++                   0xd1, 0x9f, 0xe7, 0x49, 0x93, 0xa7, 0xbe, 0xc3,
++                   0x6c, 0x6e, 0x16, 0xa0, 0x98, 0x3f, 0xea, 0xba },
++      .b_public = (u8[32]){ 0x9c, 0x64, 0x7d, 0x9a, 0xe5, 0x89, 0xb9, 0xf5,
++                  0x8f, 0xdc, 0x3c, 0xa4, 0x94, 0x7e, 0xfb, 0xc9,
++                  0x15, 0xc4, 0xb2, 0xe0, 0x8e, 0x74, 0x4a, 0x0e,
++                  0xdf, 0x46, 0x9d, 0xac, 0x59, 0xc8, 0xf8, 0x5a },
++      .expected_ss = (u8[32]){ 0x87, 0xb7, 0xf2, 0x12, 0xb6, 0x27, 0xf7, 0xa5,
++                  0x4c, 0xa5, 0xe0, 0xbc, 0xda, 0xdd, 0xd5, 0x38,
++                  0x9d, 0x9d, 0xe6, 0x15, 0x6c, 0xdb, 0xcf, 0x8e,
++                  0xbe, 0x14, 0xff, 0xbc, 0xfb, 0x43, 0x65, 0x51 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key on twist */
++{
++      .secret = (u8[32]){ 0x58, 0x8c, 0x06, 0x1a, 0x50, 0x80, 0x4a, 0xc4,
++                   0x88, 0xad, 0x77, 0x4a, 0xc7, 0x16, 0xc3, 0xf5,
++                   0xba, 0x71, 0x4b, 0x27, 0x12, 0xe0, 0x48, 0x49,
++                   0x13, 0x79, 0xa5, 0x00, 0x21, 0x19, 0x98, 0xa8 },
++      .b_public = (u8[32]){ 0x63, 0xaa, 0x40, 0xc6, 0xe3, 0x83, 0x46, 0xc5,
++                  0xca, 0xf2, 0x3a, 0x6d, 0xf0, 0xa5, 0xe6, 0xc8,
++                  0x08, 0x89, 0xa0, 0x86, 0x47, 0xe5, 0x51, 0xb3,
++                  0x56, 0x34, 0x49, 0xbe, 0xfc, 0xfc, 0x97, 0x33 },
++      .expected_ss = (u8[32]){ 0xb1, 0xa7, 0x07, 0x51, 0x94, 0x95, 0xff, 0xff,
++                  0xb2, 0x98, 0xff, 0x94, 0x17, 0x16, 0xb0, 0x6d,
++                  0xfa, 0xb8, 0x7c, 0xf8, 0xd9, 0x11, 0x23, 0xfe,
++                  0x2b, 0xe9, 0xa2, 0x33, 0xdd, 0xa2, 0x22, 0x12 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key on twist */
++{
++      .secret = (u8[32]){ 0xb0, 0x5b, 0xfd, 0x32, 0xe5, 0x53, 0x25, 0xd9,
++                   0xfd, 0x64, 0x8c, 0xb3, 0x02, 0x84, 0x80, 0x39,
++                   0x00, 0x0b, 0x39, 0x0e, 0x44, 0xd5, 0x21, 0xe5,
++                   0x8a, 0xab, 0x3b, 0x29, 0xa6, 0x96, 0x0b, 0xa8 },
++      .b_public = (u8[32]){ 0x0f, 0x83, 0xc3, 0x6f, 0xde, 0xd9, 0xd3, 0x2f,
++                  0xad, 0xf4, 0xef, 0xa3, 0xae, 0x93, 0xa9, 0x0b,
++                  0xb5, 0xcf, 0xa6, 0x68, 0x93, 0xbc, 0x41, 0x2c,
++                  0x43, 0xfa, 0x72, 0x87, 0xdb, 0xb9, 0x97, 0x79 },
++      .expected_ss = (u8[32]){ 0x67, 0xdd, 0x4a, 0x6e, 0x16, 0x55, 0x33, 0x53,
++                  0x4c, 0x0e, 0x3f, 0x17, 0x2e, 0x4a, 0xb8, 0x57,
++                  0x6b, 0xca, 0x92, 0x3a, 0x5f, 0x07, 0xb2, 0xc0,
++                  0x69, 0xb4, 0xc3, 0x10, 0xff, 0x2e, 0x93, 0x5b },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key on twist */
++{
++      .secret = (u8[32]){ 0x70, 0xe3, 0x4b, 0xcb, 0xe1, 0xf4, 0x7f, 0xbc,
++                   0x0f, 0xdd, 0xfd, 0x7c, 0x1e, 0x1a, 0xa5, 0x3d,
++                   0x57, 0xbf, 0xe0, 0xf6, 0x6d, 0x24, 0x30, 0x67,
++                   0xb4, 0x24, 0xbb, 0x62, 0x10, 0xbe, 0xd1, 0x9c },
++      .b_public = (u8[32]){ 0x0b, 0x82, 0x11, 0xa2, 0xb6, 0x04, 0x90, 0x97,
++                  0xf6, 0x87, 0x1c, 0x6c, 0x05, 0x2d, 0x3c, 0x5f,
++                  0xc1, 0xba, 0x17, 0xda, 0x9e, 0x32, 0xae, 0x45,
++                  0x84, 0x03, 0xb0, 0x5b, 0xb2, 0x83, 0x09, 0x2a },
++      .expected_ss = (u8[32]){ 0x4a, 0x06, 0x38, 0xcf, 0xaa, 0x9e, 0xf1, 0x93,
++                  0x3b, 0x47, 0xf8, 0x93, 0x92, 0x96, 0xa6, 0xb2,
++                  0x5b, 0xe5, 0x41, 0xef, 0x7f, 0x70, 0xe8, 0x44,
++                  0xc0, 0xbc, 0xc0, 0x0b, 0x13, 0x4d, 0xe6, 0x4a },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key on twist */
++{
++      .secret = (u8[32]){ 0x68, 0xc1, 0xf3, 0xa6, 0x53, 0xa4, 0xcd, 0xb1,
++                   0xd3, 0x7b, 0xba, 0x94, 0x73, 0x8f, 0x8b, 0x95,
++                   0x7a, 0x57, 0xbe, 0xb2, 0x4d, 0x64, 0x6e, 0x99,
++                   0x4d, 0xc2, 0x9a, 0x27, 0x6a, 0xad, 0x45, 0x8d },
++      .b_public = (u8[32]){ 0x34, 0x3a, 0xc2, 0x0a, 0x3b, 0x9c, 0x6a, 0x27,
++                  0xb1, 0x00, 0x81, 0x76, 0x50, 0x9a, 0xd3, 0x07,
++                  0x35, 0x85, 0x6e, 0xc1, 0xc8, 0xd8, 0xfc, 0xae,
++                  0x13, 0x91, 0x2d, 0x08, 0xd1, 0x52, 0xf4, 0x6c },
++      .expected_ss = (u8[32]){ 0x39, 0x94, 0x91, 0xfc, 0xe8, 0xdf, 0xab, 0x73,
++                  0xb4, 0xf9, 0xf6, 0x11, 0xde, 0x8e, 0xa0, 0xb2,
++                  0x7b, 0x28, 0xf8, 0x59, 0x94, 0x25, 0x0b, 0x0f,
++                  0x47, 0x5d, 0x58, 0x5d, 0x04, 0x2a, 0xc2, 0x07 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key on twist */
++{
++      .secret = (u8[32]){ 0xd8, 0x77, 0xb2, 0x6d, 0x06, 0xdf, 0xf9, 0xd9,
++                   0xf7, 0xfd, 0x4c, 0x5b, 0x37, 0x69, 0xf8, 0xcd,
++                   0xd5, 0xb3, 0x05, 0x16, 0xa5, 0xab, 0x80, 0x6b,
++                   0xe3, 0x24, 0xff, 0x3e, 0xb6, 0x9e, 0xa0, 0xb2 },
++      .b_public = (u8[32]){ 0xfa, 0x69, 0x5f, 0xc7, 0xbe, 0x8d, 0x1b, 0xe5,
++                  0xbf, 0x70, 0x48, 0x98, 0xf3, 0x88, 0xc4, 0x52,
++                  0xba, 0xfd, 0xd3, 0xb8, 0xea, 0xe8, 0x05, 0xf8,
++                  0x68, 0x1a, 0x8d, 0x15, 0xc2, 0xd4, 0xe1, 0x42 },
++      .expected_ss = (u8[32]){ 0x2c, 0x4f, 0xe1, 0x1d, 0x49, 0x0a, 0x53, 0x86,
++                  0x17, 0x76, 0xb1, 0x3b, 0x43, 0x54, 0xab, 0xd4,
++                  0xcf, 0x5a, 0x97, 0x69, 0x9d, 0xb6, 0xe6, 0xc6,
++                  0x8c, 0x16, 0x26, 0xd0, 0x76, 0x62, 0xf7, 0x58 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case on twist */
++{
++      .secret = (u8[32]){ 0x38, 0xdd, 0xe9, 0xf3, 0xe7, 0xb7, 0x99, 0x04,
++                   0x5f, 0x9a, 0xc3, 0x79, 0x3d, 0x4a, 0x92, 0x77,
++                   0xda, 0xde, 0xad, 0xc4, 0x1b, 0xec, 0x02, 0x90,
++                   0xf8, 0x1f, 0x74, 0x4f, 0x73, 0x77, 0x5f, 0x84 },
++      .b_public = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++      .expected_ss = (u8[32]){ 0x9a, 0x2c, 0xfe, 0x84, 0xff, 0x9c, 0x4a, 0x97,
++                  0x39, 0x62, 0x5c, 0xae, 0x4a, 0x3b, 0x82, 0xa9,
++                  0x06, 0x87, 0x7a, 0x44, 0x19, 0x46, 0xf8, 0xd7,
++                  0xb3, 0xd7, 0x95, 0xfe, 0x8f, 0x5d, 0x16, 0x39 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case on twist */
++{
++      .secret = (u8[32]){ 0x98, 0x57, 0xa9, 0x14, 0xe3, 0xc2, 0x90, 0x36,
++                   0xfd, 0x9a, 0x44, 0x2b, 0xa5, 0x26, 0xb5, 0xcd,
++                   0xcd, 0xf2, 0x82, 0x16, 0x15, 0x3e, 0x63, 0x6c,
++                   0x10, 0x67, 0x7a, 0xca, 0xb6, 0xbd, 0x6a, 0xa5 },
++      .b_public = (u8[32]){ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++      .expected_ss = (u8[32]){ 0x4d, 0xa4, 0xe0, 0xaa, 0x07, 0x2c, 0x23, 0x2e,
++                  0xe2, 0xf0, 0xfa, 0x4e, 0x51, 0x9a, 0xe5, 0x0b,
++                  0x52, 0xc1, 0xed, 0xd0, 0x8a, 0x53, 0x4d, 0x4e,
++                  0xf3, 0x46, 0xc2, 0xe1, 0x06, 0xd2, 0x1d, 0x60 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case on twist */
++{
++      .secret = (u8[32]){ 0x48, 0xe2, 0x13, 0x0d, 0x72, 0x33, 0x05, 0xed,
++                   0x05, 0xe6, 0xe5, 0x89, 0x4d, 0x39, 0x8a, 0x5e,
++                   0x33, 0x36, 0x7a, 0x8c, 0x6a, 0xac, 0x8f, 0xcd,
++                   0xf0, 0xa8, 0x8e, 0x4b, 0x42, 0x82, 0x0d, 0xb7 },
++      .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0x03, 0x00, 0x00, 0xf8, 0xff,
++                  0xff, 0x1f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff,
++                  0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 0x00,
++                  0x00, 0xf0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00 },
++      .expected_ss = (u8[32]){ 0x9e, 0xd1, 0x0c, 0x53, 0x74, 0x7f, 0x64, 0x7f,
++                  0x82, 0xf4, 0x51, 0x25, 0xd3, 0xde, 0x15, 0xa1,
++                  0xe6, 0xb8, 0x24, 0x49, 0x6a, 0xb4, 0x04, 0x10,
++                  0xff, 0xcc, 0x3c, 0xfe, 0x95, 0x76, 0x0f, 0x3b },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case on twist */
++{
++      .secret = (u8[32]){ 0x28, 0xf4, 0x10, 0x11, 0x69, 0x18, 0x51, 0xb3,
++                   0xa6, 0x2b, 0x64, 0x15, 0x53, 0xb3, 0x0d, 0x0d,
++                   0xfd, 0xdc, 0xb8, 0xff, 0xfc, 0xf5, 0x37, 0x00,
++                   0xa7, 0xbe, 0x2f, 0x6a, 0x87, 0x2e, 0x9f, 0xb0 },
++      .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0xfc, 0xff, 0xff, 0x07, 0x00,
++                  0x00, 0xe0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00,
++                  0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0xf8, 0xff,
++                  0xff, 0x0f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0x7f },
++      .expected_ss = (u8[32]){ 0xcf, 0x72, 0xb4, 0xaa, 0x6a, 0xa1, 0xc9, 0xf8,
++                  0x94, 0xf4, 0x16, 0x5b, 0x86, 0x10, 0x9a, 0xa4,
++                  0x68, 0x51, 0x76, 0x48, 0xe1, 0xf0, 0xcc, 0x70,
++                  0xe1, 0xab, 0x08, 0x46, 0x01, 0x76, 0x50, 0x6b },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case on twist */
++{
++      .secret = (u8[32]){ 0x18, 0xa9, 0x3b, 0x64, 0x99, 0xb9, 0xf6, 0xb3,
++                   0x22, 0x5c, 0xa0, 0x2f, 0xef, 0x41, 0x0e, 0x0a,
++                   0xde, 0xc2, 0x35, 0x32, 0x32, 0x1d, 0x2d, 0x8e,
++                   0xf1, 0xa6, 0xd6, 0x02, 0xa8, 0xc6, 0x5b, 0x83 },
++      .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++                  0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++                  0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++                  0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x7f },
++      .expected_ss = (u8[32]){ 0x5d, 0x50, 0xb6, 0x28, 0x36, 0xbb, 0x69, 0x57,
++                  0x94, 0x10, 0x38, 0x6c, 0xf7, 0xbb, 0x81, 0x1c,
++                  0x14, 0xbf, 0x85, 0xb1, 0xc7, 0xb1, 0x7e, 0x59,
++                  0x24, 0xc7, 0xff, 0xea, 0x91, 0xef, 0x9e, 0x12 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case on twist */
++{
++      .secret = (u8[32]){ 0xc0, 0x1d, 0x13, 0x05, 0xa1, 0x33, 0x8a, 0x1f,
++                   0xca, 0xc2, 0xba, 0x7e, 0x2e, 0x03, 0x2b, 0x42,
++                   0x7e, 0x0b, 0x04, 0x90, 0x31, 0x65, 0xac, 0xa9,
++                   0x57, 0xd8, 0xd0, 0x55, 0x3d, 0x87, 0x17, 0xb0 },
++      .b_public = (u8[32]){ 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++      .expected_ss = (u8[32]){ 0x19, 0x23, 0x0e, 0xb1, 0x48, 0xd5, 0xd6, 0x7c,
++                  0x3c, 0x22, 0xab, 0x1d, 0xae, 0xff, 0x80, 0xa5,
++                  0x7e, 0xae, 0x42, 0x65, 0xce, 0x28, 0x72, 0x65,
++                  0x7b, 0x2c, 0x80, 0x99, 0xfc, 0x69, 0x8e, 0x50 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for public key */
++{
++      .secret = (u8[32]){ 0x38, 0x6f, 0x7f, 0x16, 0xc5, 0x07, 0x31, 0xd6,
++                   0x4f, 0x82, 0xe6, 0xa1, 0x70, 0xb1, 0x42, 0xa4,
++                   0xe3, 0x4f, 0x31, 0xfd, 0x77, 0x68, 0xfc, 0xb8,
++                   0x90, 0x29, 0x25, 0xe7, 0xd1, 0xe2, 0x1a, 0xbe },
++      .b_public = (u8[32]){ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++      .expected_ss = (u8[32]){ 0x0f, 0xca, 0xb5, 0xd8, 0x42, 0xa0, 0x78, 0xd7,
++                  0xa7, 0x1f, 0xc5, 0x9b, 0x57, 0xbf, 0xb4, 0xca,
++                  0x0b, 0xe6, 0x87, 0x3b, 0x49, 0xdc, 0xdb, 0x9f,
++                  0x44, 0xe1, 0x4a, 0xe8, 0xfb, 0xdf, 0xa5, 0x42 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for public key */
++{
++      .secret = (u8[32]){ 0xe0, 0x23, 0xa2, 0x89, 0xbd, 0x5e, 0x90, 0xfa,
++                   0x28, 0x04, 0xdd, 0xc0, 0x19, 0xa0, 0x5e, 0xf3,
++                   0xe7, 0x9d, 0x43, 0x4b, 0xb6, 0xea, 0x2f, 0x52,
++                   0x2e, 0xcb, 0x64, 0x3a, 0x75, 0x29, 0x6e, 0x95 },
++      .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++                  0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++                  0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++                  0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
++      .expected_ss = (u8[32]){ 0x54, 0xce, 0x8f, 0x22, 0x75, 0xc0, 0x77, 0xe3,
++                  0xb1, 0x30, 0x6a, 0x39, 0x39, 0xc5, 0xe0, 0x3e,
++                  0xef, 0x6b, 0xbb, 0x88, 0x06, 0x05, 0x44, 0x75,
++                  0x8d, 0x9f, 0xef, 0x59, 0xb0, 0xbc, 0x3e, 0x4f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for public key */
++{
++      .secret = (u8[32]){ 0x68, 0xf0, 0x10, 0xd6, 0x2e, 0xe8, 0xd9, 0x26,
++                   0x05, 0x3a, 0x36, 0x1c, 0x3a, 0x75, 0xc6, 0xea,
++                   0x4e, 0xbd, 0xc8, 0x60, 0x6a, 0xb2, 0x85, 0x00,
++                   0x3a, 0x6f, 0x8f, 0x40, 0x76, 0xb0, 0x1e, 0x83 },
++      .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 },
++      .expected_ss = (u8[32]){ 0xf1, 0x36, 0x77, 0x5c, 0x5b, 0xeb, 0x0a, 0xf8,
++                  0x11, 0x0a, 0xf1, 0x0b, 0x20, 0x37, 0x23, 0x32,
++                  0x04, 0x3c, 0xab, 0x75, 0x24, 0x19, 0x67, 0x87,
++                  0x75, 0xa2, 0x23, 0xdf, 0x57, 0xc9, 0xd3, 0x0d },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for public key */
++{
++      .secret = (u8[32]){ 0x58, 0xeb, 0xcb, 0x35, 0xb0, 0xf8, 0x84, 0x5c,
++                   0xaf, 0x1e, 0xc6, 0x30, 0xf9, 0x65, 0x76, 0xb6,
++                   0x2c, 0x4b, 0x7b, 0x6c, 0x36, 0xb2, 0x9d, 0xeb,
++                   0x2c, 0xb0, 0x08, 0x46, 0x51, 0x75, 0x5c, 0x96 },
++      .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xfb, 0xff,
++                  0xff, 0xdf, 0xff, 0xff, 0xdf, 0xff, 0xff, 0xff,
++                  0xfe, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xf7, 0xff,
++                  0xff, 0xf7, 0xff, 0xff, 0xbf, 0xff, 0xff, 0x3f },
++      .expected_ss = (u8[32]){ 0xbf, 0x9a, 0xff, 0xd0, 0x6b, 0x84, 0x40, 0x85,
++                  0x58, 0x64, 0x60, 0x96, 0x2e, 0xf2, 0x14, 0x6f,
++                  0xf3, 0xd4, 0x53, 0x3d, 0x94, 0x44, 0xaa, 0xb0,
++                  0x06, 0xeb, 0x88, 0xcc, 0x30, 0x54, 0x40, 0x7d },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for public key */
++{
++      .secret = (u8[32]){ 0x18, 0x8c, 0x4b, 0xc5, 0xb9, 0xc4, 0x4b, 0x38,
++                   0xbb, 0x65, 0x8b, 0x9b, 0x2a, 0xe8, 0x2d, 0x5b,
++                   0x01, 0x01, 0x5e, 0x09, 0x31, 0x84, 0xb1, 0x7c,
++                   0xb7, 0x86, 0x35, 0x03, 0xa7, 0x83, 0xe1, 0xbb },
++      .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
++      .expected_ss = (u8[32]){ 0xd4, 0x80, 0xde, 0x04, 0xf6, 0x99, 0xcb, 0x3b,
++                  0xe0, 0x68, 0x4a, 0x9c, 0xc2, 0xe3, 0x12, 0x81,
++                  0xea, 0x0b, 0xc5, 0xa9, 0xdc, 0xc1, 0x57, 0xd3,
++                  0xd2, 0x01, 0x58, 0xd4, 0x6c, 0xa5, 0x24, 0x6d },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for public key */
++{
++      .secret = (u8[32]){ 0xe0, 0x6c, 0x11, 0xbb, 0x2e, 0x13, 0xce, 0x3d,
++                   0xc7, 0x67, 0x3f, 0x67, 0xf5, 0x48, 0x22, 0x42,
++                   0x90, 0x94, 0x23, 0xa9, 0xae, 0x95, 0xee, 0x98,
++                   0x6a, 0x98, 0x8d, 0x98, 0xfa, 0xee, 0x23, 0xa2 },
++      .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
++                  0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
++                  0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
++                  0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f },
++      .expected_ss = (u8[32]){ 0x4c, 0x44, 0x01, 0xcc, 0xe6, 0xb5, 0x1e, 0x4c,
++                  0xb1, 0x8f, 0x27, 0x90, 0x24, 0x6c, 0x9b, 0xf9,
++                  0x14, 0xdb, 0x66, 0x77, 0x50, 0xa1, 0xcb, 0x89,
++                  0x06, 0x90, 0x92, 0xaf, 0x07, 0x29, 0x22, 0x76 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for public key */
++{
++      .secret = (u8[32]){ 0xc0, 0x65, 0x8c, 0x46, 0xdd, 0xe1, 0x81, 0x29,
++                   0x29, 0x38, 0x77, 0x53, 0x5b, 0x11, 0x62, 0xb6,
++                   0xf9, 0xf5, 0x41, 0x4a, 0x23, 0xcf, 0x4d, 0x2c,
++                   0xbc, 0x14, 0x0a, 0x4d, 0x99, 0xda, 0x2b, 0x8f },
++      .b_public = (u8[32]){ 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++      .expected_ss = (u8[32]){ 0x57, 0x8b, 0xa8, 0xcc, 0x2d, 0xbd, 0xc5, 0x75,
++                  0xaf, 0xcf, 0x9d, 0xf2, 0xb3, 0xee, 0x61, 0x89,
++                  0xf5, 0x33, 0x7d, 0x68, 0x54, 0xc7, 0x9b, 0x4c,
++                  0xe1, 0x65, 0xea, 0x12, 0x29, 0x3b, 0x3a, 0x0f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0xf0, 0x1e, 0x48, 0xda, 0xfa, 0xc9, 0xd7, 0xbc,
++                   0xf5, 0x89, 0xcb, 0xc3, 0x82, 0xc8, 0x78, 0xd1,
++                   0x8b, 0xda, 0x35, 0x50, 0x58, 0x9f, 0xfb, 0x5d,
++                   0x50, 0xb5, 0x23, 0xbe, 0xbe, 0x32, 0x9d, 0xae },
++      .b_public = (u8[32]){ 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++      .expected_ss = (u8[32]){ 0xbd, 0x36, 0xa0, 0x79, 0x0e, 0xb8, 0x83, 0x09,
++                  0x8c, 0x98, 0x8b, 0x21, 0x78, 0x67, 0x73, 0xde,
++                  0x0b, 0x3a, 0x4d, 0xf1, 0x62, 0x28, 0x2c, 0xf1,
++                  0x10, 0xde, 0x18, 0xdd, 0x48, 0x4c, 0xe7, 0x4b },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0x28, 0x87, 0x96, 0xbc, 0x5a, 0xff, 0x4b, 0x81,
++                   0xa3, 0x75, 0x01, 0x75, 0x7b, 0xc0, 0x75, 0x3a,
++                   0x3c, 0x21, 0x96, 0x47, 0x90, 0xd3, 0x86, 0x99,
++                   0x30, 0x8d, 0xeb, 0xc1, 0x7a, 0x6e, 0xaf, 0x8d },
++      .b_public = (u8[32]){ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++      .expected_ss = (u8[32]){ 0xb4, 0xe0, 0xdd, 0x76, 0xda, 0x7b, 0x07, 0x17,
++                  0x28, 0xb6, 0x1f, 0x85, 0x67, 0x71, 0xaa, 0x35,
++                  0x6e, 0x57, 0xed, 0xa7, 0x8a, 0x5b, 0x16, 0x55,
++                  0xcc, 0x38, 0x20, 0xfb, 0x5f, 0x85, 0x4c, 0x5c },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0x98, 0xdf, 0x84, 0x5f, 0x66, 0x51, 0xbf, 0x11,
++                   0x38, 0x22, 0x1f, 0x11, 0x90, 0x41, 0xf7, 0x2b,
++                   0x6d, 0xbc, 0x3c, 0x4a, 0xce, 0x71, 0x43, 0xd9,
++                   0x9f, 0xd5, 0x5a, 0xd8, 0x67, 0x48, 0x0d, 0xa8 },
++      .b_public = (u8[32]){ 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++      .expected_ss = (u8[32]){ 0x6f, 0xdf, 0x6c, 0x37, 0x61, 0x1d, 0xbd, 0x53,
++                  0x04, 0xdc, 0x0f, 0x2e, 0xb7, 0xc9, 0x51, 0x7e,
++                  0xb3, 0xc5, 0x0e, 0x12, 0xfd, 0x05, 0x0a, 0xc6,
++                  0xde, 0xc2, 0x70, 0x71, 0xd4, 0xbf, 0xc0, 0x34 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0xf0, 0x94, 0x98, 0xe4, 0x6f, 0x02, 0xf8, 0x78,
++                   0x82, 0x9e, 0x78, 0xb8, 0x03, 0xd3, 0x16, 0xa2,
++                   0xed, 0x69, 0x5d, 0x04, 0x98, 0xa0, 0x8a, 0xbd,
++                   0xf8, 0x27, 0x69, 0x30, 0xe2, 0x4e, 0xdc, 0xb0 },
++      .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++      .expected_ss = (u8[32]){ 0x4c, 0x8f, 0xc4, 0xb1, 0xc6, 0xab, 0x88, 0xfb,
++                  0x21, 0xf1, 0x8f, 0x6d, 0x4c, 0x81, 0x02, 0x40,
++                  0xd4, 0xe9, 0x46, 0x51, 0xba, 0x44, 0xf7, 0xa2,
++                  0xc8, 0x63, 0xce, 0xc7, 0xdc, 0x56, 0x60, 0x2d },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0x18, 0x13, 0xc1, 0x0a, 0x5c, 0x7f, 0x21, 0xf9,
++                   0x6e, 0x17, 0xf2, 0x88, 0xc0, 0xcc, 0x37, 0x60,
++                   0x7c, 0x04, 0xc5, 0xf5, 0xae, 0xa2, 0xdb, 0x13,
++                   0x4f, 0x9e, 0x2f, 0xfc, 0x66, 0xbd, 0x9d, 0xb8 },
++      .b_public = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
++      .expected_ss = (u8[32]){ 0x1c, 0xd0, 0xb2, 0x82, 0x67, 0xdc, 0x54, 0x1c,
++                  0x64, 0x2d, 0x6d, 0x7d, 0xca, 0x44, 0xa8, 0xb3,
++                  0x8a, 0x63, 0x73, 0x6e, 0xef, 0x5c, 0x4e, 0x65,
++                  0x01, 0xff, 0xbb, 0xb1, 0x78, 0x0c, 0x03, 0x3c },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0x78, 0x57, 0xfb, 0x80, 0x86, 0x53, 0x64, 0x5a,
++                   0x0b, 0xeb, 0x13, 0x8a, 0x64, 0xf5, 0xf4, 0xd7,
++                   0x33, 0xa4, 0x5e, 0xa8, 0x4c, 0x3c, 0xda, 0x11,
++                   0xa9, 0xc0, 0x6f, 0x7e, 0x71, 0x39, 0x14, 0x9e },
++      .b_public = (u8[32]){ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
++      .expected_ss = (u8[32]){ 0x87, 0x55, 0xbe, 0x01, 0xc6, 0x0a, 0x7e, 0x82,
++                  0x5c, 0xff, 0x3e, 0x0e, 0x78, 0xcb, 0x3a, 0xa4,
++                  0x33, 0x38, 0x61, 0x51, 0x6a, 0xa5, 0x9b, 0x1c,
++                  0x51, 0xa8, 0xb2, 0xa5, 0x43, 0xdf, 0xa8, 0x22 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0xe0, 0x3a, 0xa8, 0x42, 0xe2, 0xab, 0xc5, 0x6e,
++                   0x81, 0xe8, 0x7b, 0x8b, 0x9f, 0x41, 0x7b, 0x2a,
++                   0x1e, 0x59, 0x13, 0xc7, 0x23, 0xee, 0xd2, 0x8d,
++                   0x75, 0x2f, 0x8d, 0x47, 0xa5, 0x9f, 0x49, 0x8f },
++      .b_public = (u8[32]){ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
++      .expected_ss = (u8[32]){ 0x54, 0xc9, 0xa1, 0xed, 0x95, 0xe5, 0x46, 0xd2,
++                  0x78, 0x22, 0xa3, 0x60, 0x93, 0x1d, 0xda, 0x60,
++                  0xa1, 0xdf, 0x04, 0x9d, 0xa6, 0xf9, 0x04, 0x25,
++                  0x3c, 0x06, 0x12, 0xbb, 0xdc, 0x08, 0x74, 0x76 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0xf8, 0xf7, 0x07, 0xb7, 0x99, 0x9b, 0x18, 0xcb,
++                   0x0d, 0x6b, 0x96, 0x12, 0x4f, 0x20, 0x45, 0x97,
++                   0x2c, 0xa2, 0x74, 0xbf, 0xc1, 0x54, 0xad, 0x0c,
++                   0x87, 0x03, 0x8c, 0x24, 0xc6, 0xd0, 0xd4, 0xb2 },
++      .b_public = (u8[32]){ 0xda, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .expected_ss = (u8[32]){ 0xcc, 0x1f, 0x40, 0xd7, 0x43, 0xcd, 0xc2, 0x23,
++                  0x0e, 0x10, 0x43, 0xda, 0xba, 0x8b, 0x75, 0xe8,
++                  0x10, 0xf1, 0xfb, 0xab, 0x7f, 0x25, 0x52, 0x69,
++                  0xbd, 0x9e, 0xbb, 0x29, 0xe6, 0xbf, 0x49, 0x4f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0xa0, 0x34, 0xf6, 0x84, 0xfa, 0x63, 0x1e, 0x1a,
++                   0x34, 0x81, 0x18, 0xc1, 0xce, 0x4c, 0x98, 0x23,
++                   0x1f, 0x2d, 0x9e, 0xec, 0x9b, 0xa5, 0x36, 0x5b,
++                   0x4a, 0x05, 0xd6, 0x9a, 0x78, 0x5b, 0x07, 0x96 },
++      .b_public = (u8[32]){ 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .expected_ss = (u8[32]){ 0x54, 0x99, 0x8e, 0xe4, 0x3a, 0x5b, 0x00, 0x7b,
++                  0xf4, 0x99, 0xf0, 0x78, 0xe7, 0x36, 0x52, 0x44,
++                  0x00, 0xa8, 0xb5, 0xc7, 0xe9, 0xb9, 0xb4, 0x37,
++                  0x71, 0x74, 0x8c, 0x7c, 0xdf, 0x88, 0x04, 0x12 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0x30, 0xb6, 0xc6, 0xa0, 0xf2, 0xff, 0xa6, 0x80,
++                   0x76, 0x8f, 0x99, 0x2b, 0xa8, 0x9e, 0x15, 0x2d,
++                   0x5b, 0xc9, 0x89, 0x3d, 0x38, 0xc9, 0x11, 0x9b,
++                   0xe4, 0xf7, 0x67, 0xbf, 0xab, 0x6e, 0x0c, 0xa5 },
++      .b_public = (u8[32]){ 0xdc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .expected_ss = (u8[32]){ 0xea, 0xd9, 0xb3, 0x8e, 0xfd, 0xd7, 0x23, 0x63,
++                  0x79, 0x34, 0xe5, 0x5a, 0xb7, 0x17, 0xa7, 0xae,
++                  0x09, 0xeb, 0x86, 0xa2, 0x1d, 0xc3, 0x6a, 0x3f,
++                  0xee, 0xb8, 0x8b, 0x75, 0x9e, 0x39, 0x1e, 0x09 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0x90, 0x1b, 0x9d, 0xcf, 0x88, 0x1e, 0x01, 0xe0,
++                   0x27, 0x57, 0x50, 0x35, 0xd4, 0x0b, 0x43, 0xbd,
++                   0xc1, 0xc5, 0x24, 0x2e, 0x03, 0x08, 0x47, 0x49,
++                   0x5b, 0x0c, 0x72, 0x86, 0x46, 0x9b, 0x65, 0x91 },
++      .b_public = (u8[32]){ 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .expected_ss = (u8[32]){ 0x60, 0x2f, 0xf4, 0x07, 0x89, 0xb5, 0x4b, 0x41,
++                  0x80, 0x59, 0x15, 0xfe, 0x2a, 0x62, 0x21, 0xf0,
++                  0x7a, 0x50, 0xff, 0xc2, 0xc3, 0xfc, 0x94, 0xcf,
++                  0x61, 0xf1, 0x3d, 0x79, 0x04, 0xe8, 0x8e, 0x0e },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0x80, 0x46, 0x67, 0x7c, 0x28, 0xfd, 0x82, 0xc9,
++                   0xa1, 0xbd, 0xb7, 0x1a, 0x1a, 0x1a, 0x34, 0xfa,
++                   0xba, 0x12, 0x25, 0xe2, 0x50, 0x7f, 0xe3, 0xf5,
++                   0x4d, 0x10, 0xbd, 0x5b, 0x0d, 0x86, 0x5f, 0x8e },
++      .b_public = (u8[32]){ 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .expected_ss = (u8[32]){ 0xe0, 0x0a, 0xe8, 0xb1, 0x43, 0x47, 0x12, 0x47,
++                  0xba, 0x24, 0xf1, 0x2c, 0x88, 0x55, 0x36, 0xc3,
++                  0xcb, 0x98, 0x1b, 0x58, 0xe1, 0xe5, 0x6b, 0x2b,
++                  0xaf, 0x35, 0xc1, 0x2a, 0xe1, 0xf7, 0x9c, 0x26 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0x60, 0x2f, 0x7e, 0x2f, 0x68, 0xa8, 0x46, 0xb8,
++                   0x2c, 0xc2, 0x69, 0xb1, 0xd4, 0x8e, 0x93, 0x98,
++                   0x86, 0xae, 0x54, 0xfd, 0x63, 0x6c, 0x1f, 0xe0,
++                   0x74, 0xd7, 0x10, 0x12, 0x7d, 0x47, 0x24, 0x91 },
++      .b_public = (u8[32]){ 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .expected_ss = (u8[32]){ 0x98, 0xcb, 0x9b, 0x50, 0xdd, 0x3f, 0xc2, 0xb0,
++                  0xd4, 0xf2, 0xd2, 0xbf, 0x7c, 0x5c, 0xfd, 0xd1,
++                  0x0c, 0x8f, 0xcd, 0x31, 0xfc, 0x40, 0xaf, 0x1a,
++                  0xd4, 0x4f, 0x47, 0xc1, 0x31, 0x37, 0x63, 0x62 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0x60, 0x88, 0x7b, 0x3d, 0xc7, 0x24, 0x43, 0x02,
++                   0x6e, 0xbe, 0xdb, 0xbb, 0xb7, 0x06, 0x65, 0xf4,
++                   0x2b, 0x87, 0xad, 0xd1, 0x44, 0x0e, 0x77, 0x68,
++                   0xfb, 0xd7, 0xe8, 0xe2, 0xce, 0x5f, 0x63, 0x9d },
++      .b_public = (u8[32]){ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .expected_ss = (u8[32]){ 0x38, 0xd6, 0x30, 0x4c, 0x4a, 0x7e, 0x6d, 0x9f,
++                  0x79, 0x59, 0x33, 0x4f, 0xb5, 0x24, 0x5b, 0xd2,
++                  0xc7, 0x54, 0x52, 0x5d, 0x4c, 0x91, 0xdb, 0x95,
++                  0x02, 0x06, 0x92, 0x62, 0x34, 0xc1, 0xf6, 0x33 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0x78, 0xd3, 0x1d, 0xfa, 0x85, 0x44, 0x97, 0xd7,
++                   0x2d, 0x8d, 0xef, 0x8a, 0x1b, 0x7f, 0xb0, 0x06,
++                   0xce, 0xc2, 0xd8, 0xc4, 0x92, 0x46, 0x47, 0xc9,
++                   0x38, 0x14, 0xae, 0x56, 0xfa, 0xed, 0xa4, 0x95 },
++      .b_public = (u8[32]){ 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .expected_ss = (u8[32]){ 0x78, 0x6c, 0xd5, 0x49, 0x96, 0xf0, 0x14, 0xa5,
++                  0xa0, 0x31, 0xec, 0x14, 0xdb, 0x81, 0x2e, 0xd0,
++                  0x83, 0x55, 0x06, 0x1f, 0xdb, 0x5d, 0xe6, 0x80,
++                  0xa8, 0x00, 0xac, 0x52, 0x1f, 0x31, 0x8e, 0x23 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - public key >= p */
++{
++      .secret = (u8[32]){ 0xc0, 0x4c, 0x5b, 0xae, 0xfa, 0x83, 0x02, 0xdd,
++                   0xde, 0xd6, 0xa4, 0xbb, 0x95, 0x77, 0x61, 0xb4,
++                   0xeb, 0x97, 0xae, 0xfa, 0x4f, 0xc3, 0xb8, 0x04,
++                   0x30, 0x85, 0xf9, 0x6a, 0x56, 0x59, 0xb3, 0xa5 },
++      .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++      .expected_ss = (u8[32]){ 0x29, 0xae, 0x8b, 0xc7, 0x3e, 0x9b, 0x10, 0xa0,
++                  0x8b, 0x4f, 0x68, 0x1c, 0x43, 0xc3, 0xe0, 0xac,
++                  0x1a, 0x17, 0x1d, 0x31, 0xb3, 0x8f, 0x1a, 0x48,
++                  0xef, 0xba, 0x29, 0xae, 0x63, 0x9e, 0xa1, 0x34 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - RFC 7748 */
++{
++      .secret = (u8[32]){ 0xa0, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d,
++                   0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd,
++                   0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18,
++                   0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0x44 },
++      .b_public = (u8[32]){ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb,
++                  0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c,
++                  0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b,
++                  0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c },
++      .expected_ss = (u8[32]){ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90,
++                  0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f,
++                  0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7,
++                  0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - RFC 7748 */
++{
++      .secret = (u8[32]){ 0x48, 0x66, 0xe9, 0xd4, 0xd1, 0xb4, 0x67, 0x3c,
++                   0x5a, 0xd2, 0x26, 0x91, 0x95, 0x7d, 0x6a, 0xf5,
++                   0xc1, 0x1b, 0x64, 0x21, 0xe0, 0xea, 0x01, 0xd4,
++                   0x2c, 0xa4, 0x16, 0x9e, 0x79, 0x18, 0xba, 0x4d },
++      .b_public = (u8[32]){ 0xe5, 0x21, 0x0f, 0x12, 0x78, 0x68, 0x11, 0xd3,
++                  0xf4, 0xb7, 0x95, 0x9d, 0x05, 0x38, 0xae, 0x2c,
++                  0x31, 0xdb, 0xe7, 0x10, 0x6f, 0xc0, 0x3c, 0x3e,
++                  0xfc, 0x4c, 0xd5, 0x49, 0xc7, 0x15, 0xa4, 0x13 },
++      .expected_ss = (u8[32]){ 0x95, 0xcb, 0xde, 0x94, 0x76, 0xe8, 0x90, 0x7d,
++                  0x7a, 0xad, 0xe4, 0x5c, 0xb4, 0xb8, 0x73, 0xf8,
++                  0x8b, 0x59, 0x5a, 0x68, 0x79, 0x9f, 0xa1, 0x52,
++                  0xe6, 0xf8, 0xf7, 0x64, 0x7a, 0xac, 0x79, 0x57 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0x0a, 0xb4, 0xe7, 0x63, 0x80, 0xd8, 0x4d, 0xde,
++                  0x4f, 0x68, 0x33, 0xc5, 0x8f, 0x2a, 0x9f, 0xb8,
++                  0xf8, 0x3b, 0xb0, 0x16, 0x9b, 0x17, 0x2b, 0xe4,
++                  0xb6, 0xe0, 0x59, 0x28, 0x87, 0x74, 0x1a, 0x36 },
++      .expected_ss = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0x89, 0xe1, 0x0d, 0x57, 0x01, 0xb4, 0x33, 0x7d,
++                  0x2d, 0x03, 0x21, 0x81, 0x53, 0x8b, 0x10, 0x64,
++                  0xbd, 0x40, 0x84, 0x40, 0x1c, 0xec, 0xa1, 0xfd,
++                  0x12, 0x66, 0x3a, 0x19, 0x59, 0x38, 0x80, 0x00 },
++      .expected_ss = (u8[32]){ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0x2b, 0x55, 0xd3, 0xaa, 0x4a, 0x8f, 0x80, 0xc8,
++                  0xc0, 0xb2, 0xae, 0x5f, 0x93, 0x3e, 0x85, 0xaf,
++                  0x49, 0xbe, 0xac, 0x36, 0xc2, 0xfa, 0x73, 0x94,
++                  0xba, 0xb7, 0x6c, 0x89, 0x33, 0xf8, 0xf8, 0x1d },
++      .expected_ss = (u8[32]){ 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0x63, 0xe5, 0xb1, 0xfe, 0x96, 0x01, 0xfe, 0x84,
++                  0x38, 0x5d, 0x88, 0x66, 0xb0, 0x42, 0x12, 0x62,
++                  0xf7, 0x8f, 0xbf, 0xa5, 0xaf, 0xf9, 0x58, 0x5e,
++                  0x62, 0x66, 0x79, 0xb1, 0x85, 0x47, 0xd9, 0x59 },
++      .expected_ss = (u8[32]){ 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0xe4, 0x28, 0xf3, 0xda, 0xc1, 0x78, 0x09, 0xf8,
++                  0x27, 0xa5, 0x22, 0xce, 0x32, 0x35, 0x50, 0x58,
++                  0xd0, 0x73, 0x69, 0x36, 0x4a, 0xa7, 0x89, 0x02,
++                  0xee, 0x10, 0x13, 0x9b, 0x9f, 0x9d, 0xd6, 0x53 },
++      .expected_ss = (u8[32]){ 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0xb3, 0xb5, 0x0e, 0x3e, 0xd3, 0xa4, 0x07, 0xb9,
++                  0x5d, 0xe9, 0x42, 0xef, 0x74, 0x57, 0x5b, 0x5a,
++                  0xb8, 0xa1, 0x0c, 0x09, 0xee, 0x10, 0x35, 0x44,
++                  0xd6, 0x0b, 0xdf, 0xed, 0x81, 0x38, 0xab, 0x2b },
++      .expected_ss = (u8[32]){ 0xf9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0x21, 0x3f, 0xff, 0xe9, 0x3d, 0x5e, 0xa8, 0xcd,
++                  0x24, 0x2e, 0x46, 0x28, 0x44, 0x02, 0x99, 0x22,
++                  0xc4, 0x3c, 0x77, 0xc9, 0xe3, 0xe4, 0x2f, 0x56,
++                  0x2f, 0x48, 0x5d, 0x24, 0xc5, 0x01, 0xa2, 0x0b },
++      .expected_ss = (u8[32]){ 0xf3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0x91, 0xb2, 0x32, 0xa1, 0x78, 0xb3, 0xcd, 0x53,
++                  0x09, 0x32, 0x44, 0x1e, 0x61, 0x39, 0x41, 0x8f,
++                  0x72, 0x17, 0x22, 0x92, 0xf1, 0xda, 0x4c, 0x18,
++                  0x34, 0xfc, 0x5e, 0xbf, 0xef, 0xb5, 0x1e, 0x3f },
++      .expected_ss = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0x04, 0x5c, 0x6e, 0x11, 0xc5, 0xd3, 0x32, 0x55,
++                  0x6c, 0x78, 0x22, 0xfe, 0x94, 0xeb, 0xf8, 0x9b,
++                  0x56, 0xa3, 0x87, 0x8d, 0xc2, 0x7c, 0xa0, 0x79,
++                  0x10, 0x30, 0x58, 0x84, 0x9f, 0xab, 0xcb, 0x4f },
++      .expected_ss = (u8[32]){ 0xe5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0x1c, 0xa2, 0x19, 0x0b, 0x71, 0x16, 0x35, 0x39,
++                  0x06, 0x3c, 0x35, 0x77, 0x3b, 0xda, 0x0c, 0x9c,
++                  0x92, 0x8e, 0x91, 0x36, 0xf0, 0x62, 0x0a, 0xeb,
++                  0x09, 0x3f, 0x09, 0x91, 0x97, 0xb7, 0xf7, 0x4e },
++      .expected_ss = (u8[32]){ 0xe3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0xf7, 0x6e, 0x90, 0x10, 0xac, 0x33, 0xc5, 0x04,
++                  0x3b, 0x2d, 0x3b, 0x76, 0xa8, 0x42, 0x17, 0x10,
++                  0x00, 0xc4, 0x91, 0x62, 0x22, 0xe9, 0xe8, 0x58,
++                  0x97, 0xa0, 0xae, 0xc7, 0xf6, 0x35, 0x0b, 0x3c },
++      .expected_ss = (u8[32]){ 0xdd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0xbb, 0x72, 0x68, 0x8d, 0x8f, 0x8a, 0xa7, 0xa3,
++                  0x9c, 0xd6, 0x06, 0x0c, 0xd5, 0xc8, 0x09, 0x3c,
++                  0xde, 0xc6, 0xfe, 0x34, 0x19, 0x37, 0xc3, 0x88,
++                  0x6a, 0x99, 0x34, 0x6c, 0xd0, 0x7f, 0xaa, 0x55 },
++      .expected_ss = (u8[32]){ 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0x88, 0xfd, 0xde, 0xa1, 0x93, 0x39, 0x1c, 0x6a,
++                  0x59, 0x33, 0xef, 0x9b, 0x71, 0x90, 0x15, 0x49,
++                  0x44, 0x72, 0x05, 0xaa, 0xe9, 0xda, 0x92, 0x8a,
++                  0x6b, 0x91, 0xa3, 0x52, 0xba, 0x10, 0xf4, 0x1f },
++      .expected_ss = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - edge case for shared secret */
++{
++      .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                   0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                   0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                   0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++      .b_public = (u8[32]){ 0x30, 0x3b, 0x39, 0x2f, 0x15, 0x31, 0x16, 0xca,
++                  0xd9, 0xcc, 0x68, 0x2a, 0x00, 0xcc, 0xc4, 0x4c,
++                  0x95, 0xff, 0x0d, 0x3b, 0xbe, 0x56, 0x8b, 0xeb,
++                  0x6c, 0x4e, 0x73, 0x9b, 0xaf, 0xdc, 0x2c, 0x68 },
++      .expected_ss = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - checking for overflow */
++{
++      .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
++                   0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
++                   0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
++                   0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
++      .b_public = (u8[32]){ 0xfd, 0x30, 0x0a, 0xeb, 0x40, 0xe1, 0xfa, 0x58,
++                  0x25, 0x18, 0x41, 0x2b, 0x49, 0xb2, 0x08, 0xa7,
++                  0x84, 0x2b, 0x1e, 0x1f, 0x05, 0x6a, 0x04, 0x01,
++                  0x78, 0xea, 0x41, 0x41, 0x53, 0x4f, 0x65, 0x2d },
++      .expected_ss = (u8[32]){ 0xb7, 0x34, 0x10, 0x5d, 0xc2, 0x57, 0x58, 0x5d,
++                  0x73, 0xb5, 0x66, 0xcc, 0xb7, 0x6f, 0x06, 0x27,
++                  0x95, 0xcc, 0xbe, 0xc8, 0x91, 0x28, 0xe5, 0x2b,
++                  0x02, 0xf3, 0xe5, 0x96, 0x39, 0xf1, 0x3c, 0x46 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - checking for overflow */
++{
++      .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
++                   0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
++                   0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
++                   0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
++      .b_public = (u8[32]){ 0xc8, 0xef, 0x79, 0xb5, 0x14, 0xd7, 0x68, 0x26,
++                  0x77, 0xbc, 0x79, 0x31, 0xe0, 0x6e, 0xe5, 0xc2,
++                  0x7c, 0x9b, 0x39, 0x2b, 0x4a, 0xe9, 0x48, 0x44,
++                  0x73, 0xf5, 0x54, 0xe6, 0x67, 0x8e, 0xcc, 0x2e },
++      .expected_ss = (u8[32]){ 0x64, 0x7a, 0x46, 0xb6, 0xfc, 0x3f, 0x40, 0xd6,
++                  0x21, 0x41, 0xee, 0x3c, 0xee, 0x70, 0x6b, 0x4d,
++                  0x7a, 0x92, 0x71, 0x59, 0x3a, 0x7b, 0x14, 0x3e,
++                  0x8e, 0x2e, 0x22, 0x79, 0x88, 0x3e, 0x45, 0x50 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - checking for overflow */
++{
++      .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
++                   0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
++                   0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
++                   0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
++      .b_public = (u8[32]){ 0x64, 0xae, 0xac, 0x25, 0x04, 0x14, 0x48, 0x61,
++                  0x53, 0x2b, 0x7b, 0xbc, 0xb6, 0xc8, 0x7d, 0x67,
++                  0xdd, 0x4c, 0x1f, 0x07, 0xeb, 0xc2, 0xe0, 0x6e,
++                  0xff, 0xb9, 0x5a, 0xec, 0xc6, 0x17, 0x0b, 0x2c },
++      .expected_ss = (u8[32]){ 0x4f, 0xf0, 0x3d, 0x5f, 0xb4, 0x3c, 0xd8, 0x65,
++                  0x7a, 0x3c, 0xf3, 0x7c, 0x13, 0x8c, 0xad, 0xce,
++                  0xcc, 0xe5, 0x09, 0xe4, 0xeb, 0xa0, 0x89, 0xd0,
++                  0xef, 0x40, 0xb4, 0xe4, 0xfb, 0x94, 0x61, 0x55 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - checking for overflow */
++{
++      .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
++                   0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
++                   0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
++                   0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
++      .b_public = (u8[32]){ 0xbf, 0x68, 0xe3, 0x5e, 0x9b, 0xdb, 0x7e, 0xee,
++                  0x1b, 0x50, 0x57, 0x02, 0x21, 0x86, 0x0f, 0x5d,
++                  0xcd, 0xad, 0x8a, 0xcb, 0xab, 0x03, 0x1b, 0x14,
++                  0x97, 0x4c, 0xc4, 0x90, 0x13, 0xc4, 0x98, 0x31 },
++      .expected_ss = (u8[32]){ 0x21, 0xce, 0xe5, 0x2e, 0xfd, 0xbc, 0x81, 0x2e,
++                  0x1d, 0x02, 0x1a, 0x4a, 0xf1, 0xe1, 0xd8, 0xbc,
++                  0x4d, 0xb3, 0xc4, 0x00, 0xe4, 0xd2, 0xa2, 0xc5,
++                  0x6a, 0x39, 0x26, 0xdb, 0x4d, 0x99, 0xc6, 0x5b },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - checking for overflow */
++{
++      .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
++                   0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
++                   0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
++                   0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
++      .b_public = (u8[32]){ 0x53, 0x47, 0xc4, 0x91, 0x33, 0x1a, 0x64, 0xb4,
++                  0x3d, 0xdc, 0x68, 0x30, 0x34, 0xe6, 0x77, 0xf5,
++                  0x3d, 0xc3, 0x2b, 0x52, 0xa5, 0x2a, 0x57, 0x7c,
++                  0x15, 0xa8, 0x3b, 0xf2, 0x98, 0xe9, 0x9f, 0x19 },
++      .expected_ss = (u8[32]){ 0x18, 0xcb, 0x89, 0xe4, 0xe2, 0x0c, 0x0c, 0x2b,
++                  0xd3, 0x24, 0x30, 0x52, 0x45, 0x26, 0x6c, 0x93,
++                  0x27, 0x69, 0x0b, 0xbe, 0x79, 0xac, 0xb8, 0x8f,
++                  0x5b, 0x8f, 0xb3, 0xf7, 0x4e, 0xca, 0x3e, 0x52 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - private key == -1 (mod order) */
++{
++      .secret = (u8[32]){ 0xa0, 0x23, 0xcd, 0xd0, 0x83, 0xef, 0x5b, 0xb8,
++                   0x2f, 0x10, 0xd6, 0x2e, 0x59, 0xe1, 0x5a, 0x68,
++                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50 },
++      .b_public = (u8[32]){ 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e,
++                  0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57,
++                  0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f,
++                  0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 },
++      .expected_ss = (u8[32]){ 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e,
++                  0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57,
++                  0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f,
++                  0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++},
++/* wycheproof - private key == 1 (mod order) on twist */
++{
++      .secret = (u8[32]){ 0x58, 0x08, 0x3d, 0xd2, 0x61, 0xad, 0x91, 0xef,
++                   0xf9, 0x52, 0x32, 0x2e, 0xc8, 0x24, 0xc6, 0x82,
++                   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x5f },
++      .b_public = (u8[32]){ 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f,
++                  0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6,
++                  0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64,
++                  0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 },
++      .expected_ss = (u8[32]){ 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f,
++                  0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6,
++                  0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64,
++                  0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 },
++      .secret_size = 32,
++      .b_public_size = 32,
++      .expected_ss_size = 32,
++
++}
++};
++
+ static const struct kpp_testvec ecdh_tv_template[] = {
+       {
+ #ifndef CONFIG_CRYPTO_FIPS
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0027-crypto-curve25519-implement-generic-KPP-driver.patch b/target/linux/generic/backport-5.4/080-wireguard-0027-crypto-curve25519-implement-generic-KPP-driver.patch

new file mode 100644 (file)

index 0000000..2d6de58
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0027-crypto-curve25519-implement-generic-KPP-driver.patch
@@ -0,0 +1,136 @@
+From 54bdc995d525de6ae20f74af36d079f8b79e52fa Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:34 +0100
+Subject: [PATCH 027/124] crypto: curve25519 - implement generic KPP driver
+
+commit ee772cb641135739c1530647391d5a04c39db192 upstream.
+
+Expose the generic Curve25519 library via the crypto API KPP interface.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/Kconfig              |  5 +++
+ crypto/Makefile             |  1 +
+ crypto/curve25519-generic.c | 90 +++++++++++++++++++++++++++++++++++++
+ 3 files changed, 96 insertions(+)
+ create mode 100644 crypto/curve25519-generic.c
+
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -264,6 +264,11 @@ config CRYPTO_ECRDSA
+         standard algorithms (called GOST algorithms). Only signature verification
+         is implemented.
+ 
++config CRYPTO_CURVE25519
++      tristate "Curve25519 algorithm"
++      select CRYPTO_KPP
++      select CRYPTO_LIB_CURVE25519_GENERIC
++
+ comment "Authenticated Encryption with Associated Data"
+ 
+ config CRYPTO_CCM
+--- a/crypto/Makefile
++++ b/crypto/Makefile
+@@ -167,6 +167,7 @@ obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o
+ obj-$(CONFIG_CRYPTO_OFB) += ofb.o
+ obj-$(CONFIG_CRYPTO_ECC) += ecc.o
+ obj-$(CONFIG_CRYPTO_ESSIV) += essiv.o
++obj-$(CONFIG_CRYPTO_CURVE25519) += curve25519-generic.o
+ 
+ ecdh_generic-y += ecdh.o
+ ecdh_generic-y += ecdh_helper.o
+--- /dev/null
++++ b/crypto/curve25519-generic.c
+@@ -0,0 +1,90 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++#include <crypto/curve25519.h>
++#include <crypto/internal/kpp.h>
++#include <crypto/kpp.h>
++#include <linux/module.h>
++#include <linux/scatterlist.h>
++
++static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
++                               unsigned int len)
++{
++      u8 *secret = kpp_tfm_ctx(tfm);
++
++      if (!len)
++              curve25519_generate_secret(secret);
++      else if (len == CURVE25519_KEY_SIZE &&
++               crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
++              memcpy(secret, buf, CURVE25519_KEY_SIZE);
++      else
++              return -EINVAL;
++      return 0;
++}
++
++static int curve25519_compute_value(struct kpp_request *req)
++{
++      struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
++      const u8 *secret = kpp_tfm_ctx(tfm);
++      u8 public_key[CURVE25519_KEY_SIZE];
++      u8 buf[CURVE25519_KEY_SIZE];
++      int copied, nbytes;
++      u8 const *bp;
++
++      if (req->src) {
++              copied = sg_copy_to_buffer(req->src,
++                                         sg_nents_for_len(req->src,
++                                                          CURVE25519_KEY_SIZE),
++                                         public_key, CURVE25519_KEY_SIZE);
++              if (copied != CURVE25519_KEY_SIZE)
++                      return -EINVAL;
++              bp = public_key;
++      } else {
++              bp = curve25519_base_point;
++      }
++
++      curve25519_generic(buf, secret, bp);
++
++      /* might want less than we've got */
++      nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
++      copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
++                                                              nbytes),
++                                   buf, nbytes);
++      if (copied != nbytes)
++              return -EINVAL;
++      return 0;
++}
++
++static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
++{
++      return CURVE25519_KEY_SIZE;
++}
++
++static struct kpp_alg curve25519_alg = {
++      .base.cra_name          = "curve25519",
++      .base.cra_driver_name   = "curve25519-generic",
++      .base.cra_priority      = 100,
++      .base.cra_module        = THIS_MODULE,
++      .base.cra_ctxsize       = CURVE25519_KEY_SIZE,
++
++      .set_secret             = curve25519_set_secret,
++      .generate_public_key    = curve25519_compute_value,
++      .compute_shared_secret  = curve25519_compute_value,
++      .max_size               = curve25519_max_size,
++};
++
++static int curve25519_init(void)
++{
++      return crypto_register_kpp(&curve25519_alg);
++}
++
++static void curve25519_exit(void)
++{
++      crypto_unregister_kpp(&curve25519_alg);
++}
++
++subsys_initcall(curve25519_init);
++module_exit(curve25519_exit);
++
++MODULE_ALIAS_CRYPTO("curve25519");
++MODULE_ALIAS_CRYPTO("curve25519-generic");
++MODULE_LICENSE("GPL");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0028-crypto-lib-curve25519-work-around-Clang-stack-spilli.patch b/target/linux/generic/backport-5.4/080-wireguard-0028-crypto-lib-curve25519-work-around-Clang-stack-spilli.patch

new file mode 100644 (file)

index 0000000..b38f3f7
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0028-crypto-lib-curve25519-work-around-Clang-stack-spilli.patch
@@ -0,0 +1,75 @@
+From 3c710fa0cdbf9362df4e3b36be338779662b30a6 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:35 +0100
+Subject: [PATCH 028/124] crypto: lib/curve25519 - work around Clang stack
+ spilling issue
+
+commit 660bb8e1f833ea63185fe80fde847e3e42f18e3b upstream.
+
+Arnd reports that the 32-bit generic library code for Curve25119 ends
+up using an excessive amount of stack space when built with Clang:
+
+  lib/crypto/curve25519-fiat32.c:756:6: error: stack frame size
+      of 1384 bytes in function 'curve25519_generic'
+      [-Werror,-Wframe-larger-than=]
+
+Let's give some hints to the compiler regarding which routines should
+not be inlined, to prevent it from running out of registers and spilling
+to the stack. The resulting code performs identically under both GCC
+and Clang, and makes the warning go away.
+
+Suggested-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/curve25519-fiat32.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/lib/crypto/curve25519-fiat32.c
++++ b/lib/crypto/curve25519-fiat32.c
+@@ -223,7 +223,7 @@ static __always_inline void fe_1(fe *h)
+       h->v[0] = 1;
+ }
+ 
+-static void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
++static noinline void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+ {
+       { const u32 x20 = in1[9];
+       { const u32 x21 = in1[8];
+@@ -266,7 +266,7 @@ static __always_inline void fe_add(fe_lo
+       fe_add_impl(h->v, f->v, g->v);
+ }
+ 
+-static void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
++static noinline void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+ {
+       { const u32 x20 = in1[9];
+       { const u32 x21 = in1[8];
+@@ -309,7 +309,7 @@ static __always_inline void fe_sub(fe_lo
+       fe_sub_impl(h->v, f->v, g->v);
+ }
+ 
+-static void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
++static noinline void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+ {
+       { const u32 x20 = in1[9];
+       { const u32 x21 = in1[8];
+@@ -441,7 +441,7 @@ fe_mul_tll(fe *h, const fe_loose *f, con
+       fe_mul_impl(h->v, f->v, g->v);
+ }
+ 
+-static void fe_sqr_impl(u32 out[10], const u32 in1[10])
++static noinline void fe_sqr_impl(u32 out[10], const u32 in1[10])
+ {
+       { const u32 x17 = in1[9];
+       { const u32 x18 = in1[8];
+@@ -619,7 +619,7 @@ static __always_inline void fe_invert(fe
+  *
+  * Preconditions: b in {0,1}
+  */
+-static __always_inline void fe_cswap(fe *f, fe *g, unsigned int b)
++static noinline void fe_cswap(fe *f, fe *g, unsigned int b)
+ {
+       unsigned i;
+       b = 0 - b;
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0029-crypto-curve25519-x86_64-library-and-KPP-implementat.patch b/target/linux/generic/backport-5.4/080-wireguard-0029-crypto-curve25519-x86_64-library-and-KPP-implementat.patch

new file mode 100644 (file)

index 0000000..fd06cb1
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0029-crypto-curve25519-x86_64-library-and-KPP-implementat.patch
@@ -0,0 +1,2537 @@
+From 0195e7650ebe0fdb5e1d5891274c203cb6cee0b6 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:36 +0100
+Subject: [PATCH 029/124] crypto: curve25519 - x86_64 library and KPP
+ implementations
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit bb611bdfd6be34d9f822c73305fcc83720499d38 upstream.
+
+This implementation is the fastest available x86_64 implementation, and
+unlike Sandy2x, it doesn't requie use of the floating point registers at
+all. Instead it makes use of BMI2 and ADX, available on recent
+microarchitectures. The implementation was written by Armando
+Faz-Hernández with contributions (upstream) from Samuel Neves and me,
+in addition to further changes in the kernel implementation from us.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
+Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
+[ardb: - move to arch/x86/crypto
+       - wire into lib/crypto framework
+       - implement crypto API KPP hooks ]
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/Makefile            |    1 +
+ arch/x86/crypto/curve25519-x86_64.c | 2475 +++++++++++++++++++++++++++
+ crypto/Kconfig                      |    6 +
+ 3 files changed, 2482 insertions(+)
+ create mode 100644 arch/x86/crypto/curve25519-x86_64.c
+
+--- a/arch/x86/crypto/Makefile
++++ b/arch/x86/crypto/Makefile
+@@ -39,6 +39,7 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2)
+ 
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
++obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
+ 
+ # These modules require assembler to support AVX.
+ ifeq ($(avx_supported),yes)
+--- /dev/null
++++ b/arch/x86/crypto/curve25519-x86_64.c
+@@ -0,0 +1,2475 @@
++// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
++/*
++ * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
++ * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
++ */
++
++#include <crypto/curve25519.h>
++#include <crypto/internal/kpp.h>
++
++#include <linux/types.h>
++#include <linux/jump_label.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <asm/cpufeature.h>
++#include <asm/processor.h>
++
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx);
++
++enum { NUM_WORDS_ELTFP25519 = 4 };
++typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
++typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
++
++#define mul_eltfp25519_1w_adx(c, a, b) do { \
++      mul_256x256_integer_adx(m.buffer, a, b); \
++      red_eltfp25519_1w_adx(c, m.buffer); \
++} while (0)
++
++#define mul_eltfp25519_1w_bmi2(c, a, b) do { \
++      mul_256x256_integer_bmi2(m.buffer, a, b); \
++      red_eltfp25519_1w_bmi2(c, m.buffer); \
++} while (0)
++
++#define sqr_eltfp25519_1w_adx(a) do { \
++      sqr_256x256_integer_adx(m.buffer, a); \
++      red_eltfp25519_1w_adx(a, m.buffer); \
++} while (0)
++
++#define sqr_eltfp25519_1w_bmi2(a) do { \
++      sqr_256x256_integer_bmi2(m.buffer, a); \
++      red_eltfp25519_1w_bmi2(a, m.buffer); \
++} while (0)
++
++#define mul_eltfp25519_2w_adx(c, a, b) do { \
++      mul2_256x256_integer_adx(m.buffer, a, b); \
++      red_eltfp25519_2w_adx(c, m.buffer); \
++} while (0)
++
++#define mul_eltfp25519_2w_bmi2(c, a, b) do { \
++      mul2_256x256_integer_bmi2(m.buffer, a, b); \
++      red_eltfp25519_2w_bmi2(c, m.buffer); \
++} while (0)
++
++#define sqr_eltfp25519_2w_adx(a) do { \
++      sqr2_256x256_integer_adx(m.buffer, a); \
++      red_eltfp25519_2w_adx(a, m.buffer); \
++} while (0)
++
++#define sqr_eltfp25519_2w_bmi2(a) do { \
++      sqr2_256x256_integer_bmi2(m.buffer, a); \
++      red_eltfp25519_2w_bmi2(a, m.buffer); \
++} while (0)
++
++#define sqrn_eltfp25519_1w_adx(a, times) do { \
++      int ____counter = (times); \
++      while (____counter-- > 0) \
++              sqr_eltfp25519_1w_adx(a); \
++} while (0)
++
++#define sqrn_eltfp25519_1w_bmi2(a, times) do { \
++      int ____counter = (times); \
++      while (____counter-- > 0) \
++              sqr_eltfp25519_1w_bmi2(a); \
++} while (0)
++
++#define copy_eltfp25519_1w(C, A) do { \
++      (C)[0] = (A)[0]; \
++      (C)[1] = (A)[1]; \
++      (C)[2] = (A)[2]; \
++      (C)[3] = (A)[3]; \
++} while (0)
++
++#define setzero_eltfp25519_1w(C) do { \
++      (C)[0] = 0; \
++      (C)[1] = 0; \
++      (C)[2] = 0; \
++      (C)[3] = 0; \
++} while (0)
++
++__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
++      /*   1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL,
++                0xffffffffffffffffUL, 0x5fffffffffffffffUL,
++      /*   2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL,
++                0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL,
++      /*   3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL,
++                0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL,
++      /*   4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL,
++                0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL,
++      /*   5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL,
++                0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL,
++      /*   6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL,
++                0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL,
++      /*   7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL,
++                0xc1c20d06231f7614UL, 0x2938218da274f972UL,
++      /*   8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL,
++                0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL,
++      /*   9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL,
++                0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL,
++      /*  10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL,
++                0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL,
++      /*  11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL,
++                0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL,
++      /*  12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL,
++                0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL,
++      /*  13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL,
++                0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL,
++      /*  14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL,
++                0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL,
++      /*  15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL,
++                0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL,
++      /*  16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL,
++                0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL,
++      /*  17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL,
++                0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL,
++      /*  18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL,
++                0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL,
++      /*  19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL,
++                0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL,
++      /*  20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL,
++                0x9d4935467caaf22eUL, 0x5166408eee85ff49UL,
++      /*  21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL,
++                0x5259729241159b1cUL, 0x6a621892d5b0ab33UL,
++      /*  22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL,
++                0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL,
++      /*  23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL,
++                0x23758739f630a257UL, 0x295a407a01a78580UL,
++      /*  24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL,
++                0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL,
++      /*  25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL,
++                0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL,
++      /*  26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL,
++                0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL,
++      /*  27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL,
++                0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL,
++      /*  28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL,
++                0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL,
++      /*  29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL,
++                0x74b4c4ceab102f64UL, 0x183abadd10139845UL,
++      /*  30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL,
++                0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL,
++      /*  31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL,
++                0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL,
++      /*  32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL,
++                0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL,
++      /*  33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL,
++                0xd88768e4904032d8UL, 0x131384427b3aaeecUL,
++      /*  34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL,
++                0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL,
++      /*  35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL,
++                0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL,
++      /*  36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL,
++                0xa401760b882c797aUL, 0x1fc223e28dc88730UL,
++      /*  37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL,
++                0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL,
++      /*  38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL,
++                0xfdbf177988bbc586UL, 0x2959894fcad81df5UL,
++      /*  39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL,
++                0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL,
++      /*  40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL,
++                0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL,
++      /*  41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL,
++                0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL,
++      /*  42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL,
++                0x5c217736fa279374UL, 0x7dde05734afeb1faUL,
++      /*  43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL,
++                0xe6053bf89595bf7aUL, 0x394faf38da245530UL,
++      /*  44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL,
++                0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL,
++      /*  45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL,
++                0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL,
++      /*  46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL,
++                0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL,
++      /*  47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL,
++                0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL,
++      /*  48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL,
++                0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL,
++      /*  49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL,
++                0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL,
++      /*  50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL,
++                0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL,
++      /*  51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL,
++                0xc189218075e91436UL, 0x6d9284169b3b8484UL,
++      /*  52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL,
++                0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL,
++      /*  53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL,
++                0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL,
++      /*  54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL,
++                0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL,
++      /*  55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL,
++                0x19346a65d3224a08UL, 0x0f5034e49b9af466UL,
++      /*  56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL,
++                0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL,
++      /*  57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL,
++                0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL,
++      /*  58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL,
++                0xf826842130f5ad28UL, 0x3ea988f75301a441UL,
++      /*  59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL,
++                0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL,
++      /*  60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL,
++                0xd01469df811d644bUL, 0x77fea47d81a5d71fUL,
++      /*  61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL,
++                0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL,
++      /*  62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL,
++                0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL,
++      /*  63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL,
++                0xbea450e1dbd885d5UL, 0x61b68649320f712cUL,
++      /*  64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL,
++                0x25232973322dbef4UL, 0x445dc4758c17f770UL,
++      /*  65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL,
++                0x1efebefdc053db34UL, 0x4adbe867c65daf99UL,
++      /*  66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL,
++                0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL,
++      /*  67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL,
++                0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL,
++      /*  68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL,
++                0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL,
++      /*  69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL,
++                0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL,
++      /*  70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL,
++                0xbdaacb805831ca6fUL, 0x445b652dc916694fUL,
++      /*  71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL,
++                0xa1823aafe04c314aUL, 0x790a2d94437cf586UL,
++      /*  72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL,
++                0xbf70903b204f5169UL, 0x2f7a89891ba319feUL,
++      /*  73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL,
++                0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL,
++      /*  74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL,
++                0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL,
++      /*  75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL,
++                0x674f1288f8e11217UL, 0x5682250f329f93d0UL,
++      /*  76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL,
++                0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL,
++      /*  77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL,
++                0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL,
++      /*  78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL,
++                0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL,
++      /*  79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL,
++                0x5deadacec9f04973UL, 0x29275b5d41d29b27UL,
++      /*  80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL,
++                0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL,
++      /*  81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL,
++                0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL,
++      /*  82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL,
++                0x894d1d855ae52359UL, 0x68e122157b743d69UL,
++      /*  83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL,
++                0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL,
++      /*  84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL,
++                0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL,
++      /*  85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL,
++                0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL,
++      /*  86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL,
++                0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL,
++      /*  87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL,
++                0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL,
++      /*  88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL,
++                0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL,
++      /*  89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL,
++                0x45adb16e76cefcf2UL, 0x01f768aead232999UL,
++      /*  90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL,
++                0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL,
++      /*  91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL,
++                0x5eefa966de2a701dUL, 0x23b20565de55e3efUL,
++      /*  92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL,
++                0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL,
++      /*  93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL,
++                0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL,
++      /*  94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL,
++                0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL,
++      /*  95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL,
++                0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL,
++      /*  96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL,
++                0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL,
++      /*  97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL,
++                0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL,
++      /*  98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL,
++                0xb9886314844006b1UL, 0x40d2a72ab454cc60UL,
++      /*  99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL,
++                0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL,
++      /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL,
++                0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL,
++      /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL,
++                0x40727064c416d74fUL, 0x6e15c6114b502ef0UL,
++      /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL,
++                0x4a497962066e6043UL, 0x705b3aab41355b44UL,
++      /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL,
++                0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL,
++      /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL,
++                0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL,
++      /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL,
++                0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL,
++      /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL,
++                0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL,
++      /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL,
++                0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL,
++      /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL,
++                0x2088ce1570033c68UL, 0x7fba1f495c837987UL,
++      /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL,
++                0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL,
++      /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL,
++                0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL,
++      /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL,
++                0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL,
++      /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL,
++                0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL,
++      /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL,
++                0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL,
++      /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL,
++                0x00f52e3f67280294UL, 0x566d4fc14730c509UL,
++      /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL,
++                0x216730fba68d6095UL, 0x22e8c3843f69cea7UL,
++      /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL,
++                0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL,
++      /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL,
++                0x508e862f121692fcUL, 0x3a81907fa093c291UL,
++      /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL,
++                0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL,
++      /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL,
++                0xbee595ce8a9df2e5UL, 0x25e496c722422236UL,
++      /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL,
++                0xe488de11d761e352UL, 0x0e878a01a085545cUL,
++      /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL,
++                0x9ea37a487ae80d67UL, 0x67a9958011e41794UL,
++      /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL,
++                0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL,
++      /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL,
++                0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL,
++      /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL,
++                0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL,
++      /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL,
++                0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL,
++      /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL,
++                0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL,
++      /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL,
++                0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL,
++      /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL,
++                0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL,
++      /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL,
++                0x97134556a9832d06UL, 0x269bb0360a84f8a0UL,
++      /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL,
++                0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL,
++      /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL,
++                0x904659bb686e3772UL, 0x7215c371746ba8c8UL,
++      /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL,
++                0x266fd5809208f294UL, 0x5c847085619a26b9UL,
++      /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL,
++                0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL,
++      /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL,
++                0x2b1641917b307614UL, 0x117c554fc4f45b7cUL,
++      /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL,
++                0xd7e803f4171b2827UL, 0x1015e87487d225eaUL,
++      /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL,
++                0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL,
++      /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL,
++                0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL,
++      /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL,
++                0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL,
++      /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL,
++                0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL,
++      /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL,
++                0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL,
++      /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL,
++                0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL,
++      /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL,
++                0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL,
++      /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL,
++                0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL,
++      /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL,
++                0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL,
++      /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL,
++                0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL,
++      /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL,
++                0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL,
++      /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL,
++                0xec219c48fbd21604UL, 0x1aaf1af517c36731UL,
++      /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL,
++                0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL,
++      /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL,
++                0x4dbbc207f531561aUL, 0x0253b7f082128a27UL,
++      /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL,
++                0x52d17436309d4253UL, 0x356f97e13efae576UL,
++      /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL,
++                0x0c776128bed92c98UL, 0x1d34ae93032885b8UL,
++      /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL,
++                0x66124c6f97bda770UL, 0x0f81a0290654124aUL,
++      /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL,
++                0xff08d03f93d8c20aUL, 0x52a148199faef26bUL,
++      /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL,
++                0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL,
++      /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL,
++                0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL,
++      /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL,
++                0x5da643cb4bf30035UL, 0x77db28d63940f721UL,
++      /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL,
++                0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL,
++      /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL,
++                0x140a69245ca575edUL, 0x0cf1c37134273a4cUL,
++      /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL,
++                0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL,
++      /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL,
++                0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL,
++      /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL,
++                0x497d723f802e88e1UL, 0x30684dea602f408dUL,
++      /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL,
++                0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL,
++      /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL,
++                0x7468c7423a543258UL, 0x4a7f11464eb5642fUL,
++      /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL,
++                0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL,
++      /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL,
++                0x026df551dbb85c20UL, 0x74fcd91047e21901UL,
++      /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL,
++                0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL,
++      /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL,
++                0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL,
++      /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL,
++                0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL,
++      /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL,
++                0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL,
++      /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL,
++                0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL,
++      /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL,
++                0x13033ac001f66697UL, 0x273b24fe3b367d75UL,
++      /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL,
++                0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL,
++      /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL,
++                0xacc63ca34b8ec145UL, 0x74621888fee66574UL,
++      /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL,
++                0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL,
++      /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL,
++                0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL,
++      /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL,
++                0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL,
++      /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL,
++                0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL,
++      /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL,
++                0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL,
++      /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL,
++                0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL,
++      /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL,
++                0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL,
++      /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL,
++                0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL,
++      /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL,
++                0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL,
++      /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL,
++                0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL,
++      /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL,
++                0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL,
++      /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL,
++                0x81004b71e33cc191UL, 0x44e6be345122803cUL,
++      /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL,
++                0x49c8c4281af60c29UL, 0x21edb518de701aeeUL,
++      /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL,
++                0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL,
++      /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL,
++                0x12bc8d6915783712UL, 0x498194c0fc620abbUL,
++      /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL,
++                0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL,
++      /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL,
++                0x1e60c24598c71fffUL, 0x59f2f014979983efUL,
++      /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL,
++                0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL,
++      /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL,
++                0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL,
++      /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL,
++                0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL,
++      /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL,
++                0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL,
++      /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL,
++                0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL,
++      /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL,
++                0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL,
++      /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL,
++                0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL,
++      /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL,
++                0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL,
++      /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL,
++                0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL,
++      /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL,
++                0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL,
++      /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL,
++                0x883ada83a6a1652cUL, 0x585f1974034d6c17UL,
++      /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL,
++                0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL,
++      /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL,
++                0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL,
++      /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL,
++                0x33979624f0e917beUL, 0x2c018dc527356b30UL,
++      /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL,
++                0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL,
++      /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL,
++                0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL,
++      /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL,
++                0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL,
++      /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL,
++                0x345ead5e972d091eUL, 0x18c8df11a83103baUL,
++      /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL,
++                0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL,
++      /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL,
++                0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL,
++      /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL,
++                0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL,
++      /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL,
++                0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL,
++      /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL,
++                0x79952a008221e738UL, 0x4322e1a7535cd2bbUL,
++      /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL,
++                0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL,
++      /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL,
++                0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL,
++      /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL,
++                0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL,
++      /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL,
++                0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL,
++      /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL,
++                0x1df4c0af01314a60UL, 0x09a62dab89289527UL,
++      /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL,
++                0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL,
++      /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL,
++                0x49b96853d7a7084aUL, 0x4980a319601420a8UL,
++      /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL,
++                0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL,
++      /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL,
++                0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL,
++      /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL,
++                0xddeb34a061615d99UL, 0x5129cecceb64b773UL,
++      /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL,
++                0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL,
++      /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL,
++                0x680bd77c73edad2eUL, 0x487c02354edd9041UL,
++      /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL,
++                0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL,
++      /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL,
++                0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL,
++      /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL,
++                0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL,
++      /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL,
++                0xe9834262d13921edUL, 0x27fedafaa54bb592UL,
++      /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL,
++                0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL,
++      /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL,
++                0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL,
++      /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL,
++                0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL,
++      /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL,
++                0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL,
++      /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL,
++                0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL,
++      /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL,
++                0x645b426f3d1d58acUL, 0x4804a82227a557bcUL,
++      /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL,
++                0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL,
++      /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL,
++                0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL,
++      /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL,
++                0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL,
++      /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL,
++                0xc26ccff352b37ec7UL, 0x056f68341d797b21UL,
++      /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL,
++                0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL,
++      /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL,
++                0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL,
++      /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL,
++                0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL,
++      /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL,
++                0x2c43ecea0107c1ddUL, 0x526028809372de35UL,
++      /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL,
++                0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL,
++      /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL,
++                0x899fc38fc4b5c515UL, 0x250386b124ffc207UL,
++      /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL,
++                0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL,
++      /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL,
++                0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL,
++      /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL,
++                0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL,
++      /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL,
++                0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL,
++      /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL,
++                0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL,
++      /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL,
++                0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL,
++      /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL,
++                0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL
++};
++
++/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
++ * a is two 256-bit integers: a0[0:3] and a1[4:7]
++ * b is two 256-bit integers: b0[0:3] and b1[4:7]
++ */
++static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a,
++                                   const u64 *const b)
++{
++      asm volatile(
++              "xorl %%r14d, %%r14d ;"
++              "movq   (%1), %%rdx; "  /* A[0] */
++              "mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
++              "xorl %%r10d, %%r10d ;"
++              "movq %%r8, (%0) ;"
++              "mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
++              "adox %%r10, %%r15 ;"
++              "mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
++              "adox  %%r8, %%rax ;"
++              "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
++              "adox %%r10, %%rbx ;"
++              /******************************************/
++              "adox %%r14, %%rcx ;"
++
++              "movq  8(%1), %%rdx; "  /* A[1] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
++              "adox %%r15,  %%r8 ;"
++              "movq  %%r8, 8(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
++              "adox %%r10,  %%r9 ;"
++              "adcx  %%r9, %%rax ;"
++              "mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
++              "adox  %%r8, %%r11 ;"
++              "adcx %%r11, %%rbx ;"
++              "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
++              "adox %%r10, %%r13 ;"
++              "adcx %%r13, %%rcx ;"
++              /******************************************/
++              "adox %%r14, %%r15 ;"
++              "adcx %%r14, %%r15 ;"
++
++              "movq 16(%1), %%rdx; " /* A[2] */
++              "xorl %%r10d, %%r10d ;"
++              "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
++              "adox %%rax,  %%r8 ;"
++              "movq %%r8, 16(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
++              "adox %%r10,  %%r9 ;"
++              "adcx  %%r9, %%rbx ;"
++              "mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
++              "adox  %%r8, %%r11 ;"
++              "adcx %%r11, %%rcx ;"
++              "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
++              "adox %%r10, %%r13 ;"
++              "adcx %%r13, %%r15 ;"
++              /******************************************/
++              "adox %%r14, %%rax ;"
++              "adcx %%r14, %%rax ;"
++
++              "movq 24(%1), %%rdx; " /* A[3] */
++              "xorl %%r10d, %%r10d ;"
++              "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
++              "adox %%rbx,  %%r8 ;"
++              "movq %%r8, 24(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
++              "adox %%r10,  %%r9 ;"
++              "adcx  %%r9, %%rcx ;"
++              "movq %%rcx, 32(%0) ;"
++              "mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
++              "adox  %%r8, %%r11 ;"
++              "adcx %%r11, %%r15 ;"
++              "movq %%r15, 40(%0) ;"
++              "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
++              "adox %%r10, %%r13 ;"
++              "adcx %%r13, %%rax ;"
++              "movq %%rax, 48(%0) ;"
++              /******************************************/
++              "adox %%r14, %%rbx ;"
++              "adcx %%r14, %%rbx ;"
++              "movq %%rbx, 56(%0) ;"
++
++              "movq 32(%1), %%rdx; "  /* C[0] */
++              "mulx 32(%2),  %%r8, %%r15; " /* C[0]*D[0] */
++              "xorl %%r10d, %%r10d ;"
++              "movq %%r8, 64(%0);"
++              "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
++              "adox %%r10, %%r15 ;"
++              "mulx 48(%2),  %%r8, %%rbx; " /* C[0]*D[2] */
++              "adox  %%r8, %%rax ;"
++              "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
++              "adox %%r10, %%rbx ;"
++              /******************************************/
++              "adox %%r14, %%rcx ;"
++
++              "movq 40(%1), %%rdx; " /* C[1] */
++              "xorl %%r10d, %%r10d ;"
++              "mulx 32(%2),  %%r8,  %%r9; " /* C[1]*D[0] */
++              "adox %%r15,  %%r8 ;"
++              "movq  %%r8, 72(%0);"
++              "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
++              "adox %%r10,  %%r9 ;"
++              "adcx  %%r9, %%rax ;"
++              "mulx 48(%2),  %%r8, %%r13; " /* C[1]*D[2] */
++              "adox  %%r8, %%r11 ;"
++              "adcx %%r11, %%rbx ;"
++              "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
++              "adox %%r10, %%r13 ;"
++              "adcx %%r13, %%rcx ;"
++              /******************************************/
++              "adox %%r14, %%r15 ;"
++              "adcx %%r14, %%r15 ;"
++
++              "movq 48(%1), %%rdx; " /* C[2] */
++              "xorl %%r10d, %%r10d ;"
++              "mulx 32(%2),  %%r8,  %%r9; " /* C[2]*D[0] */
++              "adox %%rax,  %%r8 ;"
++              "movq  %%r8, 80(%0);"
++              "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
++              "adox %%r10,  %%r9 ;"
++              "adcx  %%r9, %%rbx ;"
++              "mulx 48(%2),  %%r8, %%r13; " /* C[2]*D[2] */
++              "adox  %%r8, %%r11 ;"
++              "adcx %%r11, %%rcx ;"
++              "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
++              "adox %%r10, %%r13 ;"
++              "adcx %%r13, %%r15 ;"
++              /******************************************/
++              "adox %%r14, %%rax ;"
++              "adcx %%r14, %%rax ;"
++
++              "movq 56(%1), %%rdx; " /* C[3] */
++              "xorl %%r10d, %%r10d ;"
++              "mulx 32(%2),  %%r8,  %%r9; " /* C[3]*D[0] */
++              "adox %%rbx,  %%r8 ;"
++              "movq  %%r8, 88(%0);"
++              "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
++              "adox %%r10,  %%r9 ;"
++              "adcx  %%r9, %%rcx ;"
++              "movq %%rcx,  96(%0) ;"
++              "mulx 48(%2),  %%r8, %%r13; " /* C[3]*D[2] */
++              "adox  %%r8, %%r11 ;"
++              "adcx %%r11, %%r15 ;"
++              "movq %%r15, 104(%0) ;"
++              "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
++              "adox %%r10, %%r13 ;"
++              "adcx %%r13, %%rax ;"
++              "movq %%rax, 112(%0) ;"
++              /******************************************/
++              "adox %%r14, %%rbx ;"
++              "adcx %%r14, %%rbx ;"
++              "movq %%rbx, 120(%0) ;"
++              :
++              : "r"(c), "r"(a), "r"(b)
++              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
++                "%r10", "%r11", "%r13", "%r14", "%r15");
++}
++
++static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a,
++                                    const u64 *const b)
++{
++      asm volatile(
++              "movq   (%1), %%rdx; "  /* A[0] */
++              "mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
++              "movq %%r8,  (%0) ;"
++              "mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
++              "addq %%r10, %%r15 ;"
++              "mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
++              "adcq  %%r8, %%rax ;"
++              "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
++              "adcq %%r10, %%rbx ;"
++              /******************************************/
++              "adcq    $0, %%rcx ;"
++
++              "movq  8(%1), %%rdx; "  /* A[1] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
++              "addq %%r15,  %%r8 ;"
++              "movq %%r8, 8(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
++              "adcq %%r10,  %%r9 ;"
++              "mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
++              "adcq  %%r8, %%r11 ;"
++              "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
++              "adcq %%r10, %%r13 ;"
++              /******************************************/
++              "adcq    $0, %%r15 ;"
++
++              "addq  %%r9, %%rax ;"
++              "adcq %%r11, %%rbx ;"
++              "adcq %%r13, %%rcx ;"
++              "adcq    $0, %%r15 ;"
++
++              "movq 16(%1), %%rdx; "  /* A[2] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
++              "addq %%rax,  %%r8 ;"
++              "movq %%r8, 16(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
++              "adcq %%r10,  %%r9 ;"
++              "mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
++              "adcq  %%r8, %%r11 ;"
++              "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
++              "adcq %%r10, %%r13 ;"
++              /******************************************/
++              "adcq    $0, %%rax ;"
++
++              "addq  %%r9, %%rbx ;"
++              "adcq %%r11, %%rcx ;"
++              "adcq %%r13, %%r15 ;"
++              "adcq    $0, %%rax ;"
++
++              "movq 24(%1), %%rdx; "  /* A[3] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
++              "addq %%rbx,  %%r8 ;"
++              "movq %%r8, 24(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
++              "adcq %%r10,  %%r9 ;"
++              "mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
++              "adcq  %%r8, %%r11 ;"
++              "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
++              "adcq %%r10, %%r13 ;"
++              /******************************************/
++              "adcq    $0, %%rbx ;"
++
++              "addq  %%r9, %%rcx ;"
++              "movq %%rcx, 32(%0) ;"
++              "adcq %%r11, %%r15 ;"
++              "movq %%r15, 40(%0) ;"
++              "adcq %%r13, %%rax ;"
++              "movq %%rax, 48(%0) ;"
++              "adcq    $0, %%rbx ;"
++              "movq %%rbx, 56(%0) ;"
++
++              "movq 32(%1), %%rdx; "  /* C[0] */
++              "mulx 32(%2),  %%r8, %%r15; " /* C[0]*D[0] */
++              "movq %%r8, 64(%0) ;"
++              "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
++              "addq %%r10, %%r15 ;"
++              "mulx 48(%2),  %%r8, %%rbx; " /* C[0]*D[2] */
++              "adcq  %%r8, %%rax ;"
++              "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
++              "adcq %%r10, %%rbx ;"
++              /******************************************/
++              "adcq    $0, %%rcx ;"
++
++              "movq 40(%1), %%rdx; "  /* C[1] */
++              "mulx 32(%2),  %%r8,  %%r9; " /* C[1]*D[0] */
++              "addq %%r15,  %%r8 ;"
++              "movq %%r8, 72(%0) ;"
++              "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
++              "adcq %%r10,  %%r9 ;"
++              "mulx 48(%2),  %%r8, %%r13; " /* C[1]*D[2] */
++              "adcq  %%r8, %%r11 ;"
++              "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
++              "adcq %%r10, %%r13 ;"
++              /******************************************/
++              "adcq    $0, %%r15 ;"
++
++              "addq  %%r9, %%rax ;"
++              "adcq %%r11, %%rbx ;"
++              "adcq %%r13, %%rcx ;"
++              "adcq    $0, %%r15 ;"
++
++              "movq 48(%1), %%rdx; "  /* C[2] */
++              "mulx 32(%2),  %%r8,  %%r9; " /* C[2]*D[0] */
++              "addq %%rax,  %%r8 ;"
++              "movq %%r8, 80(%0) ;"
++              "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
++              "adcq %%r10,  %%r9 ;"
++              "mulx 48(%2),  %%r8, %%r13; " /* C[2]*D[2] */
++              "adcq  %%r8, %%r11 ;"
++              "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
++              "adcq %%r10, %%r13 ;"
++              /******************************************/
++              "adcq    $0, %%rax ;"
++
++              "addq  %%r9, %%rbx ;"
++              "adcq %%r11, %%rcx ;"
++              "adcq %%r13, %%r15 ;"
++              "adcq    $0, %%rax ;"
++
++              "movq 56(%1), %%rdx; "  /* C[3] */
++              "mulx 32(%2),  %%r8,  %%r9; " /* C[3]*D[0] */
++              "addq %%rbx,  %%r8 ;"
++              "movq %%r8, 88(%0) ;"
++              "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
++              "adcq %%r10,  %%r9 ;"
++              "mulx 48(%2),  %%r8, %%r13; " /* C[3]*D[2] */
++              "adcq  %%r8, %%r11 ;"
++              "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
++              "adcq %%r10, %%r13 ;"
++              /******************************************/
++              "adcq    $0, %%rbx ;"
++
++              "addq  %%r9, %%rcx ;"
++              "movq %%rcx,  96(%0) ;"
++              "adcq %%r11, %%r15 ;"
++              "movq %%r15, 104(%0) ;"
++              "adcq %%r13, %%rax ;"
++              "movq %%rax, 112(%0) ;"
++              "adcq    $0, %%rbx ;"
++              "movq %%rbx, 120(%0) ;"
++              :
++              : "r"(c), "r"(a), "r"(b)
++              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
++                "%r10", "%r11", "%r13", "%r15");
++}
++
++static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a)
++{
++      asm volatile(
++              "movq   (%1), %%rdx        ;" /* A[0]      */
++              "mulx  8(%1),  %%r8, %%r14 ;" /* A[1]*A[0] */
++              "xorl %%r15d, %%r15d;"
++              "mulx 16(%1),  %%r9, %%r10 ;" /* A[2]*A[0] */
++              "adcx %%r14,  %%r9 ;"
++              "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
++              "adcx %%rax, %%r10 ;"
++              "movq 24(%1), %%rdx        ;" /* A[3]      */
++              "mulx  8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
++              "adcx %%rcx, %%r11 ;"
++              "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
++              "adcx %%rax, %%rbx ;"
++              "movq  8(%1), %%rdx        ;" /* A[1]      */
++              "adcx %%r15, %%r13 ;"
++              "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
++              "movq    $0, %%r14 ;"
++              /******************************************/
++              "adcx %%r15, %%r14 ;"
++
++              "xorl %%r15d, %%r15d;"
++              "adox %%rax, %%r10 ;"
++              "adcx  %%r8,  %%r8 ;"
++              "adox %%rcx, %%r11 ;"
++              "adcx  %%r9,  %%r9 ;"
++              "adox %%r15, %%rbx ;"
++              "adcx %%r10, %%r10 ;"
++              "adox %%r15, %%r13 ;"
++              "adcx %%r11, %%r11 ;"
++              "adox %%r15, %%r14 ;"
++              "adcx %%rbx, %%rbx ;"
++              "adcx %%r13, %%r13 ;"
++              "adcx %%r14, %%r14 ;"
++
++              "movq   (%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
++              /*******************/
++              "movq %%rax,  0(%0) ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,  8(%0) ;"
++              "movq  8(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
++              "adcq %%rax,  %%r9 ;"
++              "movq  %%r9, 16(%0) ;"
++              "adcq %%rcx, %%r10 ;"
++              "movq %%r10, 24(%0) ;"
++              "movq 16(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
++              "adcq %%rax, %%r11 ;"
++              "movq %%r11, 32(%0) ;"
++              "adcq %%rcx, %%rbx ;"
++              "movq %%rbx, 40(%0) ;"
++              "movq 24(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
++              "adcq %%rax, %%r13 ;"
++              "movq %%r13, 48(%0) ;"
++              "adcq %%rcx, %%r14 ;"
++              "movq %%r14, 56(%0) ;"
++
++
++              "movq 32(%1), %%rdx        ;" /* B[0]      */
++              "mulx 40(%1),  %%r8, %%r14 ;" /* B[1]*B[0] */
++              "xorl %%r15d, %%r15d;"
++              "mulx 48(%1),  %%r9, %%r10 ;" /* B[2]*B[0] */
++              "adcx %%r14,  %%r9 ;"
++              "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
++              "adcx %%rax, %%r10 ;"
++              "movq 56(%1), %%rdx        ;" /* B[3]      */
++              "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */
++              "adcx %%rcx, %%r11 ;"
++              "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
++              "adcx %%rax, %%rbx ;"
++              "movq 40(%1), %%rdx        ;" /* B[1]      */
++              "adcx %%r15, %%r13 ;"
++              "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
++              "movq    $0, %%r14 ;"
++              /******************************************/
++              "adcx %%r15, %%r14 ;"
++
++              "xorl %%r15d, %%r15d;"
++              "adox %%rax, %%r10 ;"
++              "adcx  %%r8,  %%r8 ;"
++              "adox %%rcx, %%r11 ;"
++              "adcx  %%r9,  %%r9 ;"
++              "adox %%r15, %%rbx ;"
++              "adcx %%r10, %%r10 ;"
++              "adox %%r15, %%r13 ;"
++              "adcx %%r11, %%r11 ;"
++              "adox %%r15, %%r14 ;"
++              "adcx %%rbx, %%rbx ;"
++              "adcx %%r13, %%r13 ;"
++              "adcx %%r14, %%r14 ;"
++
++              "movq 32(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
++              /*******************/
++              "movq %%rax,  64(%0) ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,  72(%0) ;"
++              "movq 40(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
++              "adcq %%rax,  %%r9 ;"
++              "movq  %%r9,  80(%0) ;"
++              "adcq %%rcx, %%r10 ;"
++              "movq %%r10,  88(%0) ;"
++              "movq 48(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
++              "adcq %%rax, %%r11 ;"
++              "movq %%r11,  96(%0) ;"
++              "adcq %%rcx, %%rbx ;"
++              "movq %%rbx, 104(%0) ;"
++              "movq 56(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
++              "adcq %%rax, %%r13 ;"
++              "movq %%r13, 112(%0) ;"
++              "adcq %%rcx, %%r14 ;"
++              "movq %%r14, 120(%0) ;"
++              :
++              : "r"(c), "r"(a)
++              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
++                "%r10", "%r11", "%r13", "%r14", "%r15");
++}
++
++static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a)
++{
++      asm volatile(
++              "movq  8(%1), %%rdx        ;" /* A[1]      */
++              "mulx   (%1),  %%r8,  %%r9 ;" /* A[0]*A[1] */
++              "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
++              "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
++
++              "movq 16(%1), %%rdx        ;" /* A[2]      */
++              "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
++              "mulx   (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
++
++              "addq %%rax,  %%r9 ;"
++              "adcq %%rdx, %%r10 ;"
++              "adcq %%rcx, %%r11 ;"
++              "adcq %%r14, %%r15 ;"
++              "adcq    $0, %%r13 ;"
++              "movq    $0, %%r14 ;"
++              "adcq    $0, %%r14 ;"
++
++              "movq   (%1), %%rdx        ;" /* A[0]      */
++              "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
++
++              "addq %%rax, %%r10 ;"
++              "adcq %%rcx, %%r11 ;"
++              "adcq    $0, %%r15 ;"
++              "adcq    $0, %%r13 ;"
++              "adcq    $0, %%r14 ;"
++
++              "shldq $1, %%r13, %%r14 ;"
++              "shldq $1, %%r15, %%r13 ;"
++              "shldq $1, %%r11, %%r15 ;"
++              "shldq $1, %%r10, %%r11 ;"
++              "shldq $1,  %%r9, %%r10 ;"
++              "shldq $1,  %%r8,  %%r9 ;"
++              "shlq  $1,  %%r8        ;"
++
++              /*******************/
++              "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
++              /*******************/
++              "movq %%rax,  0(%0) ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,  8(%0) ;"
++              "movq  8(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
++              "adcq %%rax,  %%r9 ;"
++              "movq  %%r9, 16(%0) ;"
++              "adcq %%rcx, %%r10 ;"
++              "movq %%r10, 24(%0) ;"
++              "movq 16(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
++              "adcq %%rax, %%r11 ;"
++              "movq %%r11, 32(%0) ;"
++              "adcq %%rcx, %%r15 ;"
++              "movq %%r15, 40(%0) ;"
++              "movq 24(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
++              "adcq %%rax, %%r13 ;"
++              "movq %%r13, 48(%0) ;"
++              "adcq %%rcx, %%r14 ;"
++              "movq %%r14, 56(%0) ;"
++
++              "movq 40(%1), %%rdx        ;" /* B[1]      */
++              "mulx 32(%1),  %%r8,  %%r9 ;" /* B[0]*B[1] */
++              "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
++              "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
++
++              "movq 48(%1), %%rdx        ;" /* B[2]      */
++              "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */
++              "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
++
++              "addq %%rax,  %%r9 ;"
++              "adcq %%rdx, %%r10 ;"
++              "adcq %%rcx, %%r11 ;"
++              "adcq %%r14, %%r15 ;"
++              "adcq    $0, %%r13 ;"
++              "movq    $0, %%r14 ;"
++              "adcq    $0, %%r14 ;"
++
++              "movq 32(%1), %%rdx        ;" /* B[0]      */
++              "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
++
++              "addq %%rax, %%r10 ;"
++              "adcq %%rcx, %%r11 ;"
++              "adcq    $0, %%r15 ;"
++              "adcq    $0, %%r13 ;"
++              "adcq    $0, %%r14 ;"
++
++              "shldq $1, %%r13, %%r14 ;"
++              "shldq $1, %%r15, %%r13 ;"
++              "shldq $1, %%r11, %%r15 ;"
++              "shldq $1, %%r10, %%r11 ;"
++              "shldq $1,  %%r9, %%r10 ;"
++              "shldq $1,  %%r8,  %%r9 ;"
++              "shlq  $1,  %%r8        ;"
++
++              /*******************/
++              "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
++              /*******************/
++              "movq %%rax,  64(%0) ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,  72(%0) ;"
++              "movq 40(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
++              "adcq %%rax,  %%r9 ;"
++              "movq  %%r9,  80(%0) ;"
++              "adcq %%rcx, %%r10 ;"
++              "movq %%r10,  88(%0) ;"
++              "movq 48(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
++              "adcq %%rax, %%r11 ;"
++              "movq %%r11,  96(%0) ;"
++              "adcq %%rcx, %%r15 ;"
++              "movq %%r15, 104(%0) ;"
++              "movq 56(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
++              "adcq %%rax, %%r13 ;"
++              "movq %%r13, 112(%0) ;"
++              "adcq %%rcx, %%r14 ;"
++              "movq %%r14, 120(%0) ;"
++              :
++              : "r"(c), "r"(a)
++              : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
++                "%r11", "%r13", "%r14", "%r15");
++}
++
++static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a)
++{
++      asm volatile(
++              "movl    $38, %%edx; "  /* 2*c = 38 = 2^256 */
++              "mulx 32(%1),  %%r8, %%r10; " /* c*C[4] */
++              "xorl %%ebx, %%ebx ;"
++              "adox   (%1),  %%r8 ;"
++              "mulx 40(%1),  %%r9, %%r11; " /* c*C[5] */
++              "adcx %%r10,  %%r9 ;"
++              "adox  8(%1),  %%r9 ;"
++              "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
++              "adcx %%r11, %%r10 ;"
++              "adox 16(%1), %%r10 ;"
++              "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
++              "adcx %%rax, %%r11 ;"
++              "adox 24(%1), %%r11 ;"
++              /***************************************/
++              "adcx %%rbx, %%rcx ;"
++              "adox  %%rbx, %%rcx ;"
++              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
++              "adcx %%rcx,  %%r8 ;"
++              "adcx %%rbx,  %%r9 ;"
++              "movq  %%r9,  8(%0) ;"
++              "adcx %%rbx, %%r10 ;"
++              "movq %%r10, 16(%0) ;"
++              "adcx %%rbx, %%r11 ;"
++              "movq %%r11, 24(%0) ;"
++              "mov     $0, %%ecx ;"
++              "cmovc %%edx, %%ecx ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,   (%0) ;"
++
++              "mulx  96(%1),  %%r8, %%r10; " /* c*C[4] */
++              "xorl %%ebx, %%ebx ;"
++              "adox 64(%1),  %%r8 ;"
++              "mulx 104(%1),  %%r9, %%r11; " /* c*C[5] */
++              "adcx %%r10,  %%r9 ;"
++              "adox 72(%1),  %%r9 ;"
++              "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
++              "adcx %%r11, %%r10 ;"
++              "adox 80(%1), %%r10 ;"
++              "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
++              "adcx %%rax, %%r11 ;"
++              "adox 88(%1), %%r11 ;"
++              /****************************************/
++              "adcx %%rbx, %%rcx ;"
++              "adox  %%rbx, %%rcx ;"
++              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
++              "adcx %%rcx,  %%r8 ;"
++              "adcx %%rbx,  %%r9 ;"
++              "movq  %%r9, 40(%0) ;"
++              "adcx %%rbx, %%r10 ;"
++              "movq %%r10, 48(%0) ;"
++              "adcx %%rbx, %%r11 ;"
++              "movq %%r11, 56(%0) ;"
++              "mov     $0, %%ecx ;"
++              "cmovc %%edx, %%ecx ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8, 32(%0) ;"
++              :
++              : "r"(c), "r"(a)
++              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
++                "%r10", "%r11");
++}
++
++static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a)
++{
++      asm volatile(
++              "movl    $38, %%edx ; "       /* 2*c = 38 = 2^256 */
++              "mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
++              "mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
++              "addq %%r10,  %%r9 ;"
++              "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
++              "adcq %%r11, %%r10 ;"
++              "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
++              "adcq %%rax, %%r11 ;"
++              /***************************************/
++              "adcq    $0, %%rcx ;"
++              "addq   (%1),  %%r8 ;"
++              "adcq  8(%1),  %%r9 ;"
++              "adcq 16(%1), %%r10 ;"
++              "adcq 24(%1), %%r11 ;"
++              "adcq     $0, %%rcx ;"
++              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
++              "addq %%rcx,  %%r8 ;"
++              "adcq    $0,  %%r9 ;"
++              "movq  %%r9,  8(%0) ;"
++              "adcq    $0, %%r10 ;"
++              "movq %%r10, 16(%0) ;"
++              "adcq    $0, %%r11 ;"
++              "movq %%r11, 24(%0) ;"
++              "mov     $0, %%ecx ;"
++              "cmovc %%edx, %%ecx ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,   (%0) ;"
++
++              "mulx  96(%1),  %%r8, %%r10 ;" /* c*C[4] */
++              "mulx 104(%1),  %%r9, %%r11 ;" /* c*C[5] */
++              "addq %%r10,  %%r9 ;"
++              "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
++              "adcq %%r11, %%r10 ;"
++              "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
++              "adcq %%rax, %%r11 ;"
++              /****************************************/
++              "adcq    $0, %%rcx ;"
++              "addq 64(%1),  %%r8 ;"
++              "adcq 72(%1),  %%r9 ;"
++              "adcq 80(%1), %%r10 ;"
++              "adcq 88(%1), %%r11 ;"
++              "adcq     $0, %%rcx ;"
++              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
++              "addq %%rcx,  %%r8 ;"
++              "adcq    $0,  %%r9 ;"
++              "movq  %%r9, 40(%0) ;"
++              "adcq    $0, %%r10 ;"
++              "movq %%r10, 48(%0) ;"
++              "adcq    $0, %%r11 ;"
++              "movq %%r11, 56(%0) ;"
++              "mov     $0, %%ecx ;"
++              "cmovc %%edx, %%ecx ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8, 32(%0) ;"
++              :
++              : "r"(c), "r"(a)
++              : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
++                "%r11");
++}
++
++static void mul_256x256_integer_adx(u64 *const c, const u64 *const a,
++                                  const u64 *const b)
++{
++      asm volatile(
++              "movq   (%1), %%rdx; "  /* A[0] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[0]*B[0] */
++              "xorl %%r10d, %%r10d ;"
++              "movq  %%r8,  (%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[0]*B[1] */
++              "adox  %%r9, %%r10 ;"
++              "movq %%r10, 8(%0) ;"
++              "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */
++              "adox %%r11, %%r15 ;"
++              "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
++              "adox %%r13, %%r14 ;"
++              "movq $0, %%rax ;"
++              /******************************************/
++              "adox %%rdx, %%rax ;"
++
++              "movq  8(%1), %%rdx; "  /* A[1] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
++              "xorl %%r10d, %%r10d ;"
++              "adcx 8(%0),  %%r8 ;"
++              "movq  %%r8,  8(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
++              "adox  %%r9, %%r10 ;"
++              "adcx %%r15, %%r10 ;"
++              "movq %%r10, 16(%0) ;"
++              "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */
++              "adox %%r11, %%r15 ;"
++              "adcx %%r14, %%r15 ;"
++              "movq $0, %%r8  ;"
++              "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
++              "adox %%r13, %%r14 ;"
++              "adcx %%rax, %%r14 ;"
++              "movq $0, %%rax ;"
++              /******************************************/
++              "adox %%rdx, %%rax ;"
++              "adcx  %%r8, %%rax ;"
++
++              "movq 16(%1), %%rdx; "  /* A[2] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
++              "xorl %%r10d, %%r10d ;"
++              "adcx 16(%0), %%r8 ;"
++              "movq  %%r8, 16(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
++              "adox  %%r9, %%r10 ;"
++              "adcx %%r15, %%r10 ;"
++              "movq %%r10, 24(%0) ;"
++              "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */
++              "adox %%r11, %%r15 ;"
++              "adcx %%r14, %%r15 ;"
++              "movq $0, %%r8  ;"
++              "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
++              "adox %%r13, %%r14 ;"
++              "adcx %%rax, %%r14 ;"
++              "movq $0, %%rax ;"
++              /******************************************/
++              "adox %%rdx, %%rax ;"
++              "adcx  %%r8, %%rax ;"
++
++              "movq 24(%1), %%rdx; "  /* A[3] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
++              "xorl %%r10d, %%r10d ;"
++              "adcx 24(%0), %%r8 ;"
++              "movq  %%r8, 24(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
++              "adox  %%r9, %%r10 ;"
++              "adcx %%r15, %%r10 ;"
++              "movq %%r10, 32(%0) ;"
++              "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */
++              "adox %%r11, %%r15 ;"
++              "adcx %%r14, %%r15 ;"
++              "movq %%r15, 40(%0) ;"
++              "movq $0, %%r8  ;"
++              "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
++              "adox %%r13, %%r14 ;"
++              "adcx %%rax, %%r14 ;"
++              "movq %%r14, 48(%0) ;"
++              "movq $0, %%rax ;"
++              /******************************************/
++              "adox %%rdx, %%rax ;"
++              "adcx  %%r8, %%rax ;"
++              "movq %%rax, 56(%0) ;"
++              :
++              : "r"(c), "r"(a), "r"(b)
++              : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11",
++                "%r13", "%r14", "%r15");
++}
++
++static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a,
++                                   const u64 *const b)
++{
++      asm volatile(
++              "movq   (%1), %%rdx; "  /* A[0] */
++              "mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
++              "movq %%r8,  (%0) ;"
++              "mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
++              "addq %%r10, %%r15 ;"
++              "mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
++              "adcq  %%r8, %%rax ;"
++              "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
++              "adcq %%r10, %%rbx ;"
++              /******************************************/
++              "adcq    $0, %%rcx ;"
++
++              "movq  8(%1), %%rdx; "  /* A[1] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
++              "addq %%r15,  %%r8 ;"
++              "movq %%r8, 8(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
++              "adcq %%r10,  %%r9 ;"
++              "mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
++              "adcq  %%r8, %%r11 ;"
++              "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
++              "adcq %%r10, %%r13 ;"
++              /******************************************/
++              "adcq    $0, %%r15 ;"
++
++              "addq  %%r9, %%rax ;"
++              "adcq %%r11, %%rbx ;"
++              "adcq %%r13, %%rcx ;"
++              "adcq    $0, %%r15 ;"
++
++              "movq 16(%1), %%rdx; "  /* A[2] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
++              "addq %%rax,  %%r8 ;"
++              "movq %%r8, 16(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
++              "adcq %%r10,  %%r9 ;"
++              "mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
++              "adcq  %%r8, %%r11 ;"
++              "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
++              "adcq %%r10, %%r13 ;"
++              /******************************************/
++              "adcq    $0, %%rax ;"
++
++              "addq  %%r9, %%rbx ;"
++              "adcq %%r11, %%rcx ;"
++              "adcq %%r13, %%r15 ;"
++              "adcq    $0, %%rax ;"
++
++              "movq 24(%1), %%rdx; "  /* A[3] */
++              "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
++              "addq %%rbx,  %%r8 ;"
++              "movq %%r8, 24(%0) ;"
++              "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
++              "adcq %%r10,  %%r9 ;"
++              "mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
++              "adcq  %%r8, %%r11 ;"
++              "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
++              "adcq %%r10, %%r13 ;"
++              /******************************************/
++              "adcq    $0, %%rbx ;"
++
++              "addq  %%r9, %%rcx ;"
++              "movq %%rcx, 32(%0) ;"
++              "adcq %%r11, %%r15 ;"
++              "movq %%r15, 40(%0) ;"
++              "adcq %%r13, %%rax ;"
++              "movq %%rax, 48(%0) ;"
++              "adcq    $0, %%rbx ;"
++              "movq %%rbx, 56(%0) ;"
++              :
++              : "r"(c), "r"(a), "r"(b)
++              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
++                "%r10", "%r11", "%r13", "%r15");
++}
++
++static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a)
++{
++      asm volatile(
++              "movq   (%1), %%rdx        ;" /* A[0]      */
++              "mulx  8(%1),  %%r8, %%r14 ;" /* A[1]*A[0] */
++              "xorl %%r15d, %%r15d;"
++              "mulx 16(%1),  %%r9, %%r10 ;" /* A[2]*A[0] */
++              "adcx %%r14,  %%r9 ;"
++              "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
++              "adcx %%rax, %%r10 ;"
++              "movq 24(%1), %%rdx        ;" /* A[3]      */
++              "mulx  8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
++              "adcx %%rcx, %%r11 ;"
++              "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
++              "adcx %%rax, %%rbx ;"
++              "movq  8(%1), %%rdx        ;" /* A[1]      */
++              "adcx %%r15, %%r13 ;"
++              "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
++              "movq    $0, %%r14 ;"
++              /******************************************/
++              "adcx %%r15, %%r14 ;"
++
++              "xorl %%r15d, %%r15d;"
++              "adox %%rax, %%r10 ;"
++              "adcx  %%r8,  %%r8 ;"
++              "adox %%rcx, %%r11 ;"
++              "adcx  %%r9,  %%r9 ;"
++              "adox %%r15, %%rbx ;"
++              "adcx %%r10, %%r10 ;"
++              "adox %%r15, %%r13 ;"
++              "adcx %%r11, %%r11 ;"
++              "adox %%r15, %%r14 ;"
++              "adcx %%rbx, %%rbx ;"
++              "adcx %%r13, %%r13 ;"
++              "adcx %%r14, %%r14 ;"
++
++              "movq   (%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
++              /*******************/
++              "movq %%rax,  0(%0) ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,  8(%0) ;"
++              "movq  8(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
++              "adcq %%rax,  %%r9 ;"
++              "movq  %%r9, 16(%0) ;"
++              "adcq %%rcx, %%r10 ;"
++              "movq %%r10, 24(%0) ;"
++              "movq 16(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
++              "adcq %%rax, %%r11 ;"
++              "movq %%r11, 32(%0) ;"
++              "adcq %%rcx, %%rbx ;"
++              "movq %%rbx, 40(%0) ;"
++              "movq 24(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
++              "adcq %%rax, %%r13 ;"
++              "movq %%r13, 48(%0) ;"
++              "adcq %%rcx, %%r14 ;"
++              "movq %%r14, 56(%0) ;"
++              :
++              : "r"(c), "r"(a)
++              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
++                "%r10", "%r11", "%r13", "%r14", "%r15");
++}
++
++static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a)
++{
++      asm volatile(
++              "movq  8(%1), %%rdx        ;" /* A[1]      */
++              "mulx   (%1),  %%r8,  %%r9 ;" /* A[0]*A[1] */
++              "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
++              "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
++
++              "movq 16(%1), %%rdx        ;" /* A[2]      */
++              "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
++              "mulx   (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
++
++              "addq %%rax,  %%r9 ;"
++              "adcq %%rdx, %%r10 ;"
++              "adcq %%rcx, %%r11 ;"
++              "adcq %%r14, %%r15 ;"
++              "adcq    $0, %%r13 ;"
++              "movq    $0, %%r14 ;"
++              "adcq    $0, %%r14 ;"
++
++              "movq   (%1), %%rdx        ;" /* A[0]      */
++              "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
++
++              "addq %%rax, %%r10 ;"
++              "adcq %%rcx, %%r11 ;"
++              "adcq    $0, %%r15 ;"
++              "adcq    $0, %%r13 ;"
++              "adcq    $0, %%r14 ;"
++
++              "shldq $1, %%r13, %%r14 ;"
++              "shldq $1, %%r15, %%r13 ;"
++              "shldq $1, %%r11, %%r15 ;"
++              "shldq $1, %%r10, %%r11 ;"
++              "shldq $1,  %%r9, %%r10 ;"
++              "shldq $1,  %%r8,  %%r9 ;"
++              "shlq  $1,  %%r8        ;"
++
++              /*******************/
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
++              /*******************/
++              "movq %%rax,  0(%0) ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,  8(%0) ;"
++              "movq  8(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
++              "adcq %%rax,  %%r9 ;"
++              "movq  %%r9, 16(%0) ;"
++              "adcq %%rcx, %%r10 ;"
++              "movq %%r10, 24(%0) ;"
++              "movq 16(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
++              "adcq %%rax, %%r11 ;"
++              "movq %%r11, 32(%0) ;"
++              "adcq %%rcx, %%r15 ;"
++              "movq %%r15, 40(%0) ;"
++              "movq 24(%1), %%rdx ;"
++              "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
++              "adcq %%rax, %%r13 ;"
++              "movq %%r13, 48(%0) ;"
++              "adcq %%rcx, %%r14 ;"
++              "movq %%r14, 56(%0) ;"
++              :
++              : "r"(c), "r"(a)
++              : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
++                "%r11", "%r13", "%r14", "%r15");
++}
++
++static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
++{
++      asm volatile(
++              "movl    $38, %%edx ;"  /* 2*c = 38 = 2^256 */
++              "mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
++              "xorl %%ebx, %%ebx ;"
++              "adox   (%1),  %%r8 ;"
++              "mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
++              "adcx %%r10,  %%r9 ;"
++              "adox  8(%1),  %%r9 ;"
++              "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
++              "adcx %%r11, %%r10 ;"
++              "adox 16(%1), %%r10 ;"
++              "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
++              "adcx %%rax, %%r11 ;"
++              "adox 24(%1), %%r11 ;"
++              /***************************************/
++              "adcx %%rbx, %%rcx ;"
++              "adox  %%rbx, %%rcx ;"
++              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
++              "adcx %%rcx,  %%r8 ;"
++              "adcx %%rbx,  %%r9 ;"
++              "movq  %%r9,  8(%0) ;"
++              "adcx %%rbx, %%r10 ;"
++              "movq %%r10, 16(%0) ;"
++              "adcx %%rbx, %%r11 ;"
++              "movq %%r11, 24(%0) ;"
++              "mov     $0, %%ecx ;"
++              "cmovc %%edx, %%ecx ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,   (%0) ;"
++              :
++              : "r"(c), "r"(a)
++              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
++                "%r10", "%r11");
++}
++
++static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
++{
++      asm volatile(
++              "movl    $38, %%edx ;"  /* 2*c = 38 = 2^256 */
++              "mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
++              "mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
++              "addq %%r10,  %%r9 ;"
++              "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
++              "adcq %%r11, %%r10 ;"
++              "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
++              "adcq %%rax, %%r11 ;"
++              /***************************************/
++              "adcq    $0, %%rcx ;"
++              "addq   (%1),  %%r8 ;"
++              "adcq  8(%1),  %%r9 ;"
++              "adcq 16(%1), %%r10 ;"
++              "adcq 24(%1), %%r11 ;"
++              "adcq     $0, %%rcx ;"
++              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
++              "addq %%rcx,  %%r8 ;"
++              "adcq    $0,  %%r9 ;"
++              "movq  %%r9,  8(%0) ;"
++              "adcq    $0, %%r10 ;"
++              "movq %%r10, 16(%0) ;"
++              "adcq    $0, %%r11 ;"
++              "movq %%r11, 24(%0) ;"
++              "mov     $0, %%ecx ;"
++              "cmovc %%edx, %%ecx ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,   (%0) ;"
++              :
++              : "r"(c), "r"(a)
++              : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
++                "%r11");
++}
++
++static __always_inline void
++add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b)
++{
++      asm volatile(
++              "mov     $38, %%eax ;"
++              "xorl  %%ecx, %%ecx ;"
++              "movq   (%2),  %%r8 ;"
++              "adcx   (%1),  %%r8 ;"
++              "movq  8(%2),  %%r9 ;"
++              "adcx  8(%1),  %%r9 ;"
++              "movq 16(%2), %%r10 ;"
++              "adcx 16(%1), %%r10 ;"
++              "movq 24(%2), %%r11 ;"
++              "adcx 24(%1), %%r11 ;"
++              "cmovc %%eax, %%ecx ;"
++              "xorl %%eax, %%eax  ;"
++              "adcx %%rcx,  %%r8  ;"
++              "adcx %%rax,  %%r9  ;"
++              "movq  %%r9,  8(%0) ;"
++              "adcx %%rax, %%r10  ;"
++              "movq %%r10, 16(%0) ;"
++              "adcx %%rax, %%r11  ;"
++              "movq %%r11, 24(%0) ;"
++              "mov     $38, %%ecx ;"
++              "cmovc %%ecx, %%eax ;"
++              "addq %%rax,  %%r8  ;"
++              "movq  %%r8,   (%0) ;"
++              :
++              : "r"(c), "r"(a), "r"(b)
++              : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
++}
++
++static __always_inline void
++add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b)
++{
++      asm volatile(
++              "mov     $38, %%eax ;"
++              "movq   (%2),  %%r8 ;"
++              "addq   (%1),  %%r8 ;"
++              "movq  8(%2),  %%r9 ;"
++              "adcq  8(%1),  %%r9 ;"
++              "movq 16(%2), %%r10 ;"
++              "adcq 16(%1), %%r10 ;"
++              "movq 24(%2), %%r11 ;"
++              "adcq 24(%1), %%r11 ;"
++              "mov      $0, %%ecx ;"
++              "cmovc %%eax, %%ecx ;"
++              "addq %%rcx,  %%r8  ;"
++              "adcq    $0,  %%r9  ;"
++              "movq  %%r9,  8(%0) ;"
++              "adcq    $0, %%r10  ;"
++              "movq %%r10, 16(%0) ;"
++              "adcq    $0, %%r11  ;"
++              "movq %%r11, 24(%0) ;"
++              "mov     $0, %%ecx  ;"
++              "cmovc %%eax, %%ecx ;"
++              "addq %%rcx,  %%r8  ;"
++              "movq  %%r8,   (%0) ;"
++              :
++              : "r"(c), "r"(a), "r"(b)
++              : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
++}
++
++static __always_inline void
++sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b)
++{
++      asm volatile(
++              "mov     $38, %%eax ;"
++              "movq   (%1),  %%r8 ;"
++              "subq   (%2),  %%r8 ;"
++              "movq  8(%1),  %%r9 ;"
++              "sbbq  8(%2),  %%r9 ;"
++              "movq 16(%1), %%r10 ;"
++              "sbbq 16(%2), %%r10 ;"
++              "movq 24(%1), %%r11 ;"
++              "sbbq 24(%2), %%r11 ;"
++              "mov      $0, %%ecx ;"
++              "cmovc %%eax, %%ecx ;"
++              "subq %%rcx,  %%r8  ;"
++              "sbbq    $0,  %%r9  ;"
++              "movq  %%r9,  8(%0) ;"
++              "sbbq    $0, %%r10  ;"
++              "movq %%r10, 16(%0) ;"
++              "sbbq    $0, %%r11  ;"
++              "movq %%r11, 24(%0) ;"
++              "mov     $0, %%ecx  ;"
++              "cmovc %%eax, %%ecx ;"
++              "subq %%rcx,  %%r8  ;"
++              "movq  %%r8,   (%0) ;"
++              :
++              : "r"(c), "r"(a), "r"(b)
++              : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
++}
++
++/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
++static __always_inline void
++mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a)
++{
++      const u64 a24 = 121666;
++      asm volatile(
++              "movq     %2, %%rdx ;"
++              "mulx   (%1),  %%r8, %%r10 ;"
++              "mulx  8(%1),  %%r9, %%r11 ;"
++              "addq %%r10,  %%r9 ;"
++              "mulx 16(%1), %%r10, %%rax ;"
++              "adcq %%r11, %%r10 ;"
++              "mulx 24(%1), %%r11, %%rcx ;"
++              "adcq %%rax, %%r11 ;"
++              /**************************/
++              "adcq    $0, %%rcx ;"
++              "movl   $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
++              "imul %%rdx, %%rcx ;"
++              "addq %%rcx,  %%r8 ;"
++              "adcq    $0,  %%r9 ;"
++              "movq  %%r9,  8(%0) ;"
++              "adcq    $0, %%r10 ;"
++              "movq %%r10, 16(%0) ;"
++              "adcq    $0, %%r11 ;"
++              "movq %%r11, 24(%0) ;"
++              "mov     $0, %%ecx ;"
++              "cmovc %%edx, %%ecx ;"
++              "addq %%rcx,  %%r8 ;"
++              "movq  %%r8,   (%0) ;"
++              :
++              : "r"(c), "r"(a), "r"(a24)
++              : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
++                "%r11");
++}
++
++static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
++{
++      struct {
++              eltfp25519_1w_buffer buffer;
++              eltfp25519_1w x0, x1, x2;
++      } __aligned(32) m;
++      u64 *T[4];
++
++      T[0] = m.x0;
++      T[1] = c; /* x^(-1) */
++      T[2] = m.x1;
++      T[3] = m.x2;
++
++      copy_eltfp25519_1w(T[1], a);
++      sqrn_eltfp25519_1w_adx(T[1], 1);
++      copy_eltfp25519_1w(T[2], T[1]);
++      sqrn_eltfp25519_1w_adx(T[2], 2);
++      mul_eltfp25519_1w_adx(T[0], a, T[2]);
++      mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
++      copy_eltfp25519_1w(T[2], T[1]);
++      sqrn_eltfp25519_1w_adx(T[2], 1);
++      mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
++      copy_eltfp25519_1w(T[2], T[0]);
++      sqrn_eltfp25519_1w_adx(T[2], 5);
++      mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
++      copy_eltfp25519_1w(T[2], T[0]);
++      sqrn_eltfp25519_1w_adx(T[2], 10);
++      mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
++      copy_eltfp25519_1w(T[3], T[2]);
++      sqrn_eltfp25519_1w_adx(T[3], 20);
++      mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
++      sqrn_eltfp25519_1w_adx(T[3], 10);
++      mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
++      copy_eltfp25519_1w(T[0], T[3]);
++      sqrn_eltfp25519_1w_adx(T[0], 50);
++      mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
++      copy_eltfp25519_1w(T[2], T[0]);
++      sqrn_eltfp25519_1w_adx(T[2], 100);
++      mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
++      sqrn_eltfp25519_1w_adx(T[2], 50);
++      mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
++      sqrn_eltfp25519_1w_adx(T[2], 5);
++      mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
++
++      memzero_explicit(&m, sizeof(m));
++}
++
++static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
++{
++      struct {
++              eltfp25519_1w_buffer buffer;
++              eltfp25519_1w x0, x1, x2;
++      } __aligned(32) m;
++      u64 *T[5];
++
++      T[0] = m.x0;
++      T[1] = c; /* x^(-1) */
++      T[2] = m.x1;
++      T[3] = m.x2;
++
++      copy_eltfp25519_1w(T[1], a);
++      sqrn_eltfp25519_1w_bmi2(T[1], 1);
++      copy_eltfp25519_1w(T[2], T[1]);
++      sqrn_eltfp25519_1w_bmi2(T[2], 2);
++      mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
++      mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
++      copy_eltfp25519_1w(T[2], T[1]);
++      sqrn_eltfp25519_1w_bmi2(T[2], 1);
++      mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
++      copy_eltfp25519_1w(T[2], T[0]);
++      sqrn_eltfp25519_1w_bmi2(T[2], 5);
++      mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
++      copy_eltfp25519_1w(T[2], T[0]);
++      sqrn_eltfp25519_1w_bmi2(T[2], 10);
++      mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
++      copy_eltfp25519_1w(T[3], T[2]);
++      sqrn_eltfp25519_1w_bmi2(T[3], 20);
++      mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
++      sqrn_eltfp25519_1w_bmi2(T[3], 10);
++      mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
++      copy_eltfp25519_1w(T[0], T[3]);
++      sqrn_eltfp25519_1w_bmi2(T[0], 50);
++      mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
++      copy_eltfp25519_1w(T[2], T[0]);
++      sqrn_eltfp25519_1w_bmi2(T[2], 100);
++      mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
++      sqrn_eltfp25519_1w_bmi2(T[2], 50);
++      mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
++      sqrn_eltfp25519_1w_bmi2(T[2], 5);
++      mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
++
++      memzero_explicit(&m, sizeof(m));
++}
++
++/* Given c, a 256-bit number, fred_eltfp25519_1w updates c
++ * with a number such that 0 <= C < 2**255-19.
++ */
++static __always_inline void fred_eltfp25519_1w(u64 *const c)
++{
++      u64 tmp0 = 38, tmp1 = 19;
++      asm volatile(
++              "btrq   $63,    %3 ;" /* Put bit 255 in carry flag and clear */
++              "cmovncl %k5,   %k4 ;" /* c[255] ? 38 : 19 */
++
++              /* Add either 19 or 38 to c */
++              "addq    %4,   %0 ;"
++              "adcq    $0,   %1 ;"
++              "adcq    $0,   %2 ;"
++              "adcq    $0,   %3 ;"
++
++              /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
++              "movl    $0,  %k4 ;"
++              "cmovnsl %k5,  %k4 ;" /* c[255] ? 0 : 19 */
++              "btrq   $63,   %3 ;" /* Clear bit 255 */
++
++              /* Subtract 19 if necessary */
++              "subq    %4,   %0 ;"
++              "sbbq    $0,   %1 ;"
++              "sbbq    $0,   %2 ;"
++              "sbbq    $0,   %3 ;"
++
++              : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0),
++                "+r"(tmp1)
++              :
++              : "memory", "cc");
++}
++
++static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
++{
++      u64 temp;
++      asm volatile(
++              "test %9, %9 ;"
++              "movq %0, %8 ;"
++              "cmovnzq %4, %0 ;"
++              "cmovnzq %8, %4 ;"
++              "movq %1, %8 ;"
++              "cmovnzq %5, %1 ;"
++              "cmovnzq %8, %5 ;"
++              "movq %2, %8 ;"
++              "cmovnzq %6, %2 ;"
++              "cmovnzq %8, %6 ;"
++              "movq %3, %8 ;"
++              "cmovnzq %7, %3 ;"
++              "cmovnzq %8, %7 ;"
++              : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]),
++                "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]),
++                "=r"(temp)
++              : "r"(bit)
++              : "cc"
++      );
++}
++
++static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py)
++{
++      asm volatile(
++              "test %4, %4 ;"
++              "cmovnzq %5, %0 ;"
++              "cmovnzq %6, %1 ;"
++              "cmovnzq %7, %2 ;"
++              "cmovnzq %8, %3 ;"
++              : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3])
++              : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3])
++              : "cc"
++      );
++}
++
++static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE],
++                         const u8 private_key[CURVE25519_KEY_SIZE],
++                         const u8 session_key[CURVE25519_KEY_SIZE])
++{
++      struct {
++              u64 buffer[4 * NUM_WORDS_ELTFP25519];
++              u64 coordinates[4 * NUM_WORDS_ELTFP25519];
++              u64 workspace[6 * NUM_WORDS_ELTFP25519];
++              u8 session[CURVE25519_KEY_SIZE];
++              u8 private[CURVE25519_KEY_SIZE];
++      } __aligned(32) m;
++
++      int i = 0, j = 0;
++      u64 prev = 0;
++      u64 *const X1 = (u64 *)m.session;
++      u64 *const key = (u64 *)m.private;
++      u64 *const Px = m.coordinates + 0;
++      u64 *const Pz = m.coordinates + 4;
++      u64 *const Qx = m.coordinates + 8;
++      u64 *const Qz = m.coordinates + 12;
++      u64 *const X2 = Qx;
++      u64 *const Z2 = Qz;
++      u64 *const X3 = Px;
++      u64 *const Z3 = Pz;
++      u64 *const X2Z2 = Qx;
++      u64 *const X3Z3 = Px;
++
++      u64 *const A = m.workspace + 0;
++      u64 *const B = m.workspace + 4;
++      u64 *const D = m.workspace + 8;
++      u64 *const C = m.workspace + 12;
++      u64 *const DA = m.workspace + 16;
++      u64 *const CB = m.workspace + 20;
++      u64 *const AB = A;
++      u64 *const DC = D;
++      u64 *const DACB = DA;
++
++      memcpy(m.private, private_key, sizeof(m.private));
++      memcpy(m.session, session_key, sizeof(m.session));
++
++      curve25519_clamp_secret(m.private);
++
++      /* As in the draft:
++       * When receiving such an array, implementations of curve25519
++       * MUST mask the most-significant bit in the final byte. This
++       * is done to preserve compatibility with point formats which
++       * reserve the sign bit for use in other protocols and to
++       * increase resistance to implementation fingerprinting
++       */
++      m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
++
++      copy_eltfp25519_1w(Px, X1);
++      setzero_eltfp25519_1w(Pz);
++      setzero_eltfp25519_1w(Qx);
++      setzero_eltfp25519_1w(Qz);
++
++      Pz[0] = 1;
++      Qx[0] = 1;
++
++      /* main-loop */
++      prev = 0;
++      j = 62;
++      for (i = 3; i >= 0; --i) {
++              while (j >= 0) {
++                      u64 bit = (key[i] >> j) & 0x1;
++                      u64 swap = bit ^ prev;
++                      prev = bit;
++
++                      add_eltfp25519_1w_adx(A, X2, Z2);       /* A = (X2+Z2) */
++                      sub_eltfp25519_1w(B, X2, Z2);           /* B = (X2-Z2) */
++                      add_eltfp25519_1w_adx(C, X3, Z3);       /* C = (X3+Z3) */
++                      sub_eltfp25519_1w(D, X3, Z3);           /* D = (X3-Z3) */
++                      mul_eltfp25519_2w_adx(DACB, AB, DC);    /* [DA|CB] = [A|B]*[D|C] */
++
++                      cselect(swap, A, C);
++                      cselect(swap, B, D);
++
++                      sqr_eltfp25519_2w_adx(AB);              /* [AA|BB] = [A^2|B^2] */
++                      add_eltfp25519_1w_adx(X3, DA, CB);      /* X3 = (DA+CB) */
++                      sub_eltfp25519_1w(Z3, DA, CB);          /* Z3 = (DA-CB) */
++                      sqr_eltfp25519_2w_adx(X3Z3);            /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
++
++                      copy_eltfp25519_1w(X2, B);              /* X2 = B^2 */
++                      sub_eltfp25519_1w(Z2, A, B);            /* Z2 = E = AA-BB */
++
++                      mul_a24_eltfp25519_1w(B, Z2);           /* B = a24*E */
++                      add_eltfp25519_1w_adx(B, B, X2);        /* B = a24*E+B */
++                      mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB);  /* [X2|Z2] = [B|E]*[A|a24*E+B] */
++                      mul_eltfp25519_1w_adx(Z3, Z3, X1);      /* Z3 = Z3*X1 */
++                      --j;
++              }
++              j = 63;
++      }
++
++      inv_eltfp25519_1w_adx(A, Qz);
++      mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
++      fred_eltfp25519_1w((u64 *)shared);
++
++      memzero_explicit(&m, sizeof(m));
++}
++
++static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE],
++                              const u8 private_key[CURVE25519_KEY_SIZE])
++{
++      struct {
++              u64 buffer[4 * NUM_WORDS_ELTFP25519];
++              u64 coordinates[4 * NUM_WORDS_ELTFP25519];
++              u64 workspace[4 * NUM_WORDS_ELTFP25519];
++              u8 private[CURVE25519_KEY_SIZE];
++      } __aligned(32) m;
++
++      const int ite[4] = { 64, 64, 64, 63 };
++      const int q = 3;
++      u64 swap = 1;
++
++      int i = 0, j = 0, k = 0;
++      u64 *const key = (u64 *)m.private;
++      u64 *const Ur1 = m.coordinates + 0;
++      u64 *const Zr1 = m.coordinates + 4;
++      u64 *const Ur2 = m.coordinates + 8;
++      u64 *const Zr2 = m.coordinates + 12;
++
++      u64 *const UZr1 = m.coordinates + 0;
++      u64 *const ZUr2 = m.coordinates + 8;
++
++      u64 *const A = m.workspace + 0;
++      u64 *const B = m.workspace + 4;
++      u64 *const C = m.workspace + 8;
++      u64 *const D = m.workspace + 12;
++
++      u64 *const AB = m.workspace + 0;
++      u64 *const CD = m.workspace + 8;
++
++      const u64 *const P = table_ladder_8k;
++
++      memcpy(m.private, private_key, sizeof(m.private));
++
++      curve25519_clamp_secret(m.private);
++
++      setzero_eltfp25519_1w(Ur1);
++      setzero_eltfp25519_1w(Zr1);
++      setzero_eltfp25519_1w(Zr2);
++      Ur1[0] = 1;
++      Zr1[0] = 1;
++      Zr2[0] = 1;
++
++      /* G-S */
++      Ur2[3] = 0x1eaecdeee27cab34UL;
++      Ur2[2] = 0xadc7a0b9235d48e2UL;
++      Ur2[1] = 0xbbf095ae14b2edf8UL;
++      Ur2[0] = 0x7e94e1fec82faabdUL;
++
++      /* main-loop */
++      j = q;
++      for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
++              while (j < ite[i]) {
++                      u64 bit = (key[i] >> j) & 0x1;
++                      k = (64 * i + j - q);
++                      swap = swap ^ bit;
++                      cswap(swap, Ur1, Ur2);
++                      cswap(swap, Zr1, Zr2);
++                      swap = bit;
++                      /* Addition */
++                      sub_eltfp25519_1w(B, Ur1, Zr1);         /* B = Ur1-Zr1 */
++                      add_eltfp25519_1w_adx(A, Ur1, Zr1);     /* A = Ur1+Zr1 */
++                      mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */
++                      sub_eltfp25519_1w(B, A, C);             /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
++                      add_eltfp25519_1w_adx(A, A, C);         /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
++                      sqr_eltfp25519_2w_adx(AB);              /* A = A^2      |  B = B^2 */
++                      mul_eltfp25519_2w_adx(UZr1, ZUr2, AB);  /* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
++                      ++j;
++              }
++              j = 0;
++      }
++
++      /* Doubling */
++      for (i = 0; i < q; ++i) {
++              add_eltfp25519_1w_adx(A, Ur1, Zr1);     /*  A = Ur1+Zr1 */
++              sub_eltfp25519_1w(B, Ur1, Zr1);         /*  B = Ur1-Zr1 */
++              sqr_eltfp25519_2w_adx(AB);              /*  A = A**2     B = B**2 */
++              copy_eltfp25519_1w(C, B);               /*  C = B */
++              sub_eltfp25519_1w(B, A, B);             /*  B = A-B */
++              mul_a24_eltfp25519_1w(D, B);            /*  D = my_a24*B */
++              add_eltfp25519_1w_adx(D, D, C);         /*  D = D+C */
++              mul_eltfp25519_2w_adx(UZr1, AB, CD);    /*  Ur1 = A*B   Zr1 = Zr1*A */
++      }
++
++      /* Convert to affine coordinates */
++      inv_eltfp25519_1w_adx(A, Zr1);
++      mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
++      fred_eltfp25519_1w((u64 *)session_key);
++
++      memzero_explicit(&m, sizeof(m));
++}
++
++static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE],
++                          const u8 private_key[CURVE25519_KEY_SIZE],
++                          const u8 session_key[CURVE25519_KEY_SIZE])
++{
++      struct {
++              u64 buffer[4 * NUM_WORDS_ELTFP25519];
++              u64 coordinates[4 * NUM_WORDS_ELTFP25519];
++              u64 workspace[6 * NUM_WORDS_ELTFP25519];
++              u8 session[CURVE25519_KEY_SIZE];
++              u8 private[CURVE25519_KEY_SIZE];
++      } __aligned(32) m;
++
++      int i = 0, j = 0;
++      u64 prev = 0;
++      u64 *const X1 = (u64 *)m.session;
++      u64 *const key = (u64 *)m.private;
++      u64 *const Px = m.coordinates + 0;
++      u64 *const Pz = m.coordinates + 4;
++      u64 *const Qx = m.coordinates + 8;
++      u64 *const Qz = m.coordinates + 12;
++      u64 *const X2 = Qx;
++      u64 *const Z2 = Qz;
++      u64 *const X3 = Px;
++      u64 *const Z3 = Pz;
++      u64 *const X2Z2 = Qx;
++      u64 *const X3Z3 = Px;
++
++      u64 *const A = m.workspace + 0;
++      u64 *const B = m.workspace + 4;
++      u64 *const D = m.workspace + 8;
++      u64 *const C = m.workspace + 12;
++      u64 *const DA = m.workspace + 16;
++      u64 *const CB = m.workspace + 20;
++      u64 *const AB = A;
++      u64 *const DC = D;
++      u64 *const DACB = DA;
++
++      memcpy(m.private, private_key, sizeof(m.private));
++      memcpy(m.session, session_key, sizeof(m.session));
++
++      curve25519_clamp_secret(m.private);
++
++      /* As in the draft:
++       * When receiving such an array, implementations of curve25519
++       * MUST mask the most-significant bit in the final byte. This
++       * is done to preserve compatibility with point formats which
++       * reserve the sign bit for use in other protocols and to
++       * increase resistance to implementation fingerprinting
++       */
++      m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
++
++      copy_eltfp25519_1w(Px, X1);
++      setzero_eltfp25519_1w(Pz);
++      setzero_eltfp25519_1w(Qx);
++      setzero_eltfp25519_1w(Qz);
++
++      Pz[0] = 1;
++      Qx[0] = 1;
++
++      /* main-loop */
++      prev = 0;
++      j = 62;
++      for (i = 3; i >= 0; --i) {
++              while (j >= 0) {
++                      u64 bit = (key[i] >> j) & 0x1;
++                      u64 swap = bit ^ prev;
++                      prev = bit;
++
++                      add_eltfp25519_1w_bmi2(A, X2, Z2);      /* A = (X2+Z2) */
++                      sub_eltfp25519_1w(B, X2, Z2);           /* B = (X2-Z2) */
++                      add_eltfp25519_1w_bmi2(C, X3, Z3);      /* C = (X3+Z3) */
++                      sub_eltfp25519_1w(D, X3, Z3);           /* D = (X3-Z3) */
++                      mul_eltfp25519_2w_bmi2(DACB, AB, DC);   /* [DA|CB] = [A|B]*[D|C] */
++
++                      cselect(swap, A, C);
++                      cselect(swap, B, D);
++
++                      sqr_eltfp25519_2w_bmi2(AB);             /* [AA|BB] = [A^2|B^2] */
++                      add_eltfp25519_1w_bmi2(X3, DA, CB);     /* X3 = (DA+CB) */
++                      sub_eltfp25519_1w(Z3, DA, CB);          /* Z3 = (DA-CB) */
++                      sqr_eltfp25519_2w_bmi2(X3Z3);           /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
++
++                      copy_eltfp25519_1w(X2, B);              /* X2 = B^2 */
++                      sub_eltfp25519_1w(Z2, A, B);            /* Z2 = E = AA-BB */
++
++                      mul_a24_eltfp25519_1w(B, Z2);           /* B = a24*E */
++                      add_eltfp25519_1w_bmi2(B, B, X2);       /* B = a24*E+B */
++                      mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
++                      mul_eltfp25519_1w_bmi2(Z3, Z3, X1);     /* Z3 = Z3*X1 */
++                      --j;
++              }
++              j = 63;
++      }
++
++      inv_eltfp25519_1w_bmi2(A, Qz);
++      mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
++      fred_eltfp25519_1w((u64 *)shared);
++
++      memzero_explicit(&m, sizeof(m));
++}
++
++static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE],
++                               const u8 private_key[CURVE25519_KEY_SIZE])
++{
++      struct {
++              u64 buffer[4 * NUM_WORDS_ELTFP25519];
++              u64 coordinates[4 * NUM_WORDS_ELTFP25519];
++              u64 workspace[4 * NUM_WORDS_ELTFP25519];
++              u8 private[CURVE25519_KEY_SIZE];
++      } __aligned(32) m;
++
++      const int ite[4] = { 64, 64, 64, 63 };
++      const int q = 3;
++      u64 swap = 1;
++
++      int i = 0, j = 0, k = 0;
++      u64 *const key = (u64 *)m.private;
++      u64 *const Ur1 = m.coordinates + 0;
++      u64 *const Zr1 = m.coordinates + 4;
++      u64 *const Ur2 = m.coordinates + 8;
++      u64 *const Zr2 = m.coordinates + 12;
++
++      u64 *const UZr1 = m.coordinates + 0;
++      u64 *const ZUr2 = m.coordinates + 8;
++
++      u64 *const A = m.workspace + 0;
++      u64 *const B = m.workspace + 4;
++      u64 *const C = m.workspace + 8;
++      u64 *const D = m.workspace + 12;
++
++      u64 *const AB = m.workspace + 0;
++      u64 *const CD = m.workspace + 8;
++
++      const u64 *const P = table_ladder_8k;
++
++      memcpy(m.private, private_key, sizeof(m.private));
++
++      curve25519_clamp_secret(m.private);
++
++      setzero_eltfp25519_1w(Ur1);
++      setzero_eltfp25519_1w(Zr1);
++      setzero_eltfp25519_1w(Zr2);
++      Ur1[0] = 1;
++      Zr1[0] = 1;
++      Zr2[0] = 1;
++
++      /* G-S */
++      Ur2[3] = 0x1eaecdeee27cab34UL;
++      Ur2[2] = 0xadc7a0b9235d48e2UL;
++      Ur2[1] = 0xbbf095ae14b2edf8UL;
++      Ur2[0] = 0x7e94e1fec82faabdUL;
++
++      /* main-loop */
++      j = q;
++      for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
++              while (j < ite[i]) {
++                      u64 bit = (key[i] >> j) & 0x1;
++                      k = (64 * i + j - q);
++                      swap = swap ^ bit;
++                      cswap(swap, Ur1, Ur2);
++                      cswap(swap, Zr1, Zr2);
++                      swap = bit;
++                      /* Addition */
++                      sub_eltfp25519_1w(B, Ur1, Zr1);         /* B = Ur1-Zr1 */
++                      add_eltfp25519_1w_bmi2(A, Ur1, Zr1);    /* A = Ur1+Zr1 */
++                      mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
++                      sub_eltfp25519_1w(B, A, C);             /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
++                      add_eltfp25519_1w_bmi2(A, A, C);        /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
++                      sqr_eltfp25519_2w_bmi2(AB);             /* A = A^2      |  B = B^2 */
++                      mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
++                      ++j;
++              }
++              j = 0;
++      }
++
++      /* Doubling */
++      for (i = 0; i < q; ++i) {
++              add_eltfp25519_1w_bmi2(A, Ur1, Zr1);    /*  A = Ur1+Zr1 */
++              sub_eltfp25519_1w(B, Ur1, Zr1);         /*  B = Ur1-Zr1 */
++              sqr_eltfp25519_2w_bmi2(AB);             /*  A = A**2     B = B**2 */
++              copy_eltfp25519_1w(C, B);               /*  C = B */
++              sub_eltfp25519_1w(B, A, B);             /*  B = A-B */
++              mul_a24_eltfp25519_1w(D, B);            /*  D = my_a24*B */
++              add_eltfp25519_1w_bmi2(D, D, C);        /*  D = D+C */
++              mul_eltfp25519_2w_bmi2(UZr1, AB, CD);   /*  Ur1 = A*B   Zr1 = Zr1*A */
++      }
++
++      /* Convert to affine coordinates */
++      inv_eltfp25519_1w_bmi2(A, Zr1);
++      mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
++      fred_eltfp25519_1w((u64 *)session_key);
++
++      memzero_explicit(&m, sizeof(m));
++}
++
++void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
++                   const u8 secret[CURVE25519_KEY_SIZE],
++                   const u8 basepoint[CURVE25519_KEY_SIZE])
++{
++      if (static_branch_likely(&curve25519_use_adx))
++              curve25519_adx(mypublic, secret, basepoint);
++      else if (static_branch_likely(&curve25519_use_bmi2))
++              curve25519_bmi2(mypublic, secret, basepoint);
++      else
++              curve25519_generic(mypublic, secret, basepoint);
++}
++EXPORT_SYMBOL(curve25519_arch);
++
++void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
++                        const u8 secret[CURVE25519_KEY_SIZE])
++{
++      if (static_branch_likely(&curve25519_use_adx))
++              curve25519_adx_base(pub, secret);
++      else if (static_branch_likely(&curve25519_use_bmi2))
++              curve25519_bmi2_base(pub, secret);
++      else
++              curve25519_generic(pub, secret, curve25519_base_point);
++}
++EXPORT_SYMBOL(curve25519_base_arch);
++
++static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
++                               unsigned int len)
++{
++      u8 *secret = kpp_tfm_ctx(tfm);
++
++      if (!len)
++              curve25519_generate_secret(secret);
++      else if (len == CURVE25519_KEY_SIZE &&
++               crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
++              memcpy(secret, buf, CURVE25519_KEY_SIZE);
++      else
++              return -EINVAL;
++      return 0;
++}
++
++static int curve25519_generate_public_key(struct kpp_request *req)
++{
++      struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
++      const u8 *secret = kpp_tfm_ctx(tfm);
++      u8 buf[CURVE25519_KEY_SIZE];
++      int copied, nbytes;
++
++      if (req->src)
++              return -EINVAL;
++
++      curve25519_base_arch(buf, secret);
++
++      /* might want less than we've got */
++      nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
++      copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
++                                                              nbytes),
++                                   buf, nbytes);
++      if (copied != nbytes)
++              return -EINVAL;
++      return 0;
++}
++
++static int curve25519_compute_shared_secret(struct kpp_request *req)
++{
++      struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
++      const u8 *secret = kpp_tfm_ctx(tfm);
++      u8 public_key[CURVE25519_KEY_SIZE];
++      u8 buf[CURVE25519_KEY_SIZE];
++      int copied, nbytes;
++
++      if (!req->src)
++              return -EINVAL;
++
++      copied = sg_copy_to_buffer(req->src,
++                                 sg_nents_for_len(req->src,
++                                                  CURVE25519_KEY_SIZE),
++                                 public_key, CURVE25519_KEY_SIZE);
++      if (copied != CURVE25519_KEY_SIZE)
++              return -EINVAL;
++
++      curve25519_arch(buf, secret, public_key);
++
++      /* might want less than we've got */
++      nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
++      copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
++                                                              nbytes),
++                                   buf, nbytes);
++      if (copied != nbytes)
++              return -EINVAL;
++      return 0;
++}
++
++static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
++{
++      return CURVE25519_KEY_SIZE;
++}
++
++static struct kpp_alg curve25519_alg = {
++      .base.cra_name          = "curve25519",
++      .base.cra_driver_name   = "curve25519-x86",
++      .base.cra_priority      = 200,
++      .base.cra_module        = THIS_MODULE,
++      .base.cra_ctxsize       = CURVE25519_KEY_SIZE,
++
++      .set_secret             = curve25519_set_secret,
++      .generate_public_key    = curve25519_generate_public_key,
++      .compute_shared_secret  = curve25519_compute_shared_secret,
++      .max_size               = curve25519_max_size,
++};
++
++static int __init curve25519_mod_init(void)
++{
++      if (boot_cpu_has(X86_FEATURE_BMI2))
++              static_branch_enable(&curve25519_use_bmi2);
++      else if (boot_cpu_has(X86_FEATURE_ADX))
++              static_branch_enable(&curve25519_use_adx);
++      else
++              return 0;
++      return crypto_register_kpp(&curve25519_alg);
++}
++
++static void __exit curve25519_mod_exit(void)
++{
++      if (boot_cpu_has(X86_FEATURE_BMI2) ||
++          boot_cpu_has(X86_FEATURE_ADX))
++              crypto_unregister_kpp(&curve25519_alg);
++}
++
++module_init(curve25519_mod_init);
++module_exit(curve25519_mod_exit);
++
++MODULE_ALIAS_CRYPTO("curve25519");
++MODULE_ALIAS_CRYPTO("curve25519-x86");
++MODULE_LICENSE("GPL v2");
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -269,6 +269,12 @@ config CRYPTO_CURVE25519
+       select CRYPTO_KPP
+       select CRYPTO_LIB_CURVE25519_GENERIC
+ 
++config CRYPTO_CURVE25519_X86
++      tristate "x86_64 accelerated Curve25519 scalar multiplication library"
++      depends on X86 && 64BIT
++      select CRYPTO_LIB_CURVE25519_GENERIC
++      select CRYPTO_ARCH_HAVE_LIB_CURVE25519
++
+ comment "Authenticated Encryption with Associated Data"
+ 
+ config CRYPTO_CCM
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0030-crypto-arm-curve25519-import-Bernstein-and-Schwabe-s.patch b/target/linux/generic/backport-5.4/080-wireguard-0030-crypto-arm-curve25519-import-Bernstein-and-Schwabe-s.patch

new file mode 100644 (file)

index 0000000..b15a32b
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0030-crypto-arm-curve25519-import-Bernstein-and-Schwabe-s.patch
@@ -0,0 +1,2135 @@
+From bfc49f5ecdd60f2b37cd2f21a6f4de6ea91625e5 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:37 +0100
+Subject: [PATCH 030/124] crypto: arm/curve25519 - import Bernstein and
+ Schwabe's Curve25519 ARM implementation
+
+commit f0fb006b604f98e2309a30f34ef455ac734f7c1c upstream.
+
+This comes from Dan Bernstein and Peter Schwabe's public domain NEON
+code, and is included here in raw form so that subsequent commits that
+fix these up for the kernel can see how it has changed. This code does
+have some entirely cosmetic formatting differences, adding indentation
+and so forth, so that when we actually port it for use in the kernel in
+the subsequent commit, it's obvious what's changed in the process.
+
+This code originates from SUPERCOP 20180818, available at
+<https://bench.cr.yp.to/supercop.html>.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/curve25519-core.S | 2105 +++++++++++++++++++++++++++++
+ 1 file changed, 2105 insertions(+)
+ create mode 100644 arch/arm/crypto/curve25519-core.S
+
+--- /dev/null
++++ b/arch/arm/crypto/curve25519-core.S
+@@ -0,0 +1,2105 @@
++/*
++ * Public domain code from Daniel J. Bernstein and Peter Schwabe, from
++ * SUPERCOP's curve25519/neon2/scalarmult.s.
++ */
++
++.fpu neon
++.text
++.align 4
++.global _crypto_scalarmult_curve25519_neon2
++.global crypto_scalarmult_curve25519_neon2
++.type _crypto_scalarmult_curve25519_neon2 STT_FUNC
++.type crypto_scalarmult_curve25519_neon2 STT_FUNC
++      _crypto_scalarmult_curve25519_neon2:
++      crypto_scalarmult_curve25519_neon2:
++      vpush           {q4, q5, q6, q7}
++      mov             r12, sp
++      sub             sp, sp, #736
++      and             sp, sp, #0xffffffe0
++      strd            r4, [sp, #0]
++      strd            r6, [sp, #8]
++      strd            r8, [sp, #16]
++      strd            r10, [sp, #24]
++      str             r12, [sp, #480]
++      str             r14, [sp, #484]
++      mov             r0, r0
++      mov             r1, r1
++      mov             r2, r2
++      add             r3, sp, #32
++      ldr             r4, =0
++      ldr             r5, =254
++      vmov.i32        q0, #1
++      vshr.u64        q1, q0, #7
++      vshr.u64        q0, q0, #8
++      vmov.i32        d4, #19
++      vmov.i32        d5, #38
++      add             r6, sp, #512
++      vst1.8          {d2-d3}, [r6, : 128]
++      add             r6, sp, #528
++      vst1.8          {d0-d1}, [r6, : 128]
++      add             r6, sp, #544
++      vst1.8          {d4-d5}, [r6, : 128]
++      add             r6, r3, #0
++      vmov.i32        q2, #0
++      vst1.8          {d4-d5}, [r6, : 128]!
++      vst1.8          {d4-d5}, [r6, : 128]!
++      vst1.8          d4, [r6, : 64]
++      add             r6, r3, #0
++      ldr             r7, =960
++      sub             r7, r7, #2
++      neg             r7, r7
++      sub             r7, r7, r7, LSL #7
++      str             r7, [r6]
++      add             r6, sp, #704
++      vld1.8          {d4-d5}, [r1]!
++      vld1.8          {d6-d7}, [r1]
++      vst1.8          {d4-d5}, [r6, : 128]!
++      vst1.8          {d6-d7}, [r6, : 128]
++      sub             r1, r6, #16
++      ldrb            r6, [r1]
++      and             r6, r6, #248
++      strb            r6, [r1]
++      ldrb            r6, [r1, #31]
++      and             r6, r6, #127
++      orr             r6, r6, #64
++      strb            r6, [r1, #31]
++      vmov.i64        q2, #0xffffffff
++      vshr.u64        q3, q2, #7
++      vshr.u64        q2, q2, #6
++      vld1.8          {d8}, [r2]
++      vld1.8          {d10}, [r2]
++      add             r2, r2, #6
++      vld1.8          {d12}, [r2]
++      vld1.8          {d14}, [r2]
++      add             r2, r2, #6
++      vld1.8          {d16}, [r2]
++      add             r2, r2, #4
++      vld1.8          {d18}, [r2]
++      vld1.8          {d20}, [r2]
++      add             r2, r2, #6
++      vld1.8          {d22}, [r2]
++      add             r2, r2, #2
++      vld1.8          {d24}, [r2]
++      vld1.8          {d26}, [r2]
++      vshr.u64        q5, q5, #26
++      vshr.u64        q6, q6, #3
++      vshr.u64        q7, q7, #29
++      vshr.u64        q8, q8, #6
++      vshr.u64        q10, q10, #25
++      vshr.u64        q11, q11, #3
++      vshr.u64        q12, q12, #12
++      vshr.u64        q13, q13, #38
++      vand            q4, q4, q2
++      vand            q6, q6, q2
++      vand            q8, q8, q2
++      vand            q10, q10, q2
++      vand            q2, q12, q2
++      vand            q5, q5, q3
++      vand            q7, q7, q3
++      vand            q9, q9, q3
++      vand            q11, q11, q3
++      vand            q3, q13, q3
++      add             r2, r3, #48
++      vadd.i64        q12, q4, q1
++      vadd.i64        q13, q10, q1
++      vshr.s64        q12, q12, #26
++      vshr.s64        q13, q13, #26
++      vadd.i64        q5, q5, q12
++      vshl.i64        q12, q12, #26
++      vadd.i64        q14, q5, q0
++      vadd.i64        q11, q11, q13
++      vshl.i64        q13, q13, #26
++      vadd.i64        q15, q11, q0
++      vsub.i64        q4, q4, q12
++      vshr.s64        q12, q14, #25
++      vsub.i64        q10, q10, q13
++      vshr.s64        q13, q15, #25
++      vadd.i64        q6, q6, q12
++      vshl.i64        q12, q12, #25
++      vadd.i64        q14, q6, q1
++      vadd.i64        q2, q2, q13
++      vsub.i64        q5, q5, q12
++      vshr.s64        q12, q14, #26
++      vshl.i64        q13, q13, #25
++      vadd.i64        q14, q2, q1
++      vadd.i64        q7, q7, q12
++      vshl.i64        q12, q12, #26
++      vadd.i64        q15, q7, q0
++      vsub.i64        q11, q11, q13
++      vshr.s64        q13, q14, #26
++      vsub.i64        q6, q6, q12
++      vshr.s64        q12, q15, #25
++      vadd.i64        q3, q3, q13
++      vshl.i64        q13, q13, #26
++      vadd.i64        q14, q3, q0
++      vadd.i64        q8, q8, q12
++      vshl.i64        q12, q12, #25
++      vadd.i64        q15, q8, q1
++      add             r2, r2, #8
++      vsub.i64        q2, q2, q13
++      vshr.s64        q13, q14, #25
++      vsub.i64        q7, q7, q12
++      vshr.s64        q12, q15, #26
++      vadd.i64        q14, q13, q13
++      vadd.i64        q9, q9, q12
++      vtrn.32         d12, d14
++      vshl.i64        q12, q12, #26
++      vtrn.32         d13, d15
++      vadd.i64        q0, q9, q0
++      vadd.i64        q4, q4, q14
++      vst1.8          d12, [r2, : 64]!
++      vshl.i64        q6, q13, #4
++      vsub.i64        q7, q8, q12
++      vshr.s64        q0, q0, #25
++      vadd.i64        q4, q4, q6
++      vadd.i64        q6, q10, q0
++      vshl.i64        q0, q0, #25
++      vadd.i64        q8, q6, q1
++      vadd.i64        q4, q4, q13
++      vshl.i64        q10, q13, #25
++      vadd.i64        q1, q4, q1
++      vsub.i64        q0, q9, q0
++      vshr.s64        q8, q8, #26
++      vsub.i64        q3, q3, q10
++      vtrn.32         d14, d0
++      vshr.s64        q1, q1, #26
++      vtrn.32         d15, d1
++      vadd.i64        q0, q11, q8
++      vst1.8          d14, [r2, : 64]
++      vshl.i64        q7, q8, #26
++      vadd.i64        q5, q5, q1
++      vtrn.32         d4, d6
++      vshl.i64        q1, q1, #26
++      vtrn.32         d5, d7
++      vsub.i64        q3, q6, q7
++      add             r2, r2, #16
++      vsub.i64        q1, q4, q1
++      vst1.8          d4, [r2, : 64]
++      vtrn.32         d6, d0
++      vtrn.32         d7, d1
++      sub             r2, r2, #8
++      vtrn.32         d2, d10
++      vtrn.32         d3, d11
++      vst1.8          d6, [r2, : 64]
++      sub             r2, r2, #24
++      vst1.8          d2, [r2, : 64]
++      add             r2, r3, #96
++      vmov.i32        q0, #0
++      vmov.i64        d2, #0xff
++      vmov.i64        d3, #0
++      vshr.u32        q1, q1, #7
++      vst1.8          {d2-d3}, [r2, : 128]!
++      vst1.8          {d0-d1}, [r2, : 128]!
++      vst1.8          d0, [r2, : 64]
++      add             r2, r3, #144
++      vmov.i32        q0, #0
++      vst1.8          {d0-d1}, [r2, : 128]!
++      vst1.8          {d0-d1}, [r2, : 128]!
++      vst1.8          d0, [r2, : 64]
++      add             r2, r3, #240
++      vmov.i32        q0, #0
++      vmov.i64        d2, #0xff
++      vmov.i64        d3, #0
++      vshr.u32        q1, q1, #7
++      vst1.8          {d2-d3}, [r2, : 128]!
++      vst1.8          {d0-d1}, [r2, : 128]!
++      vst1.8          d0, [r2, : 64]
++      add             r2, r3, #48
++      add             r6, r3, #192
++      vld1.8          {d0-d1}, [r2, : 128]!
++      vld1.8          {d2-d3}, [r2, : 128]!
++      vld1.8          {d4}, [r2, : 64]
++      vst1.8          {d0-d1}, [r6, : 128]!
++      vst1.8          {d2-d3}, [r6, : 128]!
++      vst1.8          d4, [r6, : 64]
++._mainloop:
++      mov             r2, r5, LSR #3
++      and             r6, r5, #7
++      ldrb            r2, [r1, r2]
++      mov             r2, r2, LSR r6
++      and             r2, r2, #1
++      str             r5, [sp, #488]
++      eor             r4, r4, r2
++      str             r2, [sp, #492]
++      neg             r2, r4
++      add             r4, r3, #96
++      add             r5, r3, #192
++      add             r6, r3, #144
++      vld1.8          {d8-d9}, [r4, : 128]!
++      add             r7, r3, #240
++      vld1.8          {d10-d11}, [r5, : 128]!
++      veor            q6, q4, q5
++      vld1.8          {d14-d15}, [r6, : 128]!
++      vdup.i32        q8, r2
++      vld1.8          {d18-d19}, [r7, : 128]!
++      veor            q10, q7, q9
++      vld1.8          {d22-d23}, [r4, : 128]!
++      vand            q6, q6, q8
++      vld1.8          {d24-d25}, [r5, : 128]!
++      vand            q10, q10, q8
++      vld1.8          {d26-d27}, [r6, : 128]!
++      veor            q4, q4, q6
++      vld1.8          {d28-d29}, [r7, : 128]!
++      veor            q5, q5, q6
++      vld1.8          {d0}, [r4, : 64]
++      veor            q6, q7, q10
++      vld1.8          {d2}, [r5, : 64]
++      veor            q7, q9, q10
++      vld1.8          {d4}, [r6, : 64]
++      veor            q9, q11, q12
++      vld1.8          {d6}, [r7, : 64]
++      veor            q10, q0, q1
++      sub             r2, r4, #32
++      vand            q9, q9, q8
++      sub             r4, r5, #32
++      vand            q10, q10, q8
++      sub             r5, r6, #32
++      veor            q11, q11, q9
++      sub             r6, r7, #32
++      veor            q0, q0, q10
++      veor            q9, q12, q9
++      veor            q1, q1, q10
++      veor            q10, q13, q14
++      veor            q12, q2, q3
++      vand            q10, q10, q8
++      vand            q8, q12, q8
++      veor            q12, q13, q10
++      veor            q2, q2, q8
++      veor            q10, q14, q10
++      veor            q3, q3, q8
++      vadd.i32        q8, q4, q6
++      vsub.i32        q4, q4, q6
++      vst1.8          {d16-d17}, [r2, : 128]!
++      vadd.i32        q6, q11, q12
++      vst1.8          {d8-d9}, [r5, : 128]!
++      vsub.i32        q4, q11, q12
++      vst1.8          {d12-d13}, [r2, : 128]!
++      vadd.i32        q6, q0, q2
++      vst1.8          {d8-d9}, [r5, : 128]!
++      vsub.i32        q0, q0, q2
++      vst1.8          d12, [r2, : 64]
++      vadd.i32        q2, q5, q7
++      vst1.8          d0, [r5, : 64]
++      vsub.i32        q0, q5, q7
++      vst1.8          {d4-d5}, [r4, : 128]!
++      vadd.i32        q2, q9, q10
++      vst1.8          {d0-d1}, [r6, : 128]!
++      vsub.i32        q0, q9, q10
++      vst1.8          {d4-d5}, [r4, : 128]!
++      vadd.i32        q2, q1, q3
++      vst1.8          {d0-d1}, [r6, : 128]!
++      vsub.i32        q0, q1, q3
++      vst1.8          d4, [r4, : 64]
++      vst1.8          d0, [r6, : 64]
++      add             r2, sp, #544
++      add             r4, r3, #96
++      add             r5, r3, #144
++      vld1.8          {d0-d1}, [r2, : 128]
++      vld1.8          {d2-d3}, [r4, : 128]!
++      vld1.8          {d4-d5}, [r5, : 128]!
++      vzip.i32        q1, q2
++      vld1.8          {d6-d7}, [r4, : 128]!
++      vld1.8          {d8-d9}, [r5, : 128]!
++      vshl.i32        q5, q1, #1
++      vzip.i32        q3, q4
++      vshl.i32        q6, q2, #1
++      vld1.8          {d14}, [r4, : 64]
++      vshl.i32        q8, q3, #1
++      vld1.8          {d15}, [r5, : 64]
++      vshl.i32        q9, q4, #1
++      vmul.i32        d21, d7, d1
++      vtrn.32         d14, d15
++      vmul.i32        q11, q4, q0
++      vmul.i32        q0, q7, q0
++      vmull.s32       q12, d2, d2
++      vmlal.s32       q12, d11, d1
++      vmlal.s32       q12, d12, d0
++      vmlal.s32       q12, d13, d23
++      vmlal.s32       q12, d16, d22
++      vmlal.s32       q12, d7, d21
++      vmull.s32       q10, d2, d11
++      vmlal.s32       q10, d4, d1
++      vmlal.s32       q10, d13, d0
++      vmlal.s32       q10, d6, d23
++      vmlal.s32       q10, d17, d22
++      vmull.s32       q13, d10, d4
++      vmlal.s32       q13, d11, d3
++      vmlal.s32       q13, d13, d1
++      vmlal.s32       q13, d16, d0
++      vmlal.s32       q13, d17, d23
++      vmlal.s32       q13, d8, d22
++      vmull.s32       q1, d10, d5
++      vmlal.s32       q1, d11, d4
++      vmlal.s32       q1, d6, d1
++      vmlal.s32       q1, d17, d0
++      vmlal.s32       q1, d8, d23
++      vmull.s32       q14, d10, d6
++      vmlal.s32       q14, d11, d13
++      vmlal.s32       q14, d4, d4
++      vmlal.s32       q14, d17, d1
++      vmlal.s32       q14, d18, d0
++      vmlal.s32       q14, d9, d23
++      vmull.s32       q11, d10, d7
++      vmlal.s32       q11, d11, d6
++      vmlal.s32       q11, d12, d5
++      vmlal.s32       q11, d8, d1
++      vmlal.s32       q11, d19, d0
++      vmull.s32       q15, d10, d8
++      vmlal.s32       q15, d11, d17
++      vmlal.s32       q15, d12, d6
++      vmlal.s32       q15, d13, d5
++      vmlal.s32       q15, d19, d1
++      vmlal.s32       q15, d14, d0
++      vmull.s32       q2, d10, d9
++      vmlal.s32       q2, d11, d8
++      vmlal.s32       q2, d12, d7
++      vmlal.s32       q2, d13, d6
++      vmlal.s32       q2, d14, d1
++      vmull.s32       q0, d15, d1
++      vmlal.s32       q0, d10, d14
++      vmlal.s32       q0, d11, d19
++      vmlal.s32       q0, d12, d8
++      vmlal.s32       q0, d13, d17
++      vmlal.s32       q0, d6, d6
++      add             r2, sp, #512
++      vld1.8          {d18-d19}, [r2, : 128]
++      vmull.s32       q3, d16, d7
++      vmlal.s32       q3, d10, d15
++      vmlal.s32       q3, d11, d14
++      vmlal.s32       q3, d12, d9
++      vmlal.s32       q3, d13, d8
++      add             r2, sp, #528
++      vld1.8          {d8-d9}, [r2, : 128]
++      vadd.i64        q5, q12, q9
++      vadd.i64        q6, q15, q9
++      vshr.s64        q5, q5, #26
++      vshr.s64        q6, q6, #26
++      vadd.i64        q7, q10, q5
++      vshl.i64        q5, q5, #26
++      vadd.i64        q8, q7, q4
++      vadd.i64        q2, q2, q6
++      vshl.i64        q6, q6, #26
++      vadd.i64        q10, q2, q4
++      vsub.i64        q5, q12, q5
++      vshr.s64        q8, q8, #25
++      vsub.i64        q6, q15, q6
++      vshr.s64        q10, q10, #25
++      vadd.i64        q12, q13, q8
++      vshl.i64        q8, q8, #25
++      vadd.i64        q13, q12, q9
++      vadd.i64        q0, q0, q10
++      vsub.i64        q7, q7, q8
++      vshr.s64        q8, q13, #26
++      vshl.i64        q10, q10, #25
++      vadd.i64        q13, q0, q9
++      vadd.i64        q1, q1, q8
++      vshl.i64        q8, q8, #26
++      vadd.i64        q15, q1, q4
++      vsub.i64        q2, q2, q10
++      vshr.s64        q10, q13, #26
++      vsub.i64        q8, q12, q8
++      vshr.s64        q12, q15, #25
++      vadd.i64        q3, q3, q10
++      vshl.i64        q10, q10, #26
++      vadd.i64        q13, q3, q4
++      vadd.i64        q14, q14, q12
++      add             r2, r3, #288
++      vshl.i64        q12, q12, #25
++      add             r4, r3, #336
++      vadd.i64        q15, q14, q9
++      add             r2, r2, #8
++      vsub.i64        q0, q0, q10
++      add             r4, r4, #8
++      vshr.s64        q10, q13, #25
++      vsub.i64        q1, q1, q12
++      vshr.s64        q12, q15, #26
++      vadd.i64        q13, q10, q10
++      vadd.i64        q11, q11, q12
++      vtrn.32         d16, d2
++      vshl.i64        q12, q12, #26
++      vtrn.32         d17, d3
++      vadd.i64        q1, q11, q4
++      vadd.i64        q4, q5, q13
++      vst1.8          d16, [r2, : 64]!
++      vshl.i64        q5, q10, #4
++      vst1.8          d17, [r4, : 64]!
++      vsub.i64        q8, q14, q12
++      vshr.s64        q1, q1, #25
++      vadd.i64        q4, q4, q5
++      vadd.i64        q5, q6, q1
++      vshl.i64        q1, q1, #25
++      vadd.i64        q6, q5, q9
++      vadd.i64        q4, q4, q10
++      vshl.i64        q10, q10, #25
++      vadd.i64        q9, q4, q9
++      vsub.i64        q1, q11, q1
++      vshr.s64        q6, q6, #26
++      vsub.i64        q3, q3, q10
++      vtrn.32         d16, d2
++      vshr.s64        q9, q9, #26
++      vtrn.32         d17, d3
++      vadd.i64        q1, q2, q6
++      vst1.8          d16, [r2, : 64]
++      vshl.i64        q2, q6, #26
++      vst1.8          d17, [r4, : 64]
++      vadd.i64        q6, q7, q9
++      vtrn.32         d0, d6
++      vshl.i64        q7, q9, #26
++      vtrn.32         d1, d7
++      vsub.i64        q2, q5, q2
++      add             r2, r2, #16
++      vsub.i64        q3, q4, q7
++      vst1.8          d0, [r2, : 64]
++      add             r4, r4, #16
++      vst1.8          d1, [r4, : 64]
++      vtrn.32         d4, d2
++      vtrn.32         d5, d3
++      sub             r2, r2, #8
++      sub             r4, r4, #8
++      vtrn.32         d6, d12
++      vtrn.32         d7, d13
++      vst1.8          d4, [r2, : 64]
++      vst1.8          d5, [r4, : 64]
++      sub             r2, r2, #24
++      sub             r4, r4, #24
++      vst1.8          d6, [r2, : 64]
++      vst1.8          d7, [r4, : 64]
++      add             r2, r3, #240
++      add             r4, r3, #96
++      vld1.8          {d0-d1}, [r4, : 128]!
++      vld1.8          {d2-d3}, [r4, : 128]!
++      vld1.8          {d4}, [r4, : 64]
++      add             r4, r3, #144
++      vld1.8          {d6-d7}, [r4, : 128]!
++      vtrn.32         q0, q3
++      vld1.8          {d8-d9}, [r4, : 128]!
++      vshl.i32        q5, q0, #4
++      vtrn.32         q1, q4
++      vshl.i32        q6, q3, #4
++      vadd.i32        q5, q5, q0
++      vadd.i32        q6, q6, q3
++      vshl.i32        q7, q1, #4
++      vld1.8          {d5}, [r4, : 64]
++      vshl.i32        q8, q4, #4
++      vtrn.32         d4, d5
++      vadd.i32        q7, q7, q1
++      vadd.i32        q8, q8, q4
++      vld1.8          {d18-d19}, [r2, : 128]!
++      vshl.i32        q10, q2, #4
++      vld1.8          {d22-d23}, [r2, : 128]!
++      vadd.i32        q10, q10, q2
++      vld1.8          {d24}, [r2, : 64]
++      vadd.i32        q5, q5, q0
++      add             r2, r3, #192
++      vld1.8          {d26-d27}, [r2, : 128]!
++      vadd.i32        q6, q6, q3
++      vld1.8          {d28-d29}, [r2, : 128]!
++      vadd.i32        q8, q8, q4
++      vld1.8          {d25}, [r2, : 64]
++      vadd.i32        q10, q10, q2
++      vtrn.32         q9, q13
++      vadd.i32        q7, q7, q1
++      vadd.i32        q5, q5, q0
++      vtrn.32         q11, q14
++      vadd.i32        q6, q6, q3
++      add             r2, sp, #560
++      vadd.i32        q10, q10, q2
++      vtrn.32         d24, d25
++      vst1.8          {d12-d13}, [r2, : 128]
++      vshl.i32        q6, q13, #1
++      add             r2, sp, #576
++      vst1.8          {d20-d21}, [r2, : 128]
++      vshl.i32        q10, q14, #1
++      add             r2, sp, #592
++      vst1.8          {d12-d13}, [r2, : 128]
++      vshl.i32        q15, q12, #1
++      vadd.i32        q8, q8, q4
++      vext.32         d10, d31, d30, #0
++      vadd.i32        q7, q7, q1
++      add             r2, sp, #608
++      vst1.8          {d16-d17}, [r2, : 128]
++      vmull.s32       q8, d18, d5
++      vmlal.s32       q8, d26, d4
++      vmlal.s32       q8, d19, d9
++      vmlal.s32       q8, d27, d3
++      vmlal.s32       q8, d22, d8
++      vmlal.s32       q8, d28, d2
++      vmlal.s32       q8, d23, d7
++      vmlal.s32       q8, d29, d1
++      vmlal.s32       q8, d24, d6
++      vmlal.s32       q8, d25, d0
++      add             r2, sp, #624
++      vst1.8          {d14-d15}, [r2, : 128]
++      vmull.s32       q2, d18, d4
++      vmlal.s32       q2, d12, d9
++      vmlal.s32       q2, d13, d8
++      vmlal.s32       q2, d19, d3
++      vmlal.s32       q2, d22, d2
++      vmlal.s32       q2, d23, d1
++      vmlal.s32       q2, d24, d0
++      add             r2, sp, #640
++      vst1.8          {d20-d21}, [r2, : 128]
++      vmull.s32       q7, d18, d9
++      vmlal.s32       q7, d26, d3
++      vmlal.s32       q7, d19, d8
++      vmlal.s32       q7, d27, d2
++      vmlal.s32       q7, d22, d7
++      vmlal.s32       q7, d28, d1
++      vmlal.s32       q7, d23, d6
++      vmlal.s32       q7, d29, d0
++      add             r2, sp, #656
++      vst1.8          {d10-d11}, [r2, : 128]
++      vmull.s32       q5, d18, d3
++      vmlal.s32       q5, d19, d2
++      vmlal.s32       q5, d22, d1
++      vmlal.s32       q5, d23, d0
++      vmlal.s32       q5, d12, d8
++      add             r2, sp, #672
++      vst1.8          {d16-d17}, [r2, : 128]
++      vmull.s32       q4, d18, d8
++      vmlal.s32       q4, d26, d2
++      vmlal.s32       q4, d19, d7
++      vmlal.s32       q4, d27, d1
++      vmlal.s32       q4, d22, d6
++      vmlal.s32       q4, d28, d0
++      vmull.s32       q8, d18, d7
++      vmlal.s32       q8, d26, d1
++      vmlal.s32       q8, d19, d6
++      vmlal.s32       q8, d27, d0
++      add             r2, sp, #576
++      vld1.8          {d20-d21}, [r2, : 128]
++      vmlal.s32       q7, d24, d21
++      vmlal.s32       q7, d25, d20
++      vmlal.s32       q4, d23, d21
++      vmlal.s32       q4, d29, d20
++      vmlal.s32       q8, d22, d21
++      vmlal.s32       q8, d28, d20
++      vmlal.s32       q5, d24, d20
++      add             r2, sp, #576
++      vst1.8          {d14-d15}, [r2, : 128]
++      vmull.s32       q7, d18, d6
++      vmlal.s32       q7, d26, d0
++      add             r2, sp, #656
++      vld1.8          {d30-d31}, [r2, : 128]
++      vmlal.s32       q2, d30, d21
++      vmlal.s32       q7, d19, d21
++      vmlal.s32       q7, d27, d20
++      add             r2, sp, #624
++      vld1.8          {d26-d27}, [r2, : 128]
++      vmlal.s32       q4, d25, d27
++      vmlal.s32       q8, d29, d27
++      vmlal.s32       q8, d25, d26
++      vmlal.s32       q7, d28, d27
++      vmlal.s32       q7, d29, d26
++      add             r2, sp, #608
++      vld1.8          {d28-d29}, [r2, : 128]
++      vmlal.s32       q4, d24, d29
++      vmlal.s32       q8, d23, d29
++      vmlal.s32       q8, d24, d28
++      vmlal.s32       q7, d22, d29
++      vmlal.s32       q7, d23, d28
++      add             r2, sp, #608
++      vst1.8          {d8-d9}, [r2, : 128]
++      add             r2, sp, #560
++      vld1.8          {d8-d9}, [r2, : 128]
++      vmlal.s32       q7, d24, d9
++      vmlal.s32       q7, d25, d31
++      vmull.s32       q1, d18, d2
++      vmlal.s32       q1, d19, d1
++      vmlal.s32       q1, d22, d0
++      vmlal.s32       q1, d24, d27
++      vmlal.s32       q1, d23, d20
++      vmlal.s32       q1, d12, d7
++      vmlal.s32       q1, d13, d6
++      vmull.s32       q6, d18, d1
++      vmlal.s32       q6, d19, d0
++      vmlal.s32       q6, d23, d27
++      vmlal.s32       q6, d22, d20
++      vmlal.s32       q6, d24, d26
++      vmull.s32       q0, d18, d0
++      vmlal.s32       q0, d22, d27
++      vmlal.s32       q0, d23, d26
++      vmlal.s32       q0, d24, d31
++      vmlal.s32       q0, d19, d20
++      add             r2, sp, #640
++      vld1.8          {d18-d19}, [r2, : 128]
++      vmlal.s32       q2, d18, d7
++      vmlal.s32       q2, d19, d6
++      vmlal.s32       q5, d18, d6
++      vmlal.s32       q5, d19, d21
++      vmlal.s32       q1, d18, d21
++      vmlal.s32       q1, d19, d29
++      vmlal.s32       q0, d18, d28
++      vmlal.s32       q0, d19, d9
++      vmlal.s32       q6, d18, d29
++      vmlal.s32       q6, d19, d28
++      add             r2, sp, #592
++      vld1.8          {d18-d19}, [r2, : 128]
++      add             r2, sp, #512
++      vld1.8          {d22-d23}, [r2, : 128]
++      vmlal.s32       q5, d19, d7
++      vmlal.s32       q0, d18, d21
++      vmlal.s32       q0, d19, d29
++      vmlal.s32       q6, d18, d6
++      add             r2, sp, #528
++      vld1.8          {d6-d7}, [r2, : 128]
++      vmlal.s32       q6, d19, d21
++      add             r2, sp, #576
++      vld1.8          {d18-d19}, [r2, : 128]
++      vmlal.s32       q0, d30, d8
++      add             r2, sp, #672
++      vld1.8          {d20-d21}, [r2, : 128]
++      vmlal.s32       q5, d30, d29
++      add             r2, sp, #608
++      vld1.8          {d24-d25}, [r2, : 128]
++      vmlal.s32       q1, d30, d28
++      vadd.i64        q13, q0, q11
++      vadd.i64        q14, q5, q11
++      vmlal.s32       q6, d30, d9
++      vshr.s64        q4, q13, #26
++      vshr.s64        q13, q14, #26
++      vadd.i64        q7, q7, q4
++      vshl.i64        q4, q4, #26
++      vadd.i64        q14, q7, q3
++      vadd.i64        q9, q9, q13
++      vshl.i64        q13, q13, #26
++      vadd.i64        q15, q9, q3
++      vsub.i64        q0, q0, q4
++      vshr.s64        q4, q14, #25
++      vsub.i64        q5, q5, q13
++      vshr.s64        q13, q15, #25
++      vadd.i64        q6, q6, q4
++      vshl.i64        q4, q4, #25
++      vadd.i64        q14, q6, q11
++      vadd.i64        q2, q2, q13
++      vsub.i64        q4, q7, q4
++      vshr.s64        q7, q14, #26
++      vshl.i64        q13, q13, #25
++      vadd.i64        q14, q2, q11
++      vadd.i64        q8, q8, q7
++      vshl.i64        q7, q7, #26
++      vadd.i64        q15, q8, q3
++      vsub.i64        q9, q9, q13
++      vshr.s64        q13, q14, #26
++      vsub.i64        q6, q6, q7
++      vshr.s64        q7, q15, #25
++      vadd.i64        q10, q10, q13
++      vshl.i64        q13, q13, #26
++      vadd.i64        q14, q10, q3
++      vadd.i64        q1, q1, q7
++      add             r2, r3, #144
++      vshl.i64        q7, q7, #25
++      add             r4, r3, #96
++      vadd.i64        q15, q1, q11
++      add             r2, r2, #8
++      vsub.i64        q2, q2, q13
++      add             r4, r4, #8
++      vshr.s64        q13, q14, #25
++      vsub.i64        q7, q8, q7
++      vshr.s64        q8, q15, #26
++      vadd.i64        q14, q13, q13
++      vadd.i64        q12, q12, q8
++      vtrn.32         d12, d14
++      vshl.i64        q8, q8, #26
++      vtrn.32         d13, d15
++      vadd.i64        q3, q12, q3
++      vadd.i64        q0, q0, q14
++      vst1.8          d12, [r2, : 64]!
++      vshl.i64        q7, q13, #4
++      vst1.8          d13, [r4, : 64]!
++      vsub.i64        q1, q1, q8
++      vshr.s64        q3, q3, #25
++      vadd.i64        q0, q0, q7
++      vadd.i64        q5, q5, q3
++      vshl.i64        q3, q3, #25
++      vadd.i64        q6, q5, q11
++      vadd.i64        q0, q0, q13
++      vshl.i64        q7, q13, #25
++      vadd.i64        q8, q0, q11
++      vsub.i64        q3, q12, q3
++      vshr.s64        q6, q6, #26
++      vsub.i64        q7, q10, q7
++      vtrn.32         d2, d6
++      vshr.s64        q8, q8, #26
++      vtrn.32         d3, d7
++      vadd.i64        q3, q9, q6
++      vst1.8          d2, [r2, : 64]
++      vshl.i64        q6, q6, #26
++      vst1.8          d3, [r4, : 64]
++      vadd.i64        q1, q4, q8
++      vtrn.32         d4, d14
++      vshl.i64        q4, q8, #26
++      vtrn.32         d5, d15
++      vsub.i64        q5, q5, q6
++      add             r2, r2, #16
++      vsub.i64        q0, q0, q4
++      vst1.8          d4, [r2, : 64]
++      add             r4, r4, #16
++      vst1.8          d5, [r4, : 64]
++      vtrn.32         d10, d6
++      vtrn.32         d11, d7
++      sub             r2, r2, #8
++      sub             r4, r4, #8
++      vtrn.32         d0, d2
++      vtrn.32         d1, d3
++      vst1.8          d10, [r2, : 64]
++      vst1.8          d11, [r4, : 64]
++      sub             r2, r2, #24
++      sub             r4, r4, #24
++      vst1.8          d0, [r2, : 64]
++      vst1.8          d1, [r4, : 64]
++      add             r2, r3, #288
++      add             r4, r3, #336
++      vld1.8          {d0-d1}, [r2, : 128]!
++      vld1.8          {d2-d3}, [r4, : 128]!
++      vsub.i32        q0, q0, q1
++      vld1.8          {d2-d3}, [r2, : 128]!
++      vld1.8          {d4-d5}, [r4, : 128]!
++      vsub.i32        q1, q1, q2
++      add             r5, r3, #240
++      vld1.8          {d4}, [r2, : 64]
++      vld1.8          {d6}, [r4, : 64]
++      vsub.i32        q2, q2, q3
++      vst1.8          {d0-d1}, [r5, : 128]!
++      vst1.8          {d2-d3}, [r5, : 128]!
++      vst1.8          d4, [r5, : 64]
++      add             r2, r3, #144
++      add             r4, r3, #96
++      add             r5, r3, #144
++      add             r6, r3, #192
++      vld1.8          {d0-d1}, [r2, : 128]!
++      vld1.8          {d2-d3}, [r4, : 128]!
++      vsub.i32        q2, q0, q1
++      vadd.i32        q0, q0, q1
++      vld1.8          {d2-d3}, [r2, : 128]!
++      vld1.8          {d6-d7}, [r4, : 128]!
++      vsub.i32        q4, q1, q3
++      vadd.i32        q1, q1, q3
++      vld1.8          {d6}, [r2, : 64]
++      vld1.8          {d10}, [r4, : 64]
++      vsub.i32        q6, q3, q5
++      vadd.i32        q3, q3, q5
++      vst1.8          {d4-d5}, [r5, : 128]!
++      vst1.8          {d0-d1}, [r6, : 128]!
++      vst1.8          {d8-d9}, [r5, : 128]!
++      vst1.8          {d2-d3}, [r6, : 128]!
++      vst1.8          d12, [r5, : 64]
++      vst1.8          d6, [r6, : 64]
++      add             r2, r3, #0
++      add             r4, r3, #240
++      vld1.8          {d0-d1}, [r4, : 128]!
++      vld1.8          {d2-d3}, [r4, : 128]!
++      vld1.8          {d4}, [r4, : 64]
++      add             r4, r3, #336
++      vld1.8          {d6-d7}, [r4, : 128]!
++      vtrn.32         q0, q3
++      vld1.8          {d8-d9}, [r4, : 128]!
++      vshl.i32        q5, q0, #4
++      vtrn.32         q1, q4
++      vshl.i32        q6, q3, #4
++      vadd.i32        q5, q5, q0
++      vadd.i32        q6, q6, q3
++      vshl.i32        q7, q1, #4
++      vld1.8          {d5}, [r4, : 64]
++      vshl.i32        q8, q4, #4
++      vtrn.32         d4, d5
++      vadd.i32        q7, q7, q1
++      vadd.i32        q8, q8, q4
++      vld1.8          {d18-d19}, [r2, : 128]!
++      vshl.i32        q10, q2, #4
++      vld1.8          {d22-d23}, [r2, : 128]!
++      vadd.i32        q10, q10, q2
++      vld1.8          {d24}, [r2, : 64]
++      vadd.i32        q5, q5, q0
++      add             r2, r3, #288
++      vld1.8          {d26-d27}, [r2, : 128]!
++      vadd.i32        q6, q6, q3
++      vld1.8          {d28-d29}, [r2, : 128]!
++      vadd.i32        q8, q8, q4
++      vld1.8          {d25}, [r2, : 64]
++      vadd.i32        q10, q10, q2
++      vtrn.32         q9, q13
++      vadd.i32        q7, q7, q1
++      vadd.i32        q5, q5, q0
++      vtrn.32         q11, q14
++      vadd.i32        q6, q6, q3
++      add             r2, sp, #560
++      vadd.i32        q10, q10, q2
++      vtrn.32         d24, d25
++      vst1.8          {d12-d13}, [r2, : 128]
++      vshl.i32        q6, q13, #1
++      add             r2, sp, #576
++      vst1.8          {d20-d21}, [r2, : 128]
++      vshl.i32        q10, q14, #1
++      add             r2, sp, #592
++      vst1.8          {d12-d13}, [r2, : 128]
++      vshl.i32        q15, q12, #1
++      vadd.i32        q8, q8, q4
++      vext.32         d10, d31, d30, #0
++      vadd.i32        q7, q7, q1
++      add             r2, sp, #608
++      vst1.8          {d16-d17}, [r2, : 128]
++      vmull.s32       q8, d18, d5
++      vmlal.s32       q8, d26, d4
++      vmlal.s32       q8, d19, d9
++      vmlal.s32       q8, d27, d3
++      vmlal.s32       q8, d22, d8
++      vmlal.s32       q8, d28, d2
++      vmlal.s32       q8, d23, d7
++      vmlal.s32       q8, d29, d1
++      vmlal.s32       q8, d24, d6
++      vmlal.s32       q8, d25, d0
++      add             r2, sp, #624
++      vst1.8          {d14-d15}, [r2, : 128]
++      vmull.s32       q2, d18, d4
++      vmlal.s32       q2, d12, d9
++      vmlal.s32       q2, d13, d8
++      vmlal.s32       q2, d19, d3
++      vmlal.s32       q2, d22, d2
++      vmlal.s32       q2, d23, d1
++      vmlal.s32       q2, d24, d0
++      add             r2, sp, #640
++      vst1.8          {d20-d21}, [r2, : 128]
++      vmull.s32       q7, d18, d9
++      vmlal.s32       q7, d26, d3
++      vmlal.s32       q7, d19, d8
++      vmlal.s32       q7, d27, d2
++      vmlal.s32       q7, d22, d7
++      vmlal.s32       q7, d28, d1
++      vmlal.s32       q7, d23, d6
++      vmlal.s32       q7, d29, d0
++      add             r2, sp, #656
++      vst1.8          {d10-d11}, [r2, : 128]
++      vmull.s32       q5, d18, d3
++      vmlal.s32       q5, d19, d2
++      vmlal.s32       q5, d22, d1
++      vmlal.s32       q5, d23, d0
++      vmlal.s32       q5, d12, d8
++      add             r2, sp, #672
++      vst1.8          {d16-d17}, [r2, : 128]
++      vmull.s32       q4, d18, d8
++      vmlal.s32       q4, d26, d2
++      vmlal.s32       q4, d19, d7
++      vmlal.s32       q4, d27, d1
++      vmlal.s32       q4, d22, d6
++      vmlal.s32       q4, d28, d0
++      vmull.s32       q8, d18, d7
++      vmlal.s32       q8, d26, d1
++      vmlal.s32       q8, d19, d6
++      vmlal.s32       q8, d27, d0
++      add             r2, sp, #576
++      vld1.8          {d20-d21}, [r2, : 128]
++      vmlal.s32       q7, d24, d21
++      vmlal.s32       q7, d25, d20
++      vmlal.s32       q4, d23, d21
++      vmlal.s32       q4, d29, d20
++      vmlal.s32       q8, d22, d21
++      vmlal.s32       q8, d28, d20
++      vmlal.s32       q5, d24, d20
++      add             r2, sp, #576
++      vst1.8          {d14-d15}, [r2, : 128]
++      vmull.s32       q7, d18, d6
++      vmlal.s32       q7, d26, d0
++      add             r2, sp, #656
++      vld1.8          {d30-d31}, [r2, : 128]
++      vmlal.s32       q2, d30, d21
++      vmlal.s32       q7, d19, d21
++      vmlal.s32       q7, d27, d20
++      add             r2, sp, #624
++      vld1.8          {d26-d27}, [r2, : 128]
++      vmlal.s32       q4, d25, d27
++      vmlal.s32       q8, d29, d27
++      vmlal.s32       q8, d25, d26
++      vmlal.s32       q7, d28, d27
++      vmlal.s32       q7, d29, d26
++      add             r2, sp, #608
++      vld1.8          {d28-d29}, [r2, : 128]
++      vmlal.s32       q4, d24, d29
++      vmlal.s32       q8, d23, d29
++      vmlal.s32       q8, d24, d28
++      vmlal.s32       q7, d22, d29
++      vmlal.s32       q7, d23, d28
++      add             r2, sp, #608
++      vst1.8          {d8-d9}, [r2, : 128]
++      add             r2, sp, #560
++      vld1.8          {d8-d9}, [r2, : 128]
++      vmlal.s32       q7, d24, d9
++      vmlal.s32       q7, d25, d31
++      vmull.s32       q1, d18, d2
++      vmlal.s32       q1, d19, d1
++      vmlal.s32       q1, d22, d0
++      vmlal.s32       q1, d24, d27
++      vmlal.s32       q1, d23, d20
++      vmlal.s32       q1, d12, d7
++      vmlal.s32       q1, d13, d6
++      vmull.s32       q6, d18, d1
++      vmlal.s32       q6, d19, d0
++      vmlal.s32       q6, d23, d27
++      vmlal.s32       q6, d22, d20
++      vmlal.s32       q6, d24, d26
++      vmull.s32       q0, d18, d0
++      vmlal.s32       q0, d22, d27
++      vmlal.s32       q0, d23, d26
++      vmlal.s32       q0, d24, d31
++      vmlal.s32       q0, d19, d20
++      add             r2, sp, #640
++      vld1.8          {d18-d19}, [r2, : 128]
++      vmlal.s32       q2, d18, d7
++      vmlal.s32       q2, d19, d6
++      vmlal.s32       q5, d18, d6
++      vmlal.s32       q5, d19, d21
++      vmlal.s32       q1, d18, d21
++      vmlal.s32       q1, d19, d29
++      vmlal.s32       q0, d18, d28
++      vmlal.s32       q0, d19, d9
++      vmlal.s32       q6, d18, d29
++      vmlal.s32       q6, d19, d28
++      add             r2, sp, #592
++      vld1.8          {d18-d19}, [r2, : 128]
++      add             r2, sp, #512
++      vld1.8          {d22-d23}, [r2, : 128]
++      vmlal.s32       q5, d19, d7
++      vmlal.s32       q0, d18, d21
++      vmlal.s32       q0, d19, d29
++      vmlal.s32       q6, d18, d6
++      add             r2, sp, #528
++      vld1.8          {d6-d7}, [r2, : 128]
++      vmlal.s32       q6, d19, d21
++      add             r2, sp, #576
++      vld1.8          {d18-d19}, [r2, : 128]
++      vmlal.s32       q0, d30, d8
++      add             r2, sp, #672
++      vld1.8          {d20-d21}, [r2, : 128]
++      vmlal.s32       q5, d30, d29
++      add             r2, sp, #608
++      vld1.8          {d24-d25}, [r2, : 128]
++      vmlal.s32       q1, d30, d28
++      vadd.i64        q13, q0, q11
++      vadd.i64        q14, q5, q11
++      vmlal.s32       q6, d30, d9
++      vshr.s64        q4, q13, #26
++      vshr.s64        q13, q14, #26
++      vadd.i64        q7, q7, q4
++      vshl.i64        q4, q4, #26
++      vadd.i64        q14, q7, q3
++      vadd.i64        q9, q9, q13
++      vshl.i64        q13, q13, #26
++      vadd.i64        q15, q9, q3
++      vsub.i64        q0, q0, q4
++      vshr.s64        q4, q14, #25
++      vsub.i64        q5, q5, q13
++      vshr.s64        q13, q15, #25
++      vadd.i64        q6, q6, q4
++      vshl.i64        q4, q4, #25
++      vadd.i64        q14, q6, q11
++      vadd.i64        q2, q2, q13
++      vsub.i64        q4, q7, q4
++      vshr.s64        q7, q14, #26
++      vshl.i64        q13, q13, #25
++      vadd.i64        q14, q2, q11
++      vadd.i64        q8, q8, q7
++      vshl.i64        q7, q7, #26
++      vadd.i64        q15, q8, q3
++      vsub.i64        q9, q9, q13
++      vshr.s64        q13, q14, #26
++      vsub.i64        q6, q6, q7
++      vshr.s64        q7, q15, #25
++      vadd.i64        q10, q10, q13
++      vshl.i64        q13, q13, #26
++      vadd.i64        q14, q10, q3
++      vadd.i64        q1, q1, q7
++      add             r2, r3, #288
++      vshl.i64        q7, q7, #25
++      add             r4, r3, #96
++      vadd.i64        q15, q1, q11
++      add             r2, r2, #8
++      vsub.i64        q2, q2, q13
++      add             r4, r4, #8
++      vshr.s64        q13, q14, #25
++      vsub.i64        q7, q8, q7
++      vshr.s64        q8, q15, #26
++      vadd.i64        q14, q13, q13
++      vadd.i64        q12, q12, q8
++      vtrn.32         d12, d14
++      vshl.i64        q8, q8, #26
++      vtrn.32         d13, d15
++      vadd.i64        q3, q12, q3
++      vadd.i64        q0, q0, q14
++      vst1.8          d12, [r2, : 64]!
++      vshl.i64        q7, q13, #4
++      vst1.8          d13, [r4, : 64]!
++      vsub.i64        q1, q1, q8
++      vshr.s64        q3, q3, #25
++      vadd.i64        q0, q0, q7
++      vadd.i64        q5, q5, q3
++      vshl.i64        q3, q3, #25
++      vadd.i64        q6, q5, q11
++      vadd.i64        q0, q0, q13
++      vshl.i64        q7, q13, #25
++      vadd.i64        q8, q0, q11
++      vsub.i64        q3, q12, q3
++      vshr.s64        q6, q6, #26
++      vsub.i64        q7, q10, q7
++      vtrn.32         d2, d6
++      vshr.s64        q8, q8, #26
++      vtrn.32         d3, d7
++      vadd.i64        q3, q9, q6
++      vst1.8          d2, [r2, : 64]
++      vshl.i64        q6, q6, #26
++      vst1.8          d3, [r4, : 64]
++      vadd.i64        q1, q4, q8
++      vtrn.32         d4, d14
++      vshl.i64        q4, q8, #26
++      vtrn.32         d5, d15
++      vsub.i64        q5, q5, q6
++      add             r2, r2, #16
++      vsub.i64        q0, q0, q4
++      vst1.8          d4, [r2, : 64]
++      add             r4, r4, #16
++      vst1.8          d5, [r4, : 64]
++      vtrn.32         d10, d6
++      vtrn.32         d11, d7
++      sub             r2, r2, #8
++      sub             r4, r4, #8
++      vtrn.32         d0, d2
++      vtrn.32         d1, d3
++      vst1.8          d10, [r2, : 64]
++      vst1.8          d11, [r4, : 64]
++      sub             r2, r2, #24
++      sub             r4, r4, #24
++      vst1.8          d0, [r2, : 64]
++      vst1.8          d1, [r4, : 64]
++      add             r2, sp, #544
++      add             r4, r3, #144
++      add             r5, r3, #192
++      vld1.8          {d0-d1}, [r2, : 128]
++      vld1.8          {d2-d3}, [r4, : 128]!
++      vld1.8          {d4-d5}, [r5, : 128]!
++      vzip.i32        q1, q2
++      vld1.8          {d6-d7}, [r4, : 128]!
++      vld1.8          {d8-d9}, [r5, : 128]!
++      vshl.i32        q5, q1, #1
++      vzip.i32        q3, q4
++      vshl.i32        q6, q2, #1
++      vld1.8          {d14}, [r4, : 64]
++      vshl.i32        q8, q3, #1
++      vld1.8          {d15}, [r5, : 64]
++      vshl.i32        q9, q4, #1
++      vmul.i32        d21, d7, d1
++      vtrn.32         d14, d15
++      vmul.i32        q11, q4, q0
++      vmul.i32        q0, q7, q0
++      vmull.s32       q12, d2, d2
++      vmlal.s32       q12, d11, d1
++      vmlal.s32       q12, d12, d0
++      vmlal.s32       q12, d13, d23
++      vmlal.s32       q12, d16, d22
++      vmlal.s32       q12, d7, d21
++      vmull.s32       q10, d2, d11
++      vmlal.s32       q10, d4, d1
++      vmlal.s32       q10, d13, d0
++      vmlal.s32       q10, d6, d23
++      vmlal.s32       q10, d17, d22
++      vmull.s32       q13, d10, d4
++      vmlal.s32       q13, d11, d3
++      vmlal.s32       q13, d13, d1
++      vmlal.s32       q13, d16, d0
++      vmlal.s32       q13, d17, d23
++      vmlal.s32       q13, d8, d22
++      vmull.s32       q1, d10, d5
++      vmlal.s32       q1, d11, d4
++      vmlal.s32       q1, d6, d1
++      vmlal.s32       q1, d17, d0
++      vmlal.s32       q1, d8, d23
++      vmull.s32       q14, d10, d6
++      vmlal.s32       q14, d11, d13
++      vmlal.s32       q14, d4, d4
++      vmlal.s32       q14, d17, d1
++      vmlal.s32       q14, d18, d0
++      vmlal.s32       q14, d9, d23
++      vmull.s32       q11, d10, d7
++      vmlal.s32       q11, d11, d6
++      vmlal.s32       q11, d12, d5
++      vmlal.s32       q11, d8, d1
++      vmlal.s32       q11, d19, d0
++      vmull.s32       q15, d10, d8
++      vmlal.s32       q15, d11, d17
++      vmlal.s32       q15, d12, d6
++      vmlal.s32       q15, d13, d5
++      vmlal.s32       q15, d19, d1
++      vmlal.s32       q15, d14, d0
++      vmull.s32       q2, d10, d9
++      vmlal.s32       q2, d11, d8
++      vmlal.s32       q2, d12, d7
++      vmlal.s32       q2, d13, d6
++      vmlal.s32       q2, d14, d1
++      vmull.s32       q0, d15, d1
++      vmlal.s32       q0, d10, d14
++      vmlal.s32       q0, d11, d19
++      vmlal.s32       q0, d12, d8
++      vmlal.s32       q0, d13, d17
++      vmlal.s32       q0, d6, d6
++      add             r2, sp, #512
++      vld1.8          {d18-d19}, [r2, : 128]
++      vmull.s32       q3, d16, d7
++      vmlal.s32       q3, d10, d15
++      vmlal.s32       q3, d11, d14
++      vmlal.s32       q3, d12, d9
++      vmlal.s32       q3, d13, d8
++      add             r2, sp, #528
++      vld1.8          {d8-d9}, [r2, : 128]
++      vadd.i64        q5, q12, q9
++      vadd.i64        q6, q15, q9
++      vshr.s64        q5, q5, #26
++      vshr.s64        q6, q6, #26
++      vadd.i64        q7, q10, q5
++      vshl.i64        q5, q5, #26
++      vadd.i64        q8, q7, q4
++      vadd.i64        q2, q2, q6
++      vshl.i64        q6, q6, #26
++      vadd.i64        q10, q2, q4
++      vsub.i64        q5, q12, q5
++      vshr.s64        q8, q8, #25
++      vsub.i64        q6, q15, q6
++      vshr.s64        q10, q10, #25
++      vadd.i64        q12, q13, q8
++      vshl.i64        q8, q8, #25
++      vadd.i64        q13, q12, q9
++      vadd.i64        q0, q0, q10
++      vsub.i64        q7, q7, q8
++      vshr.s64        q8, q13, #26
++      vshl.i64        q10, q10, #25
++      vadd.i64        q13, q0, q9
++      vadd.i64        q1, q1, q8
++      vshl.i64        q8, q8, #26
++      vadd.i64        q15, q1, q4
++      vsub.i64        q2, q2, q10
++      vshr.s64        q10, q13, #26
++      vsub.i64        q8, q12, q8
++      vshr.s64        q12, q15, #25
++      vadd.i64        q3, q3, q10
++      vshl.i64        q10, q10, #26
++      vadd.i64        q13, q3, q4
++      vadd.i64        q14, q14, q12
++      add             r2, r3, #144
++      vshl.i64        q12, q12, #25
++      add             r4, r3, #192
++      vadd.i64        q15, q14, q9
++      add             r2, r2, #8
++      vsub.i64        q0, q0, q10
++      add             r4, r4, #8
++      vshr.s64        q10, q13, #25
++      vsub.i64        q1, q1, q12
++      vshr.s64        q12, q15, #26
++      vadd.i64        q13, q10, q10
++      vadd.i64        q11, q11, q12
++      vtrn.32         d16, d2
++      vshl.i64        q12, q12, #26
++      vtrn.32         d17, d3
++      vadd.i64        q1, q11, q4
++      vadd.i64        q4, q5, q13
++      vst1.8          d16, [r2, : 64]!
++      vshl.i64        q5, q10, #4
++      vst1.8          d17, [r4, : 64]!
++      vsub.i64        q8, q14, q12
++      vshr.s64        q1, q1, #25
++      vadd.i64        q4, q4, q5
++      vadd.i64        q5, q6, q1
++      vshl.i64        q1, q1, #25
++      vadd.i64        q6, q5, q9
++      vadd.i64        q4, q4, q10
++      vshl.i64        q10, q10, #25
++      vadd.i64        q9, q4, q9
++      vsub.i64        q1, q11, q1
++      vshr.s64        q6, q6, #26
++      vsub.i64        q3, q3, q10
++      vtrn.32         d16, d2
++      vshr.s64        q9, q9, #26
++      vtrn.32         d17, d3
++      vadd.i64        q1, q2, q6
++      vst1.8          d16, [r2, : 64]
++      vshl.i64        q2, q6, #26
++      vst1.8          d17, [r4, : 64]
++      vadd.i64        q6, q7, q9
++      vtrn.32         d0, d6
++      vshl.i64        q7, q9, #26
++      vtrn.32         d1, d7
++      vsub.i64        q2, q5, q2
++      add             r2, r2, #16
++      vsub.i64        q3, q4, q7
++      vst1.8          d0, [r2, : 64]
++      add             r4, r4, #16
++      vst1.8          d1, [r4, : 64]
++      vtrn.32         d4, d2
++      vtrn.32         d5, d3
++      sub             r2, r2, #8
++      sub             r4, r4, #8
++      vtrn.32         d6, d12
++      vtrn.32         d7, d13
++      vst1.8          d4, [r2, : 64]
++      vst1.8          d5, [r4, : 64]
++      sub             r2, r2, #24
++      sub             r4, r4, #24
++      vst1.8          d6, [r2, : 64]
++      vst1.8          d7, [r4, : 64]
++      add             r2, r3, #336
++      add             r4, r3, #288
++      vld1.8          {d0-d1}, [r2, : 128]!
++      vld1.8          {d2-d3}, [r4, : 128]!
++      vadd.i32        q0, q0, q1
++      vld1.8          {d2-d3}, [r2, : 128]!
++      vld1.8          {d4-d5}, [r4, : 128]!
++      vadd.i32        q1, q1, q2
++      add             r5, r3, #288
++      vld1.8          {d4}, [r2, : 64]
++      vld1.8          {d6}, [r4, : 64]
++      vadd.i32        q2, q2, q3
++      vst1.8          {d0-d1}, [r5, : 128]!
++      vst1.8          {d2-d3}, [r5, : 128]!
++      vst1.8          d4, [r5, : 64]
++      add             r2, r3, #48
++      add             r4, r3, #144
++      vld1.8          {d0-d1}, [r4, : 128]!
++      vld1.8          {d2-d3}, [r4, : 128]!
++      vld1.8          {d4}, [r4, : 64]
++      add             r4, r3, #288
++      vld1.8          {d6-d7}, [r4, : 128]!
++      vtrn.32         q0, q3
++      vld1.8          {d8-d9}, [r4, : 128]!
++      vshl.i32        q5, q0, #4
++      vtrn.32         q1, q4
++      vshl.i32        q6, q3, #4
++      vadd.i32        q5, q5, q0
++      vadd.i32        q6, q6, q3
++      vshl.i32        q7, q1, #4
++      vld1.8          {d5}, [r4, : 64]
++      vshl.i32        q8, q4, #4
++      vtrn.32         d4, d5
++      vadd.i32        q7, q7, q1
++      vadd.i32        q8, q8, q4
++      vld1.8          {d18-d19}, [r2, : 128]!
++      vshl.i32        q10, q2, #4
++      vld1.8          {d22-d23}, [r2, : 128]!
++      vadd.i32        q10, q10, q2
++      vld1.8          {d24}, [r2, : 64]
++      vadd.i32        q5, q5, q0
++      add             r2, r3, #240
++      vld1.8          {d26-d27}, [r2, : 128]!
++      vadd.i32        q6, q6, q3
++      vld1.8          {d28-d29}, [r2, : 128]!
++      vadd.i32        q8, q8, q4
++      vld1.8          {d25}, [r2, : 64]
++      vadd.i32        q10, q10, q2
++      vtrn.32         q9, q13
++      vadd.i32        q7, q7, q1
++      vadd.i32        q5, q5, q0
++      vtrn.32         q11, q14
++      vadd.i32        q6, q6, q3
++      add             r2, sp, #560
++      vadd.i32        q10, q10, q2
++      vtrn.32         d24, d25
++      vst1.8          {d12-d13}, [r2, : 128]
++      vshl.i32        q6, q13, #1
++      add             r2, sp, #576
++      vst1.8          {d20-d21}, [r2, : 128]
++      vshl.i32        q10, q14, #1
++      add             r2, sp, #592
++      vst1.8          {d12-d13}, [r2, : 128]
++      vshl.i32        q15, q12, #1
++      vadd.i32        q8, q8, q4
++      vext.32         d10, d31, d30, #0
++      vadd.i32        q7, q7, q1
++      add             r2, sp, #608
++      vst1.8          {d16-d17}, [r2, : 128]
++      vmull.s32       q8, d18, d5
++      vmlal.s32       q8, d26, d4
++      vmlal.s32       q8, d19, d9
++      vmlal.s32       q8, d27, d3
++      vmlal.s32       q8, d22, d8
++      vmlal.s32       q8, d28, d2
++      vmlal.s32       q8, d23, d7
++      vmlal.s32       q8, d29, d1
++      vmlal.s32       q8, d24, d6
++      vmlal.s32       q8, d25, d0
++      add             r2, sp, #624
++      vst1.8          {d14-d15}, [r2, : 128]
++      vmull.s32       q2, d18, d4
++      vmlal.s32       q2, d12, d9
++      vmlal.s32       q2, d13, d8
++      vmlal.s32       q2, d19, d3
++      vmlal.s32       q2, d22, d2
++      vmlal.s32       q2, d23, d1
++      vmlal.s32       q2, d24, d0
++      add             r2, sp, #640
++      vst1.8          {d20-d21}, [r2, : 128]
++      vmull.s32       q7, d18, d9
++      vmlal.s32       q7, d26, d3
++      vmlal.s32       q7, d19, d8
++      vmlal.s32       q7, d27, d2
++      vmlal.s32       q7, d22, d7
++      vmlal.s32       q7, d28, d1
++      vmlal.s32       q7, d23, d6
++      vmlal.s32       q7, d29, d0
++      add             r2, sp, #656
++      vst1.8          {d10-d11}, [r2, : 128]
++      vmull.s32       q5, d18, d3
++      vmlal.s32       q5, d19, d2
++      vmlal.s32       q5, d22, d1
++      vmlal.s32       q5, d23, d0
++      vmlal.s32       q5, d12, d8
++      add             r2, sp, #672
++      vst1.8          {d16-d17}, [r2, : 128]
++      vmull.s32       q4, d18, d8
++      vmlal.s32       q4, d26, d2
++      vmlal.s32       q4, d19, d7
++      vmlal.s32       q4, d27, d1
++      vmlal.s32       q4, d22, d6
++      vmlal.s32       q4, d28, d0
++      vmull.s32       q8, d18, d7
++      vmlal.s32       q8, d26, d1
++      vmlal.s32       q8, d19, d6
++      vmlal.s32       q8, d27, d0
++      add             r2, sp, #576
++      vld1.8          {d20-d21}, [r2, : 128]
++      vmlal.s32       q7, d24, d21
++      vmlal.s32       q7, d25, d20
++      vmlal.s32       q4, d23, d21
++      vmlal.s32       q4, d29, d20
++      vmlal.s32       q8, d22, d21
++      vmlal.s32       q8, d28, d20
++      vmlal.s32       q5, d24, d20
++      add             r2, sp, #576
++      vst1.8          {d14-d15}, [r2, : 128]
++      vmull.s32       q7, d18, d6
++      vmlal.s32       q7, d26, d0
++      add             r2, sp, #656
++      vld1.8          {d30-d31}, [r2, : 128]
++      vmlal.s32       q2, d30, d21
++      vmlal.s32       q7, d19, d21
++      vmlal.s32       q7, d27, d20
++      add             r2, sp, #624
++      vld1.8          {d26-d27}, [r2, : 128]
++      vmlal.s32       q4, d25, d27
++      vmlal.s32       q8, d29, d27
++      vmlal.s32       q8, d25, d26
++      vmlal.s32       q7, d28, d27
++      vmlal.s32       q7, d29, d26
++      add             r2, sp, #608
++      vld1.8          {d28-d29}, [r2, : 128]
++      vmlal.s32       q4, d24, d29
++      vmlal.s32       q8, d23, d29
++      vmlal.s32       q8, d24, d28
++      vmlal.s32       q7, d22, d29
++      vmlal.s32       q7, d23, d28
++      add             r2, sp, #608
++      vst1.8          {d8-d9}, [r2, : 128]
++      add             r2, sp, #560
++      vld1.8          {d8-d9}, [r2, : 128]
++      vmlal.s32       q7, d24, d9
++      vmlal.s32       q7, d25, d31
++      vmull.s32       q1, d18, d2
++      vmlal.s32       q1, d19, d1
++      vmlal.s32       q1, d22, d0
++      vmlal.s32       q1, d24, d27
++      vmlal.s32       q1, d23, d20
++      vmlal.s32       q1, d12, d7
++      vmlal.s32       q1, d13, d6
++      vmull.s32       q6, d18, d1
++      vmlal.s32       q6, d19, d0
++      vmlal.s32       q6, d23, d27
++      vmlal.s32       q6, d22, d20
++      vmlal.s32       q6, d24, d26
++      vmull.s32       q0, d18, d0
++      vmlal.s32       q0, d22, d27
++      vmlal.s32       q0, d23, d26
++      vmlal.s32       q0, d24, d31
++      vmlal.s32       q0, d19, d20
++      add             r2, sp, #640
++      vld1.8          {d18-d19}, [r2, : 128]
++      vmlal.s32       q2, d18, d7
++      vmlal.s32       q2, d19, d6
++      vmlal.s32       q5, d18, d6
++      vmlal.s32       q5, d19, d21
++      vmlal.s32       q1, d18, d21
++      vmlal.s32       q1, d19, d29
++      vmlal.s32       q0, d18, d28
++      vmlal.s32       q0, d19, d9
++      vmlal.s32       q6, d18, d29
++      vmlal.s32       q6, d19, d28
++      add             r2, sp, #592
++      vld1.8          {d18-d19}, [r2, : 128]
++      add             r2, sp, #512
++      vld1.8          {d22-d23}, [r2, : 128]
++      vmlal.s32       q5, d19, d7
++      vmlal.s32       q0, d18, d21
++      vmlal.s32       q0, d19, d29
++      vmlal.s32       q6, d18, d6
++      add             r2, sp, #528
++      vld1.8          {d6-d7}, [r2, : 128]
++      vmlal.s32       q6, d19, d21
++      add             r2, sp, #576
++      vld1.8          {d18-d19}, [r2, : 128]
++      vmlal.s32       q0, d30, d8
++      add             r2, sp, #672
++      vld1.8          {d20-d21}, [r2, : 128]
++      vmlal.s32       q5, d30, d29
++      add             r2, sp, #608
++      vld1.8          {d24-d25}, [r2, : 128]
++      vmlal.s32       q1, d30, d28
++      vadd.i64        q13, q0, q11
++      vadd.i64        q14, q5, q11
++      vmlal.s32       q6, d30, d9
++      vshr.s64        q4, q13, #26
++      vshr.s64        q13, q14, #26
++      vadd.i64        q7, q7, q4
++      vshl.i64        q4, q4, #26
++      vadd.i64        q14, q7, q3
++      vadd.i64        q9, q9, q13
++      vshl.i64        q13, q13, #26
++      vadd.i64        q15, q9, q3
++      vsub.i64        q0, q0, q4
++      vshr.s64        q4, q14, #25
++      vsub.i64        q5, q5, q13
++      vshr.s64        q13, q15, #25
++      vadd.i64        q6, q6, q4
++      vshl.i64        q4, q4, #25
++      vadd.i64        q14, q6, q11
++      vadd.i64        q2, q2, q13
++      vsub.i64        q4, q7, q4
++      vshr.s64        q7, q14, #26
++      vshl.i64        q13, q13, #25
++      vadd.i64        q14, q2, q11
++      vadd.i64        q8, q8, q7
++      vshl.i64        q7, q7, #26
++      vadd.i64        q15, q8, q3
++      vsub.i64        q9, q9, q13
++      vshr.s64        q13, q14, #26
++      vsub.i64        q6, q6, q7
++      vshr.s64        q7, q15, #25
++      vadd.i64        q10, q10, q13
++      vshl.i64        q13, q13, #26
++      vadd.i64        q14, q10, q3
++      vadd.i64        q1, q1, q7
++      add             r2, r3, #240
++      vshl.i64        q7, q7, #25
++      add             r4, r3, #144
++      vadd.i64        q15, q1, q11
++      add             r2, r2, #8
++      vsub.i64        q2, q2, q13
++      add             r4, r4, #8
++      vshr.s64        q13, q14, #25
++      vsub.i64        q7, q8, q7
++      vshr.s64        q8, q15, #26
++      vadd.i64        q14, q13, q13
++      vadd.i64        q12, q12, q8
++      vtrn.32         d12, d14
++      vshl.i64        q8, q8, #26
++      vtrn.32         d13, d15
++      vadd.i64        q3, q12, q3
++      vadd.i64        q0, q0, q14
++      vst1.8          d12, [r2, : 64]!
++      vshl.i64        q7, q13, #4
++      vst1.8          d13, [r4, : 64]!
++      vsub.i64        q1, q1, q8
++      vshr.s64        q3, q3, #25
++      vadd.i64        q0, q0, q7
++      vadd.i64        q5, q5, q3
++      vshl.i64        q3, q3, #25
++      vadd.i64        q6, q5, q11
++      vadd.i64        q0, q0, q13
++      vshl.i64        q7, q13, #25
++      vadd.i64        q8, q0, q11
++      vsub.i64        q3, q12, q3
++      vshr.s64        q6, q6, #26
++      vsub.i64        q7, q10, q7
++      vtrn.32         d2, d6
++      vshr.s64        q8, q8, #26
++      vtrn.32         d3, d7
++      vadd.i64        q3, q9, q6
++      vst1.8          d2, [r2, : 64]
++      vshl.i64        q6, q6, #26
++      vst1.8          d3, [r4, : 64]
++      vadd.i64        q1, q4, q8
++      vtrn.32         d4, d14
++      vshl.i64        q4, q8, #26
++      vtrn.32         d5, d15
++      vsub.i64        q5, q5, q6
++      add             r2, r2, #16
++      vsub.i64        q0, q0, q4
++      vst1.8          d4, [r2, : 64]
++      add             r4, r4, #16
++      vst1.8          d5, [r4, : 64]
++      vtrn.32         d10, d6
++      vtrn.32         d11, d7
++      sub             r2, r2, #8
++      sub             r4, r4, #8
++      vtrn.32         d0, d2
++      vtrn.32         d1, d3
++      vst1.8          d10, [r2, : 64]
++      vst1.8          d11, [r4, : 64]
++      sub             r2, r2, #24
++      sub             r4, r4, #24
++      vst1.8          d0, [r2, : 64]
++      vst1.8          d1, [r4, : 64]
++      ldr             r2, [sp, #488]
++      ldr             r4, [sp, #492]
++      subs            r5, r2, #1
++      bge             ._mainloop
++      add             r1, r3, #144
++      add             r2, r3, #336
++      vld1.8          {d0-d1}, [r1, : 128]!
++      vld1.8          {d2-d3}, [r1, : 128]!
++      vld1.8          {d4}, [r1, : 64]
++      vst1.8          {d0-d1}, [r2, : 128]!
++      vst1.8          {d2-d3}, [r2, : 128]!
++      vst1.8          d4, [r2, : 64]
++      ldr             r1, =0
++._invertloop:
++      add             r2, r3, #144
++      ldr             r4, =0
++      ldr             r5, =2
++      cmp             r1, #1
++      ldreq           r5, =1
++      addeq           r2, r3, #336
++      addeq           r4, r3, #48
++      cmp             r1, #2
++      ldreq           r5, =1
++      addeq           r2, r3, #48
++      cmp             r1, #3
++      ldreq           r5, =5
++      addeq           r4, r3, #336
++      cmp             r1, #4
++      ldreq           r5, =10
++      cmp             r1, #5
++      ldreq           r5, =20
++      cmp             r1, #6
++      ldreq           r5, =10
++      addeq           r2, r3, #336
++      addeq           r4, r3, #336
++      cmp             r1, #7
++      ldreq           r5, =50
++      cmp             r1, #8
++      ldreq           r5, =100
++      cmp             r1, #9
++      ldreq           r5, =50
++      addeq           r2, r3, #336
++      cmp             r1, #10
++      ldreq           r5, =5
++      addeq           r2, r3, #48
++      cmp             r1, #11
++      ldreq           r5, =0
++      addeq           r2, r3, #96
++      add             r6, r3, #144
++      add             r7, r3, #288
++      vld1.8          {d0-d1}, [r6, : 128]!
++      vld1.8          {d2-d3}, [r6, : 128]!
++      vld1.8          {d4}, [r6, : 64]
++      vst1.8          {d0-d1}, [r7, : 128]!
++      vst1.8          {d2-d3}, [r7, : 128]!
++      vst1.8          d4, [r7, : 64]
++      cmp             r5, #0
++      beq             ._skipsquaringloop
++._squaringloop:
++      add             r6, r3, #288
++      add             r7, r3, #288
++      add             r8, r3, #288
++      vmov.i32        q0, #19
++      vmov.i32        q1, #0
++      vmov.i32        q2, #1
++      vzip.i32        q1, q2
++      vld1.8          {d4-d5}, [r7, : 128]!
++      vld1.8          {d6-d7}, [r7, : 128]!
++      vld1.8          {d9}, [r7, : 64]
++      vld1.8          {d10-d11}, [r6, : 128]!
++      add             r7, sp, #416
++      vld1.8          {d12-d13}, [r6, : 128]!
++      vmul.i32        q7, q2, q0
++      vld1.8          {d8}, [r6, : 64]
++      vext.32         d17, d11, d10, #1
++      vmul.i32        q9, q3, q0
++      vext.32         d16, d10, d8, #1
++      vshl.u32        q10, q5, q1
++      vext.32         d22, d14, d4, #1
++      vext.32         d24, d18, d6, #1
++      vshl.u32        q13, q6, q1
++      vshl.u32        d28, d8, d2
++      vrev64.i32      d22, d22
++      vmul.i32        d1, d9, d1
++      vrev64.i32      d24, d24
++      vext.32         d29, d8, d13, #1
++      vext.32         d0, d1, d9, #1
++      vrev64.i32      d0, d0
++      vext.32         d2, d9, d1, #1
++      vext.32         d23, d15, d5, #1
++      vmull.s32       q4, d20, d4
++      vrev64.i32      d23, d23
++      vmlal.s32       q4, d21, d1
++      vrev64.i32      d2, d2
++      vmlal.s32       q4, d26, d19
++      vext.32         d3, d5, d15, #1
++      vmlal.s32       q4, d27, d18
++      vrev64.i32      d3, d3
++      vmlal.s32       q4, d28, d15
++      vext.32         d14, d12, d11, #1
++      vmull.s32       q5, d16, d23
++      vext.32         d15, d13, d12, #1
++      vmlal.s32       q5, d17, d4
++      vst1.8          d8, [r7, : 64]!
++      vmlal.s32       q5, d14, d1
++      vext.32         d12, d9, d8, #0
++      vmlal.s32       q5, d15, d19
++      vmov.i64        d13, #0
++      vmlal.s32       q5, d29, d18
++      vext.32         d25, d19, d7, #1
++      vmlal.s32       q6, d20, d5
++      vrev64.i32      d25, d25
++      vmlal.s32       q6, d21, d4
++      vst1.8          d11, [r7, : 64]!
++      vmlal.s32       q6, d26, d1
++      vext.32         d9, d10, d10, #0
++      vmlal.s32       q6, d27, d19
++      vmov.i64        d8, #0
++      vmlal.s32       q6, d28, d18
++      vmlal.s32       q4, d16, d24
++      vmlal.s32       q4, d17, d5
++      vmlal.s32       q4, d14, d4
++      vst1.8          d12, [r7, : 64]!
++      vmlal.s32       q4, d15, d1
++      vext.32         d10, d13, d12, #0
++      vmlal.s32       q4, d29, d19
++      vmov.i64        d11, #0
++      vmlal.s32       q5, d20, d6
++      vmlal.s32       q5, d21, d5
++      vmlal.s32       q5, d26, d4
++      vext.32         d13, d8, d8, #0
++      vmlal.s32       q5, d27, d1
++      vmov.i64        d12, #0
++      vmlal.s32       q5, d28, d19
++      vst1.8          d9, [r7, : 64]!
++      vmlal.s32       q6, d16, d25
++      vmlal.s32       q6, d17, d6
++      vst1.8          d10, [r7, : 64]
++      vmlal.s32       q6, d14, d5
++      vext.32         d8, d11, d10, #0
++      vmlal.s32       q6, d15, d4
++      vmov.i64        d9, #0
++      vmlal.s32       q6, d29, d1
++      vmlal.s32       q4, d20, d7
++      vmlal.s32       q4, d21, d6
++      vmlal.s32       q4, d26, d5
++      vext.32         d11, d12, d12, #0
++      vmlal.s32       q4, d27, d4
++      vmov.i64        d10, #0
++      vmlal.s32       q4, d28, d1
++      vmlal.s32       q5, d16, d0
++      sub             r6, r7, #32
++      vmlal.s32       q5, d17, d7
++      vmlal.s32       q5, d14, d6
++      vext.32         d30, d9, d8, #0
++      vmlal.s32       q5, d15, d5
++      vld1.8          {d31}, [r6, : 64]!
++      vmlal.s32       q5, d29, d4
++      vmlal.s32       q15, d20, d0
++      vext.32         d0, d6, d18, #1
++      vmlal.s32       q15, d21, d25
++      vrev64.i32      d0, d0
++      vmlal.s32       q15, d26, d24
++      vext.32         d1, d7, d19, #1
++      vext.32         d7, d10, d10, #0
++      vmlal.s32       q15, d27, d23
++      vrev64.i32      d1, d1
++      vld1.8          {d6}, [r6, : 64]
++      vmlal.s32       q15, d28, d22
++      vmlal.s32       q3, d16, d4
++      add             r6, r6, #24
++      vmlal.s32       q3, d17, d2
++      vext.32         d4, d31, d30, #0
++      vmov            d17, d11
++      vmlal.s32       q3, d14, d1
++      vext.32         d11, d13, d13, #0
++      vext.32         d13, d30, d30, #0
++      vmlal.s32       q3, d15, d0
++      vext.32         d1, d8, d8, #0
++      vmlal.s32       q3, d29, d3
++      vld1.8          {d5}, [r6, : 64]
++      sub             r6, r6, #16
++      vext.32         d10, d6, d6, #0
++      vmov.i32        q1, #0xffffffff
++      vshl.i64        q4, q1, #25
++      add             r7, sp, #512
++      vld1.8          {d14-d15}, [r7, : 128]
++      vadd.i64        q9, q2, q7
++      vshl.i64        q1, q1, #26
++      vshr.s64        q10, q9, #26
++      vld1.8          {d0}, [r6, : 64]!
++      vadd.i64        q5, q5, q10
++      vand            q9, q9, q1
++      vld1.8          {d16}, [r6, : 64]!
++      add             r6, sp, #528
++      vld1.8          {d20-d21}, [r6, : 128]
++      vadd.i64        q11, q5, q10
++      vsub.i64        q2, q2, q9
++      vshr.s64        q9, q11, #25
++      vext.32         d12, d5, d4, #0
++      vand            q11, q11, q4
++      vadd.i64        q0, q0, q9
++      vmov            d19, d7
++      vadd.i64        q3, q0, q7
++      vsub.i64        q5, q5, q11
++      vshr.s64        q11, q3, #26
++      vext.32         d18, d11, d10, #0
++      vand            q3, q3, q1
++      vadd.i64        q8, q8, q11
++      vadd.i64        q11, q8, q10
++      vsub.i64        q0, q0, q3
++      vshr.s64        q3, q11, #25
++      vand            q11, q11, q4
++      vadd.i64        q3, q6, q3
++      vadd.i64        q6, q3, q7
++      vsub.i64        q8, q8, q11
++      vshr.s64        q11, q6, #26
++      vand            q6, q6, q1
++      vadd.i64        q9, q9, q11
++      vadd.i64        d25, d19, d21
++      vsub.i64        q3, q3, q6
++      vshr.s64        d23, d25, #25
++      vand            q4, q12, q4
++      vadd.i64        d21, d23, d23
++      vshl.i64        d25, d23, #4
++      vadd.i64        d21, d21, d23
++      vadd.i64        d25, d25, d21
++      vadd.i64        d4, d4, d25
++      vzip.i32        q0, q8
++      vadd.i64        d12, d4, d14
++      add             r6, r8, #8
++      vst1.8          d0, [r6, : 64]
++      vsub.i64        d19, d19, d9
++      add             r6, r6, #16
++      vst1.8          d16, [r6, : 64]
++      vshr.s64        d22, d12, #26
++      vand            q0, q6, q1
++      vadd.i64        d10, d10, d22
++      vzip.i32        q3, q9
++      vsub.i64        d4, d4, d0
++      sub             r6, r6, #8
++      vst1.8          d6, [r6, : 64]
++      add             r6, r6, #16
++      vst1.8          d18, [r6, : 64]
++      vzip.i32        q2, q5
++      sub             r6, r6, #32
++      vst1.8          d4, [r6, : 64]
++      subs            r5, r5, #1
++      bhi             ._squaringloop
++._skipsquaringloop:
++      mov             r2, r2
++      add             r5, r3, #288
++      add             r6, r3, #144
++      vmov.i32        q0, #19
++      vmov.i32        q1, #0
++      vmov.i32        q2, #1
++      vzip.i32        q1, q2
++      vld1.8          {d4-d5}, [r5, : 128]!
++      vld1.8          {d6-d7}, [r5, : 128]!
++      vld1.8          {d9}, [r5, : 64]
++      vld1.8          {d10-d11}, [r2, : 128]!
++      add             r5, sp, #416
++      vld1.8          {d12-d13}, [r2, : 128]!
++      vmul.i32        q7, q2, q0
++      vld1.8          {d8}, [r2, : 64]
++      vext.32         d17, d11, d10, #1
++      vmul.i32        q9, q3, q0
++      vext.32         d16, d10, d8, #1
++      vshl.u32        q10, q5, q1
++      vext.32         d22, d14, d4, #1
++      vext.32         d24, d18, d6, #1
++      vshl.u32        q13, q6, q1
++      vshl.u32        d28, d8, d2
++      vrev64.i32      d22, d22
++      vmul.i32        d1, d9, d1
++      vrev64.i32      d24, d24
++      vext.32         d29, d8, d13, #1
++      vext.32         d0, d1, d9, #1
++      vrev64.i32      d0, d0
++      vext.32         d2, d9, d1, #1
++      vext.32         d23, d15, d5, #1
++      vmull.s32       q4, d20, d4
++      vrev64.i32      d23, d23
++      vmlal.s32       q4, d21, d1
++      vrev64.i32      d2, d2
++      vmlal.s32       q4, d26, d19
++      vext.32         d3, d5, d15, #1
++      vmlal.s32       q4, d27, d18
++      vrev64.i32      d3, d3
++      vmlal.s32       q4, d28, d15
++      vext.32         d14, d12, d11, #1
++      vmull.s32       q5, d16, d23
++      vext.32         d15, d13, d12, #1
++      vmlal.s32       q5, d17, d4
++      vst1.8          d8, [r5, : 64]!
++      vmlal.s32       q5, d14, d1
++      vext.32         d12, d9, d8, #0
++      vmlal.s32       q5, d15, d19
++      vmov.i64        d13, #0
++      vmlal.s32       q5, d29, d18
++      vext.32         d25, d19, d7, #1
++      vmlal.s32       q6, d20, d5
++      vrev64.i32      d25, d25
++      vmlal.s32       q6, d21, d4
++      vst1.8          d11, [r5, : 64]!
++      vmlal.s32       q6, d26, d1
++      vext.32         d9, d10, d10, #0
++      vmlal.s32       q6, d27, d19
++      vmov.i64        d8, #0
++      vmlal.s32       q6, d28, d18
++      vmlal.s32       q4, d16, d24
++      vmlal.s32       q4, d17, d5
++      vmlal.s32       q4, d14, d4
++      vst1.8          d12, [r5, : 64]!
++      vmlal.s32       q4, d15, d1
++      vext.32         d10, d13, d12, #0
++      vmlal.s32       q4, d29, d19
++      vmov.i64        d11, #0
++      vmlal.s32       q5, d20, d6
++      vmlal.s32       q5, d21, d5
++      vmlal.s32       q5, d26, d4
++      vext.32         d13, d8, d8, #0
++      vmlal.s32       q5, d27, d1
++      vmov.i64        d12, #0
++      vmlal.s32       q5, d28, d19
++      vst1.8          d9, [r5, : 64]!
++      vmlal.s32       q6, d16, d25
++      vmlal.s32       q6, d17, d6
++      vst1.8          d10, [r5, : 64]
++      vmlal.s32       q6, d14, d5
++      vext.32         d8, d11, d10, #0
++      vmlal.s32       q6, d15, d4
++      vmov.i64        d9, #0
++      vmlal.s32       q6, d29, d1
++      vmlal.s32       q4, d20, d7
++      vmlal.s32       q4, d21, d6
++      vmlal.s32       q4, d26, d5
++      vext.32         d11, d12, d12, #0
++      vmlal.s32       q4, d27, d4
++      vmov.i64        d10, #0
++      vmlal.s32       q4, d28, d1
++      vmlal.s32       q5, d16, d0
++      sub             r2, r5, #32
++      vmlal.s32       q5, d17, d7
++      vmlal.s32       q5, d14, d6
++      vext.32         d30, d9, d8, #0
++      vmlal.s32       q5, d15, d5
++      vld1.8          {d31}, [r2, : 64]!
++      vmlal.s32       q5, d29, d4
++      vmlal.s32       q15, d20, d0
++      vext.32         d0, d6, d18, #1
++      vmlal.s32       q15, d21, d25
++      vrev64.i32      d0, d0
++      vmlal.s32       q15, d26, d24
++      vext.32         d1, d7, d19, #1
++      vext.32         d7, d10, d10, #0
++      vmlal.s32       q15, d27, d23
++      vrev64.i32      d1, d1
++      vld1.8          {d6}, [r2, : 64]
++      vmlal.s32       q15, d28, d22
++      vmlal.s32       q3, d16, d4
++      add             r2, r2, #24
++      vmlal.s32       q3, d17, d2
++      vext.32         d4, d31, d30, #0
++      vmov            d17, d11
++      vmlal.s32       q3, d14, d1
++      vext.32         d11, d13, d13, #0
++      vext.32         d13, d30, d30, #0
++      vmlal.s32       q3, d15, d0
++      vext.32         d1, d8, d8, #0
++      vmlal.s32       q3, d29, d3
++      vld1.8          {d5}, [r2, : 64]
++      sub             r2, r2, #16
++      vext.32         d10, d6, d6, #0
++      vmov.i32        q1, #0xffffffff
++      vshl.i64        q4, q1, #25
++      add             r5, sp, #512
++      vld1.8          {d14-d15}, [r5, : 128]
++      vadd.i64        q9, q2, q7
++      vshl.i64        q1, q1, #26
++      vshr.s64        q10, q9, #26
++      vld1.8          {d0}, [r2, : 64]!
++      vadd.i64        q5, q5, q10
++      vand            q9, q9, q1
++      vld1.8          {d16}, [r2, : 64]!
++      add             r2, sp, #528
++      vld1.8          {d20-d21}, [r2, : 128]
++      vadd.i64        q11, q5, q10
++      vsub.i64        q2, q2, q9
++      vshr.s64        q9, q11, #25
++      vext.32         d12, d5, d4, #0
++      vand            q11, q11, q4
++      vadd.i64        q0, q0, q9
++      vmov            d19, d7
++      vadd.i64        q3, q0, q7
++      vsub.i64        q5, q5, q11
++      vshr.s64        q11, q3, #26
++      vext.32         d18, d11, d10, #0
++      vand            q3, q3, q1
++      vadd.i64        q8, q8, q11
++      vadd.i64        q11, q8, q10
++      vsub.i64        q0, q0, q3
++      vshr.s64        q3, q11, #25
++      vand            q11, q11, q4
++      vadd.i64        q3, q6, q3
++      vadd.i64        q6, q3, q7
++      vsub.i64        q8, q8, q11
++      vshr.s64        q11, q6, #26
++      vand            q6, q6, q1
++      vadd.i64        q9, q9, q11
++      vadd.i64        d25, d19, d21
++      vsub.i64        q3, q3, q6
++      vshr.s64        d23, d25, #25
++      vand            q4, q12, q4
++      vadd.i64        d21, d23, d23
++      vshl.i64        d25, d23, #4
++      vadd.i64        d21, d21, d23
++      vadd.i64        d25, d25, d21
++      vadd.i64        d4, d4, d25
++      vzip.i32        q0, q8
++      vadd.i64        d12, d4, d14
++      add             r2, r6, #8
++      vst1.8          d0, [r2, : 64]
++      vsub.i64        d19, d19, d9
++      add             r2, r2, #16
++      vst1.8          d16, [r2, : 64]
++      vshr.s64        d22, d12, #26
++      vand            q0, q6, q1
++      vadd.i64        d10, d10, d22
++      vzip.i32        q3, q9
++      vsub.i64        d4, d4, d0
++      sub             r2, r2, #8
++      vst1.8          d6, [r2, : 64]
++      add             r2, r2, #16
++      vst1.8          d18, [r2, : 64]
++      vzip.i32        q2, q5
++      sub             r2, r2, #32
++      vst1.8          d4, [r2, : 64]
++      cmp             r4, #0
++      beq             ._skippostcopy
++      add             r2, r3, #144
++      mov             r4, r4
++      vld1.8          {d0-d1}, [r2, : 128]!
++      vld1.8          {d2-d3}, [r2, : 128]!
++      vld1.8          {d4}, [r2, : 64]
++      vst1.8          {d0-d1}, [r4, : 128]!
++      vst1.8          {d2-d3}, [r4, : 128]!
++      vst1.8          d4, [r4, : 64]
++._skippostcopy:
++      cmp             r1, #1
++      bne             ._skipfinalcopy
++      add             r2, r3, #288
++      add             r4, r3, #144
++      vld1.8          {d0-d1}, [r2, : 128]!
++      vld1.8          {d2-d3}, [r2, : 128]!
++      vld1.8          {d4}, [r2, : 64]
++      vst1.8          {d0-d1}, [r4, : 128]!
++      vst1.8          {d2-d3}, [r4, : 128]!
++      vst1.8          d4, [r4, : 64]
++._skipfinalcopy:
++      add             r1, r1, #1
++      cmp             r1, #12
++      blo             ._invertloop
++      add             r1, r3, #144
++      ldr             r2, [r1], #4
++      ldr             r3, [r1], #4
++      ldr             r4, [r1], #4
++      ldr             r5, [r1], #4
++      ldr             r6, [r1], #4
++      ldr             r7, [r1], #4
++      ldr             r8, [r1], #4
++      ldr             r9, [r1], #4
++      ldr             r10, [r1], #4
++      ldr             r1, [r1]
++      add             r11, r1, r1, LSL #4
++      add             r11, r11, r1, LSL #1
++      add             r11, r11, #16777216
++      mov             r11, r11, ASR #25
++      add             r11, r11, r2
++      mov             r11, r11, ASR #26
++      add             r11, r11, r3
++      mov             r11, r11, ASR #25
++      add             r11, r11, r4
++      mov             r11, r11, ASR #26
++      add             r11, r11, r5
++      mov             r11, r11, ASR #25
++      add             r11, r11, r6
++      mov             r11, r11, ASR #26
++      add             r11, r11, r7
++      mov             r11, r11, ASR #25
++      add             r11, r11, r8
++      mov             r11, r11, ASR #26
++      add             r11, r11, r9
++      mov             r11, r11, ASR #25
++      add             r11, r11, r10
++      mov             r11, r11, ASR #26
++      add             r11, r11, r1
++      mov             r11, r11, ASR #25
++      add             r2, r2, r11
++      add             r2, r2, r11, LSL #1
++      add             r2, r2, r11, LSL #4
++      mov             r11, r2, ASR #26
++      add             r3, r3, r11
++      sub             r2, r2, r11, LSL #26
++      mov             r11, r3, ASR #25
++      add             r4, r4, r11
++      sub             r3, r3, r11, LSL #25
++      mov             r11, r4, ASR #26
++      add             r5, r5, r11
++      sub             r4, r4, r11, LSL #26
++      mov             r11, r5, ASR #25
++      add             r6, r6, r11
++      sub             r5, r5, r11, LSL #25
++      mov             r11, r6, ASR #26
++      add             r7, r7, r11
++      sub             r6, r6, r11, LSL #26
++      mov             r11, r7, ASR #25
++      add             r8, r8, r11
++      sub             r7, r7, r11, LSL #25
++      mov             r11, r8, ASR #26
++      add             r9, r9, r11
++      sub             r8, r8, r11, LSL #26
++      mov             r11, r9, ASR #25
++      add             r10, r10, r11
++      sub             r9, r9, r11, LSL #25
++      mov             r11, r10, ASR #26
++      add             r1, r1, r11
++      sub             r10, r10, r11, LSL #26
++      mov             r11, r1, ASR #25
++      sub             r1, r1, r11, LSL #25
++      add             r2, r2, r3, LSL #26
++      mov             r3, r3, LSR #6
++      add             r3, r3, r4, LSL #19
++      mov             r4, r4, LSR #13
++      add             r4, r4, r5, LSL #13
++      mov             r5, r5, LSR #19
++      add             r5, r5, r6, LSL #6
++      add             r6, r7, r8, LSL #25
++      mov             r7, r8, LSR #7
++      add             r7, r7, r9, LSL #19
++      mov             r8, r9, LSR #13
++      add             r8, r8, r10, LSL #12
++      mov             r9, r10, LSR #20
++      add             r1, r9, r1, LSL #6
++      str             r2, [r0], #4
++      str             r3, [r0], #4
++      str             r4, [r0], #4
++      str             r5, [r0], #4
++      str             r6, [r0], #4
++      str             r7, [r0], #4
++      str             r8, [r0], #4
++      str             r1, [r0]
++      ldrd            r4, [sp, #0]
++      ldrd            r6, [sp, #8]
++      ldrd            r8, [sp, #16]
++      ldrd            r10, [sp, #24]
++      ldr             r12, [sp, #480]
++      ldr             r14, [sp, #484]
++      ldr             r0, =0
++      mov             sp, r12
++      vpop            {q4, q5, q6, q7}
++      bx              lr
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch b/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch

new file mode 100644 (file)

index 0000000..14a75e1
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch
@@ -0,0 +1,1058 @@
+From ec96c25c1ce09c78e44bd4627bc0a3e610b7f5d8 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:38 +0100
+Subject: [PATCH 031/124] crypto: arm/curve25519 - wire up NEON implementation
+
+commit d8f1308a025fc7e00414194ed742d5f05a21e13c upstream.
+
+This ports the SUPERCOP implementation for usage in kernel space. In
+addition to the usual header, macro, and style changes required for
+kernel space, it makes a few small changes to the code:
+
+  - The stack alignment is relaxed to 16 bytes.
+  - Superfluous mov statements have been removed.
+  - ldr for constants has been replaced with movw.
+  - ldreq has been replaced with moveq.
+  - The str epilogue has been made more idiomatic.
+  - SIMD registers are not pushed and popped at the beginning and end.
+  - The prologue and epilogue have been made idiomatic.
+  - A hole has been removed from the stack, saving 32 bytes.
+  - We write-back the base register whenever possible for vld1.8.
+  - Some multiplications have been reordered for better A7 performance.
+
+There are more opportunities for cleanup, since this code is from qhasm,
+which doesn't always do the most opportune thing. But even prior to
+extensive hand optimizations, this code delivers significant performance
+improvements (given in get_cycles() per call):
+
+                     ----------- -------------
+                    | generic C | this commit |
+        ------------ ----------- -------------
+       | Cortex-A7  |     49136 |       22395 |
+        ------------ ----------- -------------
+       | Cortex-A17 |     17326 |        4983 |
+        ------------ ----------- -------------
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+[ardb: - move to arch/arm/crypto
+       - wire into lib/crypto framework
+       - implement crypto API KPP hooks ]
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/Kconfig           |   6 +
+ arch/arm/crypto/Makefile          |   2 +
+ arch/arm/crypto/curve25519-core.S | 347 +++++++++++++-----------------
+ arch/arm/crypto/curve25519-glue.c | 127 +++++++++++
+ 4 files changed, 287 insertions(+), 195 deletions(-)
+ create mode 100644 arch/arm/crypto/curve25519-glue.c
+
+--- a/arch/arm/crypto/Kconfig
++++ b/arch/arm/crypto/Kconfig
+@@ -141,4 +141,10 @@ config CRYPTO_NHPOLY1305_NEON
+       depends on KERNEL_MODE_NEON
+       select CRYPTO_NHPOLY1305
+ 
++config CRYPTO_CURVE25519_NEON
++      tristate "NEON accelerated Curve25519 scalar multiplication library"
++      depends on KERNEL_MODE_NEON
++      select CRYPTO_LIB_CURVE25519_GENERIC
++      select CRYPTO_ARCH_HAVE_LIB_CURVE25519
++
+ endif
+--- a/arch/arm/crypto/Makefile
++++ b/arch/arm/crypto/Makefile
+@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha51
+ obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+ obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
++obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
+ 
+ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
+ ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
+@@ -58,6 +59,7 @@ chacha-neon-y := chacha-scalar-core.o ch
+ chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
+ poly1305-arm-y := poly1305-core.o poly1305-glue.o
+ nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
++curve25519-neon-y := curve25519-core.o curve25519-glue.o
+ 
+ ifdef REGENERATE_ARM_CRYPTO
+ quiet_cmd_perl = PERL    $@
+--- a/arch/arm/crypto/curve25519-core.S
++++ b/arch/arm/crypto/curve25519-core.S
+@@ -1,43 +1,35 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+ /*
+- * Public domain code from Daniel J. Bernstein and Peter Schwabe, from
+- * SUPERCOP's curve25519/neon2/scalarmult.s.
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
++ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
++ * manually reworked for use in kernel space.
+  */
+ 
+-.fpu neon
++#include <linux/linkage.h>
++
+ .text
++.fpu neon
++.arch armv7-a
+ .align 4
+-.global _crypto_scalarmult_curve25519_neon2
+-.global crypto_scalarmult_curve25519_neon2
+-.type _crypto_scalarmult_curve25519_neon2 STT_FUNC
+-.type crypto_scalarmult_curve25519_neon2 STT_FUNC
+-      _crypto_scalarmult_curve25519_neon2:
+-      crypto_scalarmult_curve25519_neon2:
+-      vpush           {q4, q5, q6, q7}
+-      mov             r12, sp
+-      sub             sp, sp, #736
+-      and             sp, sp, #0xffffffe0
+-      strd            r4, [sp, #0]
+-      strd            r6, [sp, #8]
+-      strd            r8, [sp, #16]
+-      strd            r10, [sp, #24]
+-      str             r12, [sp, #480]
+-      str             r14, [sp, #484]
+-      mov             r0, r0
+-      mov             r1, r1
+-      mov             r2, r2
+-      add             r3, sp, #32
+-      ldr             r4, =0
+-      ldr             r5, =254
++
++ENTRY(curve25519_neon)
++      push            {r4-r11, lr}
++      mov             ip, sp
++      sub             r3, sp, #704
++      and             r3, r3, #0xfffffff0
++      mov             sp, r3
++      movw            r4, #0
++      movw            r5, #254
+       vmov.i32        q0, #1
+       vshr.u64        q1, q0, #7
+       vshr.u64        q0, q0, #8
+       vmov.i32        d4, #19
+       vmov.i32        d5, #38
+-      add             r6, sp, #512
+-      vst1.8          {d2-d3}, [r6, : 128]
+-      add             r6, sp, #528
+-      vst1.8          {d0-d1}, [r6, : 128]
+-      add             r6, sp, #544
++      add             r6, sp, #480
++      vst1.8          {d2-d3}, [r6, : 128]!
++      vst1.8          {d0-d1}, [r6, : 128]!
+       vst1.8          {d4-d5}, [r6, : 128]
+       add             r6, r3, #0
+       vmov.i32        q2, #0
+@@ -45,12 +37,12 @@
+       vst1.8          {d4-d5}, [r6, : 128]!
+       vst1.8          d4, [r6, : 64]
+       add             r6, r3, #0
+-      ldr             r7, =960
++      movw            r7, #960
+       sub             r7, r7, #2
+       neg             r7, r7
+       sub             r7, r7, r7, LSL #7
+       str             r7, [r6]
+-      add             r6, sp, #704
++      add             r6, sp, #672
+       vld1.8          {d4-d5}, [r1]!
+       vld1.8          {d6-d7}, [r1]
+       vst1.8          {d4-d5}, [r6, : 128]!
+@@ -212,15 +204,15 @@
+       vst1.8          {d0-d1}, [r6, : 128]!
+       vst1.8          {d2-d3}, [r6, : 128]!
+       vst1.8          d4, [r6, : 64]
+-._mainloop:
++.Lmainloop:
+       mov             r2, r5, LSR #3
+       and             r6, r5, #7
+       ldrb            r2, [r1, r2]
+       mov             r2, r2, LSR r6
+       and             r2, r2, #1
+-      str             r5, [sp, #488]
++      str             r5, [sp, #456]
+       eor             r4, r4, r2
+-      str             r2, [sp, #492]
++      str             r2, [sp, #460]
+       neg             r2, r4
+       add             r4, r3, #96
+       add             r5, r3, #192
+@@ -291,7 +283,7 @@
+       vsub.i32        q0, q1, q3
+       vst1.8          d4, [r4, : 64]
+       vst1.8          d0, [r6, : 64]
+-      add             r2, sp, #544
++      add             r2, sp, #512
+       add             r4, r3, #96
+       add             r5, r3, #144
+       vld1.8          {d0-d1}, [r2, : 128]
+@@ -361,14 +353,13 @@
+       vmlal.s32       q0, d12, d8
+       vmlal.s32       q0, d13, d17
+       vmlal.s32       q0, d6, d6
+-      add             r2, sp, #512
+-      vld1.8          {d18-d19}, [r2, : 128]
++      add             r2, sp, #480
++      vld1.8          {d18-d19}, [r2, : 128]!
+       vmull.s32       q3, d16, d7
+       vmlal.s32       q3, d10, d15
+       vmlal.s32       q3, d11, d14
+       vmlal.s32       q3, d12, d9
+       vmlal.s32       q3, d13, d8
+-      add             r2, sp, #528
+       vld1.8          {d8-d9}, [r2, : 128]
+       vadd.i64        q5, q12, q9
+       vadd.i64        q6, q15, q9
+@@ -502,22 +493,19 @@
+       vadd.i32        q5, q5, q0
+       vtrn.32         q11, q14
+       vadd.i32        q6, q6, q3
+-      add             r2, sp, #560
++      add             r2, sp, #528
+       vadd.i32        q10, q10, q2
+       vtrn.32         d24, d25
+-      vst1.8          {d12-d13}, [r2, : 128]
++      vst1.8          {d12-d13}, [r2, : 128]!
+       vshl.i32        q6, q13, #1
+-      add             r2, sp, #576
+-      vst1.8          {d20-d21}, [r2, : 128]
++      vst1.8          {d20-d21}, [r2, : 128]!
+       vshl.i32        q10, q14, #1
+-      add             r2, sp, #592
+-      vst1.8          {d12-d13}, [r2, : 128]
++      vst1.8          {d12-d13}, [r2, : 128]!
+       vshl.i32        q15, q12, #1
+       vadd.i32        q8, q8, q4
+       vext.32         d10, d31, d30, #0
+       vadd.i32        q7, q7, q1
+-      add             r2, sp, #608
+-      vst1.8          {d16-d17}, [r2, : 128]
++      vst1.8          {d16-d17}, [r2, : 128]!
+       vmull.s32       q8, d18, d5
+       vmlal.s32       q8, d26, d4
+       vmlal.s32       q8, d19, d9
+@@ -528,8 +516,7 @@
+       vmlal.s32       q8, d29, d1
+       vmlal.s32       q8, d24, d6
+       vmlal.s32       q8, d25, d0
+-      add             r2, sp, #624
+-      vst1.8          {d14-d15}, [r2, : 128]
++      vst1.8          {d14-d15}, [r2, : 128]!
+       vmull.s32       q2, d18, d4
+       vmlal.s32       q2, d12, d9
+       vmlal.s32       q2, d13, d8
+@@ -537,8 +524,7 @@
+       vmlal.s32       q2, d22, d2
+       vmlal.s32       q2, d23, d1
+       vmlal.s32       q2, d24, d0
+-      add             r2, sp, #640
+-      vst1.8          {d20-d21}, [r2, : 128]
++      vst1.8          {d20-d21}, [r2, : 128]!
+       vmull.s32       q7, d18, d9
+       vmlal.s32       q7, d26, d3
+       vmlal.s32       q7, d19, d8
+@@ -547,14 +533,12 @@
+       vmlal.s32       q7, d28, d1
+       vmlal.s32       q7, d23, d6
+       vmlal.s32       q7, d29, d0
+-      add             r2, sp, #656
+-      vst1.8          {d10-d11}, [r2, : 128]
++      vst1.8          {d10-d11}, [r2, : 128]!
+       vmull.s32       q5, d18, d3
+       vmlal.s32       q5, d19, d2
+       vmlal.s32       q5, d22, d1
+       vmlal.s32       q5, d23, d0
+       vmlal.s32       q5, d12, d8
+-      add             r2, sp, #672
+       vst1.8          {d16-d17}, [r2, : 128]
+       vmull.s32       q4, d18, d8
+       vmlal.s32       q4, d26, d2
+@@ -566,7 +550,7 @@
+       vmlal.s32       q8, d26, d1
+       vmlal.s32       q8, d19, d6
+       vmlal.s32       q8, d27, d0
+-      add             r2, sp, #576
++      add             r2, sp, #544
+       vld1.8          {d20-d21}, [r2, : 128]
+       vmlal.s32       q7, d24, d21
+       vmlal.s32       q7, d25, d20
+@@ -575,32 +559,30 @@
+       vmlal.s32       q8, d22, d21
+       vmlal.s32       q8, d28, d20
+       vmlal.s32       q5, d24, d20
+-      add             r2, sp, #576
+       vst1.8          {d14-d15}, [r2, : 128]
+       vmull.s32       q7, d18, d6
+       vmlal.s32       q7, d26, d0
+-      add             r2, sp, #656
++      add             r2, sp, #624
+       vld1.8          {d30-d31}, [r2, : 128]
+       vmlal.s32       q2, d30, d21
+       vmlal.s32       q7, d19, d21
+       vmlal.s32       q7, d27, d20
+-      add             r2, sp, #624
++      add             r2, sp, #592
+       vld1.8          {d26-d27}, [r2, : 128]
+       vmlal.s32       q4, d25, d27
+       vmlal.s32       q8, d29, d27
+       vmlal.s32       q8, d25, d26
+       vmlal.s32       q7, d28, d27
+       vmlal.s32       q7, d29, d26
+-      add             r2, sp, #608
++      add             r2, sp, #576
+       vld1.8          {d28-d29}, [r2, : 128]
+       vmlal.s32       q4, d24, d29
+       vmlal.s32       q8, d23, d29
+       vmlal.s32       q8, d24, d28
+       vmlal.s32       q7, d22, d29
+       vmlal.s32       q7, d23, d28
+-      add             r2, sp, #608
+       vst1.8          {d8-d9}, [r2, : 128]
+-      add             r2, sp, #560
++      add             r2, sp, #528
+       vld1.8          {d8-d9}, [r2, : 128]
+       vmlal.s32       q7, d24, d9
+       vmlal.s32       q7, d25, d31
+@@ -621,36 +603,36 @@
+       vmlal.s32       q0, d23, d26
+       vmlal.s32       q0, d24, d31
+       vmlal.s32       q0, d19, d20
+-      add             r2, sp, #640
++      add             r2, sp, #608
+       vld1.8          {d18-d19}, [r2, : 128]
+       vmlal.s32       q2, d18, d7
+-      vmlal.s32       q2, d19, d6
+       vmlal.s32       q5, d18, d6
+-      vmlal.s32       q5, d19, d21
+       vmlal.s32       q1, d18, d21
+-      vmlal.s32       q1, d19, d29
+       vmlal.s32       q0, d18, d28
+-      vmlal.s32       q0, d19, d9
+       vmlal.s32       q6, d18, d29
++      vmlal.s32       q2, d19, d6
++      vmlal.s32       q5, d19, d21
++      vmlal.s32       q1, d19, d29
++      vmlal.s32       q0, d19, d9
+       vmlal.s32       q6, d19, d28
+-      add             r2, sp, #592
++      add             r2, sp, #560
+       vld1.8          {d18-d19}, [r2, : 128]
+-      add             r2, sp, #512
++      add             r2, sp, #480
+       vld1.8          {d22-d23}, [r2, : 128]
+       vmlal.s32       q5, d19, d7
+       vmlal.s32       q0, d18, d21
+       vmlal.s32       q0, d19, d29
+       vmlal.s32       q6, d18, d6
+-      add             r2, sp, #528
++      add             r2, sp, #496
+       vld1.8          {d6-d7}, [r2, : 128]
+       vmlal.s32       q6, d19, d21
+-      add             r2, sp, #576
++      add             r2, sp, #544
+       vld1.8          {d18-d19}, [r2, : 128]
+       vmlal.s32       q0, d30, d8
+-      add             r2, sp, #672
++      add             r2, sp, #640
+       vld1.8          {d20-d21}, [r2, : 128]
+       vmlal.s32       q5, d30, d29
+-      add             r2, sp, #608
++      add             r2, sp, #576
+       vld1.8          {d24-d25}, [r2, : 128]
+       vmlal.s32       q1, d30, d28
+       vadd.i64        q13, q0, q11
+@@ -823,22 +805,19 @@
+       vadd.i32        q5, q5, q0
+       vtrn.32         q11, q14
+       vadd.i32        q6, q6, q3
+-      add             r2, sp, #560
++      add             r2, sp, #528
+       vadd.i32        q10, q10, q2
+       vtrn.32         d24, d25
+-      vst1.8          {d12-d13}, [r2, : 128]
++      vst1.8          {d12-d13}, [r2, : 128]!
+       vshl.i32        q6, q13, #1
+-      add             r2, sp, #576
+-      vst1.8          {d20-d21}, [r2, : 128]
++      vst1.8          {d20-d21}, [r2, : 128]!
+       vshl.i32        q10, q14, #1
+-      add             r2, sp, #592
+-      vst1.8          {d12-d13}, [r2, : 128]
++      vst1.8          {d12-d13}, [r2, : 128]!
+       vshl.i32        q15, q12, #1
+       vadd.i32        q8, q8, q4
+       vext.32         d10, d31, d30, #0
+       vadd.i32        q7, q7, q1
+-      add             r2, sp, #608
+-      vst1.8          {d16-d17}, [r2, : 128]
++      vst1.8          {d16-d17}, [r2, : 128]!
+       vmull.s32       q8, d18, d5
+       vmlal.s32       q8, d26, d4
+       vmlal.s32       q8, d19, d9
+@@ -849,8 +828,7 @@
+       vmlal.s32       q8, d29, d1
+       vmlal.s32       q8, d24, d6
+       vmlal.s32       q8, d25, d0
+-      add             r2, sp, #624
+-      vst1.8          {d14-d15}, [r2, : 128]
++      vst1.8          {d14-d15}, [r2, : 128]!
+       vmull.s32       q2, d18, d4
+       vmlal.s32       q2, d12, d9
+       vmlal.s32       q2, d13, d8
+@@ -858,8 +836,7 @@
+       vmlal.s32       q2, d22, d2
+       vmlal.s32       q2, d23, d1
+       vmlal.s32       q2, d24, d0
+-      add             r2, sp, #640
+-      vst1.8          {d20-d21}, [r2, : 128]
++      vst1.8          {d20-d21}, [r2, : 128]!
+       vmull.s32       q7, d18, d9
+       vmlal.s32       q7, d26, d3
+       vmlal.s32       q7, d19, d8
+@@ -868,15 +845,13 @@
+       vmlal.s32       q7, d28, d1
+       vmlal.s32       q7, d23, d6
+       vmlal.s32       q7, d29, d0
+-      add             r2, sp, #656
+-      vst1.8          {d10-d11}, [r2, : 128]
++      vst1.8          {d10-d11}, [r2, : 128]!
+       vmull.s32       q5, d18, d3
+       vmlal.s32       q5, d19, d2
+       vmlal.s32       q5, d22, d1
+       vmlal.s32       q5, d23, d0
+       vmlal.s32       q5, d12, d8
+-      add             r2, sp, #672
+-      vst1.8          {d16-d17}, [r2, : 128]
++      vst1.8          {d16-d17}, [r2, : 128]!
+       vmull.s32       q4, d18, d8
+       vmlal.s32       q4, d26, d2
+       vmlal.s32       q4, d19, d7
+@@ -887,7 +862,7 @@
+       vmlal.s32       q8, d26, d1
+       vmlal.s32       q8, d19, d6
+       vmlal.s32       q8, d27, d0
+-      add             r2, sp, #576
++      add             r2, sp, #544
+       vld1.8          {d20-d21}, [r2, : 128]
+       vmlal.s32       q7, d24, d21
+       vmlal.s32       q7, d25, d20
+@@ -896,32 +871,30 @@
+       vmlal.s32       q8, d22, d21
+       vmlal.s32       q8, d28, d20
+       vmlal.s32       q5, d24, d20
+-      add             r2, sp, #576
+       vst1.8          {d14-d15}, [r2, : 128]
+       vmull.s32       q7, d18, d6
+       vmlal.s32       q7, d26, d0
+-      add             r2, sp, #656
++      add             r2, sp, #624
+       vld1.8          {d30-d31}, [r2, : 128]
+       vmlal.s32       q2, d30, d21
+       vmlal.s32       q7, d19, d21
+       vmlal.s32       q7, d27, d20
+-      add             r2, sp, #624
++      add             r2, sp, #592
+       vld1.8          {d26-d27}, [r2, : 128]
+       vmlal.s32       q4, d25, d27
+       vmlal.s32       q8, d29, d27
+       vmlal.s32       q8, d25, d26
+       vmlal.s32       q7, d28, d27
+       vmlal.s32       q7, d29, d26
+-      add             r2, sp, #608
++      add             r2, sp, #576
+       vld1.8          {d28-d29}, [r2, : 128]
+       vmlal.s32       q4, d24, d29
+       vmlal.s32       q8, d23, d29
+       vmlal.s32       q8, d24, d28
+       vmlal.s32       q7, d22, d29
+       vmlal.s32       q7, d23, d28
+-      add             r2, sp, #608
+       vst1.8          {d8-d9}, [r2, : 128]
+-      add             r2, sp, #560
++      add             r2, sp, #528
+       vld1.8          {d8-d9}, [r2, : 128]
+       vmlal.s32       q7, d24, d9
+       vmlal.s32       q7, d25, d31
+@@ -942,36 +915,36 @@
+       vmlal.s32       q0, d23, d26
+       vmlal.s32       q0, d24, d31
+       vmlal.s32       q0, d19, d20
+-      add             r2, sp, #640
++      add             r2, sp, #608
+       vld1.8          {d18-d19}, [r2, : 128]
+       vmlal.s32       q2, d18, d7
+-      vmlal.s32       q2, d19, d6
+       vmlal.s32       q5, d18, d6
+-      vmlal.s32       q5, d19, d21
+       vmlal.s32       q1, d18, d21
+-      vmlal.s32       q1, d19, d29
+       vmlal.s32       q0, d18, d28
+-      vmlal.s32       q0, d19, d9
+       vmlal.s32       q6, d18, d29
++      vmlal.s32       q2, d19, d6
++      vmlal.s32       q5, d19, d21
++      vmlal.s32       q1, d19, d29
++      vmlal.s32       q0, d19, d9
+       vmlal.s32       q6, d19, d28
+-      add             r2, sp, #592
++      add             r2, sp, #560
+       vld1.8          {d18-d19}, [r2, : 128]
+-      add             r2, sp, #512
++      add             r2, sp, #480
+       vld1.8          {d22-d23}, [r2, : 128]
+       vmlal.s32       q5, d19, d7
+       vmlal.s32       q0, d18, d21
+       vmlal.s32       q0, d19, d29
+       vmlal.s32       q6, d18, d6
+-      add             r2, sp, #528
++      add             r2, sp, #496
+       vld1.8          {d6-d7}, [r2, : 128]
+       vmlal.s32       q6, d19, d21
+-      add             r2, sp, #576
++      add             r2, sp, #544
+       vld1.8          {d18-d19}, [r2, : 128]
+       vmlal.s32       q0, d30, d8
+-      add             r2, sp, #672
++      add             r2, sp, #640
+       vld1.8          {d20-d21}, [r2, : 128]
+       vmlal.s32       q5, d30, d29
+-      add             r2, sp, #608
++      add             r2, sp, #576
+       vld1.8          {d24-d25}, [r2, : 128]
+       vmlal.s32       q1, d30, d28
+       vadd.i64        q13, q0, q11
+@@ -1069,7 +1042,7 @@
+       sub             r4, r4, #24
+       vst1.8          d0, [r2, : 64]
+       vst1.8          d1, [r4, : 64]
+-      add             r2, sp, #544
++      add             r2, sp, #512
+       add             r4, r3, #144
+       add             r5, r3, #192
+       vld1.8          {d0-d1}, [r2, : 128]
+@@ -1139,14 +1112,13 @@
+       vmlal.s32       q0, d12, d8
+       vmlal.s32       q0, d13, d17
+       vmlal.s32       q0, d6, d6
+-      add             r2, sp, #512
+-      vld1.8          {d18-d19}, [r2, : 128]
++      add             r2, sp, #480
++      vld1.8          {d18-d19}, [r2, : 128]!
+       vmull.s32       q3, d16, d7
+       vmlal.s32       q3, d10, d15
+       vmlal.s32       q3, d11, d14
+       vmlal.s32       q3, d12, d9
+       vmlal.s32       q3, d13, d8
+-      add             r2, sp, #528
+       vld1.8          {d8-d9}, [r2, : 128]
+       vadd.i64        q5, q12, q9
+       vadd.i64        q6, q15, q9
+@@ -1295,22 +1267,19 @@
+       vadd.i32        q5, q5, q0
+       vtrn.32         q11, q14
+       vadd.i32        q6, q6, q3
+-      add             r2, sp, #560
++      add             r2, sp, #528
+       vadd.i32        q10, q10, q2
+       vtrn.32         d24, d25
+-      vst1.8          {d12-d13}, [r2, : 128]
++      vst1.8          {d12-d13}, [r2, : 128]!
+       vshl.i32        q6, q13, #1
+-      add             r2, sp, #576
+-      vst1.8          {d20-d21}, [r2, : 128]
++      vst1.8          {d20-d21}, [r2, : 128]!
+       vshl.i32        q10, q14, #1
+-      add             r2, sp, #592
+-      vst1.8          {d12-d13}, [r2, : 128]
++      vst1.8          {d12-d13}, [r2, : 128]!
+       vshl.i32        q15, q12, #1
+       vadd.i32        q8, q8, q4
+       vext.32         d10, d31, d30, #0
+       vadd.i32        q7, q7, q1
+-      add             r2, sp, #608
+-      vst1.8          {d16-d17}, [r2, : 128]
++      vst1.8          {d16-d17}, [r2, : 128]!
+       vmull.s32       q8, d18, d5
+       vmlal.s32       q8, d26, d4
+       vmlal.s32       q8, d19, d9
+@@ -1321,8 +1290,7 @@
+       vmlal.s32       q8, d29, d1
+       vmlal.s32       q8, d24, d6
+       vmlal.s32       q8, d25, d0
+-      add             r2, sp, #624
+-      vst1.8          {d14-d15}, [r2, : 128]
++      vst1.8          {d14-d15}, [r2, : 128]!
+       vmull.s32       q2, d18, d4
+       vmlal.s32       q2, d12, d9
+       vmlal.s32       q2, d13, d8
+@@ -1330,8 +1298,7 @@
+       vmlal.s32       q2, d22, d2
+       vmlal.s32       q2, d23, d1
+       vmlal.s32       q2, d24, d0
+-      add             r2, sp, #640
+-      vst1.8          {d20-d21}, [r2, : 128]
++      vst1.8          {d20-d21}, [r2, : 128]!
+       vmull.s32       q7, d18, d9
+       vmlal.s32       q7, d26, d3
+       vmlal.s32       q7, d19, d8
+@@ -1340,15 +1307,13 @@
+       vmlal.s32       q7, d28, d1
+       vmlal.s32       q7, d23, d6
+       vmlal.s32       q7, d29, d0
+-      add             r2, sp, #656
+-      vst1.8          {d10-d11}, [r2, : 128]
++      vst1.8          {d10-d11}, [r2, : 128]!
+       vmull.s32       q5, d18, d3
+       vmlal.s32       q5, d19, d2
+       vmlal.s32       q5, d22, d1
+       vmlal.s32       q5, d23, d0
+       vmlal.s32       q5, d12, d8
+-      add             r2, sp, #672
+-      vst1.8          {d16-d17}, [r2, : 128]
++      vst1.8          {d16-d17}, [r2, : 128]!
+       vmull.s32       q4, d18, d8
+       vmlal.s32       q4, d26, d2
+       vmlal.s32       q4, d19, d7
+@@ -1359,7 +1324,7 @@
+       vmlal.s32       q8, d26, d1
+       vmlal.s32       q8, d19, d6
+       vmlal.s32       q8, d27, d0
+-      add             r2, sp, #576
++      add             r2, sp, #544
+       vld1.8          {d20-d21}, [r2, : 128]
+       vmlal.s32       q7, d24, d21
+       vmlal.s32       q7, d25, d20
+@@ -1368,32 +1333,30 @@
+       vmlal.s32       q8, d22, d21
+       vmlal.s32       q8, d28, d20
+       vmlal.s32       q5, d24, d20
+-      add             r2, sp, #576
+       vst1.8          {d14-d15}, [r2, : 128]
+       vmull.s32       q7, d18, d6
+       vmlal.s32       q7, d26, d0
+-      add             r2, sp, #656
++      add             r2, sp, #624
+       vld1.8          {d30-d31}, [r2, : 128]
+       vmlal.s32       q2, d30, d21
+       vmlal.s32       q7, d19, d21
+       vmlal.s32       q7, d27, d20
+-      add             r2, sp, #624
++      add             r2, sp, #592
+       vld1.8          {d26-d27}, [r2, : 128]
+       vmlal.s32       q4, d25, d27
+       vmlal.s32       q8, d29, d27
+       vmlal.s32       q8, d25, d26
+       vmlal.s32       q7, d28, d27
+       vmlal.s32       q7, d29, d26
+-      add             r2, sp, #608
++      add             r2, sp, #576
+       vld1.8          {d28-d29}, [r2, : 128]
+       vmlal.s32       q4, d24, d29
+       vmlal.s32       q8, d23, d29
+       vmlal.s32       q8, d24, d28
+       vmlal.s32       q7, d22, d29
+       vmlal.s32       q7, d23, d28
+-      add             r2, sp, #608
+       vst1.8          {d8-d9}, [r2, : 128]
+-      add             r2, sp, #560
++      add             r2, sp, #528
+       vld1.8          {d8-d9}, [r2, : 128]
+       vmlal.s32       q7, d24, d9
+       vmlal.s32       q7, d25, d31
+@@ -1414,36 +1377,36 @@
+       vmlal.s32       q0, d23, d26
+       vmlal.s32       q0, d24, d31
+       vmlal.s32       q0, d19, d20
+-      add             r2, sp, #640
++      add             r2, sp, #608
+       vld1.8          {d18-d19}, [r2, : 128]
+       vmlal.s32       q2, d18, d7
+-      vmlal.s32       q2, d19, d6
+       vmlal.s32       q5, d18, d6
+-      vmlal.s32       q5, d19, d21
+       vmlal.s32       q1, d18, d21
+-      vmlal.s32       q1, d19, d29
+       vmlal.s32       q0, d18, d28
+-      vmlal.s32       q0, d19, d9
+       vmlal.s32       q6, d18, d29
++      vmlal.s32       q2, d19, d6
++      vmlal.s32       q5, d19, d21
++      vmlal.s32       q1, d19, d29
++      vmlal.s32       q0, d19, d9
+       vmlal.s32       q6, d19, d28
+-      add             r2, sp, #592
++      add             r2, sp, #560
+       vld1.8          {d18-d19}, [r2, : 128]
+-      add             r2, sp, #512
++      add             r2, sp, #480
+       vld1.8          {d22-d23}, [r2, : 128]
+       vmlal.s32       q5, d19, d7
+       vmlal.s32       q0, d18, d21
+       vmlal.s32       q0, d19, d29
+       vmlal.s32       q6, d18, d6
+-      add             r2, sp, #528
++      add             r2, sp, #496
+       vld1.8          {d6-d7}, [r2, : 128]
+       vmlal.s32       q6, d19, d21
+-      add             r2, sp, #576
++      add             r2, sp, #544
+       vld1.8          {d18-d19}, [r2, : 128]
+       vmlal.s32       q0, d30, d8
+-      add             r2, sp, #672
++      add             r2, sp, #640
+       vld1.8          {d20-d21}, [r2, : 128]
+       vmlal.s32       q5, d30, d29
+-      add             r2, sp, #608
++      add             r2, sp, #576
+       vld1.8          {d24-d25}, [r2, : 128]
+       vmlal.s32       q1, d30, d28
+       vadd.i64        q13, q0, q11
+@@ -1541,10 +1504,10 @@
+       sub             r4, r4, #24
+       vst1.8          d0, [r2, : 64]
+       vst1.8          d1, [r4, : 64]
+-      ldr             r2, [sp, #488]
+-      ldr             r4, [sp, #492]
++      ldr             r2, [sp, #456]
++      ldr             r4, [sp, #460]
+       subs            r5, r2, #1
+-      bge             ._mainloop
++      bge             .Lmainloop
+       add             r1, r3, #144
+       add             r2, r3, #336
+       vld1.8          {d0-d1}, [r1, : 128]!
+@@ -1553,41 +1516,41 @@
+       vst1.8          {d0-d1}, [r2, : 128]!
+       vst1.8          {d2-d3}, [r2, : 128]!
+       vst1.8          d4, [r2, : 64]
+-      ldr             r1, =0
+-._invertloop:
++      movw            r1, #0
++.Linvertloop:
+       add             r2, r3, #144
+-      ldr             r4, =0
+-      ldr             r5, =2
++      movw            r4, #0
++      movw            r5, #2
+       cmp             r1, #1
+-      ldreq           r5, =1
++      moveq           r5, #1
+       addeq           r2, r3, #336
+       addeq           r4, r3, #48
+       cmp             r1, #2
+-      ldreq           r5, =1
++      moveq           r5, #1
+       addeq           r2, r3, #48
+       cmp             r1, #3
+-      ldreq           r5, =5
++      moveq           r5, #5
+       addeq           r4, r3, #336
+       cmp             r1, #4
+-      ldreq           r5, =10
++      moveq           r5, #10
+       cmp             r1, #5
+-      ldreq           r5, =20
++      moveq           r5, #20
+       cmp             r1, #6
+-      ldreq           r5, =10
++      moveq           r5, #10
+       addeq           r2, r3, #336
+       addeq           r4, r3, #336
+       cmp             r1, #7
+-      ldreq           r5, =50
++      moveq           r5, #50
+       cmp             r1, #8
+-      ldreq           r5, =100
++      moveq           r5, #100
+       cmp             r1, #9
+-      ldreq           r5, =50
++      moveq           r5, #50
+       addeq           r2, r3, #336
+       cmp             r1, #10
+-      ldreq           r5, =5
++      moveq           r5, #5
+       addeq           r2, r3, #48
+       cmp             r1, #11
+-      ldreq           r5, =0
++      moveq           r5, #0
+       addeq           r2, r3, #96
+       add             r6, r3, #144
+       add             r7, r3, #288
+@@ -1598,8 +1561,8 @@
+       vst1.8          {d2-d3}, [r7, : 128]!
+       vst1.8          d4, [r7, : 64]
+       cmp             r5, #0
+-      beq             ._skipsquaringloop
+-._squaringloop:
++      beq             .Lskipsquaringloop
++.Lsquaringloop:
+       add             r6, r3, #288
+       add             r7, r3, #288
+       add             r8, r3, #288
+@@ -1611,7 +1574,7 @@
+       vld1.8          {d6-d7}, [r7, : 128]!
+       vld1.8          {d9}, [r7, : 64]
+       vld1.8          {d10-d11}, [r6, : 128]!
+-      add             r7, sp, #416
++      add             r7, sp, #384
+       vld1.8          {d12-d13}, [r6, : 128]!
+       vmul.i32        q7, q2, q0
+       vld1.8          {d8}, [r6, : 64]
+@@ -1726,7 +1689,7 @@
+       vext.32         d10, d6, d6, #0
+       vmov.i32        q1, #0xffffffff
+       vshl.i64        q4, q1, #25
+-      add             r7, sp, #512
++      add             r7, sp, #480
+       vld1.8          {d14-d15}, [r7, : 128]
+       vadd.i64        q9, q2, q7
+       vshl.i64        q1, q1, #26
+@@ -1735,7 +1698,7 @@
+       vadd.i64        q5, q5, q10
+       vand            q9, q9, q1
+       vld1.8          {d16}, [r6, : 64]!
+-      add             r6, sp, #528
++      add             r6, sp, #496
+       vld1.8          {d20-d21}, [r6, : 128]
+       vadd.i64        q11, q5, q10
+       vsub.i64        q2, q2, q9
+@@ -1789,8 +1752,8 @@
+       sub             r6, r6, #32
+       vst1.8          d4, [r6, : 64]
+       subs            r5, r5, #1
+-      bhi             ._squaringloop
+-._skipsquaringloop:
++      bhi             .Lsquaringloop
++.Lskipsquaringloop:
+       mov             r2, r2
+       add             r5, r3, #288
+       add             r6, r3, #144
+@@ -1802,7 +1765,7 @@
+       vld1.8          {d6-d7}, [r5, : 128]!
+       vld1.8          {d9}, [r5, : 64]
+       vld1.8          {d10-d11}, [r2, : 128]!
+-      add             r5, sp, #416
++      add             r5, sp, #384
+       vld1.8          {d12-d13}, [r2, : 128]!
+       vmul.i32        q7, q2, q0
+       vld1.8          {d8}, [r2, : 64]
+@@ -1917,7 +1880,7 @@
+       vext.32         d10, d6, d6, #0
+       vmov.i32        q1, #0xffffffff
+       vshl.i64        q4, q1, #25
+-      add             r5, sp, #512
++      add             r5, sp, #480
+       vld1.8          {d14-d15}, [r5, : 128]
+       vadd.i64        q9, q2, q7
+       vshl.i64        q1, q1, #26
+@@ -1926,7 +1889,7 @@
+       vadd.i64        q5, q5, q10
+       vand            q9, q9, q1
+       vld1.8          {d16}, [r2, : 64]!
+-      add             r2, sp, #528
++      add             r2, sp, #496
+       vld1.8          {d20-d21}, [r2, : 128]
+       vadd.i64        q11, q5, q10
+       vsub.i64        q2, q2, q9
+@@ -1980,7 +1943,7 @@
+       sub             r2, r2, #32
+       vst1.8          d4, [r2, : 64]
+       cmp             r4, #0
+-      beq             ._skippostcopy
++      beq             .Lskippostcopy
+       add             r2, r3, #144
+       mov             r4, r4
+       vld1.8          {d0-d1}, [r2, : 128]!
+@@ -1989,9 +1952,9 @@
+       vst1.8          {d0-d1}, [r4, : 128]!
+       vst1.8          {d2-d3}, [r4, : 128]!
+       vst1.8          d4, [r4, : 64]
+-._skippostcopy:
++.Lskippostcopy:
+       cmp             r1, #1
+-      bne             ._skipfinalcopy
++      bne             .Lskipfinalcopy
+       add             r2, r3, #288
+       add             r4, r3, #144
+       vld1.8          {d0-d1}, [r2, : 128]!
+@@ -2000,10 +1963,10 @@
+       vst1.8          {d0-d1}, [r4, : 128]!
+       vst1.8          {d2-d3}, [r4, : 128]!
+       vst1.8          d4, [r4, : 64]
+-._skipfinalcopy:
++.Lskipfinalcopy:
+       add             r1, r1, #1
+       cmp             r1, #12
+-      blo             ._invertloop
++      blo             .Linvertloop
+       add             r1, r3, #144
+       ldr             r2, [r1], #4
+       ldr             r3, [r1], #4
+@@ -2085,21 +2048,15 @@
+       add             r8, r8, r10, LSL #12
+       mov             r9, r10, LSR #20
+       add             r1, r9, r1, LSL #6
+-      str             r2, [r0], #4
+-      str             r3, [r0], #4
+-      str             r4, [r0], #4
+-      str             r5, [r0], #4
+-      str             r6, [r0], #4
+-      str             r7, [r0], #4
+-      str             r8, [r0], #4
+-      str             r1, [r0]
+-      ldrd            r4, [sp, #0]
+-      ldrd            r6, [sp, #8]
+-      ldrd            r8, [sp, #16]
+-      ldrd            r10, [sp, #24]
+-      ldr             r12, [sp, #480]
+-      ldr             r14, [sp, #484]
+-      ldr             r0, =0
+-      mov             sp, r12
+-      vpop            {q4, q5, q6, q7}
+-      bx              lr
++      str             r2, [r0]
++      str             r3, [r0, #4]
++      str             r4, [r0, #8]
++      str             r5, [r0, #12]
++      str             r6, [r0, #16]
++      str             r7, [r0, #20]
++      str             r8, [r0, #24]
++      str             r1, [r0, #28]
++      movw            r0, #0
++      mov             sp, ip
++      pop             {r4-r11, pc}
++ENDPROC(curve25519_neon)
+--- /dev/null
++++ b/arch/arm/crypto/curve25519-glue.c
+@@ -0,0 +1,127 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
++ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
++ * manually reworked for use in kernel space.
++ */
++
++#include <asm/hwcap.h>
++#include <asm/neon.h>
++#include <asm/simd.h>
++#include <crypto/internal/kpp.h>
++#include <crypto/internal/simd.h>
++#include <linux/types.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/jump_label.h>
++#include <crypto/curve25519.h>
++
++asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
++                              const u8 secret[CURVE25519_KEY_SIZE],
++                              const u8 basepoint[CURVE25519_KEY_SIZE]);
++
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
++
++void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
++                   const u8 scalar[CURVE25519_KEY_SIZE],
++                   const u8 point[CURVE25519_KEY_SIZE])
++{
++      if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
++              kernel_neon_begin();
++              curve25519_neon(out, scalar, point);
++              kernel_neon_end();
++      } else {
++              curve25519_generic(out, scalar, point);
++      }
++}
++EXPORT_SYMBOL(curve25519_arch);
++
++static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
++                               unsigned int len)
++{
++      u8 *secret = kpp_tfm_ctx(tfm);
++
++      if (!len)
++              curve25519_generate_secret(secret);
++      else if (len == CURVE25519_KEY_SIZE &&
++               crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
++              memcpy(secret, buf, CURVE25519_KEY_SIZE);
++      else
++              return -EINVAL;
++      return 0;
++}
++
++static int curve25519_compute_value(struct kpp_request *req)
++{
++      struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
++      const u8 *secret = kpp_tfm_ctx(tfm);
++      u8 public_key[CURVE25519_KEY_SIZE];
++      u8 buf[CURVE25519_KEY_SIZE];
++      int copied, nbytes;
++      u8 const *bp;
++
++      if (req->src) {
++              copied = sg_copy_to_buffer(req->src,
++                                         sg_nents_for_len(req->src,
++                                                          CURVE25519_KEY_SIZE),
++                                         public_key, CURVE25519_KEY_SIZE);
++              if (copied != CURVE25519_KEY_SIZE)
++                      return -EINVAL;
++              bp = public_key;
++      } else {
++              bp = curve25519_base_point;
++      }
++
++      curve25519_arch(buf, secret, bp);
++
++      /* might want less than we've got */
++      nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
++      copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
++                                                              nbytes),
++                                   buf, nbytes);
++      if (copied != nbytes)
++              return -EINVAL;
++      return 0;
++}
++
++static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
++{
++      return CURVE25519_KEY_SIZE;
++}
++
++static struct kpp_alg curve25519_alg = {
++      .base.cra_name          = "curve25519",
++      .base.cra_driver_name   = "curve25519-neon",
++      .base.cra_priority      = 200,
++      .base.cra_module        = THIS_MODULE,
++      .base.cra_ctxsize       = CURVE25519_KEY_SIZE,
++
++      .set_secret             = curve25519_set_secret,
++      .generate_public_key    = curve25519_compute_value,
++      .compute_shared_secret  = curve25519_compute_value,
++      .max_size               = curve25519_max_size,
++};
++
++static int __init mod_init(void)
++{
++      if (elf_hwcap & HWCAP_NEON) {
++              static_branch_enable(&have_neon);
++              return crypto_register_kpp(&curve25519_alg);
++      }
++      return 0;
++}
++
++static void __exit mod_exit(void)
++{
++      if (elf_hwcap & HWCAP_NEON)
++              crypto_unregister_kpp(&curve25519_alg);
++}
++
++module_init(mod_init);
++module_exit(mod_exit);
++
++MODULE_ALIAS_CRYPTO("curve25519");
++MODULE_ALIAS_CRYPTO("curve25519-neon");
++MODULE_LICENSE("GPL v2");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0032-crypto-chacha20poly1305-import-construction-and-self.patch b/target/linux/generic/backport-5.4/080-wireguard-0032-crypto-chacha20poly1305-import-construction-and-self.patch

new file mode 100644 (file)

index 0000000..dde774a
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0032-crypto-chacha20poly1305-import-construction-and-self.patch
@@ -0,0 +1,7677 @@
+From d276ee98ad5275f3e1efb4f8a9f2e3fbece23a5a Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:39 +0100
+Subject: [PATCH 032/124] crypto: chacha20poly1305 - import construction and
+ selftest from Zinc
+
+commit ed20078b7e3331e82828be357147af6a3282e4ce upstream.
+
+This incorporates the chacha20poly1305 from the Zinc library, retaining
+the library interface, but replacing the implementation with calls into
+the code that already existed in the kernel's crypto API.
+
+Note that this library API does not implement RFC7539 fully, given that
+it is limited to 64-bit nonces. (The 96-bit nonce version that was part
+of the selftest only has been removed, along with the 96-bit nonce test
+vectors that only tested the selftest but not the actual library itself)
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/crypto/chacha20poly1305.h      |   37 +
+ lib/crypto/Kconfig                     |    7 +
+ lib/crypto/Makefile                    |    4 +
+ lib/crypto/chacha20poly1305-selftest.c | 7348 ++++++++++++++++++++++++
+ lib/crypto/chacha20poly1305.c          |  219 +
+ 5 files changed, 7615 insertions(+)
+ create mode 100644 include/crypto/chacha20poly1305.h
+ create mode 100644 lib/crypto/chacha20poly1305-selftest.c
+ create mode 100644 lib/crypto/chacha20poly1305.c
+
+--- /dev/null
++++ b/include/crypto/chacha20poly1305.h
+@@ -0,0 +1,37 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef __CHACHA20POLY1305_H
++#define __CHACHA20POLY1305_H
++
++#include <linux/types.h>
++
++enum chacha20poly1305_lengths {
++      XCHACHA20POLY1305_NONCE_SIZE = 24,
++      CHACHA20POLY1305_KEY_SIZE = 32,
++      CHACHA20POLY1305_AUTHTAG_SIZE = 16
++};
++
++void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
++                            const u8 *ad, const size_t ad_len,
++                            const u64 nonce,
++                            const u8 key[CHACHA20POLY1305_KEY_SIZE]);
++
++bool __must_check
++chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
++                       const u8 *ad, const size_t ad_len, const u64 nonce,
++                       const u8 key[CHACHA20POLY1305_KEY_SIZE]);
++
++void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
++                             const u8 *ad, const size_t ad_len,
++                             const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
++                             const u8 key[CHACHA20POLY1305_KEY_SIZE]);
++
++bool __must_check xchacha20poly1305_decrypt(
++      u8 *dst, const u8 *src, const size_t src_len, const u8 *ad,
++      const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
++      const u8 key[CHACHA20POLY1305_KEY_SIZE]);
++
++#endif /* __CHACHA20POLY1305_H */
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -119,5 +119,12 @@ config CRYPTO_LIB_POLY1305
+         by either the generic implementation or an arch-specific one, if one
+         is available and enabled.
+ 
++config CRYPTO_LIB_CHACHA20POLY1305
++      tristate "ChaCha20-Poly1305 AEAD support (8-byte nonce library version)"
++      depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
++      depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
++      select CRYPTO_LIB_CHACHA
++      select CRYPTO_LIB_POLY1305
++
+ config CRYPTO_LIB_SHA256
+       tristate
+--- a/lib/crypto/Makefile
++++ b/lib/crypto/Makefile
+@@ -16,6 +16,9 @@ libblake2s-generic-y                         += blake2s-gener
+ obj-$(CONFIG_CRYPTO_LIB_BLAKE2S)              += libblake2s.o
+ libblake2s-y                                  += blake2s.o
+ 
++obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)     += libchacha20poly1305.o
++libchacha20poly1305-y                         += chacha20poly1305.o
++
+ obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC)   += libcurve25519.o
+ libcurve25519-y                                       := curve25519-fiat32.o
+ libcurve25519-$(CONFIG_ARCH_SUPPORTS_INT128)  := curve25519-hacl64.o
+@@ -32,4 +35,5 @@ libsha256-y                                  := sha256.o
+ 
+ ifneq ($(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS),y)
+ libblake2s-y                                  += blake2s-selftest.o
++libchacha20poly1305-y                         += chacha20poly1305-selftest.o
+ endif
+--- /dev/null
++++ b/lib/crypto/chacha20poly1305-selftest.c
+@@ -0,0 +1,7348 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include <crypto/chacha20poly1305.h>
++#include <crypto/poly1305.h>
++
++#include <asm/unaligned.h>
++#include <linux/bug.h>
++#include <linux/init.h>
++#include <linux/mm.h>
++#include <linux/kernel.h>
++#include <linux/slab.h>
++
++struct chacha20poly1305_testvec {
++      const u8 *input, *output, *assoc, *nonce, *key;
++      size_t ilen, alen, nlen;
++      bool failure;
++};
++
++/* The first of these are the ChaCha20-Poly1305 AEAD test vectors from RFC7539
++ * 2.8.2. After they are generated by reference implementations. And the final
++ * marked ones are taken from wycheproof, but we only do these for the encrypt
++ * side, because mostly we're stressing the primitives rather than the actual
++ * chapoly construction.
++ */
++
++static const u8 enc_input001[] __initconst = {
++      0x49, 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x65, 0x74,
++      0x2d, 0x44, 0x72, 0x61, 0x66, 0x74, 0x73, 0x20,
++      0x61, 0x72, 0x65, 0x20, 0x64, 0x72, 0x61, 0x66,
++      0x74, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
++      0x6e, 0x74, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x69,
++      0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x20,
++      0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20,
++      0x6f, 0x66, 0x20, 0x73, 0x69, 0x78, 0x20, 0x6d,
++      0x6f, 0x6e, 0x74, 0x68, 0x73, 0x20, 0x61, 0x6e,
++      0x64, 0x20, 0x6d, 0x61, 0x79, 0x20, 0x62, 0x65,
++      0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x64,
++      0x2c, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63,
++      0x65, 0x64, 0x2c, 0x20, 0x6f, 0x72, 0x20, 0x6f,
++      0x62, 0x73, 0x6f, 0x6c, 0x65, 0x74, 0x65, 0x64,
++      0x20, 0x62, 0x79, 0x20, 0x6f, 0x74, 0x68, 0x65,
++      0x72, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
++      0x6e, 0x74, 0x73, 0x20, 0x61, 0x74, 0x20, 0x61,
++      0x6e, 0x79, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x2e,
++      0x20, 0x49, 0x74, 0x20, 0x69, 0x73, 0x20, 0x69,
++      0x6e, 0x61, 0x70, 0x70, 0x72, 0x6f, 0x70, 0x72,
++      0x69, 0x61, 0x74, 0x65, 0x20, 0x74, 0x6f, 0x20,
++      0x75, 0x73, 0x65, 0x20, 0x49, 0x6e, 0x74, 0x65,
++      0x72, 0x6e, 0x65, 0x74, 0x2d, 0x44, 0x72, 0x61,
++      0x66, 0x74, 0x73, 0x20, 0x61, 0x73, 0x20, 0x72,
++      0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65,
++      0x20, 0x6d, 0x61, 0x74, 0x65, 0x72, 0x69, 0x61,
++      0x6c, 0x20, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20,
++      0x63, 0x69, 0x74, 0x65, 0x20, 0x74, 0x68, 0x65,
++      0x6d, 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x20,
++      0x74, 0x68, 0x61, 0x6e, 0x20, 0x61, 0x73, 0x20,
++      0x2f, 0xe2, 0x80, 0x9c, 0x77, 0x6f, 0x72, 0x6b,
++      0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x67,
++      0x72, 0x65, 0x73, 0x73, 0x2e, 0x2f, 0xe2, 0x80,
++      0x9d
++};
++static const u8 enc_output001[] __initconst = {
++      0x64, 0xa0, 0x86, 0x15, 0x75, 0x86, 0x1a, 0xf4,
++      0x60, 0xf0, 0x62, 0xc7, 0x9b, 0xe6, 0x43, 0xbd,
++      0x5e, 0x80, 0x5c, 0xfd, 0x34, 0x5c, 0xf3, 0x89,
++      0xf1, 0x08, 0x67, 0x0a, 0xc7, 0x6c, 0x8c, 0xb2,
++      0x4c, 0x6c, 0xfc, 0x18, 0x75, 0x5d, 0x43, 0xee,
++      0xa0, 0x9e, 0xe9, 0x4e, 0x38, 0x2d, 0x26, 0xb0,
++      0xbd, 0xb7, 0xb7, 0x3c, 0x32, 0x1b, 0x01, 0x00,
++      0xd4, 0xf0, 0x3b, 0x7f, 0x35, 0x58, 0x94, 0xcf,
++      0x33, 0x2f, 0x83, 0x0e, 0x71, 0x0b, 0x97, 0xce,
++      0x98, 0xc8, 0xa8, 0x4a, 0xbd, 0x0b, 0x94, 0x81,
++      0x14, 0xad, 0x17, 0x6e, 0x00, 0x8d, 0x33, 0xbd,
++      0x60, 0xf9, 0x82, 0xb1, 0xff, 0x37, 0xc8, 0x55,
++      0x97, 0x97, 0xa0, 0x6e, 0xf4, 0xf0, 0xef, 0x61,
++      0xc1, 0x86, 0x32, 0x4e, 0x2b, 0x35, 0x06, 0x38,
++      0x36, 0x06, 0x90, 0x7b, 0x6a, 0x7c, 0x02, 0xb0,
++      0xf9, 0xf6, 0x15, 0x7b, 0x53, 0xc8, 0x67, 0xe4,
++      0xb9, 0x16, 0x6c, 0x76, 0x7b, 0x80, 0x4d, 0x46,
++      0xa5, 0x9b, 0x52, 0x16, 0xcd, 0xe7, 0xa4, 0xe9,
++      0x90, 0x40, 0xc5, 0xa4, 0x04, 0x33, 0x22, 0x5e,
++      0xe2, 0x82, 0xa1, 0xb0, 0xa0, 0x6c, 0x52, 0x3e,
++      0xaf, 0x45, 0x34, 0xd7, 0xf8, 0x3f, 0xa1, 0x15,
++      0x5b, 0x00, 0x47, 0x71, 0x8c, 0xbc, 0x54, 0x6a,
++      0x0d, 0x07, 0x2b, 0x04, 0xb3, 0x56, 0x4e, 0xea,
++      0x1b, 0x42, 0x22, 0x73, 0xf5, 0x48, 0x27, 0x1a,
++      0x0b, 0xb2, 0x31, 0x60, 0x53, 0xfa, 0x76, 0x99,
++      0x19, 0x55, 0xeb, 0xd6, 0x31, 0x59, 0x43, 0x4e,
++      0xce, 0xbb, 0x4e, 0x46, 0x6d, 0xae, 0x5a, 0x10,
++      0x73, 0xa6, 0x72, 0x76, 0x27, 0x09, 0x7a, 0x10,
++      0x49, 0xe6, 0x17, 0xd9, 0x1d, 0x36, 0x10, 0x94,
++      0xfa, 0x68, 0xf0, 0xff, 0x77, 0x98, 0x71, 0x30,
++      0x30, 0x5b, 0xea, 0xba, 0x2e, 0xda, 0x04, 0xdf,
++      0x99, 0x7b, 0x71, 0x4d, 0x6c, 0x6f, 0x2c, 0x29,
++      0xa6, 0xad, 0x5c, 0xb4, 0x02, 0x2b, 0x02, 0x70,
++      0x9b, 0xee, 0xad, 0x9d, 0x67, 0x89, 0x0c, 0xbb,
++      0x22, 0x39, 0x23, 0x36, 0xfe, 0xa1, 0x85, 0x1f,
++      0x38
++};
++static const u8 enc_assoc001[] __initconst = {
++      0xf3, 0x33, 0x88, 0x86, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x4e, 0x91
++};
++static const u8 enc_nonce001[] __initconst = {
++      0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08
++};
++static const u8 enc_key001[] __initconst = {
++      0x1c, 0x92, 0x40, 0xa5, 0xeb, 0x55, 0xd3, 0x8a,
++      0xf3, 0x33, 0x88, 0x86, 0x04, 0xf6, 0xb5, 0xf0,
++      0x47, 0x39, 0x17, 0xc1, 0x40, 0x2b, 0x80, 0x09,
++      0x9d, 0xca, 0x5c, 0xbc, 0x20, 0x70, 0x75, 0xc0
++};
++
++static const u8 enc_input002[] __initconst = { };
++static const u8 enc_output002[] __initconst = {
++      0xea, 0xe0, 0x1e, 0x9e, 0x2c, 0x91, 0xaa, 0xe1,
++      0xdb, 0x5d, 0x99, 0x3f, 0x8a, 0xf7, 0x69, 0x92
++};
++static const u8 enc_assoc002[] __initconst = { };
++static const u8 enc_nonce002[] __initconst = {
++      0xca, 0xbf, 0x33, 0x71, 0x32, 0x45, 0x77, 0x8e
++};
++static const u8 enc_key002[] __initconst = {
++      0x4c, 0xf5, 0x96, 0x83, 0x38, 0xe6, 0xae, 0x7f,
++      0x2d, 0x29, 0x25, 0x76, 0xd5, 0x75, 0x27, 0x86,
++      0x91, 0x9a, 0x27, 0x7a, 0xfb, 0x46, 0xc5, 0xef,
++      0x94, 0x81, 0x79, 0x57, 0x14, 0x59, 0x40, 0x68
++};
++
++static const u8 enc_input003[] __initconst = { };
++static const u8 enc_output003[] __initconst = {
++      0xdd, 0x6b, 0x3b, 0x82, 0xce, 0x5a, 0xbd, 0xd6,
++      0xa9, 0x35, 0x83, 0xd8, 0x8c, 0x3d, 0x85, 0x77
++};
++static const u8 enc_assoc003[] __initconst = {
++      0x33, 0x10, 0x41, 0x12, 0x1f, 0xf3, 0xd2, 0x6b
++};
++static const u8 enc_nonce003[] __initconst = {
++      0x3d, 0x86, 0xb5, 0x6b, 0xc8, 0xa3, 0x1f, 0x1d
++};
++static const u8 enc_key003[] __initconst = {
++      0x2d, 0xb0, 0x5d, 0x40, 0xc8, 0xed, 0x44, 0x88,
++      0x34, 0xd1, 0x13, 0xaf, 0x57, 0xa1, 0xeb, 0x3a,
++      0x2a, 0x80, 0x51, 0x36, 0xec, 0x5b, 0xbc, 0x08,
++      0x93, 0x84, 0x21, 0xb5, 0x13, 0x88, 0x3c, 0x0d
++};
++
++static const u8 enc_input004[] __initconst = {
++      0xa4
++};
++static const u8 enc_output004[] __initconst = {
++      0xb7, 0x1b, 0xb0, 0x73, 0x59, 0xb0, 0x84, 0xb2,
++      0x6d, 0x8e, 0xab, 0x94, 0x31, 0xa1, 0xae, 0xac,
++      0x89
++};
++static const u8 enc_assoc004[] __initconst = {
++      0x6a, 0xe2, 0xad, 0x3f, 0x88, 0x39, 0x5a, 0x40
++};
++static const u8 enc_nonce004[] __initconst = {
++      0xd2, 0x32, 0x1f, 0x29, 0x28, 0xc6, 0xc4, 0xc4
++};
++static const u8 enc_key004[] __initconst = {
++      0x4b, 0x28, 0x4b, 0xa3, 0x7b, 0xbe, 0xe9, 0xf8,
++      0x31, 0x80, 0x82, 0xd7, 0xd8, 0xe8, 0xb5, 0xa1,
++      0xe2, 0x18, 0x18, 0x8a, 0x9c, 0xfa, 0xa3, 0x3d,
++      0x25, 0x71, 0x3e, 0x40, 0xbc, 0x54, 0x7a, 0x3e
++};
++
++static const u8 enc_input005[] __initconst = {
++      0x2d
++};
++static const u8 enc_output005[] __initconst = {
++      0xbf, 0xe1, 0x5b, 0x0b, 0xdb, 0x6b, 0xf5, 0x5e,
++      0x6c, 0x5d, 0x84, 0x44, 0x39, 0x81, 0xc1, 0x9c,
++      0xac
++};
++static const u8 enc_assoc005[] __initconst = { };
++static const u8 enc_nonce005[] __initconst = {
++      0x20, 0x1c, 0xaa, 0x5f, 0x9c, 0xbf, 0x92, 0x30
++};
++static const u8 enc_key005[] __initconst = {
++      0x66, 0xca, 0x9c, 0x23, 0x2a, 0x4b, 0x4b, 0x31,
++      0x0e, 0x92, 0x89, 0x8b, 0xf4, 0x93, 0xc7, 0x87,
++      0x98, 0xa3, 0xd8, 0x39, 0xf8, 0xf4, 0xa7, 0x01,
++      0xc0, 0x2e, 0x0a, 0xa6, 0x7e, 0x5a, 0x78, 0x87
++};
++
++static const u8 enc_input006[] __initconst = {
++      0x33, 0x2f, 0x94, 0xc1, 0xa4, 0xef, 0xcc, 0x2a,
++      0x5b, 0xa6, 0xe5, 0x8f, 0x1d, 0x40, 0xf0, 0x92,
++      0x3c, 0xd9, 0x24, 0x11, 0xa9, 0x71, 0xf9, 0x37,
++      0x14, 0x99, 0xfa, 0xbe, 0xe6, 0x80, 0xde, 0x50,
++      0xc9, 0x96, 0xd4, 0xb0, 0xec, 0x9e, 0x17, 0xec,
++      0xd2, 0x5e, 0x72, 0x99, 0xfc, 0x0a, 0xe1, 0xcb,
++      0x48, 0xd2, 0x85, 0xdd, 0x2f, 0x90, 0xe0, 0x66,
++      0x3b, 0xe6, 0x20, 0x74, 0xbe, 0x23, 0x8f, 0xcb,
++      0xb4, 0xe4, 0xda, 0x48, 0x40, 0xa6, 0xd1, 0x1b,
++      0xc7, 0x42, 0xce, 0x2f, 0x0c, 0xa6, 0x85, 0x6e,
++      0x87, 0x37, 0x03, 0xb1, 0x7c, 0x25, 0x96, 0xa3,
++      0x05, 0xd8, 0xb0, 0xf4, 0xed, 0xea, 0xc2, 0xf0,
++      0x31, 0x98, 0x6c, 0xd1, 0x14, 0x25, 0xc0, 0xcb,
++      0x01, 0x74, 0xd0, 0x82, 0xf4, 0x36, 0xf5, 0x41,
++      0xd5, 0xdc, 0xca, 0xc5, 0xbb, 0x98, 0xfe, 0xfc,
++      0x69, 0x21, 0x70, 0xd8, 0xa4, 0x4b, 0xc8, 0xde,
++      0x8f
++};
++static const u8 enc_output006[] __initconst = {
++      0x8b, 0x06, 0xd3, 0x31, 0xb0, 0x93, 0x45, 0xb1,
++      0x75, 0x6e, 0x26, 0xf9, 0x67, 0xbc, 0x90, 0x15,
++      0x81, 0x2c, 0xb5, 0xf0, 0xc6, 0x2b, 0xc7, 0x8c,
++      0x56, 0xd1, 0xbf, 0x69, 0x6c, 0x07, 0xa0, 0xda,
++      0x65, 0x27, 0xc9, 0x90, 0x3d, 0xef, 0x4b, 0x11,
++      0x0f, 0x19, 0x07, 0xfd, 0x29, 0x92, 0xd9, 0xc8,
++      0xf7, 0x99, 0x2e, 0x4a, 0xd0, 0xb8, 0x2c, 0xdc,
++      0x93, 0xf5, 0x9e, 0x33, 0x78, 0xd1, 0x37, 0xc3,
++      0x66, 0xd7, 0x5e, 0xbc, 0x44, 0xbf, 0x53, 0xa5,
++      0xbc, 0xc4, 0xcb, 0x7b, 0x3a, 0x8e, 0x7f, 0x02,
++      0xbd, 0xbb, 0xe7, 0xca, 0xa6, 0x6c, 0x6b, 0x93,
++      0x21, 0x93, 0x10, 0x61, 0xe7, 0x69, 0xd0, 0x78,
++      0xf3, 0x07, 0x5a, 0x1a, 0x8f, 0x73, 0xaa, 0xb1,
++      0x4e, 0xd3, 0xda, 0x4f, 0xf3, 0x32, 0xe1, 0x66,
++      0x3e, 0x6c, 0xc6, 0x13, 0xba, 0x06, 0x5b, 0xfc,
++      0x6a, 0xe5, 0x6f, 0x60, 0xfb, 0x07, 0x40, 0xb0,
++      0x8c, 0x9d, 0x84, 0x43, 0x6b, 0xc1, 0xf7, 0x8d,
++      0x8d, 0x31, 0xf7, 0x7a, 0x39, 0x4d, 0x8f, 0x9a,
++      0xeb
++};
++static const u8 enc_assoc006[] __initconst = {
++      0x70, 0xd3, 0x33, 0xf3, 0x8b, 0x18, 0x0b
++};
++static const u8 enc_nonce006[] __initconst = {
++      0xdf, 0x51, 0x84, 0x82, 0x42, 0x0c, 0x75, 0x9c
++};
++static const u8 enc_key006[] __initconst = {
++      0x68, 0x7b, 0x8d, 0x8e, 0xe3, 0xc4, 0xdd, 0xae,
++      0xdf, 0x72, 0x7f, 0x53, 0x72, 0x25, 0x1e, 0x78,
++      0x91, 0xcb, 0x69, 0x76, 0x1f, 0x49, 0x93, 0xf9,
++      0x6f, 0x21, 0xcc, 0x39, 0x9c, 0xad, 0xb1, 0x01
++};
++
++static const u8 enc_input007[] __initconst = {
++      0x9b, 0x18, 0xdb, 0xdd, 0x9a, 0x0f, 0x3e, 0xa5,
++      0x15, 0x17, 0xde, 0xdf, 0x08, 0x9d, 0x65, 0x0a,
++      0x67, 0x30, 0x12, 0xe2, 0x34, 0x77, 0x4b, 0xc1,
++      0xd9, 0xc6, 0x1f, 0xab, 0xc6, 0x18, 0x50, 0x17,
++      0xa7, 0x9d, 0x3c, 0xa6, 0xc5, 0x35, 0x8c, 0x1c,
++      0xc0, 0xa1, 0x7c, 0x9f, 0x03, 0x89, 0xca, 0xe1,
++      0xe6, 0xe9, 0xd4, 0xd3, 0x88, 0xdb, 0xb4, 0x51,
++      0x9d, 0xec, 0xb4, 0xfc, 0x52, 0xee, 0x6d, 0xf1,
++      0x75, 0x42, 0xc6, 0xfd, 0xbd, 0x7a, 0x8e, 0x86,
++      0xfc, 0x44, 0xb3, 0x4f, 0xf3, 0xea, 0x67, 0x5a,
++      0x41, 0x13, 0xba, 0xb0, 0xdc, 0xe1, 0xd3, 0x2a,
++      0x7c, 0x22, 0xb3, 0xca, 0xac, 0x6a, 0x37, 0x98,
++      0x3e, 0x1d, 0x40, 0x97, 0xf7, 0x9b, 0x1d, 0x36,
++      0x6b, 0xb3, 0x28, 0xbd, 0x60, 0x82, 0x47, 0x34,
++      0xaa, 0x2f, 0x7d, 0xe9, 0xa8, 0x70, 0x81, 0x57,
++      0xd4, 0xb9, 0x77, 0x0a, 0x9d, 0x29, 0xa7, 0x84,
++      0x52, 0x4f, 0xc2, 0x4a, 0x40, 0x3b, 0x3c, 0xd4,
++      0xc9, 0x2a, 0xdb, 0x4a, 0x53, 0xc4, 0xbe, 0x80,
++      0xe9, 0x51, 0x7f, 0x8f, 0xc7, 0xa2, 0xce, 0x82,
++      0x5c, 0x91, 0x1e, 0x74, 0xd9, 0xd0, 0xbd, 0xd5,
++      0xf3, 0xfd, 0xda, 0x4d, 0x25, 0xb4, 0xbb, 0x2d,
++      0xac, 0x2f, 0x3d, 0x71, 0x85, 0x7b, 0xcf, 0x3c,
++      0x7b, 0x3e, 0x0e, 0x22, 0x78, 0x0c, 0x29, 0xbf,
++      0xe4, 0xf4, 0x57, 0xb3, 0xcb, 0x49, 0xa0, 0xfc,
++      0x1e, 0x05, 0x4e, 0x16, 0xbc, 0xd5, 0xa8, 0xa3,
++      0xee, 0x05, 0x35, 0xc6, 0x7c, 0xab, 0x60, 0x14,
++      0x55, 0x1a, 0x8e, 0xc5, 0x88, 0x5d, 0xd5, 0x81,
++      0xc2, 0x81, 0xa5, 0xc4, 0x60, 0xdb, 0xaf, 0x77,
++      0x91, 0xe1, 0xce, 0xa2, 0x7e, 0x7f, 0x42, 0xe3,
++      0xb0, 0x13, 0x1c, 0x1f, 0x25, 0x60, 0x21, 0xe2,
++      0x40, 0x5f, 0x99, 0xb7, 0x73, 0xec, 0x9b, 0x2b,
++      0xf0, 0x65, 0x11, 0xc8, 0xd0, 0x0a, 0x9f, 0xd3
++};
++static const u8 enc_output007[] __initconst = {
++      0x85, 0x04, 0xc2, 0xed, 0x8d, 0xfd, 0x97, 0x5c,
++      0xd2, 0xb7, 0xe2, 0xc1, 0x6b, 0xa3, 0xba, 0xf8,
++      0xc9, 0x50, 0xc3, 0xc6, 0xa5, 0xe3, 0xa4, 0x7c,
++      0xc3, 0x23, 0x49, 0x5e, 0xa9, 0xb9, 0x32, 0xeb,
++      0x8a, 0x7c, 0xca, 0xe5, 0xec, 0xfb, 0x7c, 0xc0,
++      0xcb, 0x7d, 0xdc, 0x2c, 0x9d, 0x92, 0x55, 0x21,
++      0x0a, 0xc8, 0x43, 0x63, 0x59, 0x0a, 0x31, 0x70,
++      0x82, 0x67, 0x41, 0x03, 0xf8, 0xdf, 0xf2, 0xac,
++      0xa7, 0x02, 0xd4, 0xd5, 0x8a, 0x2d, 0xc8, 0x99,
++      0x19, 0x66, 0xd0, 0xf6, 0x88, 0x2c, 0x77, 0xd9,
++      0xd4, 0x0d, 0x6c, 0xbd, 0x98, 0xde, 0xe7, 0x7f,
++      0xad, 0x7e, 0x8a, 0xfb, 0xe9, 0x4b, 0xe5, 0xf7,
++      0xe5, 0x50, 0xa0, 0x90, 0x3f, 0xd6, 0x22, 0x53,
++      0xe3, 0xfe, 0x1b, 0xcc, 0x79, 0x3b, 0xec, 0x12,
++      0x47, 0x52, 0xa7, 0xd6, 0x04, 0xe3, 0x52, 0xe6,
++      0x93, 0x90, 0x91, 0x32, 0x73, 0x79, 0xb8, 0xd0,
++      0x31, 0xde, 0x1f, 0x9f, 0x2f, 0x05, 0x38, 0x54,
++      0x2f, 0x35, 0x04, 0x39, 0xe0, 0xa7, 0xba, 0xc6,
++      0x52, 0xf6, 0x37, 0x65, 0x4c, 0x07, 0xa9, 0x7e,
++      0xb3, 0x21, 0x6f, 0x74, 0x8c, 0xc9, 0xde, 0xdb,
++      0x65, 0x1b, 0x9b, 0xaa, 0x60, 0xb1, 0x03, 0x30,
++      0x6b, 0xb2, 0x03, 0xc4, 0x1c, 0x04, 0xf8, 0x0f,
++      0x64, 0xaf, 0x46, 0xe4, 0x65, 0x99, 0x49, 0xe2,
++      0xea, 0xce, 0x78, 0x00, 0xd8, 0x8b, 0xd5, 0x2e,
++      0xcf, 0xfc, 0x40, 0x49, 0xe8, 0x58, 0xdc, 0x34,
++      0x9c, 0x8c, 0x61, 0xbf, 0x0a, 0x8e, 0xec, 0x39,
++      0xa9, 0x30, 0x05, 0x5a, 0xd2, 0x56, 0x01, 0xc7,
++      0xda, 0x8f, 0x4e, 0xbb, 0x43, 0xa3, 0x3a, 0xf9,
++      0x15, 0x2a, 0xd0, 0xa0, 0x7a, 0x87, 0x34, 0x82,
++      0xfe, 0x8a, 0xd1, 0x2d, 0x5e, 0xc7, 0xbf, 0x04,
++      0x53, 0x5f, 0x3b, 0x36, 0xd4, 0x25, 0x5c, 0x34,
++      0x7a, 0x8d, 0xd5, 0x05, 0xce, 0x72, 0xca, 0xef,
++      0x7a, 0x4b, 0xbc, 0xb0, 0x10, 0x5c, 0x96, 0x42,
++      0x3a, 0x00, 0x98, 0xcd, 0x15, 0xe8, 0xb7, 0x53
++};
++static const u8 enc_assoc007[] __initconst = { };
++static const u8 enc_nonce007[] __initconst = {
++      0xde, 0x7b, 0xef, 0xc3, 0x65, 0x1b, 0x68, 0xb0
++};
++static const u8 enc_key007[] __initconst = {
++      0x8d, 0xb8, 0x91, 0x48, 0xf0, 0xe7, 0x0a, 0xbd,
++      0xf9, 0x3f, 0xcd, 0xd9, 0xa0, 0x1e, 0x42, 0x4c,
++      0xe7, 0xde, 0x25, 0x3d, 0xa3, 0xd7, 0x05, 0x80,
++      0x8d, 0xf2, 0x82, 0xac, 0x44, 0x16, 0x51, 0x01
++};
++
++static const u8 enc_input008[] __initconst = {
++      0xc3, 0x09, 0x94, 0x62, 0xe6, 0x46, 0x2e, 0x10,
++      0xbe, 0x00, 0xe4, 0xfc, 0xf3, 0x40, 0xa3, 0xe2,
++      0x0f, 0xc2, 0x8b, 0x28, 0xdc, 0xba, 0xb4, 0x3c,
++      0xe4, 0x21, 0x58, 0x61, 0xcd, 0x8b, 0xcd, 0xfb,
++      0xac, 0x94, 0xa1, 0x45, 0xf5, 0x1c, 0xe1, 0x12,
++      0xe0, 0x3b, 0x67, 0x21, 0x54, 0x5e, 0x8c, 0xaa,
++      0xcf, 0xdb, 0xb4, 0x51, 0xd4, 0x13, 0xda, 0xe6,
++      0x83, 0x89, 0xb6, 0x92, 0xe9, 0x21, 0x76, 0xa4,
++      0x93, 0x7d, 0x0e, 0xfd, 0x96, 0x36, 0x03, 0x91,
++      0x43, 0x5c, 0x92, 0x49, 0x62, 0x61, 0x7b, 0xeb,
++      0x43, 0x89, 0xb8, 0x12, 0x20, 0x43, 0xd4, 0x47,
++      0x06, 0x84, 0xee, 0x47, 0xe9, 0x8a, 0x73, 0x15,
++      0x0f, 0x72, 0xcf, 0xed, 0xce, 0x96, 0xb2, 0x7f,
++      0x21, 0x45, 0x76, 0xeb, 0x26, 0x28, 0x83, 0x6a,
++      0xad, 0xaa, 0xa6, 0x81, 0xd8, 0x55, 0xb1, 0xa3,
++      0x85, 0xb3, 0x0c, 0xdf, 0xf1, 0x69, 0x2d, 0x97,
++      0x05, 0x2a, 0xbc, 0x7c, 0x7b, 0x25, 0xf8, 0x80,
++      0x9d, 0x39, 0x25, 0xf3, 0x62, 0xf0, 0x66, 0x5e,
++      0xf4, 0xa0, 0xcf, 0xd8, 0xfd, 0x4f, 0xb1, 0x1f,
++      0x60, 0x3a, 0x08, 0x47, 0xaf, 0xe1, 0xf6, 0x10,
++      0x77, 0x09, 0xa7, 0x27, 0x8f, 0x9a, 0x97, 0x5a,
++      0x26, 0xfa, 0xfe, 0x41, 0x32, 0x83, 0x10, 0xe0,
++      0x1d, 0xbf, 0x64, 0x0d, 0xf4, 0x1c, 0x32, 0x35,
++      0xe5, 0x1b, 0x36, 0xef, 0xd4, 0x4a, 0x93, 0x4d,
++      0x00, 0x7c, 0xec, 0x02, 0x07, 0x8b, 0x5d, 0x7d,
++      0x1b, 0x0e, 0xd1, 0xa6, 0xa5, 0x5d, 0x7d, 0x57,
++      0x88, 0xa8, 0xcc, 0x81, 0xb4, 0x86, 0x4e, 0xb4,
++      0x40, 0xe9, 0x1d, 0xc3, 0xb1, 0x24, 0x3e, 0x7f,
++      0xcc, 0x8a, 0x24, 0x9b, 0xdf, 0x6d, 0xf0, 0x39,
++      0x69, 0x3e, 0x4c, 0xc0, 0x96, 0xe4, 0x13, 0xda,
++      0x90, 0xda, 0xf4, 0x95, 0x66, 0x8b, 0x17, 0x17,
++      0xfe, 0x39, 0x43, 0x25, 0xaa, 0xda, 0xa0, 0x43,
++      0x3c, 0xb1, 0x41, 0x02, 0xa3, 0xf0, 0xa7, 0x19,
++      0x59, 0xbc, 0x1d, 0x7d, 0x6c, 0x6d, 0x91, 0x09,
++      0x5c, 0xb7, 0x5b, 0x01, 0xd1, 0x6f, 0x17, 0x21,
++      0x97, 0xbf, 0x89, 0x71, 0xa5, 0xb0, 0x6e, 0x07,
++      0x45, 0xfd, 0x9d, 0xea, 0x07, 0xf6, 0x7a, 0x9f,
++      0x10, 0x18, 0x22, 0x30, 0x73, 0xac, 0xd4, 0x6b,
++      0x72, 0x44, 0xed, 0xd9, 0x19, 0x9b, 0x2d, 0x4a,
++      0x41, 0xdd, 0xd1, 0x85, 0x5e, 0x37, 0x19, 0xed,
++      0xd2, 0x15, 0x8f, 0x5e, 0x91, 0xdb, 0x33, 0xf2,
++      0xe4, 0xdb, 0xff, 0x98, 0xfb, 0xa3, 0xb5, 0xca,
++      0x21, 0x69, 0x08, 0xe7, 0x8a, 0xdf, 0x90, 0xff,
++      0x3e, 0xe9, 0x20, 0x86, 0x3c, 0xe9, 0xfc, 0x0b,
++      0xfe, 0x5c, 0x61, 0xaa, 0x13, 0x92, 0x7f, 0x7b,
++      0xec, 0xe0, 0x6d, 0xa8, 0x23, 0x22, 0xf6, 0x6b,
++      0x77, 0xc4, 0xfe, 0x40, 0x07, 0x3b, 0xb6, 0xf6,
++      0x8e, 0x5f, 0xd4, 0xb9, 0xb7, 0x0f, 0x21, 0x04,
++      0xef, 0x83, 0x63, 0x91, 0x69, 0x40, 0xa3, 0x48,
++      0x5c, 0xd2, 0x60, 0xf9, 0x4f, 0x6c, 0x47, 0x8b,
++      0x3b, 0xb1, 0x9f, 0x8e, 0xee, 0x16, 0x8a, 0x13,
++      0xfc, 0x46, 0x17, 0xc3, 0xc3, 0x32, 0x56, 0xf8,
++      0x3c, 0x85, 0x3a, 0xb6, 0x3e, 0xaa, 0x89, 0x4f,
++      0xb3, 0xdf, 0x38, 0xfd, 0xf1, 0xe4, 0x3a, 0xc0,
++      0xe6, 0x58, 0xb5, 0x8f, 0xc5, 0x29, 0xa2, 0x92,
++      0x4a, 0xb6, 0xa0, 0x34, 0x7f, 0xab, 0xb5, 0x8a,
++      0x90, 0xa1, 0xdb, 0x4d, 0xca, 0xb6, 0x2c, 0x41,
++      0x3c, 0xf7, 0x2b, 0x21, 0xc3, 0xfd, 0xf4, 0x17,
++      0x5c, 0xb5, 0x33, 0x17, 0x68, 0x2b, 0x08, 0x30,
++      0xf3, 0xf7, 0x30, 0x3c, 0x96, 0xe6, 0x6a, 0x20,
++      0x97, 0xe7, 0x4d, 0x10, 0x5f, 0x47, 0x5f, 0x49,
++      0x96, 0x09, 0xf0, 0x27, 0x91, 0xc8, 0xf8, 0x5a,
++      0x2e, 0x79, 0xb5, 0xe2, 0xb8, 0xe8, 0xb9, 0x7b,
++      0xd5, 0x10, 0xcb, 0xff, 0x5d, 0x14, 0x73, 0xf3
++};
++static const u8 enc_output008[] __initconst = {
++      0x14, 0xf6, 0x41, 0x37, 0xa6, 0xd4, 0x27, 0xcd,
++      0xdb, 0x06, 0x3e, 0x9a, 0x4e, 0xab, 0xd5, 0xb1,
++      0x1e, 0x6b, 0xd2, 0xbc, 0x11, 0xf4, 0x28, 0x93,
++      0x63, 0x54, 0xef, 0xbb, 0x5e, 0x1d, 0x3a, 0x1d,
++      0x37, 0x3c, 0x0a, 0x6c, 0x1e, 0xc2, 0xd1, 0x2c,
++      0xb5, 0xa3, 0xb5, 0x7b, 0xb8, 0x8f, 0x25, 0xa6,
++      0x1b, 0x61, 0x1c, 0xec, 0x28, 0x58, 0x26, 0xa4,
++      0xa8, 0x33, 0x28, 0x25, 0x5c, 0x45, 0x05, 0xe5,
++      0x6c, 0x99, 0xe5, 0x45, 0xc4, 0xa2, 0x03, 0x84,
++      0x03, 0x73, 0x1e, 0x8c, 0x49, 0xac, 0x20, 0xdd,
++      0x8d, 0xb3, 0xc4, 0xf5, 0xe7, 0x4f, 0xf1, 0xed,
++      0xa1, 0x98, 0xde, 0xa4, 0x96, 0xdd, 0x2f, 0xab,
++      0xab, 0x97, 0xcf, 0x3e, 0xd2, 0x9e, 0xb8, 0x13,
++      0x07, 0x28, 0x29, 0x19, 0xaf, 0xfd, 0xf2, 0x49,
++      0x43, 0xea, 0x49, 0x26, 0x91, 0xc1, 0x07, 0xd6,
++      0xbb, 0x81, 0x75, 0x35, 0x0d, 0x24, 0x7f, 0xc8,
++      0xda, 0xd4, 0xb7, 0xeb, 0xe8, 0x5c, 0x09, 0xa2,
++      0x2f, 0xdc, 0x28, 0x7d, 0x3a, 0x03, 0xfa, 0x94,
++      0xb5, 0x1d, 0x17, 0x99, 0x36, 0xc3, 0x1c, 0x18,
++      0x34, 0xe3, 0x9f, 0xf5, 0x55, 0x7c, 0xb0, 0x60,
++      0x9d, 0xff, 0xac, 0xd4, 0x61, 0xf2, 0xad, 0xf8,
++      0xce, 0xc7, 0xbe, 0x5c, 0xd2, 0x95, 0xa8, 0x4b,
++      0x77, 0x13, 0x19, 0x59, 0x26, 0xc9, 0xb7, 0x8f,
++      0x6a, 0xcb, 0x2d, 0x37, 0x91, 0xea, 0x92, 0x9c,
++      0x94, 0x5b, 0xda, 0x0b, 0xce, 0xfe, 0x30, 0x20,
++      0xf8, 0x51, 0xad, 0xf2, 0xbe, 0xe7, 0xc7, 0xff,
++      0xb3, 0x33, 0x91, 0x6a, 0xc9, 0x1a, 0x41, 0xc9,
++      0x0f, 0xf3, 0x10, 0x0e, 0xfd, 0x53, 0xff, 0x6c,
++      0x16, 0x52, 0xd9, 0xf3, 0xf7, 0x98, 0x2e, 0xc9,
++      0x07, 0x31, 0x2c, 0x0c, 0x72, 0xd7, 0xc5, 0xc6,
++      0x08, 0x2a, 0x7b, 0xda, 0xbd, 0x7e, 0x02, 0xea,
++      0x1a, 0xbb, 0xf2, 0x04, 0x27, 0x61, 0x28, 0x8e,
++      0xf5, 0x04, 0x03, 0x1f, 0x4c, 0x07, 0x55, 0x82,
++      0xec, 0x1e, 0xd7, 0x8b, 0x2f, 0x65, 0x56, 0xd1,
++      0xd9, 0x1e, 0x3c, 0xe9, 0x1f, 0x5e, 0x98, 0x70,
++      0x38, 0x4a, 0x8c, 0x49, 0xc5, 0x43, 0xa0, 0xa1,
++      0x8b, 0x74, 0x9d, 0x4c, 0x62, 0x0d, 0x10, 0x0c,
++      0xf4, 0x6c, 0x8f, 0xe0, 0xaa, 0x9a, 0x8d, 0xb7,
++      0xe0, 0xbe, 0x4c, 0x87, 0xf1, 0x98, 0x2f, 0xcc,
++      0xed, 0xc0, 0x52, 0x29, 0xdc, 0x83, 0xf8, 0xfc,
++      0x2c, 0x0e, 0xa8, 0x51, 0x4d, 0x80, 0x0d, 0xa3,
++      0xfe, 0xd8, 0x37, 0xe7, 0x41, 0x24, 0xfc, 0xfb,
++      0x75, 0xe3, 0x71, 0x7b, 0x57, 0x45, 0xf5, 0x97,
++      0x73, 0x65, 0x63, 0x14, 0x74, 0xb8, 0x82, 0x9f,
++      0xf8, 0x60, 0x2f, 0x8a, 0xf2, 0x4e, 0xf1, 0x39,
++      0xda, 0x33, 0x91, 0xf8, 0x36, 0xe0, 0x8d, 0x3f,
++      0x1f, 0x3b, 0x56, 0xdc, 0xa0, 0x8f, 0x3c, 0x9d,
++      0x71, 0x52, 0xa7, 0xb8, 0xc0, 0xa5, 0xc6, 0xa2,
++      0x73, 0xda, 0xf4, 0x4b, 0x74, 0x5b, 0x00, 0x3d,
++      0x99, 0xd7, 0x96, 0xba, 0xe6, 0xe1, 0xa6, 0x96,
++      0x38, 0xad, 0xb3, 0xc0, 0xd2, 0xba, 0x91, 0x6b,
++      0xf9, 0x19, 0xdd, 0x3b, 0xbe, 0xbe, 0x9c, 0x20,
++      0x50, 0xba, 0xa1, 0xd0, 0xce, 0x11, 0xbd, 0x95,
++      0xd8, 0xd1, 0xdd, 0x33, 0x85, 0x74, 0xdc, 0xdb,
++      0x66, 0x76, 0x44, 0xdc, 0x03, 0x74, 0x48, 0x35,
++      0x98, 0xb1, 0x18, 0x47, 0x94, 0x7d, 0xff, 0x62,
++      0xe4, 0x58, 0x78, 0xab, 0xed, 0x95, 0x36, 0xd9,
++      0x84, 0x91, 0x82, 0x64, 0x41, 0xbb, 0x58, 0xe6,
++      0x1c, 0x20, 0x6d, 0x15, 0x6b, 0x13, 0x96, 0xe8,
++      0x35, 0x7f, 0xdc, 0x40, 0x2c, 0xe9, 0xbc, 0x8a,
++      0x4f, 0x92, 0xec, 0x06, 0x2d, 0x50, 0xdf, 0x93,
++      0x5d, 0x65, 0x5a, 0xa8, 0xfc, 0x20, 0x50, 0x14,
++      0xa9, 0x8a, 0x7e, 0x1d, 0x08, 0x1f, 0xe2, 0x99,
++      0xd0, 0xbe, 0xfb, 0x3a, 0x21, 0x9d, 0xad, 0x86,
++      0x54, 0xfd, 0x0d, 0x98, 0x1c, 0x5a, 0x6f, 0x1f,
++      0x9a, 0x40, 0xcd, 0xa2, 0xff, 0x6a, 0xf1, 0x54
++};
++static const u8 enc_assoc008[] __initconst = { };
++static const u8 enc_nonce008[] __initconst = {
++      0x0e, 0x0d, 0x57, 0xbb, 0x7b, 0x40, 0x54, 0x02
++};
++static const u8 enc_key008[] __initconst = {
++      0xf2, 0xaa, 0x4f, 0x99, 0xfd, 0x3e, 0xa8, 0x53,
++      0xc1, 0x44, 0xe9, 0x81, 0x18, 0xdc, 0xf5, 0xf0,
++      0x3e, 0x44, 0x15, 0x59, 0xe0, 0xc5, 0x44, 0x86,
++      0xc3, 0x91, 0xa8, 0x75, 0xc0, 0x12, 0x46, 0xba
++};
++
++static const u8 enc_input009[] __initconst = {
++      0xe6, 0xc3, 0xdb, 0x63, 0x55, 0x15, 0xe3, 0x5b,
++      0xb7, 0x4b, 0x27, 0x8b, 0x5a, 0xdd, 0xc2, 0xe8,
++      0x3a, 0x6b, 0xd7, 0x81, 0x96, 0x35, 0x97, 0xca,
++      0xd7, 0x68, 0xe8, 0xef, 0xce, 0xab, 0xda, 0x09,
++      0x6e, 0xd6, 0x8e, 0xcb, 0x55, 0xb5, 0xe1, 0xe5,
++      0x57, 0xfd, 0xc4, 0xe3, 0xe0, 0x18, 0x4f, 0x85,
++      0xf5, 0x3f, 0x7e, 0x4b, 0x88, 0xc9, 0x52, 0x44,
++      0x0f, 0xea, 0xaf, 0x1f, 0x71, 0x48, 0x9f, 0x97,
++      0x6d, 0xb9, 0x6f, 0x00, 0xa6, 0xde, 0x2b, 0x77,
++      0x8b, 0x15, 0xad, 0x10, 0xa0, 0x2b, 0x7b, 0x41,
++      0x90, 0x03, 0x2d, 0x69, 0xae, 0xcc, 0x77, 0x7c,
++      0xa5, 0x9d, 0x29, 0x22, 0xc2, 0xea, 0xb4, 0x00,
++      0x1a, 0xd2, 0x7a, 0x98, 0x8a, 0xf9, 0xf7, 0x82,
++      0xb0, 0xab, 0xd8, 0xa6, 0x94, 0x8d, 0x58, 0x2f,
++      0x01, 0x9e, 0x00, 0x20, 0xfc, 0x49, 0xdc, 0x0e,
++      0x03, 0xe8, 0x45, 0x10, 0xd6, 0xa8, 0xda, 0x55,
++      0x10, 0x9a, 0xdf, 0x67, 0x22, 0x8b, 0x43, 0xab,
++      0x00, 0xbb, 0x02, 0xc8, 0xdd, 0x7b, 0x97, 0x17,
++      0xd7, 0x1d, 0x9e, 0x02, 0x5e, 0x48, 0xde, 0x8e,
++      0xcf, 0x99, 0x07, 0x95, 0x92, 0x3c, 0x5f, 0x9f,
++      0xc5, 0x8a, 0xc0, 0x23, 0xaa, 0xd5, 0x8c, 0x82,
++      0x6e, 0x16, 0x92, 0xb1, 0x12, 0x17, 0x07, 0xc3,
++      0xfb, 0x36, 0xf5, 0x6c, 0x35, 0xd6, 0x06, 0x1f,
++      0x9f, 0xa7, 0x94, 0xa2, 0x38, 0x63, 0x9c, 0xb0,
++      0x71, 0xb3, 0xa5, 0xd2, 0xd8, 0xba, 0x9f, 0x08,
++      0x01, 0xb3, 0xff, 0x04, 0x97, 0x73, 0x45, 0x1b,
++      0xd5, 0xa9, 0x9c, 0x80, 0xaf, 0x04, 0x9a, 0x85,
++      0xdb, 0x32, 0x5b, 0x5d, 0x1a, 0xc1, 0x36, 0x28,
++      0x10, 0x79, 0xf1, 0x3c, 0xbf, 0x1a, 0x41, 0x5c,
++      0x4e, 0xdf, 0xb2, 0x7c, 0x79, 0x3b, 0x7a, 0x62,
++      0x3d, 0x4b, 0xc9, 0x9b, 0x2a, 0x2e, 0x7c, 0xa2,
++      0xb1, 0x11, 0x98, 0xa7, 0x34, 0x1a, 0x00, 0xf3,
++      0xd1, 0xbc, 0x18, 0x22, 0xba, 0x02, 0x56, 0x62,
++      0x31, 0x10, 0x11, 0x6d, 0xe0, 0x54, 0x9d, 0x40,
++      0x1f, 0x26, 0x80, 0x41, 0xca, 0x3f, 0x68, 0x0f,
++      0x32, 0x1d, 0x0a, 0x8e, 0x79, 0xd8, 0xa4, 0x1b,
++      0x29, 0x1c, 0x90, 0x8e, 0xc5, 0xe3, 0xb4, 0x91,
++      0x37, 0x9a, 0x97, 0x86, 0x99, 0xd5, 0x09, 0xc5,
++      0xbb, 0xa3, 0x3f, 0x21, 0x29, 0x82, 0x14, 0x5c,
++      0xab, 0x25, 0xfb, 0xf2, 0x4f, 0x58, 0x26, 0xd4,
++      0x83, 0xaa, 0x66, 0x89, 0x67, 0x7e, 0xc0, 0x49,
++      0xe1, 0x11, 0x10, 0x7f, 0x7a, 0xda, 0x29, 0x04,
++      0xff, 0xf0, 0xcb, 0x09, 0x7c, 0x9d, 0xfa, 0x03,
++      0x6f, 0x81, 0x09, 0x31, 0x60, 0xfb, 0x08, 0xfa,
++      0x74, 0xd3, 0x64, 0x44, 0x7c, 0x55, 0x85, 0xec,
++      0x9c, 0x6e, 0x25, 0xb7, 0x6c, 0xc5, 0x37, 0xb6,
++      0x83, 0x87, 0x72, 0x95, 0x8b, 0x9d, 0xe1, 0x69,
++      0x5c, 0x31, 0x95, 0x42, 0xa6, 0x2c, 0xd1, 0x36,
++      0x47, 0x1f, 0xec, 0x54, 0xab, 0xa2, 0x1c, 0xd8,
++      0x00, 0xcc, 0xbc, 0x0d, 0x65, 0xe2, 0x67, 0xbf,
++      0xbc, 0xea, 0xee, 0x9e, 0xe4, 0x36, 0x95, 0xbe,
++      0x73, 0xd9, 0xa6, 0xd9, 0x0f, 0xa0, 0xcc, 0x82,
++      0x76, 0x26, 0xad, 0x5b, 0x58, 0x6c, 0x4e, 0xab,
++      0x29, 0x64, 0xd3, 0xd9, 0xa9, 0x08, 0x8c, 0x1d,
++      0xa1, 0x4f, 0x80, 0xd8, 0x3f, 0x94, 0xfb, 0xd3,
++      0x7b, 0xfc, 0xd1, 0x2b, 0xc3, 0x21, 0xeb, 0xe5,
++      0x1c, 0x84, 0x23, 0x7f, 0x4b, 0xfa, 0xdb, 0x34,
++      0x18, 0xa2, 0xc2, 0xe5, 0x13, 0xfe, 0x6c, 0x49,
++      0x81, 0xd2, 0x73, 0xe7, 0xe2, 0xd7, 0xe4, 0x4f,
++      0x4b, 0x08, 0x6e, 0xb1, 0x12, 0x22, 0x10, 0x9d,
++      0xac, 0x51, 0x1e, 0x17, 0xd9, 0x8a, 0x0b, 0x42,
++      0x88, 0x16, 0x81, 0x37, 0x7c, 0x6a, 0xf7, 0xef,
++      0x2d, 0xe3, 0xd9, 0xf8, 0x5f, 0xe0, 0x53, 0x27,
++      0x74, 0xb9, 0xe2, 0xd6, 0x1c, 0x80, 0x2c, 0x52,
++      0x65
++};
++static const u8 enc_output009[] __initconst = {
++      0xfd, 0x81, 0x8d, 0xd0, 0x3d, 0xb4, 0xd5, 0xdf,
++      0xd3, 0x42, 0x47, 0x5a, 0x6d, 0x19, 0x27, 0x66,
++      0x4b, 0x2e, 0x0c, 0x27, 0x9c, 0x96, 0x4c, 0x72,
++      0x02, 0xa3, 0x65, 0xc3, 0xb3, 0x6f, 0x2e, 0xbd,
++      0x63, 0x8a, 0x4a, 0x5d, 0x29, 0xa2, 0xd0, 0x28,
++      0x48, 0xc5, 0x3d, 0x98, 0xa3, 0xbc, 0xe0, 0xbe,
++      0x3b, 0x3f, 0xe6, 0x8a, 0xa4, 0x7f, 0x53, 0x06,
++      0xfa, 0x7f, 0x27, 0x76, 0x72, 0x31, 0xa1, 0xf5,
++      0xd6, 0x0c, 0x52, 0x47, 0xba, 0xcd, 0x4f, 0xd7,
++      0xeb, 0x05, 0x48, 0x0d, 0x7c, 0x35, 0x4a, 0x09,
++      0xc9, 0x76, 0x71, 0x02, 0xa3, 0xfb, 0xb7, 0x1a,
++      0x65, 0xb7, 0xed, 0x98, 0xc6, 0x30, 0x8a, 0x00,
++      0xae, 0xa1, 0x31, 0xe5, 0xb5, 0x9e, 0x6d, 0x62,
++      0xda, 0xda, 0x07, 0x0f, 0x38, 0x38, 0xd3, 0xcb,
++      0xc1, 0xb0, 0xad, 0xec, 0x72, 0xec, 0xb1, 0xa2,
++      0x7b, 0x59, 0xf3, 0x3d, 0x2b, 0xef, 0xcd, 0x28,
++      0x5b, 0x83, 0xcc, 0x18, 0x91, 0x88, 0xb0, 0x2e,
++      0xf9, 0x29, 0x31, 0x18, 0xf9, 0x4e, 0xe9, 0x0a,
++      0x91, 0x92, 0x9f, 0xae, 0x2d, 0xad, 0xf4, 0xe6,
++      0x1a, 0xe2, 0xa4, 0xee, 0x47, 0x15, 0xbf, 0x83,
++      0x6e, 0xd7, 0x72, 0x12, 0x3b, 0x2d, 0x24, 0xe9,
++      0xb2, 0x55, 0xcb, 0x3c, 0x10, 0xf0, 0x24, 0x8a,
++      0x4a, 0x02, 0xea, 0x90, 0x25, 0xf0, 0xb4, 0x79,
++      0x3a, 0xef, 0x6e, 0xf5, 0x52, 0xdf, 0xb0, 0x0a,
++      0xcd, 0x24, 0x1c, 0xd3, 0x2e, 0x22, 0x74, 0xea,
++      0x21, 0x6f, 0xe9, 0xbd, 0xc8, 0x3e, 0x36, 0x5b,
++      0x19, 0xf1, 0xca, 0x99, 0x0a, 0xb4, 0xa7, 0x52,
++      0x1a, 0x4e, 0xf2, 0xad, 0x8d, 0x56, 0x85, 0xbb,
++      0x64, 0x89, 0xba, 0x26, 0xf9, 0xc7, 0xe1, 0x89,
++      0x19, 0x22, 0x77, 0xc3, 0xa8, 0xfc, 0xff, 0xad,
++      0xfe, 0xb9, 0x48, 0xae, 0x12, 0x30, 0x9f, 0x19,
++      0xfb, 0x1b, 0xef, 0x14, 0x87, 0x8a, 0x78, 0x71,
++      0xf3, 0xf4, 0xb7, 0x00, 0x9c, 0x1d, 0xb5, 0x3d,
++      0x49, 0x00, 0x0c, 0x06, 0xd4, 0x50, 0xf9, 0x54,
++      0x45, 0xb2, 0x5b, 0x43, 0xdb, 0x6d, 0xcf, 0x1a,
++      0xe9, 0x7a, 0x7a, 0xcf, 0xfc, 0x8a, 0x4e, 0x4d,
++      0x0b, 0x07, 0x63, 0x28, 0xd8, 0xe7, 0x08, 0x95,
++      0xdf, 0xa6, 0x72, 0x93, 0x2e, 0xbb, 0xa0, 0x42,
++      0x89, 0x16, 0xf1, 0xd9, 0x0c, 0xf9, 0xa1, 0x16,
++      0xfd, 0xd9, 0x03, 0xb4, 0x3b, 0x8a, 0xf5, 0xf6,
++      0xe7, 0x6b, 0x2e, 0x8e, 0x4c, 0x3d, 0xe2, 0xaf,
++      0x08, 0x45, 0x03, 0xff, 0x09, 0xb6, 0xeb, 0x2d,
++      0xc6, 0x1b, 0x88, 0x94, 0xac, 0x3e, 0xf1, 0x9f,
++      0x0e, 0x0e, 0x2b, 0xd5, 0x00, 0x4d, 0x3f, 0x3b,
++      0x53, 0xae, 0xaf, 0x1c, 0x33, 0x5f, 0x55, 0x6e,
++      0x8d, 0xaf, 0x05, 0x7a, 0x10, 0x34, 0xc9, 0xf4,
++      0x66, 0xcb, 0x62, 0x12, 0xa6, 0xee, 0xe8, 0x1c,
++      0x5d, 0x12, 0x86, 0xdb, 0x6f, 0x1c, 0x33, 0xc4,
++      0x1c, 0xda, 0x82, 0x2d, 0x3b, 0x59, 0xfe, 0xb1,
++      0xa4, 0x59, 0x41, 0x86, 0xd0, 0xef, 0xae, 0xfb,
++      0xda, 0x6d, 0x11, 0xb8, 0xca, 0xe9, 0x6e, 0xff,
++      0xf7, 0xa9, 0xd9, 0x70, 0x30, 0xfc, 0x53, 0xe2,
++      0xd7, 0xa2, 0x4e, 0xc7, 0x91, 0xd9, 0x07, 0x06,
++      0xaa, 0xdd, 0xb0, 0x59, 0x28, 0x1d, 0x00, 0x66,
++      0xc5, 0x54, 0xc2, 0xfc, 0x06, 0xda, 0x05, 0x90,
++      0x52, 0x1d, 0x37, 0x66, 0xee, 0xf0, 0xb2, 0x55,
++      0x8a, 0x5d, 0xd2, 0x38, 0x86, 0x94, 0x9b, 0xfc,
++      0x10, 0x4c, 0xa1, 0xb9, 0x64, 0x3e, 0x44, 0xb8,
++      0x5f, 0xb0, 0x0c, 0xec, 0xe0, 0xc9, 0xe5, 0x62,
++      0x75, 0x3f, 0x09, 0xd5, 0xf5, 0xd9, 0x26, 0xba,
++      0x9e, 0xd2, 0xf4, 0xb9, 0x48, 0x0a, 0xbc, 0xa2,
++      0xd6, 0x7c, 0x36, 0x11, 0x7d, 0x26, 0x81, 0x89,
++      0xcf, 0xa4, 0xad, 0x73, 0x0e, 0xee, 0xcc, 0x06,
++      0xa9, 0xdb, 0xb1, 0xfd, 0xfb, 0x09, 0x7f, 0x90,
++      0x42, 0x37, 0x2f, 0xe1, 0x9c, 0x0f, 0x6f, 0xcf,
++      0x43, 0xb5, 0xd9, 0x90, 0xe1, 0x85, 0xf5, 0xa8,
++      0xae
++};
++static const u8 enc_assoc009[] __initconst = {
++      0x5a, 0x27, 0xff, 0xeb, 0xdf, 0x84, 0xb2, 0x9e,
++      0xef
++};
++static const u8 enc_nonce009[] __initconst = {
++      0xef, 0x2d, 0x63, 0xee, 0x6b, 0x80, 0x8b, 0x78
++};
++static const u8 enc_key009[] __initconst = {
++      0xea, 0xbc, 0x56, 0x99, 0xe3, 0x50, 0xff, 0xc5,
++      0xcc, 0x1a, 0xd7, 0xc1, 0x57, 0x72, 0xea, 0x86,
++      0x5b, 0x89, 0x88, 0x61, 0x3d, 0x2f, 0x9b, 0xb2,
++      0xe7, 0x9c, 0xec, 0x74, 0x6e, 0x3e, 0xf4, 0x3b
++};
++
++static const u8 enc_input010[] __initconst = {
++      0x42, 0x93, 0xe4, 0xeb, 0x97, 0xb0, 0x57, 0xbf,
++      0x1a, 0x8b, 0x1f, 0xe4, 0x5f, 0x36, 0x20, 0x3c,
++      0xef, 0x0a, 0xa9, 0x48, 0x5f, 0x5f, 0x37, 0x22,
++      0x3a, 0xde, 0xe3, 0xae, 0xbe, 0xad, 0x07, 0xcc,
++      0xb1, 0xf6, 0xf5, 0xf9, 0x56, 0xdd, 0xe7, 0x16,
++      0x1e, 0x7f, 0xdf, 0x7a, 0x9e, 0x75, 0xb7, 0xc7,
++      0xbe, 0xbe, 0x8a, 0x36, 0x04, 0xc0, 0x10, 0xf4,
++      0x95, 0x20, 0x03, 0xec, 0xdc, 0x05, 0xa1, 0x7d,
++      0xc4, 0xa9, 0x2c, 0x82, 0xd0, 0xbc, 0x8b, 0xc5,
++      0xc7, 0x45, 0x50, 0xf6, 0xa2, 0x1a, 0xb5, 0x46,
++      0x3b, 0x73, 0x02, 0xa6, 0x83, 0x4b, 0x73, 0x82,
++      0x58, 0x5e, 0x3b, 0x65, 0x2f, 0x0e, 0xfd, 0x2b,
++      0x59, 0x16, 0xce, 0xa1, 0x60, 0x9c, 0xe8, 0x3a,
++      0x99, 0xed, 0x8d, 0x5a, 0xcf, 0xf6, 0x83, 0xaf,
++      0xba, 0xd7, 0x73, 0x73, 0x40, 0x97, 0x3d, 0xca,
++      0xef, 0x07, 0x57, 0xe6, 0xd9, 0x70, 0x0e, 0x95,
++      0xae, 0xa6, 0x8d, 0x04, 0xcc, 0xee, 0xf7, 0x09,
++      0x31, 0x77, 0x12, 0xa3, 0x23, 0x97, 0x62, 0xb3,
++      0x7b, 0x32, 0xfb, 0x80, 0x14, 0x48, 0x81, 0xc3,
++      0xe5, 0xea, 0x91, 0x39, 0x52, 0x81, 0xa2, 0x4f,
++      0xe4, 0xb3, 0x09, 0xff, 0xde, 0x5e, 0xe9, 0x58,
++      0x84, 0x6e, 0xf9, 0x3d, 0xdf, 0x25, 0xea, 0xad,
++      0xae, 0xe6, 0x9a, 0xd1, 0x89, 0x55, 0xd3, 0xde,
++      0x6c, 0x52, 0xdb, 0x70, 0xfe, 0x37, 0xce, 0x44,
++      0x0a, 0xa8, 0x25, 0x5f, 0x92, 0xc1, 0x33, 0x4a,
++      0x4f, 0x9b, 0x62, 0x35, 0xff, 0xce, 0xc0, 0xa9,
++      0x60, 0xce, 0x52, 0x00, 0x97, 0x51, 0x35, 0x26,
++      0x2e, 0xb9, 0x36, 0xa9, 0x87, 0x6e, 0x1e, 0xcc,
++      0x91, 0x78, 0x53, 0x98, 0x86, 0x5b, 0x9c, 0x74,
++      0x7d, 0x88, 0x33, 0xe1, 0xdf, 0x37, 0x69, 0x2b,
++      0xbb, 0xf1, 0x4d, 0xf4, 0xd1, 0xf1, 0x39, 0x93,
++      0x17, 0x51, 0x19, 0xe3, 0x19, 0x1e, 0x76, 0x37,
++      0x25, 0xfb, 0x09, 0x27, 0x6a, 0xab, 0x67, 0x6f,
++      0x14, 0x12, 0x64, 0xe7, 0xc4, 0x07, 0xdf, 0x4d,
++      0x17, 0xbb, 0x6d, 0xe0, 0xe9, 0xb9, 0xab, 0xca,
++      0x10, 0x68, 0xaf, 0x7e, 0xb7, 0x33, 0x54, 0x73,
++      0x07, 0x6e, 0xf7, 0x81, 0x97, 0x9c, 0x05, 0x6f,
++      0x84, 0x5f, 0xd2, 0x42, 0xfb, 0x38, 0xcf, 0xd1,
++      0x2f, 0x14, 0x30, 0x88, 0x98, 0x4d, 0x5a, 0xa9,
++      0x76, 0xd5, 0x4f, 0x3e, 0x70, 0x6c, 0x85, 0x76,
++      0xd7, 0x01, 0xa0, 0x1a, 0xc8, 0x4e, 0xaa, 0xac,
++      0x78, 0xfe, 0x46, 0xde, 0x6a, 0x05, 0x46, 0xa7,
++      0x43, 0x0c, 0xb9, 0xde, 0xb9, 0x68, 0xfb, 0xce,
++      0x42, 0x99, 0x07, 0x4d, 0x0b, 0x3b, 0x5a, 0x30,
++      0x35, 0xa8, 0xf9, 0x3a, 0x73, 0xef, 0x0f, 0xdb,
++      0x1e, 0x16, 0x42, 0xc4, 0xba, 0xae, 0x58, 0xaa,
++      0xf8, 0xe5, 0x75, 0x2f, 0x1b, 0x15, 0x5c, 0xfd,
++      0x0a, 0x97, 0xd0, 0xe4, 0x37, 0x83, 0x61, 0x5f,
++      0x43, 0xa6, 0xc7, 0x3f, 0x38, 0x59, 0xe6, 0xeb,
++      0xa3, 0x90, 0xc3, 0xaa, 0xaa, 0x5a, 0xd3, 0x34,
++      0xd4, 0x17, 0xc8, 0x65, 0x3e, 0x57, 0xbc, 0x5e,
++      0xdd, 0x9e, 0xb7, 0xf0, 0x2e, 0x5b, 0xb2, 0x1f,
++      0x8a, 0x08, 0x0d, 0x45, 0x91, 0x0b, 0x29, 0x53,
++      0x4f, 0x4c, 0x5a, 0x73, 0x56, 0xfe, 0xaf, 0x41,
++      0x01, 0x39, 0x0a, 0x24, 0x3c, 0x7e, 0xbe, 0x4e,
++      0x53, 0xf3, 0xeb, 0x06, 0x66, 0x51, 0x28, 0x1d,
++      0xbd, 0x41, 0x0a, 0x01, 0xab, 0x16, 0x47, 0x27,
++      0x47, 0x47, 0xf7, 0xcb, 0x46, 0x0a, 0x70, 0x9e,
++      0x01, 0x9c, 0x09, 0xe1, 0x2a, 0x00, 0x1a, 0xd8,
++      0xd4, 0x79, 0x9d, 0x80, 0x15, 0x8e, 0x53, 0x2a,
++      0x65, 0x83, 0x78, 0x3e, 0x03, 0x00, 0x07, 0x12,
++      0x1f, 0x33, 0x3e, 0x7b, 0x13, 0x37, 0xf1, 0xc3,
++      0xef, 0xb7, 0xc1, 0x20, 0x3c, 0x3e, 0x67, 0x66,
++      0x5d, 0x88, 0xa7, 0x7d, 0x33, 0x50, 0x77, 0xb0,
++      0x28, 0x8e, 0xe7, 0x2c, 0x2e, 0x7a, 0xf4, 0x3c,
++      0x8d, 0x74, 0x83, 0xaf, 0x8e, 0x87, 0x0f, 0xe4,
++      0x50, 0xff, 0x84, 0x5c, 0x47, 0x0c, 0x6a, 0x49,
++      0xbf, 0x42, 0x86, 0x77, 0x15, 0x48, 0xa5, 0x90,
++      0x5d, 0x93, 0xd6, 0x2a, 0x11, 0xd5, 0xd5, 0x11,
++      0xaa, 0xce, 0xe7, 0x6f, 0xa5, 0xb0, 0x09, 0x2c,
++      0x8d, 0xd3, 0x92, 0xf0, 0x5a, 0x2a, 0xda, 0x5b,
++      0x1e, 0xd5, 0x9a, 0xc4, 0xc4, 0xf3, 0x49, 0x74,
++      0x41, 0xca, 0xe8, 0xc1, 0xf8, 0x44, 0xd6, 0x3c,
++      0xae, 0x6c, 0x1d, 0x9a, 0x30, 0x04, 0x4d, 0x27,
++      0x0e, 0xb1, 0x5f, 0x59, 0xa2, 0x24, 0xe8, 0xe1,
++      0x98, 0xc5, 0x6a, 0x4c, 0xfe, 0x41, 0xd2, 0x27,
++      0x42, 0x52, 0xe1, 0xe9, 0x7d, 0x62, 0xe4, 0x88,
++      0x0f, 0xad, 0xb2, 0x70, 0xcb, 0x9d, 0x4c, 0x27,
++      0x2e, 0x76, 0x1e, 0x1a, 0x63, 0x65, 0xf5, 0x3b,
++      0xf8, 0x57, 0x69, 0xeb, 0x5b, 0x38, 0x26, 0x39,
++      0x33, 0x25, 0x45, 0x3e, 0x91, 0xb8, 0xd8, 0xc7,
++      0xd5, 0x42, 0xc0, 0x22, 0x31, 0x74, 0xf4, 0xbc,
++      0x0c, 0x23, 0xf1, 0xca, 0xc1, 0x8d, 0xd7, 0xbe,
++      0xc9, 0x62, 0xe4, 0x08, 0x1a, 0xcf, 0x36, 0xd5,
++      0xfe, 0x55, 0x21, 0x59, 0x91, 0x87, 0x87, 0xdf,
++      0x06, 0xdb, 0xdf, 0x96, 0x45, 0x58, 0xda, 0x05,
++      0xcd, 0x50, 0x4d, 0xd2, 0x7d, 0x05, 0x18, 0x73,
++      0x6a, 0x8d, 0x11, 0x85, 0xa6, 0x88, 0xe8, 0xda,
++      0xe6, 0x30, 0x33, 0xa4, 0x89, 0x31, 0x75, 0xbe,
++      0x69, 0x43, 0x84, 0x43, 0x50, 0x87, 0xdd, 0x71,
++      0x36, 0x83, 0xc3, 0x78, 0x74, 0x24, 0x0a, 0xed,
++      0x7b, 0xdb, 0xa4, 0x24, 0x0b, 0xb9, 0x7e, 0x5d,
++      0xff, 0xde, 0xb1, 0xef, 0x61, 0x5a, 0x45, 0x33,
++      0xf6, 0x17, 0x07, 0x08, 0x98, 0x83, 0x92, 0x0f,
++      0x23, 0x6d, 0xe6, 0xaa, 0x17, 0x54, 0xad, 0x6a,
++      0xc8, 0xdb, 0x26, 0xbe, 0xb8, 0xb6, 0x08, 0xfa,
++      0x68, 0xf1, 0xd7, 0x79, 0x6f, 0x18, 0xb4, 0x9e,
++      0x2d, 0x3f, 0x1b, 0x64, 0xaf, 0x8d, 0x06, 0x0e,
++      0x49, 0x28, 0xe0, 0x5d, 0x45, 0x68, 0x13, 0x87,
++      0xfa, 0xde, 0x40, 0x7b, 0xd2, 0xc3, 0x94, 0xd5,
++      0xe1, 0xd9, 0xc2, 0xaf, 0x55, 0x89, 0xeb, 0xb4,
++      0x12, 0x59, 0xa8, 0xd4, 0xc5, 0x29, 0x66, 0x38,
++      0xe6, 0xac, 0x22, 0x22, 0xd9, 0x64, 0x9b, 0x34,
++      0x0a, 0x32, 0x9f, 0xc2, 0xbf, 0x17, 0x6c, 0x3f,
++      0x71, 0x7a, 0x38, 0x6b, 0x98, 0xfb, 0x49, 0x36,
++      0x89, 0xc9, 0xe2, 0xd6, 0xc7, 0x5d, 0xd0, 0x69,
++      0x5f, 0x23, 0x35, 0xc9, 0x30, 0xe2, 0xfd, 0x44,
++      0x58, 0x39, 0xd7, 0x97, 0xfb, 0x5c, 0x00, 0xd5,
++      0x4f, 0x7a, 0x1a, 0x95, 0x8b, 0x62, 0x4b, 0xce,
++      0xe5, 0x91, 0x21, 0x7b, 0x30, 0x00, 0xd6, 0xdd,
++      0x6d, 0x02, 0x86, 0x49, 0x0f, 0x3c, 0x1a, 0x27,
++      0x3c, 0xd3, 0x0e, 0x71, 0xf2, 0xff, 0xf5, 0x2f,
++      0x87, 0xac, 0x67, 0x59, 0x81, 0xa3, 0xf7, 0xf8,
++      0xd6, 0x11, 0x0c, 0x84, 0xa9, 0x03, 0xee, 0x2a,
++      0xc4, 0xf3, 0x22, 0xab, 0x7c, 0xe2, 0x25, 0xf5,
++      0x67, 0xa3, 0xe4, 0x11, 0xe0, 0x59, 0xb3, 0xca,
++      0x87, 0xa0, 0xae, 0xc9, 0xa6, 0x62, 0x1b, 0x6e,
++      0x4d, 0x02, 0x6b, 0x07, 0x9d, 0xfd, 0xd0, 0x92,
++      0x06, 0xe1, 0xb2, 0x9a, 0x4a, 0x1f, 0x1f, 0x13,
++      0x49, 0x99, 0x97, 0x08, 0xde, 0x7f, 0x98, 0xaf,
++      0x51, 0x98, 0xee, 0x2c, 0xcb, 0xf0, 0x0b, 0xc6,
++      0xb6, 0xb7, 0x2d, 0x9a, 0xb1, 0xac, 0xa6, 0xe3,
++      0x15, 0x77, 0x9d, 0x6b, 0x1a, 0xe4, 0xfc, 0x8b,
++      0xf2, 0x17, 0x59, 0x08, 0x04, 0x58, 0x81, 0x9d,
++      0x1b, 0x1b, 0x69, 0x55, 0xc2, 0xb4, 0x3c, 0x1f,
++      0x50, 0xf1, 0x7f, 0x77, 0x90, 0x4c, 0x66, 0x40,
++      0x5a, 0xc0, 0x33, 0x1f, 0xcb, 0x05, 0x6d, 0x5c,
++      0x06, 0x87, 0x52, 0xa2, 0x8f, 0x26, 0xd5, 0x4f
++};
++static const u8 enc_output010[] __initconst = {
++      0xe5, 0x26, 0xa4, 0x3d, 0xbd, 0x33, 0xd0, 0x4b,
++      0x6f, 0x05, 0xa7, 0x6e, 0x12, 0x7a, 0xd2, 0x74,
++      0xa6, 0xdd, 0xbd, 0x95, 0xeb, 0xf9, 0xa4, 0xf1,
++      0x59, 0x93, 0x91, 0x70, 0xd9, 0xfe, 0x9a, 0xcd,
++      0x53, 0x1f, 0x3a, 0xab, 0xa6, 0x7c, 0x9f, 0xa6,
++      0x9e, 0xbd, 0x99, 0xd9, 0xb5, 0x97, 0x44, 0xd5,
++      0x14, 0x48, 0x4d, 0x9d, 0xc0, 0xd0, 0x05, 0x96,
++      0xeb, 0x4c, 0x78, 0x55, 0x09, 0x08, 0x01, 0x02,
++      0x30, 0x90, 0x7b, 0x96, 0x7a, 0x7b, 0x5f, 0x30,
++      0x41, 0x24, 0xce, 0x68, 0x61, 0x49, 0x86, 0x57,
++      0x82, 0xdd, 0x53, 0x1c, 0x51, 0x28, 0x2b, 0x53,
++      0x6e, 0x2d, 0xc2, 0x20, 0x4c, 0xdd, 0x8f, 0x65,
++      0x10, 0x20, 0x50, 0xdd, 0x9d, 0x50, 0xe5, 0x71,
++      0x40, 0x53, 0x69, 0xfc, 0x77, 0x48, 0x11, 0xb9,
++      0xde, 0xa4, 0x8d, 0x58, 0xe4, 0xa6, 0x1a, 0x18,
++      0x47, 0x81, 0x7e, 0xfc, 0xdd, 0xf6, 0xef, 0xce,
++      0x2f, 0x43, 0x68, 0xd6, 0x06, 0xe2, 0x74, 0x6a,
++      0xad, 0x90, 0xf5, 0x37, 0xf3, 0x3d, 0x82, 0x69,
++      0x40, 0xe9, 0x6b, 0xa7, 0x3d, 0xa8, 0x1e, 0xd2,
++      0x02, 0x7c, 0xb7, 0x9b, 0xe4, 0xda, 0x8f, 0x95,
++      0x06, 0xc5, 0xdf, 0x73, 0xa3, 0x20, 0x9a, 0x49,
++      0xde, 0x9c, 0xbc, 0xee, 0x14, 0x3f, 0x81, 0x5e,
++      0xf8, 0x3b, 0x59, 0x3c, 0xe1, 0x68, 0x12, 0x5a,
++      0x3a, 0x76, 0x3a, 0x3f, 0xf7, 0x87, 0x33, 0x0a,
++      0x01, 0xb8, 0xd4, 0xed, 0xb6, 0xbe, 0x94, 0x5e,
++      0x70, 0x40, 0x56, 0x67, 0x1f, 0x50, 0x44, 0x19,
++      0xce, 0x82, 0x70, 0x10, 0x87, 0x13, 0x20, 0x0b,
++      0x4c, 0x5a, 0xb6, 0xf6, 0xa7, 0xae, 0x81, 0x75,
++      0x01, 0x81, 0xe6, 0x4b, 0x57, 0x7c, 0xdd, 0x6d,
++      0xf8, 0x1c, 0x29, 0x32, 0xf7, 0xda, 0x3c, 0x2d,
++      0xf8, 0x9b, 0x25, 0x6e, 0x00, 0xb4, 0xf7, 0x2f,
++      0xf7, 0x04, 0xf7, 0xa1, 0x56, 0xac, 0x4f, 0x1a,
++      0x64, 0xb8, 0x47, 0x55, 0x18, 0x7b, 0x07, 0x4d,
++      0xbd, 0x47, 0x24, 0x80, 0x5d, 0xa2, 0x70, 0xc5,
++      0xdd, 0x8e, 0x82, 0xd4, 0xeb, 0xec, 0xb2, 0x0c,
++      0x39, 0xd2, 0x97, 0xc1, 0xcb, 0xeb, 0xf4, 0x77,
++      0x59, 0xb4, 0x87, 0xef, 0xcb, 0x43, 0x2d, 0x46,
++      0x54, 0xd1, 0xa7, 0xd7, 0x15, 0x99, 0x0a, 0x43,
++      0xa1, 0xe0, 0x99, 0x33, 0x71, 0xc1, 0xed, 0xfe,
++      0x72, 0x46, 0x33, 0x8e, 0x91, 0x08, 0x9f, 0xc8,
++      0x2e, 0xca, 0xfa, 0xdc, 0x59, 0xd5, 0xc3, 0x76,
++      0x84, 0x9f, 0xa3, 0x37, 0x68, 0xc3, 0xf0, 0x47,
++      0x2c, 0x68, 0xdb, 0x5e, 0xc3, 0x49, 0x4c, 0xe8,
++      0x92, 0x85, 0xe2, 0x23, 0xd3, 0x3f, 0xad, 0x32,
++      0xe5, 0x2b, 0x82, 0xd7, 0x8f, 0x99, 0x0a, 0x59,
++      0x5c, 0x45, 0xd9, 0xb4, 0x51, 0x52, 0xc2, 0xae,
++      0xbf, 0x80, 0xcf, 0xc9, 0xc9, 0x51, 0x24, 0x2a,
++      0x3b, 0x3a, 0x4d, 0xae, 0xeb, 0xbd, 0x22, 0xc3,
++      0x0e, 0x0f, 0x59, 0x25, 0x92, 0x17, 0xe9, 0x74,
++      0xc7, 0x8b, 0x70, 0x70, 0x36, 0x55, 0x95, 0x75,
++      0x4b, 0xad, 0x61, 0x2b, 0x09, 0xbc, 0x82, 0xf2,
++      0x6e, 0x94, 0x43, 0xae, 0xc3, 0xd5, 0xcd, 0x8e,
++      0xfe, 0x5b, 0x9a, 0x88, 0x43, 0x01, 0x75, 0xb2,
++      0x23, 0x09, 0xf7, 0x89, 0x83, 0xe7, 0xfa, 0xf9,
++      0xb4, 0x9b, 0xf8, 0xef, 0xbd, 0x1c, 0x92, 0xc1,
++      0xda, 0x7e, 0xfe, 0x05, 0xba, 0x5a, 0xcd, 0x07,
++      0x6a, 0x78, 0x9e, 0x5d, 0xfb, 0x11, 0x2f, 0x79,
++      0x38, 0xb6, 0xc2, 0x5b, 0x6b, 0x51, 0xb4, 0x71,
++      0xdd, 0xf7, 0x2a, 0xe4, 0xf4, 0x72, 0x76, 0xad,
++      0xc2, 0xdd, 0x64, 0x5d, 0x79, 0xb6, 0xf5, 0x7a,
++      0x77, 0x20, 0x05, 0x3d, 0x30, 0x06, 0xd4, 0x4c,
++      0x0a, 0x2c, 0x98, 0x5a, 0xb9, 0xd4, 0x98, 0xa9,
++      0x3f, 0xc6, 0x12, 0xea, 0x3b, 0x4b, 0xc5, 0x79,
++      0x64, 0x63, 0x6b, 0x09, 0x54, 0x3b, 0x14, 0x27,
++      0xba, 0x99, 0x80, 0xc8, 0x72, 0xa8, 0x12, 0x90,
++      0x29, 0xba, 0x40, 0x54, 0x97, 0x2b, 0x7b, 0xfe,
++      0xeb, 0xcd, 0x01, 0x05, 0x44, 0x72, 0xdb, 0x99,
++      0xe4, 0x61, 0xc9, 0x69, 0xd6, 0xb9, 0x28, 0xd1,
++      0x05, 0x3e, 0xf9, 0x0b, 0x49, 0x0a, 0x49, 0xe9,
++      0x8d, 0x0e, 0xa7, 0x4a, 0x0f, 0xaf, 0x32, 0xd0,
++      0xe0, 0xb2, 0x3a, 0x55, 0x58, 0xfe, 0x5c, 0x28,
++      0x70, 0x51, 0x23, 0xb0, 0x7b, 0x6a, 0x5f, 0x1e,
++      0xb8, 0x17, 0xd7, 0x94, 0x15, 0x8f, 0xee, 0x20,
++      0xc7, 0x42, 0x25, 0x3e, 0x9a, 0x14, 0xd7, 0x60,
++      0x72, 0x39, 0x47, 0x48, 0xa9, 0xfe, 0xdd, 0x47,
++      0x0a, 0xb1, 0xe6, 0x60, 0x28, 0x8c, 0x11, 0x68,
++      0xe1, 0xff, 0xd7, 0xce, 0xc8, 0xbe, 0xb3, 0xfe,
++      0x27, 0x30, 0x09, 0x70, 0xd7, 0xfa, 0x02, 0x33,
++      0x3a, 0x61, 0x2e, 0xc7, 0xff, 0xa4, 0x2a, 0xa8,
++      0x6e, 0xb4, 0x79, 0x35, 0x6d, 0x4c, 0x1e, 0x38,
++      0xf8, 0xee, 0xd4, 0x84, 0x4e, 0x6e, 0x28, 0xa7,
++      0xce, 0xc8, 0xc1, 0xcf, 0x80, 0x05, 0xf3, 0x04,
++      0xef, 0xc8, 0x18, 0x28, 0x2e, 0x8d, 0x5e, 0x0c,
++      0xdf, 0xb8, 0x5f, 0x96, 0xe8, 0xc6, 0x9c, 0x2f,
++      0xe5, 0xa6, 0x44, 0xd7, 0xe7, 0x99, 0x44, 0x0c,
++      0xec, 0xd7, 0x05, 0x60, 0x97, 0xbb, 0x74, 0x77,
++      0x58, 0xd5, 0xbb, 0x48, 0xde, 0x5a, 0xb2, 0x54,
++      0x7f, 0x0e, 0x46, 0x70, 0x6a, 0x6f, 0x78, 0xa5,
++      0x08, 0x89, 0x05, 0x4e, 0x7e, 0xa0, 0x69, 0xb4,
++      0x40, 0x60, 0x55, 0x77, 0x75, 0x9b, 0x19, 0xf2,
++      0xd5, 0x13, 0x80, 0x77, 0xf9, 0x4b, 0x3f, 0x1e,
++      0xee, 0xe6, 0x76, 0x84, 0x7b, 0x8c, 0xe5, 0x27,
++      0xa8, 0x0a, 0x91, 0x01, 0x68, 0x71, 0x8a, 0x3f,
++      0x06, 0xab, 0xf6, 0xa9, 0xa5, 0xe6, 0x72, 0x92,
++      0xe4, 0x67, 0xe2, 0xa2, 0x46, 0x35, 0x84, 0x55,
++      0x7d, 0xca, 0xa8, 0x85, 0xd0, 0xf1, 0x3f, 0xbe,
++      0xd7, 0x34, 0x64, 0xfc, 0xae, 0xe3, 0xe4, 0x04,
++      0x9f, 0x66, 0x02, 0xb9, 0x88, 0x10, 0xd9, 0xc4,
++      0x4c, 0x31, 0x43, 0x7a, 0x93, 0xe2, 0x9b, 0x56,
++      0x43, 0x84, 0xdc, 0xdc, 0xde, 0x1d, 0xa4, 0x02,
++      0x0e, 0xc2, 0xef, 0xc3, 0xf8, 0x78, 0xd1, 0xb2,
++      0x6b, 0x63, 0x18, 0xc9, 0xa9, 0xe5, 0x72, 0xd8,
++      0xf3, 0xb9, 0xd1, 0x8a, 0xc7, 0x1a, 0x02, 0x27,
++      0x20, 0x77, 0x10, 0xe5, 0xc8, 0xd4, 0x4a, 0x47,
++      0xe5, 0xdf, 0x5f, 0x01, 0xaa, 0xb0, 0xd4, 0x10,
++      0xbb, 0x69, 0xe3, 0x36, 0xc8, 0xe1, 0x3d, 0x43,
++      0xfb, 0x86, 0xcd, 0xcc, 0xbf, 0xf4, 0x88, 0xe0,
++      0x20, 0xca, 0xb7, 0x1b, 0xf1, 0x2f, 0x5c, 0xee,
++      0xd4, 0xd3, 0xa3, 0xcc, 0xa4, 0x1e, 0x1c, 0x47,
++      0xfb, 0xbf, 0xfc, 0xa2, 0x41, 0x55, 0x9d, 0xf6,
++      0x5a, 0x5e, 0x65, 0x32, 0x34, 0x7b, 0x52, 0x8d,
++      0xd5, 0xd0, 0x20, 0x60, 0x03, 0xab, 0x3f, 0x8c,
++      0xd4, 0x21, 0xea, 0x2a, 0xd9, 0xc4, 0xd0, 0xd3,
++      0x65, 0xd8, 0x7a, 0x13, 0x28, 0x62, 0x32, 0x4b,
++      0x2c, 0x87, 0x93, 0xa8, 0xb4, 0x52, 0x45, 0x09,
++      0x44, 0xec, 0xec, 0xc3, 0x17, 0xdb, 0x9a, 0x4d,
++      0x5c, 0xa9, 0x11, 0xd4, 0x7d, 0xaf, 0x9e, 0xf1,
++      0x2d, 0xb2, 0x66, 0xc5, 0x1d, 0xed, 0xb7, 0xcd,
++      0x0b, 0x25, 0x5e, 0x30, 0x47, 0x3f, 0x40, 0xf4,
++      0xa1, 0xa0, 0x00, 0x94, 0x10, 0xc5, 0x6a, 0x63,
++      0x1a, 0xd5, 0x88, 0x92, 0x8e, 0x82, 0x39, 0x87,
++      0x3c, 0x78, 0x65, 0x58, 0x42, 0x75, 0x5b, 0xdd,
++      0x77, 0x3e, 0x09, 0x4e, 0x76, 0x5b, 0xe6, 0x0e,
++      0x4d, 0x38, 0xb2, 0xc0, 0xb8, 0x95, 0x01, 0x7a,
++      0x10, 0xe0, 0xfb, 0x07, 0xf2, 0xab, 0x2d, 0x8c,
++      0x32, 0xed, 0x2b, 0xc0, 0x46, 0xc2, 0xf5, 0x38,
++      0x83, 0xf0, 0x17, 0xec, 0xc1, 0x20, 0x6a, 0x9a,
++      0x0b, 0x00, 0xa0, 0x98, 0x22, 0x50, 0x23, 0xd5,
++      0x80, 0x6b, 0xf6, 0x1f, 0xc3, 0xcc, 0x97, 0xc9,
++      0x24, 0x9f, 0xf3, 0xaf, 0x43, 0x14, 0xd5, 0xa0
++};
++static const u8 enc_assoc010[] __initconst = {
++      0xd2, 0xa1, 0x70, 0xdb, 0x7a, 0xf8, 0xfa, 0x27,
++      0xba, 0x73, 0x0f, 0xbf, 0x3d, 0x1e, 0x82, 0xb2
++};
++static const u8 enc_nonce010[] __initconst = {
++      0xdb, 0x92, 0x0f, 0x7f, 0x17, 0x54, 0x0c, 0x30
++};
++static const u8 enc_key010[] __initconst = {
++      0x47, 0x11, 0xeb, 0x86, 0x2b, 0x2c, 0xab, 0x44,
++      0x34, 0xda, 0x7f, 0x57, 0x03, 0x39, 0x0c, 0xaf,
++      0x2c, 0x14, 0xfd, 0x65, 0x23, 0xe9, 0x8e, 0x74,
++      0xd5, 0x08, 0x68, 0x08, 0xe7, 0xb4, 0x72, 0xd7
++};
++
++static const u8 enc_input011[] __initconst = {
++      0x7a, 0x57, 0xf2, 0xc7, 0x06, 0x3f, 0x50, 0x7b,
++      0x36, 0x1a, 0x66, 0x5c, 0xb9, 0x0e, 0x5e, 0x3b,
++      0x45, 0x60, 0xbe, 0x9a, 0x31, 0x9f, 0xff, 0x5d,
++      0x66, 0x34, 0xb4, 0xdc, 0xfb, 0x9d, 0x8e, 0xee,
++      0x6a, 0x33, 0xa4, 0x07, 0x3c, 0xf9, 0x4c, 0x30,
++      0xa1, 0x24, 0x52, 0xf9, 0x50, 0x46, 0x88, 0x20,
++      0x02, 0x32, 0x3a, 0x0e, 0x99, 0x63, 0xaf, 0x1f,
++      0x15, 0x28, 0x2a, 0x05, 0xff, 0x57, 0x59, 0x5e,
++      0x18, 0xa1, 0x1f, 0xd0, 0x92, 0x5c, 0x88, 0x66,
++      0x1b, 0x00, 0x64, 0xa5, 0x93, 0x8d, 0x06, 0x46,
++      0xb0, 0x64, 0x8b, 0x8b, 0xef, 0x99, 0x05, 0x35,
++      0x85, 0xb3, 0xf3, 0x33, 0xbb, 0xec, 0x66, 0xb6,
++      0x3d, 0x57, 0x42, 0xe3, 0xb4, 0xc6, 0xaa, 0xb0,
++      0x41, 0x2a, 0xb9, 0x59, 0xa9, 0xf6, 0x3e, 0x15,
++      0x26, 0x12, 0x03, 0x21, 0x4c, 0x74, 0x43, 0x13,
++      0x2a, 0x03, 0x27, 0x09, 0xb4, 0xfb, 0xe7, 0xb7,
++      0x40, 0xff, 0x5e, 0xce, 0x48, 0x9a, 0x60, 0xe3,
++      0x8b, 0x80, 0x8c, 0x38, 0x2d, 0xcb, 0x93, 0x37,
++      0x74, 0x05, 0x52, 0x6f, 0x73, 0x3e, 0xc3, 0xbc,
++      0xca, 0x72, 0x0a, 0xeb, 0xf1, 0x3b, 0xa0, 0x95,
++      0xdc, 0x8a, 0xc4, 0xa9, 0xdc, 0xca, 0x44, 0xd8,
++      0x08, 0x63, 0x6a, 0x36, 0xd3, 0x3c, 0xb8, 0xac,
++      0x46, 0x7d, 0xfd, 0xaa, 0xeb, 0x3e, 0x0f, 0x45,
++      0x8f, 0x49, 0xda, 0x2b, 0xf2, 0x12, 0xbd, 0xaf,
++      0x67, 0x8a, 0x63, 0x48, 0x4b, 0x55, 0x5f, 0x6d,
++      0x8c, 0xb9, 0x76, 0x34, 0x84, 0xae, 0xc2, 0xfc,
++      0x52, 0x64, 0x82, 0xf7, 0xb0, 0x06, 0xf0, 0x45,
++      0x73, 0x12, 0x50, 0x30, 0x72, 0xea, 0x78, 0x9a,
++      0xa8, 0xaf, 0xb5, 0xe3, 0xbb, 0x77, 0x52, 0xec,
++      0x59, 0x84, 0xbf, 0x6b, 0x8f, 0xce, 0x86, 0x5e,
++      0x1f, 0x23, 0xe9, 0xfb, 0x08, 0x86, 0xf7, 0x10,
++      0xb9, 0xf2, 0x44, 0x96, 0x44, 0x63, 0xa9, 0xa8,
++      0x78, 0x00, 0x23, 0xd6, 0xc7, 0xe7, 0x6e, 0x66,
++      0x4f, 0xcc, 0xee, 0x15, 0xb3, 0xbd, 0x1d, 0xa0,
++      0xe5, 0x9c, 0x1b, 0x24, 0x2c, 0x4d, 0x3c, 0x62,
++      0x35, 0x9c, 0x88, 0x59, 0x09, 0xdd, 0x82, 0x1b,
++      0xcf, 0x0a, 0x83, 0x6b, 0x3f, 0xae, 0x03, 0xc4,
++      0xb4, 0xdd, 0x7e, 0x5b, 0x28, 0x76, 0x25, 0x96,
++      0xd9, 0xc9, 0x9d, 0x5f, 0x86, 0xfa, 0xf6, 0xd7,
++      0xd2, 0xe6, 0x76, 0x1d, 0x0f, 0xa1, 0xdc, 0x74,
++      0x05, 0x1b, 0x1d, 0xe0, 0xcd, 0x16, 0xb0, 0xa8,
++      0x8a, 0x34, 0x7b, 0x15, 0x11, 0x77, 0xe5, 0x7b,
++      0x7e, 0x20, 0xf7, 0xda, 0x38, 0xda, 0xce, 0x70,
++      0xe9, 0xf5, 0x6c, 0xd9, 0xbe, 0x0c, 0x4c, 0x95,
++      0x4c, 0xc2, 0x9b, 0x34, 0x55, 0x55, 0xe1, 0xf3,
++      0x46, 0x8e, 0x48, 0x74, 0x14, 0x4f, 0x9d, 0xc9,
++      0xf5, 0xe8, 0x1a, 0xf0, 0x11, 0x4a, 0xc1, 0x8d,
++      0xe0, 0x93, 0xa0, 0xbe, 0x09, 0x1c, 0x2b, 0x4e,
++      0x0f, 0xb2, 0x87, 0x8b, 0x84, 0xfe, 0x92, 0x32,
++      0x14, 0xd7, 0x93, 0xdf, 0xe7, 0x44, 0xbc, 0xc5,
++      0xae, 0x53, 0x69, 0xd8, 0xb3, 0x79, 0x37, 0x80,
++      0xe3, 0x17, 0x5c, 0xec, 0x53, 0x00, 0x9a, 0xe3,
++      0x8e, 0xdc, 0x38, 0xb8, 0x66, 0xf0, 0xd3, 0xad,
++      0x1d, 0x02, 0x96, 0x86, 0x3e, 0x9d, 0x3b, 0x5d,
++      0xa5, 0x7f, 0x21, 0x10, 0xf1, 0x1f, 0x13, 0x20,
++      0xf9, 0x57, 0x87, 0x20, 0xf5, 0x5f, 0xf1, 0x17,
++      0x48, 0x0a, 0x51, 0x5a, 0xcd, 0x19, 0x03, 0xa6,
++      0x5a, 0xd1, 0x12, 0x97, 0xe9, 0x48, 0xe2, 0x1d,
++      0x83, 0x75, 0x50, 0xd9, 0x75, 0x7d, 0x6a, 0x82,
++      0xa1, 0xf9, 0x4e, 0x54, 0x87, 0x89, 0xc9, 0x0c,
++      0xb7, 0x5b, 0x6a, 0x91, 0xc1, 0x9c, 0xb2, 0xa9,
++      0xdc, 0x9a, 0xa4, 0x49, 0x0a, 0x6d, 0x0d, 0xbb,
++      0xde, 0x86, 0x44, 0xdd, 0x5d, 0x89, 0x2b, 0x96,
++      0x0f, 0x23, 0x95, 0xad, 0xcc, 0xa2, 0xb3, 0xb9,
++      0x7e, 0x74, 0x38, 0xba, 0x9f, 0x73, 0xae, 0x5f,
++      0xf8, 0x68, 0xa2, 0xe0, 0xa9, 0xce, 0xbd, 0x40,
++      0xd4, 0x4c, 0x6b, 0xd2, 0x56, 0x62, 0xb0, 0xcc,
++      0x63, 0x7e, 0x5b, 0xd3, 0xae, 0xd1, 0x75, 0xce,
++      0xbb, 0xb4, 0x5b, 0xa8, 0xf8, 0xb4, 0xac, 0x71,
++      0x75, 0xaa, 0xc9, 0x9f, 0xbb, 0x6c, 0xad, 0x0f,
++      0x55, 0x5d, 0xe8, 0x85, 0x7d, 0xf9, 0x21, 0x35,
++      0xea, 0x92, 0x85, 0x2b, 0x00, 0xec, 0x84, 0x90,
++      0x0a, 0x63, 0x96, 0xe4, 0x6b, 0xa9, 0x77, 0xb8,
++      0x91, 0xf8, 0x46, 0x15, 0x72, 0x63, 0x70, 0x01,
++      0x40, 0xa3, 0xa5, 0x76, 0x62, 0x2b, 0xbf, 0xf1,
++      0xe5, 0x8d, 0x9f, 0xa3, 0xfa, 0x9b, 0x03, 0xbe,
++      0xfe, 0x65, 0x6f, 0xa2, 0x29, 0x0d, 0x54, 0xb4,
++      0x71, 0xce, 0xa9, 0xd6, 0x3d, 0x88, 0xf9, 0xaf,
++      0x6b, 0xa8, 0x9e, 0xf4, 0x16, 0x96, 0x36, 0xb9,
++      0x00, 0xdc, 0x10, 0xab, 0xb5, 0x08, 0x31, 0x1f,
++      0x00, 0xb1, 0x3c, 0xd9, 0x38, 0x3e, 0xc6, 0x04,
++      0xa7, 0x4e, 0xe8, 0xae, 0xed, 0x98, 0xc2, 0xf7,
++      0xb9, 0x00, 0x5f, 0x8c, 0x60, 0xd1, 0xe5, 0x15,
++      0xf7, 0xae, 0x1e, 0x84, 0x88, 0xd1, 0xf6, 0xbc,
++      0x3a, 0x89, 0x35, 0x22, 0x83, 0x7c, 0xca, 0xf0,
++      0x33, 0x82, 0x4c, 0x79, 0x3c, 0xfd, 0xb1, 0xae,
++      0x52, 0x62, 0x55, 0xd2, 0x41, 0x60, 0xc6, 0xbb,
++      0xfa, 0x0e, 0x59, 0xd6, 0xa8, 0xfe, 0x5d, 0xed,
++      0x47, 0x3d, 0xe0, 0xea, 0x1f, 0x6e, 0x43, 0x51,
++      0xec, 0x10, 0x52, 0x56, 0x77, 0x42, 0x6b, 0x52,
++      0x87, 0xd8, 0xec, 0xe0, 0xaa, 0x76, 0xa5, 0x84,
++      0x2a, 0x22, 0x24, 0xfd, 0x92, 0x40, 0x88, 0xd5,
++      0x85, 0x1c, 0x1f, 0x6b, 0x47, 0xa0, 0xc4, 0xe4,
++      0xef, 0xf4, 0xea, 0xd7, 0x59, 0xac, 0x2a, 0x9e,
++      0x8c, 0xfa, 0x1f, 0x42, 0x08, 0xfe, 0x4f, 0x74,
++      0xa0, 0x26, 0xf5, 0xb3, 0x84, 0xf6, 0x58, 0x5f,
++      0x26, 0x66, 0x3e, 0xd7, 0xe4, 0x22, 0x91, 0x13,
++      0xc8, 0xac, 0x25, 0x96, 0x23, 0xd8, 0x09, 0xea,
++      0x45, 0x75, 0x23, 0xb8, 0x5f, 0xc2, 0x90, 0x8b,
++      0x09, 0xc4, 0xfc, 0x47, 0x6c, 0x6d, 0x0a, 0xef,
++      0x69, 0xa4, 0x38, 0x19, 0xcf, 0x7d, 0xf9, 0x09,
++      0x73, 0x9b, 0x60, 0x5a, 0xf7, 0x37, 0xb5, 0xfe,
++      0x9f, 0xe3, 0x2b, 0x4c, 0x0d, 0x6e, 0x19, 0xf1,
++      0xd6, 0xc0, 0x70, 0xf3, 0x9d, 0x22, 0x3c, 0xf9,
++      0x49, 0xce, 0x30, 0x8e, 0x44, 0xb5, 0x76, 0x15,
++      0x8f, 0x52, 0xfd, 0xa5, 0x04, 0xb8, 0x55, 0x6a,
++      0x36, 0x59, 0x7c, 0xc4, 0x48, 0xb8, 0xd7, 0xab,
++      0x05, 0x66, 0xe9, 0x5e, 0x21, 0x6f, 0x6b, 0x36,
++      0x29, 0xbb, 0xe9, 0xe3, 0xa2, 0x9a, 0xa8, 0xcd,
++      0x55, 0x25, 0x11, 0xba, 0x5a, 0x58, 0xa0, 0xde,
++      0xae, 0x19, 0x2a, 0x48, 0x5a, 0xff, 0x36, 0xcd,
++      0x6d, 0x16, 0x7a, 0x73, 0x38, 0x46, 0xe5, 0x47,
++      0x59, 0xc8, 0xa2, 0xf6, 0xe2, 0x6c, 0x83, 0xc5,
++      0x36, 0x2c, 0x83, 0x7d, 0xb4, 0x01, 0x05, 0x69,
++      0xe7, 0xaf, 0x5c, 0xc4, 0x64, 0x82, 0x12, 0x21,
++      0xef, 0xf7, 0xd1, 0x7d, 0xb8, 0x8d, 0x8c, 0x98,
++      0x7c, 0x5f, 0x7d, 0x92, 0x88, 0xb9, 0x94, 0x07,
++      0x9c, 0xd8, 0xe9, 0x9c, 0x17, 0x38, 0xe3, 0x57,
++      0x6c, 0xe0, 0xdc, 0xa5, 0x92, 0x42, 0xb3, 0xbd,
++      0x50, 0xa2, 0x7e, 0xb5, 0xb1, 0x52, 0x72, 0x03,
++      0x97, 0xd8, 0xaa, 0x9a, 0x1e, 0x75, 0x41, 0x11,
++      0xa3, 0x4f, 0xcc, 0xd4, 0xe3, 0x73, 0xad, 0x96,
++      0xdc, 0x47, 0x41, 0x9f, 0xb0, 0xbe, 0x79, 0x91,
++      0xf5, 0xb6, 0x18, 0xfe, 0xc2, 0x83, 0x18, 0x7d,
++      0x73, 0xd9, 0x4f, 0x83, 0x84, 0x03, 0xb3, 0xf0,
++      0x77, 0x66, 0x3d, 0x83, 0x63, 0x2e, 0x2c, 0xf9,
++      0xdd, 0xa6, 0x1f, 0x89, 0x82, 0xb8, 0x23, 0x42,
++      0xeb, 0xe2, 0xca, 0x70, 0x82, 0x61, 0x41, 0x0a,
++      0x6d, 0x5f, 0x75, 0xc5, 0xe2, 0xc4, 0x91, 0x18,
++      0x44, 0x22, 0xfa, 0x34, 0x10, 0xf5, 0x20, 0xdc,
++      0xb7, 0xdd, 0x2a, 0x20, 0x77, 0xf5, 0xf9, 0xce,
++      0xdb, 0xa0, 0x0a, 0x52, 0x2a, 0x4e, 0xdd, 0xcc,
++      0x97, 0xdf, 0x05, 0xe4, 0x5e, 0xb7, 0xaa, 0xf0,
++      0xe2, 0x80, 0xff, 0xba, 0x1a, 0x0f, 0xac, 0xdf,
++      0x02, 0x32, 0xe6, 0xf7, 0xc7, 0x17, 0x13, 0xb7,
++      0xfc, 0x98, 0x48, 0x8c, 0x0d, 0x82, 0xc9, 0x80,
++      0x7a, 0xe2, 0x0a, 0xc5, 0xb4, 0xde, 0x7c, 0x3c,
++      0x79, 0x81, 0x0e, 0x28, 0x65, 0x79, 0x67, 0x82,
++      0x69, 0x44, 0x66, 0x09, 0xf7, 0x16, 0x1a, 0xf9,
++      0x7d, 0x80, 0xa1, 0x79, 0x14, 0xa9, 0xc8, 0x20,
++      0xfb, 0xa2, 0x46, 0xbe, 0x08, 0x35, 0x17, 0x58,
++      0xc1, 0x1a, 0xda, 0x2a, 0x6b, 0x2e, 0x1e, 0xe6,
++      0x27, 0x55, 0x7b, 0x19, 0xe2, 0xfb, 0x64, 0xfc,
++      0x5e, 0x15, 0x54, 0x3c, 0xe7, 0xc2, 0x11, 0x50,
++      0x30, 0xb8, 0x72, 0x03, 0x0b, 0x1a, 0x9f, 0x86,
++      0x27, 0x11, 0x5c, 0x06, 0x2b, 0xbd, 0x75, 0x1a,
++      0x0a, 0xda, 0x01, 0xfa, 0x5c, 0x4a, 0xc1, 0x80,
++      0x3a, 0x6e, 0x30, 0xc8, 0x2c, 0xeb, 0x56, 0xec,
++      0x89, 0xfa, 0x35, 0x7b, 0xb2, 0xf0, 0x97, 0x08,
++      0x86, 0x53, 0xbe, 0xbd, 0x40, 0x41, 0x38, 0x1c,
++      0xb4, 0x8b, 0x79, 0x2e, 0x18, 0x96, 0x94, 0xde,
++      0xe8, 0xca, 0xe5, 0x9f, 0x92, 0x9f, 0x15, 0x5d,
++      0x56, 0x60, 0x5c, 0x09, 0xf9, 0x16, 0xf4, 0x17,
++      0x0f, 0xf6, 0x4c, 0xda, 0xe6, 0x67, 0x89, 0x9f,
++      0xca, 0x6c, 0xe7, 0x9b, 0x04, 0x62, 0x0e, 0x26,
++      0xa6, 0x52, 0xbd, 0x29, 0xff, 0xc7, 0xa4, 0x96,
++      0xe6, 0x6a, 0x02, 0xa5, 0x2e, 0x7b, 0xfe, 0x97,
++      0x68, 0x3e, 0x2e, 0x5f, 0x3b, 0x0f, 0x36, 0xd6,
++      0x98, 0x19, 0x59, 0x48, 0xd2, 0xc6, 0xe1, 0x55,
++      0x1a, 0x6e, 0xd6, 0xed, 0x2c, 0xba, 0xc3, 0x9e,
++      0x64, 0xc9, 0x95, 0x86, 0x35, 0x5e, 0x3e, 0x88,
++      0x69, 0x99, 0x4b, 0xee, 0xbe, 0x9a, 0x99, 0xb5,
++      0x6e, 0x58, 0xae, 0xdd, 0x22, 0xdb, 0xdd, 0x6b,
++      0xfc, 0xaf, 0x90, 0xa3, 0x3d, 0xa4, 0xc1, 0x15,
++      0x92, 0x18, 0x8d, 0xd2, 0x4b, 0x7b, 0x06, 0xd1,
++      0x37, 0xb5, 0xe2, 0x7c, 0x2c, 0xf0, 0x25, 0xe4,
++      0x94, 0x2a, 0xbd, 0xe3, 0x82, 0x70, 0x78, 0xa3,
++      0x82, 0x10, 0x5a, 0x90, 0xd7, 0xa4, 0xfa, 0xaf,
++      0x1a, 0x88, 0x59, 0xdc, 0x74, 0x12, 0xb4, 0x8e,
++      0xd7, 0x19, 0x46, 0xf4, 0x84, 0x69, 0x9f, 0xbb,
++      0x70, 0xa8, 0x4c, 0x52, 0x81, 0xa9, 0xff, 0x76,
++      0x1c, 0xae, 0xd8, 0x11, 0x3d, 0x7f, 0x7d, 0xc5,
++      0x12, 0x59, 0x28, 0x18, 0xc2, 0xa2, 0xb7, 0x1c,
++      0x88, 0xf8, 0xd6, 0x1b, 0xa6, 0x7d, 0x9e, 0xde,
++      0x29, 0xf8, 0xed, 0xff, 0xeb, 0x92, 0x24, 0x4f,
++      0x05, 0xaa, 0xd9, 0x49, 0xba, 0x87, 0x59, 0x51,
++      0xc9, 0x20, 0x5c, 0x9b, 0x74, 0xcf, 0x03, 0xd9,
++      0x2d, 0x34, 0xc7, 0x5b, 0xa5, 0x40, 0xb2, 0x99,
++      0xf5, 0xcb, 0xb4, 0xf6, 0xb7, 0x72, 0x4a, 0xd6,
++      0xbd, 0xb0, 0xf3, 0x93, 0xe0, 0x1b, 0xa8, 0x04,
++      0x1e, 0x35, 0xd4, 0x80, 0x20, 0xf4, 0x9c, 0x31,
++      0x6b, 0x45, 0xb9, 0x15, 0xb0, 0x5e, 0xdd, 0x0a,
++      0x33, 0x9c, 0x83, 0xcd, 0x58, 0x89, 0x50, 0x56,
++      0xbb, 0x81, 0x00, 0x91, 0x32, 0xf3, 0x1b, 0x3e,
++      0xcf, 0x45, 0xe1, 0xf9, 0xe1, 0x2c, 0x26, 0x78,
++      0x93, 0x9a, 0x60, 0x46, 0xc9, 0xb5, 0x5e, 0x6a,
++      0x28, 0x92, 0x87, 0x3f, 0x63, 0x7b, 0xdb, 0xf7,
++      0xd0, 0x13, 0x9d, 0x32, 0x40, 0x5e, 0xcf, 0xfb,
++      0x79, 0x68, 0x47, 0x4c, 0xfd, 0x01, 0x17, 0xe6,
++      0x97, 0x93, 0x78, 0xbb, 0xa6, 0x27, 0xa3, 0xe8,
++      0x1a, 0xe8, 0x94, 0x55, 0x7d, 0x08, 0xe5, 0xdc,
++      0x66, 0xa3, 0x69, 0xc8, 0xca, 0xc5, 0xa1, 0x84,
++      0x55, 0xde, 0x08, 0x91, 0x16, 0x3a, 0x0c, 0x86,
++      0xab, 0x27, 0x2b, 0x64, 0x34, 0x02, 0x6c, 0x76,
++      0x8b, 0xc6, 0xaf, 0xcc, 0xe1, 0xd6, 0x8c, 0x2a,
++      0x18, 0x3d, 0xa6, 0x1b, 0x37, 0x75, 0x45, 0x73,
++      0xc2, 0x75, 0xd7, 0x53, 0x78, 0x3a, 0xd6, 0xe8,
++      0x29, 0xd2, 0x4a, 0xa8, 0x1e, 0x82, 0xf6, 0xb6,
++      0x81, 0xde, 0x21, 0xed, 0x2b, 0x56, 0xbb, 0xf2,
++      0xd0, 0x57, 0xc1, 0x7c, 0xd2, 0x6a, 0xd2, 0x56,
++      0xf5, 0x13, 0x5f, 0x1c, 0x6a, 0x0b, 0x74, 0xfb,
++      0xe9, 0xfe, 0x9e, 0xea, 0x95, 0xb2, 0x46, 0xab,
++      0x0a, 0xfc, 0xfd, 0xf3, 0xbb, 0x04, 0x2b, 0x76,
++      0x1b, 0xa4, 0x74, 0xb0, 0xc1, 0x78, 0xc3, 0x69,
++      0xe2, 0xb0, 0x01, 0xe1, 0xde, 0x32, 0x4c, 0x8d,
++      0x1a, 0xb3, 0x38, 0x08, 0xd5, 0xfc, 0x1f, 0xdc,
++      0x0e, 0x2c, 0x9c, 0xb1, 0xa1, 0x63, 0x17, 0x22,
++      0xf5, 0x6c, 0x93, 0x70, 0x74, 0x00, 0xf8, 0x39,
++      0x01, 0x94, 0xd1, 0x32, 0x23, 0x56, 0x5d, 0xa6,
++      0x02, 0x76, 0x76, 0x93, 0xce, 0x2f, 0x19, 0xe9,
++      0x17, 0x52, 0xae, 0x6e, 0x2c, 0x6d, 0x61, 0x7f,
++      0x3b, 0xaa, 0xe0, 0x52, 0x85, 0xc5, 0x65, 0xc1,
++      0xbb, 0x8e, 0x5b, 0x21, 0xd5, 0xc9, 0x78, 0x83,
++      0x07, 0x97, 0x4c, 0x62, 0x61, 0x41, 0xd4, 0xfc,
++      0xc9, 0x39, 0xe3, 0x9b, 0xd0, 0xcc, 0x75, 0xc4,
++      0x97, 0xe6, 0xdd, 0x2a, 0x5f, 0xa6, 0xe8, 0x59,
++      0x6c, 0x98, 0xb9, 0x02, 0xe2, 0xa2, 0xd6, 0x68,
++      0xee, 0x3b, 0x1d, 0xe3, 0x4d, 0x5b, 0x30, 0xef,
++      0x03, 0xf2, 0xeb, 0x18, 0x57, 0x36, 0xe8, 0xa1,
++      0xf4, 0x47, 0xfb, 0xcb, 0x8f, 0xcb, 0xc8, 0xf3,
++      0x4f, 0x74, 0x9d, 0x9d, 0xb1, 0x8d, 0x14, 0x44,
++      0xd9, 0x19, 0xb4, 0x54, 0x4f, 0x75, 0x19, 0x09,
++      0xa0, 0x75, 0xbc, 0x3b, 0x82, 0xc6, 0x3f, 0xb8,
++      0x83, 0x19, 0x6e, 0xd6, 0x37, 0xfe, 0x6e, 0x8a,
++      0x4e, 0xe0, 0x4a, 0xab, 0x7b, 0xc8, 0xb4, 0x1d,
++      0xf4, 0xed, 0x27, 0x03, 0x65, 0xa2, 0xa1, 0xae,
++      0x11, 0xe7, 0x98, 0x78, 0x48, 0x91, 0xd2, 0xd2,
++      0xd4, 0x23, 0x78, 0x50, 0xb1, 0x5b, 0x85, 0x10,
++      0x8d, 0xca, 0x5f, 0x0f, 0x71, 0xae, 0x72, 0x9a,
++      0xf6, 0x25, 0x19, 0x60, 0x06, 0xf7, 0x10, 0x34,
++      0x18, 0x0d, 0xc9, 0x9f, 0x7b, 0x0c, 0x9b, 0x8f,
++      0x91, 0x1b, 0x9f, 0xcd, 0x10, 0xee, 0x75, 0xf9,
++      0x97, 0x66, 0xfc, 0x4d, 0x33, 0x6e, 0x28, 0x2b,
++      0x92, 0x85, 0x4f, 0xab, 0x43, 0x8d, 0x8f, 0x7d,
++      0x86, 0xa7, 0xc7, 0xd8, 0xd3, 0x0b, 0x8b, 0x57,
++      0xb6, 0x1d, 0x95, 0x0d, 0xe9, 0xbc, 0xd9, 0x03,
++      0xd9, 0x10, 0x19, 0xc3, 0x46, 0x63, 0x55, 0x87,
++      0x61, 0x79, 0x6c, 0x95, 0x0e, 0x9c, 0xdd, 0xca,
++      0xc3, 0xf3, 0x64, 0xf0, 0x7d, 0x76, 0xb7, 0x53,
++      0x67, 0x2b, 0x1e, 0x44, 0x56, 0x81, 0xea, 0x8f,
++      0x5c, 0x42, 0x16, 0xb8, 0x28, 0xeb, 0x1b, 0x61,
++      0x10, 0x1e, 0xbf, 0xec, 0xa8
++};
++static const u8 enc_output011[] __initconst = {
++      0x6a, 0xfc, 0x4b, 0x25, 0xdf, 0xc0, 0xe4, 0xe8,
++      0x17, 0x4d, 0x4c, 0xc9, 0x7e, 0xde, 0x3a, 0xcc,
++      0x3c, 0xba, 0x6a, 0x77, 0x47, 0xdb, 0xe3, 0x74,
++      0x7a, 0x4d, 0x5f, 0x8d, 0x37, 0x55, 0x80, 0x73,
++      0x90, 0x66, 0x5d, 0x3a, 0x7d, 0x5d, 0x86, 0x5e,
++      0x8d, 0xfd, 0x83, 0xff, 0x4e, 0x74, 0x6f, 0xf9,
++      0xe6, 0x70, 0x17, 0x70, 0x3e, 0x96, 0xa7, 0x7e,
++      0xcb, 0xab, 0x8f, 0x58, 0x24, 0x9b, 0x01, 0xfd,
++      0xcb, 0xe6, 0x4d, 0x9b, 0xf0, 0x88, 0x94, 0x57,
++      0x66, 0xef, 0x72, 0x4c, 0x42, 0x6e, 0x16, 0x19,
++      0x15, 0xea, 0x70, 0x5b, 0xac, 0x13, 0xdb, 0x9f,
++      0x18, 0xe2, 0x3c, 0x26, 0x97, 0xbc, 0xdc, 0x45,
++      0x8c, 0x6c, 0x24, 0x69, 0x9c, 0xf7, 0x65, 0x1e,
++      0x18, 0x59, 0x31, 0x7c, 0xe4, 0x73, 0xbc, 0x39,
++      0x62, 0xc6, 0x5c, 0x9f, 0xbf, 0xfa, 0x90, 0x03,
++      0xc9, 0x72, 0x26, 0xb6, 0x1b, 0xc2, 0xb7, 0x3f,
++      0xf2, 0x13, 0x77, 0xf2, 0x8d, 0xb9, 0x47, 0xd0,
++      0x53, 0xdd, 0xc8, 0x91, 0x83, 0x8b, 0xb1, 0xce,
++      0xa3, 0xfe, 0xcd, 0xd9, 0xdd, 0x92, 0x7b, 0xdb,
++      0xb8, 0xfb, 0xc9, 0x2d, 0x01, 0x59, 0x39, 0x52,
++      0xad, 0x1b, 0xec, 0xcf, 0xd7, 0x70, 0x13, 0x21,
++      0xf5, 0x47, 0xaa, 0x18, 0x21, 0x5c, 0xc9, 0x9a,
++      0xd2, 0x6b, 0x05, 0x9c, 0x01, 0xa1, 0xda, 0x35,
++      0x5d, 0xb3, 0x70, 0xe6, 0xa9, 0x80, 0x8b, 0x91,
++      0xb7, 0xb3, 0x5f, 0x24, 0x9a, 0xb7, 0xd1, 0x6b,
++      0xa1, 0x1c, 0x50, 0xba, 0x49, 0xe0, 0xee, 0x2e,
++      0x75, 0xac, 0x69, 0xc0, 0xeb, 0x03, 0xdd, 0x19,
++      0xe5, 0xf6, 0x06, 0xdd, 0xc3, 0xd7, 0x2b, 0x07,
++      0x07, 0x30, 0xa7, 0x19, 0x0c, 0xbf, 0xe6, 0x18,
++      0xcc, 0xb1, 0x01, 0x11, 0x85, 0x77, 0x1d, 0x96,
++      0xa7, 0xa3, 0x00, 0x84, 0x02, 0xa2, 0x83, 0x68,
++      0xda, 0x17, 0x27, 0xc8, 0x7f, 0x23, 0xb7, 0xf4,
++      0x13, 0x85, 0xcf, 0xdd, 0x7a, 0x7d, 0x24, 0x57,
++      0xfe, 0x05, 0x93, 0xf5, 0x74, 0xce, 0xed, 0x0c,
++      0x20, 0x98, 0x8d, 0x92, 0x30, 0xa1, 0x29, 0x23,
++      0x1a, 0xa0, 0x4f, 0x69, 0x56, 0x4c, 0xe1, 0xc8,
++      0xce, 0xf6, 0x9a, 0x0c, 0xa4, 0xfa, 0x04, 0xf6,
++      0x62, 0x95, 0xf2, 0xfa, 0xc7, 0x40, 0x68, 0x40,
++      0x8f, 0x41, 0xda, 0xb4, 0x26, 0x6f, 0x70, 0xab,
++      0x40, 0x61, 0xa4, 0x0e, 0x75, 0xfb, 0x86, 0xeb,
++      0x9d, 0x9a, 0x1f, 0xec, 0x76, 0x99, 0xe7, 0xea,
++      0xaa, 0x1e, 0x2d, 0xb5, 0xd4, 0xa6, 0x1a, 0xb8,
++      0x61, 0x0a, 0x1d, 0x16, 0x5b, 0x98, 0xc2, 0x31,
++      0x40, 0xe7, 0x23, 0x1d, 0x66, 0x99, 0xc8, 0xc0,
++      0xd7, 0xce, 0xf3, 0x57, 0x40, 0x04, 0x3f, 0xfc,
++      0xea, 0xb3, 0xfc, 0xd2, 0xd3, 0x99, 0xa4, 0x94,
++      0x69, 0xa0, 0xef, 0xd1, 0x85, 0xb3, 0xa6, 0xb1,
++      0x28, 0xbf, 0x94, 0x67, 0x22, 0xc3, 0x36, 0x46,
++      0xf8, 0xd2, 0x0f, 0x5f, 0xf4, 0x59, 0x80, 0xe6,
++      0x2d, 0x43, 0x08, 0x7d, 0x19, 0x09, 0x97, 0xa7,
++      0x4c, 0x3d, 0x8d, 0xba, 0x65, 0x62, 0xa3, 0x71,
++      0x33, 0x29, 0x62, 0xdb, 0xc1, 0x33, 0x34, 0x1a,
++      0x63, 0x33, 0x16, 0xb6, 0x64, 0x7e, 0xab, 0x33,
++      0xf0, 0xe6, 0x26, 0x68, 0xba, 0x1d, 0x2e, 0x38,
++      0x08, 0xe6, 0x02, 0xd3, 0x25, 0x2c, 0x47, 0x23,
++      0x58, 0x34, 0x0f, 0x9d, 0x63, 0x4f, 0x63, 0xbb,
++      0x7f, 0x3b, 0x34, 0x38, 0xa7, 0xb5, 0x8d, 0x65,
++      0xd9, 0x9f, 0x79, 0x55, 0x3e, 0x4d, 0xe7, 0x73,
++      0xd8, 0xf6, 0x98, 0x97, 0x84, 0x60, 0x9c, 0xc8,
++      0xa9, 0x3c, 0xf6, 0xdc, 0x12, 0x5c, 0xe1, 0xbb,
++      0x0b, 0x8b, 0x98, 0x9c, 0x9d, 0x26, 0x7c, 0x4a,
++      0xe6, 0x46, 0x36, 0x58, 0x21, 0x4a, 0xee, 0xca,
++      0xd7, 0x3b, 0xc2, 0x6c, 0x49, 0x2f, 0xe5, 0xd5,
++      0x03, 0x59, 0x84, 0x53, 0xcb, 0xfe, 0x92, 0x71,
++      0x2e, 0x7c, 0x21, 0xcc, 0x99, 0x85, 0x7f, 0xb8,
++      0x74, 0x90, 0x13, 0x42, 0x3f, 0xe0, 0x6b, 0x1d,
++      0xf2, 0x4d, 0x54, 0xd4, 0xfc, 0x3a, 0x05, 0xe6,
++      0x74, 0xaf, 0xa6, 0xa0, 0x2a, 0x20, 0x23, 0x5d,
++      0x34, 0x5c, 0xd9, 0x3e, 0x4e, 0xfa, 0x93, 0xe7,
++      0xaa, 0xe9, 0x6f, 0x08, 0x43, 0x67, 0x41, 0xc5,
++      0xad, 0xfb, 0x31, 0x95, 0x82, 0x73, 0x32, 0xd8,
++      0xa6, 0xa3, 0xed, 0x0e, 0x2d, 0xf6, 0x5f, 0xfd,
++      0x80, 0xa6, 0x7a, 0xe0, 0xdf, 0x78, 0x15, 0x29,
++      0x74, 0x33, 0xd0, 0x9e, 0x83, 0x86, 0x72, 0x22,
++      0x57, 0x29, 0xb9, 0x9e, 0x5d, 0xd3, 0x1a, 0xb5,
++      0x96, 0x72, 0x41, 0x3d, 0xf1, 0x64, 0x43, 0x67,
++      0xee, 0xaa, 0x5c, 0xd3, 0x9a, 0x96, 0x13, 0x11,
++      0x5d, 0xf3, 0x0c, 0x87, 0x82, 0x1e, 0x41, 0x9e,
++      0xd0, 0x27, 0xd7, 0x54, 0x3b, 0x67, 0x73, 0x09,
++      0x91, 0xe9, 0xd5, 0x36, 0xa7, 0xb5, 0x55, 0xe4,
++      0xf3, 0x21, 0x51, 0x49, 0x22, 0x07, 0x55, 0x4f,
++      0x44, 0x4b, 0xd2, 0x15, 0x93, 0x17, 0x2a, 0xfa,
++      0x4d, 0x4a, 0x57, 0xdb, 0x4c, 0xa6, 0xeb, 0xec,
++      0x53, 0x25, 0x6c, 0x21, 0xed, 0x00, 0x4c, 0x3b,
++      0xca, 0x14, 0x57, 0xa9, 0xd6, 0x6a, 0xcd, 0x8d,
++      0x5e, 0x74, 0xac, 0x72, 0xc1, 0x97, 0xe5, 0x1b,
++      0x45, 0x4e, 0xda, 0xfc, 0xcc, 0x40, 0xe8, 0x48,
++      0x88, 0x0b, 0xa3, 0xe3, 0x8d, 0x83, 0x42, 0xc3,
++      0x23, 0xfd, 0x68, 0xb5, 0x8e, 0xf1, 0x9d, 0x63,
++      0x77, 0xe9, 0xa3, 0x8e, 0x8c, 0x26, 0x6b, 0xbd,
++      0x72, 0x73, 0x35, 0x0c, 0x03, 0xf8, 0x43, 0x78,
++      0x52, 0x71, 0x15, 0x1f, 0x71, 0x5d, 0x6e, 0xed,
++      0xb9, 0xcc, 0x86, 0x30, 0xdb, 0x2b, 0xd3, 0x82,
++      0x88, 0x23, 0x71, 0x90, 0x53, 0x5c, 0xa9, 0x2f,
++      0x76, 0x01, 0xb7, 0x9a, 0xfe, 0x43, 0x55, 0xa3,
++      0x04, 0x9b, 0x0e, 0xe4, 0x59, 0xdf, 0xc9, 0xe9,
++      0xb1, 0xea, 0x29, 0x28, 0x3c, 0x5c, 0xae, 0x72,
++      0x84, 0xb6, 0xc6, 0xeb, 0x0c, 0x27, 0x07, 0x74,
++      0x90, 0x0d, 0x31, 0xb0, 0x00, 0x77, 0xe9, 0x40,
++      0x70, 0x6f, 0x68, 0xa7, 0xfd, 0x06, 0xec, 0x4b,
++      0xc0, 0xb7, 0xac, 0xbc, 0x33, 0xb7, 0x6d, 0x0a,
++      0xbd, 0x12, 0x1b, 0x59, 0xcb, 0xdd, 0x32, 0xf5,
++      0x1d, 0x94, 0x57, 0x76, 0x9e, 0x0c, 0x18, 0x98,
++      0x71, 0xd7, 0x2a, 0xdb, 0x0b, 0x7b, 0xa7, 0x71,
++      0xb7, 0x67, 0x81, 0x23, 0x96, 0xae, 0xb9, 0x7e,
++      0x32, 0x43, 0x92, 0x8a, 0x19, 0xa0, 0xc4, 0xd4,
++      0x3b, 0x57, 0xf9, 0x4a, 0x2c, 0xfb, 0x51, 0x46,
++      0xbb, 0xcb, 0x5d, 0xb3, 0xef, 0x13, 0x93, 0x6e,
++      0x68, 0x42, 0x54, 0x57, 0xd3, 0x6a, 0x3a, 0x8f,
++      0x9d, 0x66, 0xbf, 0xbd, 0x36, 0x23, 0xf5, 0x93,
++      0x83, 0x7b, 0x9c, 0xc0, 0xdd, 0xc5, 0x49, 0xc0,
++      0x64, 0xed, 0x07, 0x12, 0xb3, 0xe6, 0xe4, 0xe5,
++      0x38, 0x95, 0x23, 0xb1, 0xa0, 0x3b, 0x1a, 0x61,
++      0xda, 0x17, 0xac, 0xc3, 0x58, 0xdd, 0x74, 0x64,
++      0x22, 0x11, 0xe8, 0x32, 0x1d, 0x16, 0x93, 0x85,
++      0x99, 0xa5, 0x9c, 0x34, 0x55, 0xb1, 0xe9, 0x20,
++      0x72, 0xc9, 0x28, 0x7b, 0x79, 0x00, 0xa1, 0xa6,
++      0xa3, 0x27, 0x40, 0x18, 0x8a, 0x54, 0xe0, 0xcc,
++      0xe8, 0x4e, 0x8e, 0x43, 0x96, 0xe7, 0x3f, 0xc8,
++      0xe9, 0xb2, 0xf9, 0xc9, 0xda, 0x04, 0x71, 0x50,
++      0x47, 0xe4, 0xaa, 0xce, 0xa2, 0x30, 0xc8, 0xe4,
++      0xac, 0xc7, 0x0d, 0x06, 0x2e, 0xe6, 0xe8, 0x80,
++      0x36, 0x29, 0x9e, 0x01, 0xb8, 0xc3, 0xf0, 0xa0,
++      0x5d, 0x7a, 0xca, 0x4d, 0xa0, 0x57, 0xbd, 0x2a,
++      0x45, 0xa7, 0x7f, 0x9c, 0x93, 0x07, 0x8f, 0x35,
++      0x67, 0x92, 0xe3, 0xe9, 0x7f, 0xa8, 0x61, 0x43,
++      0x9e, 0x25, 0x4f, 0x33, 0x76, 0x13, 0x6e, 0x12,
++      0xb9, 0xdd, 0xa4, 0x7c, 0x08, 0x9f, 0x7c, 0xe7,
++      0x0a, 0x8d, 0x84, 0x06, 0xa4, 0x33, 0x17, 0x34,
++      0x5e, 0x10, 0x7c, 0xc0, 0xa8, 0x3d, 0x1f, 0x42,
++      0x20, 0x51, 0x65, 0x5d, 0x09, 0xc3, 0xaa, 0xc0,
++      0xc8, 0x0d, 0xf0, 0x79, 0xbc, 0x20, 0x1b, 0x95,
++      0xe7, 0x06, 0x7d, 0x47, 0x20, 0x03, 0x1a, 0x74,
++      0xdd, 0xe2, 0xd4, 0xae, 0x38, 0x71, 0x9b, 0xf5,
++      0x80, 0xec, 0x08, 0x4e, 0x56, 0xba, 0x76, 0x12,
++      0x1a, 0xdf, 0x48, 0xf3, 0xae, 0xb3, 0xe6, 0xe6,
++      0xbe, 0xc0, 0x91, 0x2e, 0x01, 0xb3, 0x01, 0x86,
++      0xa2, 0xb9, 0x52, 0xd1, 0x21, 0xae, 0xd4, 0x97,
++      0x1d, 0xef, 0x41, 0x12, 0x95, 0x3d, 0x48, 0x45,
++      0x1c, 0x56, 0x32, 0x8f, 0xb8, 0x43, 0xbb, 0x19,
++      0xf3, 0xca, 0xe9, 0xeb, 0x6d, 0x84, 0xbe, 0x86,
++      0x06, 0xe2, 0x36, 0xb2, 0x62, 0x9d, 0xd3, 0x4c,
++      0x48, 0x18, 0x54, 0x13, 0x4e, 0xcf, 0xfd, 0xba,
++      0x84, 0xb9, 0x30, 0x53, 0xcf, 0xfb, 0xb9, 0x29,
++      0x8f, 0xdc, 0x9f, 0xef, 0x60, 0x0b, 0x64, 0xf6,
++      0x8b, 0xee, 0xa6, 0x91, 0xc2, 0x41, 0x6c, 0xf6,
++      0xfa, 0x79, 0x67, 0x4b, 0xc1, 0x3f, 0xaf, 0x09,
++      0x81, 0xd4, 0x5d, 0xcb, 0x09, 0xdf, 0x36, 0x31,
++      0xc0, 0x14, 0x3c, 0x7c, 0x0e, 0x65, 0x95, 0x99,
++      0x6d, 0xa3, 0xf4, 0xd7, 0x38, 0xee, 0x1a, 0x2b,
++      0x37, 0xe2, 0xa4, 0x3b, 0x4b, 0xd0, 0x65, 0xca,
++      0xf8, 0xc3, 0xe8, 0x15, 0x20, 0xef, 0xf2, 0x00,
++      0xfd, 0x01, 0x09, 0xc5, 0xc8, 0x17, 0x04, 0x93,
++      0xd0, 0x93, 0x03, 0x55, 0xc5, 0xfe, 0x32, 0xa3,
++      0x3e, 0x28, 0x2d, 0x3b, 0x93, 0x8a, 0xcc, 0x07,
++      0x72, 0x80, 0x8b, 0x74, 0x16, 0x24, 0xbb, 0xda,
++      0x94, 0x39, 0x30, 0x8f, 0xb1, 0xcd, 0x4a, 0x90,
++      0x92, 0x7c, 0x14, 0x8f, 0x95, 0x4e, 0xac, 0x9b,
++      0xd8, 0x8f, 0x1a, 0x87, 0xa4, 0x32, 0x27, 0x8a,
++      0xba, 0xf7, 0x41, 0xcf, 0x84, 0x37, 0x19, 0xe6,
++      0x06, 0xf5, 0x0e, 0xcf, 0x36, 0xf5, 0x9e, 0x6c,
++      0xde, 0xbc, 0xff, 0x64, 0x7e, 0x4e, 0x59, 0x57,
++      0x48, 0xfe, 0x14, 0xf7, 0x9c, 0x93, 0x5d, 0x15,
++      0xad, 0xcc, 0x11, 0xb1, 0x17, 0x18, 0xb2, 0x7e,
++      0xcc, 0xab, 0xe9, 0xce, 0x7d, 0x77, 0x5b, 0x51,
++      0x1b, 0x1e, 0x20, 0xa8, 0x32, 0x06, 0x0e, 0x75,
++      0x93, 0xac, 0xdb, 0x35, 0x37, 0x1f, 0xe9, 0x19,
++      0x1d, 0xb4, 0x71, 0x97, 0xd6, 0x4e, 0x2c, 0x08,
++      0xa5, 0x13, 0xf9, 0x0e, 0x7e, 0x78, 0x6e, 0x14,
++      0xe0, 0xa9, 0xb9, 0x96, 0x4c, 0x80, 0x82, 0xba,
++      0x17, 0xb3, 0x9d, 0x69, 0xb0, 0x84, 0x46, 0xff,
++      0xf9, 0x52, 0x79, 0x94, 0x58, 0x3a, 0x62, 0x90,
++      0x15, 0x35, 0x71, 0x10, 0x37, 0xed, 0xa1, 0x8e,
++      0x53, 0x6e, 0xf4, 0x26, 0x57, 0x93, 0x15, 0x93,
++      0xf6, 0x81, 0x2c, 0x5a, 0x10, 0xda, 0x92, 0xad,
++      0x2f, 0xdb, 0x28, 0x31, 0x2d, 0x55, 0x04, 0xd2,
++      0x06, 0x28, 0x8c, 0x1e, 0xdc, 0xea, 0x54, 0xac,
++      0xff, 0xb7, 0x6c, 0x30, 0x15, 0xd4, 0xb4, 0x0d,
++      0x00, 0x93, 0x57, 0xdd, 0xd2, 0x07, 0x07, 0x06,
++      0xd9, 0x43, 0x9b, 0xcd, 0x3a, 0xf4, 0x7d, 0x4c,
++      0x36, 0x5d, 0x23, 0xa2, 0xcc, 0x57, 0x40, 0x91,
++      0xe9, 0x2c, 0x2f, 0x2c, 0xd5, 0x30, 0x9b, 0x17,
++      0xb0, 0xc9, 0xf7, 0xa7, 0x2f, 0xd1, 0x93, 0x20,
++      0x6b, 0xc6, 0xc1, 0xe4, 0x6f, 0xcb, 0xd1, 0xe7,
++      0x09, 0x0f, 0x9e, 0xdc, 0xaa, 0x9f, 0x2f, 0xdf,
++      0x56, 0x9f, 0xd4, 0x33, 0x04, 0xaf, 0xd3, 0x6c,
++      0x58, 0x61, 0xf0, 0x30, 0xec, 0xf2, 0x7f, 0xf2,
++      0x9c, 0xdf, 0x39, 0xbb, 0x6f, 0xa2, 0x8c, 0x7e,
++      0xc4, 0x22, 0x51, 0x71, 0xc0, 0x4d, 0x14, 0x1a,
++      0xc4, 0xcd, 0x04, 0xd9, 0x87, 0x08, 0x50, 0x05,
++      0xcc, 0xaf, 0xf6, 0xf0, 0x8f, 0x92, 0x54, 0x58,
++      0xc2, 0xc7, 0x09, 0x7a, 0x59, 0x02, 0x05, 0xe8,
++      0xb0, 0x86, 0xd9, 0xbf, 0x7b, 0x35, 0x51, 0x4d,
++      0xaf, 0x08, 0x97, 0x2c, 0x65, 0xda, 0x2a, 0x71,
++      0x3a, 0xa8, 0x51, 0xcc, 0xf2, 0x73, 0x27, 0xc3,
++      0xfd, 0x62, 0xcf, 0xe3, 0xb2, 0xca, 0xcb, 0xbe,
++      0x1a, 0x0a, 0xa1, 0x34, 0x7b, 0x77, 0xc4, 0x62,
++      0x68, 0x78, 0x5f, 0x94, 0x07, 0x04, 0x65, 0x16,
++      0x4b, 0x61, 0xcb, 0xff, 0x75, 0x26, 0x50, 0x66,
++      0x1f, 0x6e, 0x93, 0xf8, 0xc5, 0x51, 0xeb, 0xa4,
++      0x4a, 0x48, 0x68, 0x6b, 0xe2, 0x5e, 0x44, 0xb2,
++      0x50, 0x2c, 0x6c, 0xae, 0x79, 0x4e, 0x66, 0x35,
++      0x81, 0x50, 0xac, 0xbc, 0x3f, 0xb1, 0x0c, 0xf3,
++      0x05, 0x3c, 0x4a, 0xa3, 0x6c, 0x2a, 0x79, 0xb4,
++      0xb7, 0xab, 0xca, 0xc7, 0x9b, 0x8e, 0xcd, 0x5f,
++      0x11, 0x03, 0xcb, 0x30, 0xa3, 0xab, 0xda, 0xfe,
++      0x64, 0xb9, 0xbb, 0xd8, 0x5e, 0x3a, 0x1a, 0x56,
++      0xe5, 0x05, 0x48, 0x90, 0x1e, 0x61, 0x69, 0x1b,
++      0x22, 0xe6, 0x1a, 0x3c, 0x75, 0xad, 0x1f, 0x37,
++      0x28, 0xdc, 0xe4, 0x6d, 0xbd, 0x42, 0xdc, 0xd3,
++      0xc8, 0xb6, 0x1c, 0x48, 0xfe, 0x94, 0x77, 0x7f,
++      0xbd, 0x62, 0xac, 0xa3, 0x47, 0x27, 0xcf, 0x5f,
++      0xd9, 0xdb, 0xaf, 0xec, 0xf7, 0x5e, 0xc1, 0xb0,
++      0x9d, 0x01, 0x26, 0x99, 0x7e, 0x8f, 0x03, 0x70,
++      0xb5, 0x42, 0xbe, 0x67, 0x28, 0x1b, 0x7c, 0xbd,
++      0x61, 0x21, 0x97, 0xcc, 0x5c, 0xe1, 0x97, 0x8f,
++      0x8d, 0xde, 0x2b, 0xaa, 0xa7, 0x71, 0x1d, 0x1e,
++      0x02, 0x73, 0x70, 0x58, 0x32, 0x5b, 0x1d, 0x67,
++      0x3d, 0xe0, 0x74, 0x4f, 0x03, 0xf2, 0x70, 0x51,
++      0x79, 0xf1, 0x61, 0x70, 0x15, 0x74, 0x9d, 0x23,
++      0x89, 0xde, 0xac, 0xfd, 0xde, 0xd0, 0x1f, 0xc3,
++      0x87, 0x44, 0x35, 0x4b, 0xe5, 0xb0, 0x60, 0xc5,
++      0x22, 0xe4, 0x9e, 0xca, 0xeb, 0xd5, 0x3a, 0x09,
++      0x45, 0xa4, 0xdb, 0xfa, 0x3f, 0xeb, 0x1b, 0xc7,
++      0xc8, 0x14, 0x99, 0x51, 0x92, 0x10, 0xed, 0xed,
++      0x28, 0xe0, 0xa1, 0xf8, 0x26, 0xcf, 0xcd, 0xcb,
++      0x63, 0xa1, 0x3b, 0xe3, 0xdf, 0x7e, 0xfe, 0xa6,
++      0xf0, 0x81, 0x9a, 0xbf, 0x55, 0xde, 0x54, 0xd5,
++      0x56, 0x60, 0x98, 0x10, 0x68, 0xf4, 0x38, 0x96,
++      0x8e, 0x6f, 0x1d, 0x44, 0x7f, 0xd6, 0x2f, 0xfe,
++      0x55, 0xfb, 0x0c, 0x7e, 0x67, 0xe2, 0x61, 0x44,
++      0xed, 0xf2, 0x35, 0x30, 0x5d, 0xe9, 0xc7, 0xd6,
++      0x6d, 0xe0, 0xa0, 0xed, 0xf3, 0xfc, 0xd8, 0x3e,
++      0x0a, 0x7b, 0xcd, 0xaf, 0x65, 0x68, 0x18, 0xc0,
++      0xec, 0x04, 0x1c, 0x74, 0x6d, 0xe2, 0x6e, 0x79,
++      0xd4, 0x11, 0x2b, 0x62, 0xd5, 0x27, 0xad, 0x4f,
++      0x01, 0x59, 0x73, 0xcc, 0x6a, 0x53, 0xfb, 0x2d,
++      0xd5, 0x4e, 0x99, 0x21, 0x65, 0x4d, 0xf5, 0x82,
++      0xf7, 0xd8, 0x42, 0xce, 0x6f, 0x3d, 0x36, 0x47,
++      0xf1, 0x05, 0x16, 0xe8, 0x1b, 0x6a, 0x8f, 0x93,
++      0xf2, 0x8f, 0x37, 0x40, 0x12, 0x28, 0xa3, 0xe6,
++      0xb9, 0x17, 0x4a, 0x1f, 0xb1, 0xd1, 0x66, 0x69,
++      0x86, 0xc4, 0xfc, 0x97, 0xae, 0x3f, 0x8f, 0x1e,
++      0x2b, 0xdf, 0xcd, 0xf9, 0x3c
++};
++static const u8 enc_assoc011[] __initconst = {
++      0xd6, 0x31, 0xda, 0x5d, 0x42, 0x5e, 0xd7
++};
++static const u8 enc_nonce011[] __initconst = {
++      0xfd, 0x87, 0xd4, 0xd8, 0x62, 0xfd, 0xec, 0xaa
++};
++static const u8 enc_key011[] __initconst = {
++      0x35, 0x4e, 0xb5, 0x70, 0x50, 0x42, 0x8a, 0x85,
++      0xf2, 0xfb, 0xed, 0x7b, 0xd0, 0x9e, 0x97, 0xca,
++      0xfa, 0x98, 0x66, 0x63, 0xee, 0x37, 0xcc, 0x52,
++      0xfe, 0xd1, 0xdf, 0x95, 0x15, 0x34, 0x29, 0x38
++};
++
++static const u8 enc_input012[] __initconst = {
++      0x74, 0xa6, 0x3e, 0xe4, 0xb1, 0xcb, 0xaf, 0xb0,
++      0x40, 0xe5, 0x0f, 0x9e, 0xf1, 0xf2, 0x89, 0xb5,
++      0x42, 0x34, 0x8a, 0xa1, 0x03, 0xb7, 0xe9, 0x57,
++      0x46, 0xbe, 0x20, 0xe4, 0x6e, 0xb0, 0xeb, 0xff,
++      0xea, 0x07, 0x7e, 0xef, 0xe2, 0x55, 0x9f, 0xe5,
++      0x78, 0x3a, 0xb7, 0x83, 0xc2, 0x18, 0x40, 0x7b,
++      0xeb, 0xcd, 0x81, 0xfb, 0x90, 0x12, 0x9e, 0x46,
++      0xa9, 0xd6, 0x4a, 0xba, 0xb0, 0x62, 0xdb, 0x6b,
++      0x99, 0xc4, 0xdb, 0x54, 0x4b, 0xb8, 0xa5, 0x71,
++      0xcb, 0xcd, 0x63, 0x32, 0x55, 0xfb, 0x31, 0xf0,
++      0x38, 0xf5, 0xbe, 0x78, 0xe4, 0x45, 0xce, 0x1b,
++      0x6a, 0x5b, 0x0e, 0xf4, 0x16, 0xe4, 0xb1, 0x3d,
++      0xf6, 0x63, 0x7b, 0xa7, 0x0c, 0xde, 0x6f, 0x8f,
++      0x74, 0xdf, 0xe0, 0x1e, 0x9d, 0xce, 0x8f, 0x24,
++      0xef, 0x23, 0x35, 0x33, 0x7b, 0x83, 0x34, 0x23,
++      0x58, 0x74, 0x14, 0x77, 0x1f, 0xc2, 0x4f, 0x4e,
++      0xc6, 0x89, 0xf9, 0x52, 0x09, 0x37, 0x64, 0x14,
++      0xc4, 0x01, 0x6b, 0x9d, 0x77, 0xe8, 0x90, 0x5d,
++      0xa8, 0x4a, 0x2a, 0xef, 0x5c, 0x7f, 0xeb, 0xbb,
++      0xb2, 0xc6, 0x93, 0x99, 0x66, 0xdc, 0x7f, 0xd4,
++      0x9e, 0x2a, 0xca, 0x8d, 0xdb, 0xe7, 0x20, 0xcf,
++      0xe4, 0x73, 0xae, 0x49, 0x7d, 0x64, 0x0f, 0x0e,
++      0x28, 0x46, 0xa9, 0xa8, 0x32, 0xe4, 0x0e, 0xf6,
++      0x51, 0x53, 0xb8, 0x3c, 0xb1, 0xff, 0xa3, 0x33,
++      0x41, 0x75, 0xff, 0xf1, 0x6f, 0xf1, 0xfb, 0xbb,
++      0x83, 0x7f, 0x06, 0x9b, 0xe7, 0x1b, 0x0a, 0xe0,
++      0x5c, 0x33, 0x60, 0x5b, 0xdb, 0x5b, 0xed, 0xfe,
++      0xa5, 0x16, 0x19, 0x72, 0xa3, 0x64, 0x23, 0x00,
++      0x02, 0xc7, 0xf3, 0x6a, 0x81, 0x3e, 0x44, 0x1d,
++      0x79, 0x15, 0x5f, 0x9a, 0xde, 0xe2, 0xfd, 0x1b,
++      0x73, 0xc1, 0xbc, 0x23, 0xba, 0x31, 0xd2, 0x50,
++      0xd5, 0xad, 0x7f, 0x74, 0xa7, 0xc9, 0xf8, 0x3e,
++      0x2b, 0x26, 0x10, 0xf6, 0x03, 0x36, 0x74, 0xe4,
++      0x0e, 0x6a, 0x72, 0xb7, 0x73, 0x0a, 0x42, 0x28,
++      0xc2, 0xad, 0x5e, 0x03, 0xbe, 0xb8, 0x0b, 0xa8,
++      0x5b, 0xd4, 0xb8, 0xba, 0x52, 0x89, 0xb1, 0x9b,
++      0xc1, 0xc3, 0x65, 0x87, 0xed, 0xa5, 0xf4, 0x86,
++      0xfd, 0x41, 0x80, 0x91, 0x27, 0x59, 0x53, 0x67,
++      0x15, 0x78, 0x54, 0x8b, 0x2d, 0x3d, 0xc7, 0xff,
++      0x02, 0x92, 0x07, 0x5f, 0x7a, 0x4b, 0x60, 0x59,
++      0x3c, 0x6f, 0x5c, 0xd8, 0xec, 0x95, 0xd2, 0xfe,
++      0xa0, 0x3b, 0xd8, 0x3f, 0xd1, 0x69, 0xa6, 0xd6,
++      0x41, 0xb2, 0xf4, 0x4d, 0x12, 0xf4, 0x58, 0x3e,
++      0x66, 0x64, 0x80, 0x31, 0x9b, 0xa8, 0x4c, 0x8b,
++      0x07, 0xb2, 0xec, 0x66, 0x94, 0x66, 0x47, 0x50,
++      0x50, 0x5f, 0x18, 0x0b, 0x0e, 0xd6, 0xc0, 0x39,
++      0x21, 0x13, 0x9e, 0x33, 0xbc, 0x79, 0x36, 0x02,
++      0x96, 0x70, 0xf0, 0x48, 0x67, 0x2f, 0x26, 0xe9,
++      0x6d, 0x10, 0xbb, 0xd6, 0x3f, 0xd1, 0x64, 0x7a,
++      0x2e, 0xbe, 0x0c, 0x61, 0xf0, 0x75, 0x42, 0x38,
++      0x23, 0xb1, 0x9e, 0x9f, 0x7c, 0x67, 0x66, 0xd9,
++      0x58, 0x9a, 0xf1, 0xbb, 0x41, 0x2a, 0x8d, 0x65,
++      0x84, 0x94, 0xfc, 0xdc, 0x6a, 0x50, 0x64, 0xdb,
++      0x56, 0x33, 0x76, 0x00, 0x10, 0xed, 0xbe, 0xd2,
++      0x12, 0xf6, 0xf6, 0x1b, 0xa2, 0x16, 0xde, 0xae,
++      0x31, 0x95, 0xdd, 0xb1, 0x08, 0x7e, 0x4e, 0xee,
++      0xe7, 0xf9, 0xa5, 0xfb, 0x5b, 0x61, 0x43, 0x00,
++      0x40, 0xf6, 0x7e, 0x02, 0x04, 0x32, 0x4e, 0x0c,
++      0xe2, 0x66, 0x0d, 0xd7, 0x07, 0x98, 0x0e, 0xf8,
++      0x72, 0x34, 0x6d, 0x95, 0x86, 0xd7, 0xcb, 0x31,
++      0x54, 0x47, 0xd0, 0x38, 0x29, 0x9c, 0x5a, 0x68,
++      0xd4, 0x87, 0x76, 0xc9, 0xe7, 0x7e, 0xe3, 0xf4,
++      0x81, 0x6d, 0x18, 0xcb, 0xc9, 0x05, 0xaf, 0xa0,
++      0xfb, 0x66, 0xf7, 0xf1, 0x1c, 0xc6, 0x14, 0x11,
++      0x4f, 0x2b, 0x79, 0x42, 0x8b, 0xbc, 0xac, 0xe7,
++      0x6c, 0xfe, 0x0f, 0x58, 0xe7, 0x7c, 0x78, 0x39,
++      0x30, 0xb0, 0x66, 0x2c, 0x9b, 0x6d, 0x3a, 0xe1,
++      0xcf, 0xc9, 0xa4, 0x0e, 0x6d, 0x6d, 0x8a, 0xa1,
++      0x3a, 0xe7, 0x28, 0xd4, 0x78, 0x4c, 0xa6, 0xa2,
++      0x2a, 0xa6, 0x03, 0x30, 0xd7, 0xa8, 0x25, 0x66,
++      0x87, 0x2f, 0x69, 0x5c, 0x4e, 0xdd, 0xa5, 0x49,
++      0x5d, 0x37, 0x4a, 0x59, 0xc4, 0xaf, 0x1f, 0xa2,
++      0xe4, 0xf8, 0xa6, 0x12, 0x97, 0xd5, 0x79, 0xf5,
++      0xe2, 0x4a, 0x2b, 0x5f, 0x61, 0xe4, 0x9e, 0xe3,
++      0xee, 0xb8, 0xa7, 0x5b, 0x2f, 0xf4, 0x9e, 0x6c,
++      0xfb, 0xd1, 0xc6, 0x56, 0x77, 0xba, 0x75, 0xaa,
++      0x3d, 0x1a, 0xa8, 0x0b, 0xb3, 0x68, 0x24, 0x00,
++      0x10, 0x7f, 0xfd, 0xd7, 0xa1, 0x8d, 0x83, 0x54,
++      0x4f, 0x1f, 0xd8, 0x2a, 0xbe, 0x8a, 0x0c, 0x87,
++      0xab, 0xa2, 0xde, 0xc3, 0x39, 0xbf, 0x09, 0x03,
++      0xa5, 0xf3, 0x05, 0x28, 0xe1, 0xe1, 0xee, 0x39,
++      0x70, 0x9c, 0xd8, 0x81, 0x12, 0x1e, 0x02, 0x40,
++      0xd2, 0x6e, 0xf0, 0xeb, 0x1b, 0x3d, 0x22, 0xc6,
++      0xe5, 0xe3, 0xb4, 0x5a, 0x98, 0xbb, 0xf0, 0x22,
++      0x28, 0x8d, 0xe5, 0xd3, 0x16, 0x48, 0x24, 0xa5,
++      0xe6, 0x66, 0x0c, 0xf9, 0x08, 0xf9, 0x7e, 0x1e,
++      0xe1, 0x28, 0x26, 0x22, 0xc7, 0xc7, 0x0a, 0x32,
++      0x47, 0xfa, 0xa3, 0xbe, 0x3c, 0xc4, 0xc5, 0x53,
++      0x0a, 0xd5, 0x94, 0x4a, 0xd7, 0x93, 0xd8, 0x42,
++      0x99, 0xb9, 0x0a, 0xdb, 0x56, 0xf7, 0xb9, 0x1c,
++      0x53, 0x4f, 0xfa, 0xd3, 0x74, 0xad, 0xd9, 0x68,
++      0xf1, 0x1b, 0xdf, 0x61, 0xc6, 0x5e, 0xa8, 0x48,
++      0xfc, 0xd4, 0x4a, 0x4c, 0x3c, 0x32, 0xf7, 0x1c,
++      0x96, 0x21, 0x9b, 0xf9, 0xa3, 0xcc, 0x5a, 0xce,
++      0xd5, 0xd7, 0x08, 0x24, 0xf6, 0x1c, 0xfd, 0xdd,
++      0x38, 0xc2, 0x32, 0xe9, 0xb8, 0xe7, 0xb6, 0xfa,
++      0x9d, 0x45, 0x13, 0x2c, 0x83, 0xfd, 0x4a, 0x69,
++      0x82, 0xcd, 0xdc, 0xb3, 0x76, 0x0c, 0x9e, 0xd8,
++      0xf4, 0x1b, 0x45, 0x15, 0xb4, 0x97, 0xe7, 0x58,
++      0x34, 0xe2, 0x03, 0x29, 0x5a, 0xbf, 0xb6, 0xe0,
++      0x5d, 0x13, 0xd9, 0x2b, 0xb4, 0x80, 0xb2, 0x45,
++      0x81, 0x6a, 0x2e, 0x6c, 0x89, 0x7d, 0xee, 0xbb,
++      0x52, 0xdd, 0x1f, 0x18, 0xe7, 0x13, 0x6b, 0x33,
++      0x0e, 0xea, 0x36, 0x92, 0x77, 0x7b, 0x6d, 0x9c,
++      0x5a, 0x5f, 0x45, 0x7b, 0x7b, 0x35, 0x62, 0x23,
++      0xd1, 0xbf, 0x0f, 0xd0, 0x08, 0x1b, 0x2b, 0x80,
++      0x6b, 0x7e, 0xf1, 0x21, 0x47, 0xb0, 0x57, 0xd1,
++      0x98, 0x72, 0x90, 0x34, 0x1c, 0x20, 0x04, 0xff,
++      0x3d, 0x5c, 0xee, 0x0e, 0x57, 0x5f, 0x6f, 0x24,
++      0x4e, 0x3c, 0xea, 0xfc, 0xa5, 0xa9, 0x83, 0xc9,
++      0x61, 0xb4, 0x51, 0x24, 0xf8, 0x27, 0x5e, 0x46,
++      0x8c, 0xb1, 0x53, 0x02, 0x96, 0x35, 0xba, 0xb8,
++      0x4c, 0x71, 0xd3, 0x15, 0x59, 0x35, 0x22, 0x20,
++      0xad, 0x03, 0x9f, 0x66, 0x44, 0x3b, 0x9c, 0x35,
++      0x37, 0x1f, 0x9b, 0xbb, 0xf3, 0xdb, 0x35, 0x63,
++      0x30, 0x64, 0xaa, 0xa2, 0x06, 0xa8, 0x5d, 0xbb,
++      0xe1, 0x9f, 0x70, 0xec, 0x82, 0x11, 0x06, 0x36,
++      0xec, 0x8b, 0x69, 0x66, 0x24, 0x44, 0xc9, 0x4a,
++      0x57, 0xbb, 0x9b, 0x78, 0x13, 0xce, 0x9c, 0x0c,
++      0xba, 0x92, 0x93, 0x63, 0xb8, 0xe2, 0x95, 0x0f,
++      0x0f, 0x16, 0x39, 0x52, 0xfd, 0x3a, 0x6d, 0x02,
++      0x4b, 0xdf, 0x13, 0xd3, 0x2a, 0x22, 0xb4, 0x03,
++      0x7c, 0x54, 0x49, 0x96, 0x68, 0x54, 0x10, 0xfa,
++      0xef, 0xaa, 0x6c, 0xe8, 0x22, 0xdc, 0x71, 0x16,
++      0x13, 0x1a, 0xf6, 0x28, 0xe5, 0x6d, 0x77, 0x3d,
++      0xcd, 0x30, 0x63, 0xb1, 0x70, 0x52, 0xa1, 0xc5,
++      0x94, 0x5f, 0xcf, 0xe8, 0xb8, 0x26, 0x98, 0xf7,
++      0x06, 0xa0, 0x0a, 0x70, 0xfa, 0x03, 0x80, 0xac,
++      0xc1, 0xec, 0xd6, 0x4c, 0x54, 0xd7, 0xfe, 0x47,
++      0xb6, 0x88, 0x4a, 0xf7, 0x71, 0x24, 0xee, 0xf3,
++      0xd2, 0xc2, 0x4a, 0x7f, 0xfe, 0x61, 0xc7, 0x35,
++      0xc9, 0x37, 0x67, 0xcb, 0x24, 0x35, 0xda, 0x7e,
++      0xca, 0x5f, 0xf3, 0x8d, 0xd4, 0x13, 0x8e, 0xd6,
++      0xcb, 0x4d, 0x53, 0x8f, 0x53, 0x1f, 0xc0, 0x74,
++      0xf7, 0x53, 0xb9, 0x5e, 0x23, 0x37, 0xba, 0x6e,
++      0xe3, 0x9d, 0x07, 0x55, 0x25, 0x7b, 0xe6, 0x2a,
++      0x64, 0xd1, 0x32, 0xdd, 0x54, 0x1b, 0x4b, 0xc0,
++      0xe1, 0xd7, 0x69, 0x58, 0xf8, 0x93, 0x29, 0xc4,
++      0xdd, 0x23, 0x2f, 0xa5, 0xfc, 0x9d, 0x7e, 0xf8,
++      0xd4, 0x90, 0xcd, 0x82, 0x55, 0xdc, 0x16, 0x16,
++      0x9f, 0x07, 0x52, 0x9b, 0x9d, 0x25, 0xed, 0x32,
++      0xc5, 0x7b, 0xdf, 0xf6, 0x83, 0x46, 0x3d, 0x65,
++      0xb7, 0xef, 0x87, 0x7a, 0x12, 0x69, 0x8f, 0x06,
++      0x7c, 0x51, 0x15, 0x4a, 0x08, 0xe8, 0xac, 0x9a,
++      0x0c, 0x24, 0xa7, 0x27, 0xd8, 0x46, 0x2f, 0xe7,
++      0x01, 0x0e, 0x1c, 0xc6, 0x91, 0xb0, 0x6e, 0x85,
++      0x65, 0xf0, 0x29, 0x0d, 0x2e, 0x6b, 0x3b, 0xfb,
++      0x4b, 0xdf, 0xe4, 0x80, 0x93, 0x03, 0x66, 0x46,
++      0x3e, 0x8a, 0x6e, 0xf3, 0x5e, 0x4d, 0x62, 0x0e,
++      0x49, 0x05, 0xaf, 0xd4, 0xf8, 0x21, 0x20, 0x61,
++      0x1d, 0x39, 0x17, 0xf4, 0x61, 0x47, 0x95, 0xfb,
++      0x15, 0x2e, 0xb3, 0x4f, 0xd0, 0x5d, 0xf5, 0x7d,
++      0x40, 0xda, 0x90, 0x3c, 0x6b, 0xcb, 0x17, 0x00,
++      0x13, 0x3b, 0x64, 0x34, 0x1b, 0xf0, 0xf2, 0xe5,
++      0x3b, 0xb2, 0xc7, 0xd3, 0x5f, 0x3a, 0x44, 0xa6,
++      0x9b, 0xb7, 0x78, 0x0e, 0x42, 0x5d, 0x4c, 0xc1,
++      0xe9, 0xd2, 0xcb, 0xb7, 0x78, 0xd1, 0xfe, 0x9a,
++      0xb5, 0x07, 0xe9, 0xe0, 0xbe, 0xe2, 0x8a, 0xa7,
++      0x01, 0x83, 0x00, 0x8c, 0x5c, 0x08, 0xe6, 0x63,
++      0x12, 0x92, 0xb7, 0xb7, 0xa6, 0x19, 0x7d, 0x38,
++      0x13, 0x38, 0x92, 0x87, 0x24, 0xf9, 0x48, 0xb3,
++      0x5e, 0x87, 0x6a, 0x40, 0x39, 0x5c, 0x3f, 0xed,
++      0x8f, 0xee, 0xdb, 0x15, 0x82, 0x06, 0xda, 0x49,
++      0x21, 0x2b, 0xb5, 0xbf, 0x32, 0x7c, 0x9f, 0x42,
++      0x28, 0x63, 0xcf, 0xaf, 0x1e, 0xf8, 0xc6, 0xa0,
++      0xd1, 0x02, 0x43, 0x57, 0x62, 0xec, 0x9b, 0x0f,
++      0x01, 0x9e, 0x71, 0xd8, 0x87, 0x9d, 0x01, 0xc1,
++      0x58, 0x77, 0xd9, 0xaf, 0xb1, 0x10, 0x7e, 0xdd,
++      0xa6, 0x50, 0x96, 0xe5, 0xf0, 0x72, 0x00, 0x6d,
++      0x4b, 0xf8, 0x2a, 0x8f, 0x19, 0xf3, 0x22, 0x88,
++      0x11, 0x4a, 0x8b, 0x7c, 0xfd, 0xb7, 0xed, 0xe1,
++      0xf6, 0x40, 0x39, 0xe0, 0xe9, 0xf6, 0x3d, 0x25,
++      0xe6, 0x74, 0x3c, 0x58, 0x57, 0x7f, 0xe1, 0x22,
++      0x96, 0x47, 0x31, 0x91, 0xba, 0x70, 0x85, 0x28,
++      0x6b, 0x9f, 0x6e, 0x25, 0xac, 0x23, 0x66, 0x2f,
++      0x29, 0x88, 0x28, 0xce, 0x8c, 0x5c, 0x88, 0x53,
++      0xd1, 0x3b, 0xcc, 0x6a, 0x51, 0xb2, 0xe1, 0x28,
++      0x3f, 0x91, 0xb4, 0x0d, 0x00, 0x3a, 0xe3, 0xf8,
++      0xc3, 0x8f, 0xd7, 0x96, 0x62, 0x0e, 0x2e, 0xfc,
++      0xc8, 0x6c, 0x77, 0xa6, 0x1d, 0x22, 0xc1, 0xb8,
++      0xe6, 0x61, 0xd7, 0x67, 0x36, 0x13, 0x7b, 0xbb,
++      0x9b, 0x59, 0x09, 0xa6, 0xdf, 0xf7, 0x6b, 0xa3,
++      0x40, 0x1a, 0xf5, 0x4f, 0xb4, 0xda, 0xd3, 0xf3,
++      0x81, 0x93, 0xc6, 0x18, 0xd9, 0x26, 0xee, 0xac,
++      0xf0, 0xaa, 0xdf, 0xc5, 0x9c, 0xca, 0xc2, 0xa2,
++      0xcc, 0x7b, 0x5c, 0x24, 0xb0, 0xbc, 0xd0, 0x6a,
++      0x4d, 0x89, 0x09, 0xb8, 0x07, 0xfe, 0x87, 0xad,
++      0x0a, 0xea, 0xb8, 0x42, 0xf9, 0x5e, 0xb3, 0x3e,
++      0x36, 0x4c, 0xaf, 0x75, 0x9e, 0x1c, 0xeb, 0xbd,
++      0xbc, 0xbb, 0x80, 0x40, 0xa7, 0x3a, 0x30, 0xbf,
++      0xa8, 0x44, 0xf4, 0xeb, 0x38, 0xad, 0x29, 0xba,
++      0x23, 0xed, 0x41, 0x0c, 0xea, 0xd2, 0xbb, 0x41,
++      0x18, 0xd6, 0xb9, 0xba, 0x65, 0x2b, 0xa3, 0x91,
++      0x6d, 0x1f, 0xa9, 0xf4, 0xd1, 0x25, 0x8d, 0x4d,
++      0x38, 0xff, 0x64, 0xa0, 0xec, 0xde, 0xa6, 0xb6,
++      0x79, 0xab, 0x8e, 0x33, 0x6c, 0x47, 0xde, 0xaf,
++      0x94, 0xa4, 0xa5, 0x86, 0x77, 0x55, 0x09, 0x92,
++      0x81, 0x31, 0x76, 0xc7, 0x34, 0x22, 0x89, 0x8e,
++      0x3d, 0x26, 0x26, 0xd7, 0xfc, 0x1e, 0x16, 0x72,
++      0x13, 0x33, 0x63, 0xd5, 0x22, 0xbe, 0xb8, 0x04,
++      0x34, 0x84, 0x41, 0xbb, 0x80, 0xd0, 0x9f, 0x46,
++      0x48, 0x07, 0xa7, 0xfc, 0x2b, 0x3a, 0x75, 0x55,
++      0x8c, 0xc7, 0x6a, 0xbd, 0x7e, 0x46, 0x08, 0x84,
++      0x0f, 0xd5, 0x74, 0xc0, 0x82, 0x8e, 0xaa, 0x61,
++      0x05, 0x01, 0xb2, 0x47, 0x6e, 0x20, 0x6a, 0x2d,
++      0x58, 0x70, 0x48, 0x32, 0xa7, 0x37, 0xd2, 0xb8,
++      0x82, 0x1a, 0x51, 0xb9, 0x61, 0xdd, 0xfd, 0x9d,
++      0x6b, 0x0e, 0x18, 0x97, 0xf8, 0x45, 0x5f, 0x87,
++      0x10, 0xcf, 0x34, 0x72, 0x45, 0x26, 0x49, 0x70,
++      0xe7, 0xa3, 0x78, 0xe0, 0x52, 0x89, 0x84, 0x94,
++      0x83, 0x82, 0xc2, 0x69, 0x8f, 0xe3, 0xe1, 0x3f,
++      0x60, 0x74, 0x88, 0xc4, 0xf7, 0x75, 0x2c, 0xfb,
++      0xbd, 0xb6, 0xc4, 0x7e, 0x10, 0x0a, 0x6c, 0x90,
++      0x04, 0x9e, 0xc3, 0x3f, 0x59, 0x7c, 0xce, 0x31,
++      0x18, 0x60, 0x57, 0x73, 0x46, 0x94, 0x7d, 0x06,
++      0xa0, 0x6d, 0x44, 0xec, 0xa2, 0x0a, 0x9e, 0x05,
++      0x15, 0xef, 0xca, 0x5c, 0xbf, 0x00, 0xeb, 0xf7,
++      0x3d, 0x32, 0xd4, 0xa5, 0xef, 0x49, 0x89, 0x5e,
++      0x46, 0xb0, 0xa6, 0x63, 0x5b, 0x8a, 0x73, 0xae,
++      0x6f, 0xd5, 0x9d, 0xf8, 0x4f, 0x40, 0xb5, 0xb2,
++      0x6e, 0xd3, 0xb6, 0x01, 0xa9, 0x26, 0xa2, 0x21,
++      0xcf, 0x33, 0x7a, 0x3a, 0xa4, 0x23, 0x13, 0xb0,
++      0x69, 0x6a, 0xee, 0xce, 0xd8, 0x9d, 0x01, 0x1d,
++      0x50, 0xc1, 0x30, 0x6c, 0xb1, 0xcd, 0xa0, 0xf0,
++      0xf0, 0xa2, 0x64, 0x6f, 0xbb, 0xbf, 0x5e, 0xe6,
++      0xab, 0x87, 0xb4, 0x0f, 0x4f, 0x15, 0xaf, 0xb5,
++      0x25, 0xa1, 0xb2, 0xd0, 0x80, 0x2c, 0xfb, 0xf9,
++      0xfe, 0xd2, 0x33, 0xbb, 0x76, 0xfe, 0x7c, 0xa8,
++      0x66, 0xf7, 0xe7, 0x85, 0x9f, 0x1f, 0x85, 0x57,
++      0x88, 0xe1, 0xe9, 0x63, 0xe4, 0xd8, 0x1c, 0xa1,
++      0xfb, 0xda, 0x44, 0x05, 0x2e, 0x1d, 0x3a, 0x1c,
++      0xff, 0xc8, 0x3b, 0xc0, 0xfe, 0xda, 0x22, 0x0b,
++      0x43, 0xd6, 0x88, 0x39, 0x4c, 0x4a, 0xa6, 0x69,
++      0x18, 0x93, 0x42, 0x4e, 0xb5, 0xcc, 0x66, 0x0d,
++      0x09, 0xf8, 0x1e, 0x7c, 0xd3, 0x3c, 0x99, 0x0d,
++      0x50, 0x1d, 0x62, 0xe9, 0x57, 0x06, 0xbf, 0x19,
++      0x88, 0xdd, 0xad, 0x7b, 0x4f, 0xf9, 0xc7, 0x82,
++      0x6d, 0x8d, 0xc8, 0xc4, 0xc5, 0x78, 0x17, 0x20,
++      0x15, 0xc5, 0x52, 0x41, 0xcf, 0x5b, 0xd6, 0x7f,
++      0x94, 0x02, 0x41, 0xe0, 0x40, 0x22, 0x03, 0x5e,
++      0xd1, 0x53, 0xd4, 0x86, 0xd3, 0x2c, 0x9f, 0x0f,
++      0x96, 0xe3, 0x6b, 0x9a, 0x76, 0x32, 0x06, 0x47,
++      0x4b, 0x11, 0xb3, 0xdd, 0x03, 0x65, 0xbd, 0x9b,
++      0x01, 0xda, 0x9c, 0xb9, 0x7e, 0x3f, 0x6a, 0xc4,
++      0x7b, 0xea, 0xd4, 0x3c, 0xb9, 0xfb, 0x5c, 0x6b,
++      0x64, 0x33, 0x52, 0xba, 0x64, 0x78, 0x8f, 0xa4,
++      0xaf, 0x7a, 0x61, 0x8d, 0xbc, 0xc5, 0x73, 0xe9,
++      0x6b, 0x58, 0x97, 0x4b, 0xbf, 0x63, 0x22, 0xd3,
++      0x37, 0x02, 0x54, 0xc5, 0xb9, 0x16, 0x4a, 0xf0,
++      0x19, 0xd8, 0x94, 0x57, 0xb8, 0x8a, 0xb3, 0x16,
++      0x3b, 0xd0, 0x84, 0x8e, 0x67, 0xa6, 0xa3, 0x7d,
++      0x78, 0xec, 0x00
++};
++static const u8 enc_output012[] __initconst = {
++      0x52, 0x34, 0xb3, 0x65, 0x3b, 0xb7, 0xe5, 0xd3,
++      0xab, 0x49, 0x17, 0x60, 0xd2, 0x52, 0x56, 0xdf,
++      0xdf, 0x34, 0x56, 0x82, 0xe2, 0xbe, 0xe5, 0xe1,
++      0x28, 0xd1, 0x4e, 0x5f, 0x4f, 0x01, 0x7d, 0x3f,
++      0x99, 0x6b, 0x30, 0x6e, 0x1a, 0x7c, 0x4c, 0x8e,
++      0x62, 0x81, 0xae, 0x86, 0x3f, 0x6b, 0xd0, 0xb5,
++      0xa9, 0xcf, 0x50, 0xf1, 0x02, 0x12, 0xa0, 0x0b,
++      0x24, 0xe9, 0xe6, 0x72, 0x89, 0x2c, 0x52, 0x1b,
++      0x34, 0x38, 0xf8, 0x75, 0x5f, 0xa0, 0x74, 0xe2,
++      0x99, 0xdd, 0xa6, 0x4b, 0x14, 0x50, 0x4e, 0xf1,
++      0xbe, 0xd6, 0x9e, 0xdb, 0xb2, 0x24, 0x27, 0x74,
++      0x12, 0x4a, 0x78, 0x78, 0x17, 0xa5, 0x58, 0x8e,
++      0x2f, 0xf9, 0xf4, 0x8d, 0xee, 0x03, 0x88, 0xae,
++      0xb8, 0x29, 0xa1, 0x2f, 0x4b, 0xee, 0x92, 0xbd,
++      0x87, 0xb3, 0xce, 0x34, 0x21, 0x57, 0x46, 0x04,
++      0x49, 0x0c, 0x80, 0xf2, 0x01, 0x13, 0xa1, 0x55,
++      0xb3, 0xff, 0x44, 0x30, 0x3c, 0x1c, 0xd0, 0xef,
++      0xbc, 0x18, 0x74, 0x26, 0xad, 0x41, 0x5b, 0x5b,
++      0x3e, 0x9a, 0x7a, 0x46, 0x4f, 0x16, 0xd6, 0x74,
++      0x5a, 0xb7, 0x3a, 0x28, 0x31, 0xd8, 0xae, 0x26,
++      0xac, 0x50, 0x53, 0x86, 0xf2, 0x56, 0xd7, 0x3f,
++      0x29, 0xbc, 0x45, 0x68, 0x8e, 0xcb, 0x98, 0x64,
++      0xdd, 0xc9, 0xba, 0xb8, 0x4b, 0x7b, 0x82, 0xdd,
++      0x14, 0xa7, 0xcb, 0x71, 0x72, 0x00, 0x5c, 0xad,
++      0x7b, 0x6a, 0x89, 0xa4, 0x3d, 0xbf, 0xb5, 0x4b,
++      0x3e, 0x7c, 0x5a, 0xcf, 0xb8, 0xa1, 0xc5, 0x6e,
++      0xc8, 0xb6, 0x31, 0x57, 0x7b, 0xdf, 0xa5, 0x7e,
++      0xb1, 0xd6, 0x42, 0x2a, 0x31, 0x36, 0xd1, 0xd0,
++      0x3f, 0x7a, 0xe5, 0x94, 0xd6, 0x36, 0xa0, 0x6f,
++      0xb7, 0x40, 0x7d, 0x37, 0xc6, 0x55, 0x7c, 0x50,
++      0x40, 0x6d, 0x29, 0x89, 0xe3, 0x5a, 0xae, 0x97,
++      0xe7, 0x44, 0x49, 0x6e, 0xbd, 0x81, 0x3d, 0x03,
++      0x93, 0x06, 0x12, 0x06, 0xe2, 0x41, 0x12, 0x4a,
++      0xf1, 0x6a, 0xa4, 0x58, 0xa2, 0xfb, 0xd2, 0x15,
++      0xba, 0xc9, 0x79, 0xc9, 0xce, 0x5e, 0x13, 0xbb,
++      0xf1, 0x09, 0x04, 0xcc, 0xfd, 0xe8, 0x51, 0x34,
++      0x6a, 0xe8, 0x61, 0x88, 0xda, 0xed, 0x01, 0x47,
++      0x84, 0xf5, 0x73, 0x25, 0xf9, 0x1c, 0x42, 0x86,
++      0x07, 0xf3, 0x5b, 0x1a, 0x01, 0xb3, 0xeb, 0x24,
++      0x32, 0x8d, 0xf6, 0xed, 0x7c, 0x4b, 0xeb, 0x3c,
++      0x36, 0x42, 0x28, 0xdf, 0xdf, 0xb6, 0xbe, 0xd9,
++      0x8c, 0x52, 0xd3, 0x2b, 0x08, 0x90, 0x8c, 0xe7,
++      0x98, 0x31, 0xe2, 0x32, 0x8e, 0xfc, 0x11, 0x48,
++      0x00, 0xa8, 0x6a, 0x42, 0x4a, 0x02, 0xc6, 0x4b,
++      0x09, 0xf1, 0xe3, 0x49, 0xf3, 0x45, 0x1f, 0x0e,
++      0xbc, 0x56, 0xe2, 0xe4, 0xdf, 0xfb, 0xeb, 0x61,
++      0xfa, 0x24, 0xc1, 0x63, 0x75, 0xbb, 0x47, 0x75,
++      0xaf, 0xe1, 0x53, 0x16, 0x96, 0x21, 0x85, 0x26,
++      0x11, 0xb3, 0x76, 0xe3, 0x23, 0xa1, 0x6b, 0x74,
++      0x37, 0xd0, 0xde, 0x06, 0x90, 0x71, 0x5d, 0x43,
++      0x88, 0x9b, 0x00, 0x54, 0xa6, 0x75, 0x2f, 0xa1,
++      0xc2, 0x0b, 0x73, 0x20, 0x1d, 0xb6, 0x21, 0x79,
++      0x57, 0x3f, 0xfa, 0x09, 0xbe, 0x8a, 0x33, 0xc3,
++      0x52, 0xf0, 0x1d, 0x82, 0x31, 0xd1, 0x55, 0xb5,
++      0x6c, 0x99, 0x25, 0xcf, 0x5c, 0x32, 0xce, 0xe9,
++      0x0d, 0xfa, 0x69, 0x2c, 0xd5, 0x0d, 0xc5, 0x6d,
++      0x86, 0xd0, 0x0c, 0x3b, 0x06, 0x50, 0x79, 0xe8,
++      0xc3, 0xae, 0x04, 0xe6, 0xcd, 0x51, 0xe4, 0x26,
++      0x9b, 0x4f, 0x7e, 0xa6, 0x0f, 0xab, 0xd8, 0xe5,
++      0xde, 0xa9, 0x00, 0x95, 0xbe, 0xa3, 0x9d, 0x5d,
++      0xb2, 0x09, 0x70, 0x18, 0x1c, 0xf0, 0xac, 0x29,
++      0x23, 0x02, 0x29, 0x28, 0xd2, 0x74, 0x35, 0x57,
++      0x62, 0x0f, 0x24, 0xea, 0x5e, 0x33, 0xc2, 0x92,
++      0xf3, 0x78, 0x4d, 0x30, 0x1e, 0xa1, 0x99, 0xa9,
++      0x82, 0xb0, 0x42, 0x31, 0x8d, 0xad, 0x8a, 0xbc,
++      0xfc, 0xd4, 0x57, 0x47, 0x3e, 0xb4, 0x50, 0xdd,
++      0x6e, 0x2c, 0x80, 0x4d, 0x22, 0xf1, 0xfb, 0x57,
++      0xc4, 0xdd, 0x17, 0xe1, 0x8a, 0x36, 0x4a, 0xb3,
++      0x37, 0xca, 0xc9, 0x4e, 0xab, 0xd5, 0x69, 0xc4,
++      0xf4, 0xbc, 0x0b, 0x3b, 0x44, 0x4b, 0x29, 0x9c,
++      0xee, 0xd4, 0x35, 0x22, 0x21, 0xb0, 0x1f, 0x27,
++      0x64, 0xa8, 0x51, 0x1b, 0xf0, 0x9f, 0x19, 0x5c,
++      0xfb, 0x5a, 0x64, 0x74, 0x70, 0x45, 0x09, 0xf5,
++      0x64, 0xfe, 0x1a, 0x2d, 0xc9, 0x14, 0x04, 0x14,
++      0xcf, 0xd5, 0x7d, 0x60, 0xaf, 0x94, 0x39, 0x94,
++      0xe2, 0x7d, 0x79, 0x82, 0xd0, 0x65, 0x3b, 0x6b,
++      0x9c, 0x19, 0x84, 0xb4, 0x6d, 0xb3, 0x0c, 0x99,
++      0xc0, 0x56, 0xa8, 0xbd, 0x73, 0xce, 0x05, 0x84,
++      0x3e, 0x30, 0xaa, 0xc4, 0x9b, 0x1b, 0x04, 0x2a,
++      0x9f, 0xd7, 0x43, 0x2b, 0x23, 0xdf, 0xbf, 0xaa,
++      0xd5, 0xc2, 0x43, 0x2d, 0x70, 0xab, 0xdc, 0x75,
++      0xad, 0xac, 0xf7, 0xc0, 0xbe, 0x67, 0xb2, 0x74,
++      0xed, 0x67, 0x10, 0x4a, 0x92, 0x60, 0xc1, 0x40,
++      0x50, 0x19, 0x8a, 0x8a, 0x8c, 0x09, 0x0e, 0x72,
++      0xe1, 0x73, 0x5e, 0xe8, 0x41, 0x85, 0x63, 0x9f,
++      0x3f, 0xd7, 0x7d, 0xc4, 0xfb, 0x22, 0x5d, 0x92,
++      0x6c, 0xb3, 0x1e, 0xe2, 0x50, 0x2f, 0x82, 0xa8,
++      0x28, 0xc0, 0xb5, 0xd7, 0x5f, 0x68, 0x0d, 0x2c,
++      0x2d, 0xaf, 0x7e, 0xfa, 0x2e, 0x08, 0x0f, 0x1f,
++      0x70, 0x9f, 0xe9, 0x19, 0x72, 0x55, 0xf8, 0xfb,
++      0x51, 0xd2, 0x33, 0x5d, 0xa0, 0xd3, 0x2b, 0x0a,
++      0x6c, 0xbc, 0x4e, 0xcf, 0x36, 0x4d, 0xdc, 0x3b,
++      0xe9, 0x3e, 0x81, 0x7c, 0x61, 0xdb, 0x20, 0x2d,
++      0x3a, 0xc3, 0xb3, 0x0c, 0x1e, 0x00, 0xb9, 0x7c,
++      0xf5, 0xca, 0x10, 0x5f, 0x3a, 0x71, 0xb3, 0xe4,
++      0x20, 0xdb, 0x0c, 0x2a, 0x98, 0x63, 0x45, 0x00,
++      0x58, 0xf6, 0x68, 0xe4, 0x0b, 0xda, 0x13, 0x3b,
++      0x60, 0x5c, 0x76, 0xdb, 0xb9, 0x97, 0x71, 0xe4,
++      0xd9, 0xb7, 0xdb, 0xbd, 0x68, 0xc7, 0x84, 0x84,
++      0xaa, 0x7c, 0x68, 0x62, 0x5e, 0x16, 0xfc, 0xba,
++      0x72, 0xaa, 0x9a, 0xa9, 0xeb, 0x7c, 0x75, 0x47,
++      0x97, 0x7e, 0xad, 0xe2, 0xd9, 0x91, 0xe8, 0xe4,
++      0xa5, 0x31, 0xd7, 0x01, 0x8e, 0xa2, 0x11, 0x88,
++      0x95, 0xb9, 0xf2, 0x9b, 0xd3, 0x7f, 0x1b, 0x81,
++      0x22, 0xf7, 0x98, 0x60, 0x0a, 0x64, 0xa6, 0xc1,
++      0xf6, 0x49, 0xc7, 0xe3, 0x07, 0x4d, 0x94, 0x7a,
++      0xcf, 0x6e, 0x68, 0x0c, 0x1b, 0x3f, 0x6e, 0x2e,
++      0xee, 0x92, 0xfa, 0x52, 0xb3, 0x59, 0xf8, 0xf1,
++      0x8f, 0x6a, 0x66, 0xa3, 0x82, 0x76, 0x4a, 0x07,
++      0x1a, 0xc7, 0xdd, 0xf5, 0xda, 0x9c, 0x3c, 0x24,
++      0xbf, 0xfd, 0x42, 0xa1, 0x10, 0x64, 0x6a, 0x0f,
++      0x89, 0xee, 0x36, 0xa5, 0xce, 0x99, 0x48, 0x6a,
++      0xf0, 0x9f, 0x9e, 0x69, 0xa4, 0x40, 0x20, 0xe9,
++      0x16, 0x15, 0xf7, 0xdb, 0x75, 0x02, 0xcb, 0xe9,
++      0x73, 0x8b, 0x3b, 0x49, 0x2f, 0xf0, 0xaf, 0x51,
++      0x06, 0x5c, 0xdf, 0x27, 0x27, 0x49, 0x6a, 0xd1,
++      0xcc, 0xc7, 0xb5, 0x63, 0xb5, 0xfc, 0xb8, 0x5c,
++      0x87, 0x7f, 0x84, 0xb4, 0xcc, 0x14, 0xa9, 0x53,
++      0xda, 0xa4, 0x56, 0xf8, 0xb6, 0x1b, 0xcc, 0x40,
++      0x27, 0x52, 0x06, 0x5a, 0x13, 0x81, 0xd7, 0x3a,
++      0xd4, 0x3b, 0xfb, 0x49, 0x65, 0x31, 0x33, 0xb2,
++      0xfa, 0xcd, 0xad, 0x58, 0x4e, 0x2b, 0xae, 0xd2,
++      0x20, 0xfb, 0x1a, 0x48, 0xb4, 0x3f, 0x9a, 0xd8,
++      0x7a, 0x35, 0x4a, 0xc8, 0xee, 0x88, 0x5e, 0x07,
++      0x66, 0x54, 0xb9, 0xec, 0x9f, 0xa3, 0xe3, 0xb9,
++      0x37, 0xaa, 0x49, 0x76, 0x31, 0xda, 0x74, 0x2d,
++      0x3c, 0xa4, 0x65, 0x10, 0x32, 0x38, 0xf0, 0xde,
++      0xd3, 0x99, 0x17, 0xaa, 0x71, 0xaa, 0x8f, 0x0f,
++      0x8c, 0xaf, 0xa2, 0xf8, 0x5d, 0x64, 0xba, 0x1d,
++      0xa3, 0xef, 0x96, 0x73, 0xe8, 0xa1, 0x02, 0x8d,
++      0x0c, 0x6d, 0xb8, 0x06, 0x90, 0xb8, 0x08, 0x56,
++      0x2c, 0xa7, 0x06, 0xc9, 0xc2, 0x38, 0xdb, 0x7c,
++      0x63, 0xb1, 0x57, 0x8e, 0xea, 0x7c, 0x79, 0xf3,
++      0x49, 0x1d, 0xfe, 0x9f, 0xf3, 0x6e, 0xb1, 0x1d,
++      0xba, 0x19, 0x80, 0x1a, 0x0a, 0xd3, 0xb0, 0x26,
++      0x21, 0x40, 0xb1, 0x7c, 0xf9, 0x4d, 0x8d, 0x10,
++      0xc1, 0x7e, 0xf4, 0xf6, 0x3c, 0xa8, 0xfd, 0x7c,
++      0xa3, 0x92, 0xb2, 0x0f, 0xaa, 0xcc, 0xa6, 0x11,
++      0xfe, 0x04, 0xe3, 0xd1, 0x7a, 0x32, 0x89, 0xdf,
++      0x0d, 0xc4, 0x8f, 0x79, 0x6b, 0xca, 0x16, 0x7c,
++      0x6e, 0xf9, 0xad, 0x0f, 0xf6, 0xfe, 0x27, 0xdb,
++      0xc4, 0x13, 0x70, 0xf1, 0x62, 0x1a, 0x4f, 0x79,
++      0x40, 0xc9, 0x9b, 0x8b, 0x21, 0xea, 0x84, 0xfa,
++      0xf5, 0xf1, 0x89, 0xce, 0xb7, 0x55, 0x0a, 0x80,
++      0x39, 0x2f, 0x55, 0x36, 0x16, 0x9c, 0x7b, 0x08,
++      0xbd, 0x87, 0x0d, 0xa5, 0x32, 0xf1, 0x52, 0x7c,
++      0xe8, 0x55, 0x60, 0x5b, 0xd7, 0x69, 0xe4, 0xfc,
++      0xfa, 0x12, 0x85, 0x96, 0xea, 0x50, 0x28, 0xab,
++      0x8a, 0xf7, 0xbb, 0x0e, 0x53, 0x74, 0xca, 0xa6,
++      0x27, 0x09, 0xc2, 0xb5, 0xde, 0x18, 0x14, 0xd9,
++      0xea, 0xe5, 0x29, 0x1c, 0x40, 0x56, 0xcf, 0xd7,
++      0xae, 0x05, 0x3f, 0x65, 0xaf, 0x05, 0x73, 0xe2,
++      0x35, 0x96, 0x27, 0x07, 0x14, 0xc0, 0xad, 0x33,
++      0xf1, 0xdc, 0x44, 0x7a, 0x89, 0x17, 0x77, 0xd2,
++      0x9c, 0x58, 0x60, 0xf0, 0x3f, 0x7b, 0x2d, 0x2e,
++      0x57, 0x95, 0x54, 0x87, 0xed, 0xf2, 0xc7, 0x4c,
++      0xf0, 0xae, 0x56, 0x29, 0x19, 0x7d, 0x66, 0x4b,
++      0x9b, 0x83, 0x84, 0x42, 0x3b, 0x01, 0x25, 0x66,
++      0x8e, 0x02, 0xde, 0xb9, 0x83, 0x54, 0x19, 0xf6,
++      0x9f, 0x79, 0x0d, 0x67, 0xc5, 0x1d, 0x7a, 0x44,
++      0x02, 0x98, 0xa7, 0x16, 0x1c, 0x29, 0x0d, 0x74,
++      0xff, 0x85, 0x40, 0x06, 0xef, 0x2c, 0xa9, 0xc6,
++      0xf5, 0x53, 0x07, 0x06, 0xae, 0xe4, 0xfa, 0x5f,
++      0xd8, 0x39, 0x4d, 0xf1, 0x9b, 0x6b, 0xd9, 0x24,
++      0x84, 0xfe, 0x03, 0x4c, 0xb2, 0x3f, 0xdf, 0xa1,
++      0x05, 0x9e, 0x50, 0x14, 0x5a, 0xd9, 0x1a, 0xa2,
++      0xa7, 0xfa, 0xfa, 0x17, 0xf7, 0x78, 0xd6, 0xb5,
++      0x92, 0x61, 0x91, 0xac, 0x36, 0xfa, 0x56, 0x0d,
++      0x38, 0x32, 0x18, 0x85, 0x08, 0x58, 0x37, 0xf0,
++      0x4b, 0xdb, 0x59, 0xe7, 0xa4, 0x34, 0xc0, 0x1b,
++      0x01, 0xaf, 0x2d, 0xde, 0xa1, 0xaa, 0x5d, 0xd3,
++      0xec, 0xe1, 0xd4, 0xf7, 0xe6, 0x54, 0x68, 0xf0,
++      0x51, 0x97, 0xa7, 0x89, 0xea, 0x24, 0xad, 0xd3,
++      0x6e, 0x47, 0x93, 0x8b, 0x4b, 0xb4, 0xf7, 0x1c,
++      0x42, 0x06, 0x67, 0xe8, 0x99, 0xf6, 0xf5, 0x7b,
++      0x85, 0xb5, 0x65, 0xb5, 0xb5, 0xd2, 0x37, 0xf5,
++      0xf3, 0x02, 0xa6, 0x4d, 0x11, 0xa7, 0xdc, 0x51,
++      0x09, 0x7f, 0xa0, 0xd8, 0x88, 0x1c, 0x13, 0x71,
++      0xae, 0x9c, 0xb7, 0x7b, 0x34, 0xd6, 0x4e, 0x68,
++      0x26, 0x83, 0x51, 0xaf, 0x1d, 0xee, 0x8b, 0xbb,
++      0x69, 0x43, 0x2b, 0x9e, 0x8a, 0xbc, 0x02, 0x0e,
++      0xa0, 0x1b, 0xe0, 0xa8, 0x5f, 0x6f, 0xaf, 0x1b,
++      0x8f, 0xe7, 0x64, 0x71, 0x74, 0x11, 0x7e, 0xa8,
++      0xd8, 0xf9, 0x97, 0x06, 0xc3, 0xb6, 0xfb, 0xfb,
++      0xb7, 0x3d, 0x35, 0x9d, 0x3b, 0x52, 0xed, 0x54,
++      0xca, 0xf4, 0x81, 0x01, 0x2d, 0x1b, 0xc3, 0xa7,
++      0x00, 0x3d, 0x1a, 0x39, 0x54, 0xe1, 0xf6, 0xff,
++      0xed, 0x6f, 0x0b, 0x5a, 0x68, 0xda, 0x58, 0xdd,
++      0xa9, 0xcf, 0x5c, 0x4a, 0xe5, 0x09, 0x4e, 0xde,
++      0x9d, 0xbc, 0x3e, 0xee, 0x5a, 0x00, 0x3b, 0x2c,
++      0x87, 0x10, 0x65, 0x60, 0xdd, 0xd7, 0x56, 0xd1,
++      0x4c, 0x64, 0x45, 0xe4, 0x21, 0xec, 0x78, 0xf8,
++      0x25, 0x7a, 0x3e, 0x16, 0x5d, 0x09, 0x53, 0x14,
++      0xbe, 0x4f, 0xae, 0x87, 0xd8, 0xd1, 0xaa, 0x3c,
++      0xf6, 0x3e, 0xa4, 0x70, 0x8c, 0x5e, 0x70, 0xa4,
++      0xb3, 0x6b, 0x66, 0x73, 0xd3, 0xbf, 0x31, 0x06,
++      0x19, 0x62, 0x93, 0x15, 0xf2, 0x86, 0xe4, 0x52,
++      0x7e, 0x53, 0x4c, 0x12, 0x38, 0xcc, 0x34, 0x7d,
++      0x57, 0xf6, 0x42, 0x93, 0x8a, 0xc4, 0xee, 0x5c,
++      0x8a, 0xe1, 0x52, 0x8f, 0x56, 0x64, 0xf6, 0xa6,
++      0xd1, 0x91, 0x57, 0x70, 0xcd, 0x11, 0x76, 0xf5,
++      0x59, 0x60, 0x60, 0x3c, 0xc1, 0xc3, 0x0b, 0x7f,
++      0x58, 0x1a, 0x50, 0x91, 0xf1, 0x68, 0x8f, 0x6e,
++      0x74, 0x74, 0xa8, 0x51, 0x0b, 0xf7, 0x7a, 0x98,
++      0x37, 0xf2, 0x0a, 0x0e, 0xa4, 0x97, 0x04, 0xb8,
++      0x9b, 0xfd, 0xa0, 0xea, 0xf7, 0x0d, 0xe1, 0xdb,
++      0x03, 0xf0, 0x31, 0x29, 0xf8, 0xdd, 0x6b, 0x8b,
++      0x5d, 0xd8, 0x59, 0xa9, 0x29, 0xcf, 0x9a, 0x79,
++      0x89, 0x19, 0x63, 0x46, 0x09, 0x79, 0x6a, 0x11,
++      0xda, 0x63, 0x68, 0x48, 0x77, 0x23, 0xfb, 0x7d,
++      0x3a, 0x43, 0xcb, 0x02, 0x3b, 0x7a, 0x6d, 0x10,
++      0x2a, 0x9e, 0xac, 0xf1, 0xd4, 0x19, 0xf8, 0x23,
++      0x64, 0x1d, 0x2c, 0x5f, 0xf2, 0xb0, 0x5c, 0x23,
++      0x27, 0xf7, 0x27, 0x30, 0x16, 0x37, 0xb1, 0x90,
++      0xab, 0x38, 0xfb, 0x55, 0xcd, 0x78, 0x58, 0xd4,
++      0x7d, 0x43, 0xf6, 0x45, 0x5e, 0x55, 0x8d, 0xb1,
++      0x02, 0x65, 0x58, 0xb4, 0x13, 0x4b, 0x36, 0xf7,
++      0xcc, 0xfe, 0x3d, 0x0b, 0x82, 0xe2, 0x12, 0x11,
++      0xbb, 0xe6, 0xb8, 0x3a, 0x48, 0x71, 0xc7, 0x50,
++      0x06, 0x16, 0x3a, 0xe6, 0x7c, 0x05, 0xc7, 0xc8,
++      0x4d, 0x2f, 0x08, 0x6a, 0x17, 0x9a, 0x95, 0x97,
++      0x50, 0x68, 0xdc, 0x28, 0x18, 0xc4, 0x61, 0x38,
++      0xb9, 0xe0, 0x3e, 0x78, 0xdb, 0x29, 0xe0, 0x9f,
++      0x52, 0xdd, 0xf8, 0x4f, 0x91, 0xc1, 0xd0, 0x33,
++      0xa1, 0x7a, 0x8e, 0x30, 0x13, 0x82, 0x07, 0x9f,
++      0xd3, 0x31, 0x0f, 0x23, 0xbe, 0x32, 0x5a, 0x75,
++      0xcf, 0x96, 0xb2, 0xec, 0xb5, 0x32, 0xac, 0x21,
++      0xd1, 0x82, 0x33, 0xd3, 0x15, 0x74, 0xbd, 0x90,
++      0xf1, 0x2c, 0xe6, 0x5f, 0x8d, 0xe3, 0x02, 0xe8,
++      0xe9, 0xc4, 0xca, 0x96, 0xeb, 0x0e, 0xbc, 0x91,
++      0xf4, 0xb9, 0xea, 0xd9, 0x1b, 0x75, 0xbd, 0xe1,
++      0xac, 0x2a, 0x05, 0x37, 0x52, 0x9b, 0x1b, 0x3f,
++      0x5a, 0xdc, 0x21, 0xc3, 0x98, 0xbb, 0xaf, 0xa3,
++      0xf2, 0x00, 0xbf, 0x0d, 0x30, 0x89, 0x05, 0xcc,
++      0xa5, 0x76, 0xf5, 0x06, 0xf0, 0xc6, 0x54, 0x8a,
++      0x5d, 0xd4, 0x1e, 0xc1, 0xf2, 0xce, 0xb0, 0x62,
++      0xc8, 0xfc, 0x59, 0x42, 0x9a, 0x90, 0x60, 0x55,
++      0xfe, 0x88, 0xa5, 0x8b, 0xb8, 0x33, 0x0c, 0x23,
++      0x24, 0x0d, 0x15, 0x70, 0x37, 0x1e, 0x3d, 0xf6,
++      0xd2, 0xea, 0x92, 0x10, 0xb2, 0xc4, 0x51, 0xac,
++      0xf2, 0xac, 0xf3, 0x6b, 0x6c, 0xaa, 0xcf, 0x12,
++      0xc5, 0x6c, 0x90, 0x50, 0xb5, 0x0c, 0xfc, 0x1a,
++      0x15, 0x52, 0xe9, 0x26, 0xc6, 0x52, 0xa4, 0xe7,
++      0x81, 0x69, 0xe1, 0xe7, 0x9e, 0x30, 0x01, 0xec,
++      0x84, 0x89, 0xb2, 0x0d, 0x66, 0xdd, 0xce, 0x28,
++      0x5c, 0xec, 0x98, 0x46, 0x68, 0x21, 0x9f, 0x88,
++      0x3f, 0x1f, 0x42, 0x77, 0xce, 0xd0, 0x61, 0xd4,
++      0x20, 0xa7, 0xff, 0x53, 0xad, 0x37, 0xd0, 0x17,
++      0x35, 0xc9, 0xfc, 0xba, 0x0a, 0x78, 0x3f, 0xf2,
++      0xcc, 0x86, 0x89, 0xe8, 0x4b, 0x3c, 0x48, 0x33,
++      0x09, 0x7f, 0xc6, 0xc0, 0xdd, 0xb8, 0xfd, 0x7a,
++      0x66, 0x66, 0x65, 0xeb, 0x47, 0xa7, 0x04, 0x28,
++      0xa3, 0x19, 0x8e, 0xa9, 0xb1, 0x13, 0x67, 0x62,
++      0x70, 0xcf, 0xd6
++};
++static const u8 enc_assoc012[] __initconst = {
++      0xb1, 0x69, 0x83, 0x87, 0x30, 0xaa, 0x5d, 0xb8,
++      0x77, 0xe8, 0x21, 0xff, 0x06, 0x59, 0x35, 0xce,
++      0x75, 0xfe, 0x38, 0xef, 0xb8, 0x91, 0x43, 0x8c,
++      0xcf, 0x70, 0xdd, 0x0a, 0x68, 0xbf, 0xd4, 0xbc,
++      0x16, 0x76, 0x99, 0x36, 0x1e, 0x58, 0x79, 0x5e,
++      0xd4, 0x29, 0xf7, 0x33, 0x93, 0x48, 0xdb, 0x5f,
++      0x01, 0xae, 0x9c, 0xb6, 0xe4, 0x88, 0x6d, 0x2b,
++      0x76, 0x75, 0xe0, 0xf3, 0x74, 0xe2, 0xc9
++};
++static const u8 enc_nonce012[] __initconst = {
++      0x05, 0xa3, 0x93, 0xed, 0x30, 0xc5, 0xa2, 0x06
++};
++static const u8 enc_key012[] __initconst = {
++      0xb3, 0x35, 0x50, 0x03, 0x54, 0x2e, 0x40, 0x5e,
++      0x8f, 0x59, 0x8e, 0xc5, 0x90, 0xd5, 0x27, 0x2d,
++      0xba, 0x29, 0x2e, 0xcb, 0x1b, 0x70, 0x44, 0x1e,
++      0x65, 0x91, 0x6e, 0x2a, 0x79, 0x22, 0xda, 0x64
++};
++
++/* wycheproof - misc */
++static const u8 enc_input053[] __initconst = {
++      0x25, 0x6d, 0x40, 0x88, 0x80, 0x94, 0x17, 0x83,
++      0x55, 0xd3, 0x04, 0x84, 0x64, 0x43, 0xfe, 0xe8,
++      0xdf, 0x99, 0x47, 0x03, 0x03, 0xfb, 0x3b, 0x7b,
++      0x80, 0xe0, 0x30, 0xbe, 0xeb, 0xd3, 0x29, 0xbe
++};
++static const u8 enc_output053[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0xe6, 0xd3, 0xd7, 0x32, 0x4a, 0x1c, 0xbb, 0xa7,
++      0x77, 0xbb, 0xb0, 0xec, 0xdd, 0xa3, 0x78, 0x07
++};
++static const u8 enc_assoc053[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 enc_nonce053[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key053[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input054[] __initconst = {
++      0x25, 0x6d, 0x40, 0x88, 0x80, 0x94, 0x17, 0x83,
++      0x55, 0xd3, 0x04, 0x84, 0x64, 0x43, 0xfe, 0xe8,
++      0xdf, 0x99, 0x47, 0x03, 0x03, 0xfb, 0x3b, 0x7b,
++      0x80, 0xe0, 0x30, 0xbe, 0xeb, 0xd3, 0x29, 0xbe,
++      0xe3, 0xbc, 0xdb, 0x5b, 0x1e, 0xde, 0xfc, 0xfe,
++      0x8b, 0xcd, 0xa1, 0xb6, 0xa1, 0x5c, 0x8c, 0x2b,
++      0x08, 0x69, 0xff, 0xd2, 0xec, 0x5e, 0x26, 0xe5,
++      0x53, 0xb7, 0xb2, 0x27, 0xfe, 0x87, 0xfd, 0xbd
++};
++static const u8 enc_output054[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x06, 0x2d, 0xe6, 0x79, 0x5f, 0x27, 0x4f, 0xd2,
++      0xa3, 0x05, 0xd7, 0x69, 0x80, 0xbc, 0x9c, 0xce
++};
++static const u8 enc_assoc054[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 enc_nonce054[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key054[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input055[] __initconst = {
++      0x25, 0x6d, 0x40, 0x88, 0x80, 0x94, 0x17, 0x83,
++      0x55, 0xd3, 0x04, 0x84, 0x64, 0x43, 0xfe, 0xe8,
++      0xdf, 0x99, 0x47, 0x03, 0x03, 0xfb, 0x3b, 0x7b,
++      0x80, 0xe0, 0x30, 0xbe, 0xeb, 0xd3, 0x29, 0xbe,
++      0xe3, 0xbc, 0xdb, 0x5b, 0x1e, 0xde, 0xfc, 0xfe,
++      0x8b, 0xcd, 0xa1, 0xb6, 0xa1, 0x5c, 0x8c, 0x2b,
++      0x08, 0x69, 0xff, 0xd2, 0xec, 0x5e, 0x26, 0xe5,
++      0x53, 0xb7, 0xb2, 0x27, 0xfe, 0x87, 0xfd, 0xbd,
++      0x7a, 0xda, 0x44, 0x42, 0x42, 0x69, 0xbf, 0xfa,
++      0x55, 0x27, 0xf2, 0x70, 0xac, 0xf6, 0x85, 0x02,
++      0xb7, 0x4c, 0x5a, 0xe2, 0xe6, 0x0c, 0x05, 0x80,
++      0x98, 0x1a, 0x49, 0x38, 0x45, 0x93, 0x92, 0xc4,
++      0x9b, 0xb2, 0xf2, 0x84, 0xb6, 0x46, 0xef, 0xc7,
++      0xf3, 0xf0, 0xb1, 0x36, 0x1d, 0xc3, 0x48, 0xed,
++      0x77, 0xd3, 0x0b, 0xc5, 0x76, 0x92, 0xed, 0x38,
++      0xfb, 0xac, 0x01, 0x88, 0x38, 0x04, 0x88, 0xc7
++};
++static const u8 enc_output055[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0xd8, 0xb4, 0x79, 0x02, 0xba, 0xae, 0xaf, 0xb3,
++      0x42, 0x03, 0x05, 0x15, 0x29, 0xaf, 0x28, 0x2e
++};
++static const u8 enc_assoc055[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 enc_nonce055[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key055[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input056[] __initconst = {
++      0xda, 0x92, 0xbf, 0x77, 0x7f, 0x6b, 0xe8, 0x7c,
++      0xaa, 0x2c, 0xfb, 0x7b, 0x9b, 0xbc, 0x01, 0x17,
++      0x20, 0x66, 0xb8, 0xfc, 0xfc, 0x04, 0xc4, 0x84,
++      0x7f, 0x1f, 0xcf, 0x41, 0x14, 0x2c, 0xd6, 0x41
++};
++static const u8 enc_output056[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xb3, 0x89, 0x1c, 0x84, 0x9c, 0xb5, 0x2c, 0x27,
++      0x74, 0x7e, 0xdf, 0xcf, 0x31, 0x21, 0x3b, 0xb6
++};
++static const u8 enc_assoc056[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce056[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key056[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input057[] __initconst = {
++      0xda, 0x92, 0xbf, 0x77, 0x7f, 0x6b, 0xe8, 0x7c,
++      0xaa, 0x2c, 0xfb, 0x7b, 0x9b, 0xbc, 0x01, 0x17,
++      0x20, 0x66, 0xb8, 0xfc, 0xfc, 0x04, 0xc4, 0x84,
++      0x7f, 0x1f, 0xcf, 0x41, 0x14, 0x2c, 0xd6, 0x41,
++      0x1c, 0x43, 0x24, 0xa4, 0xe1, 0x21, 0x03, 0x01,
++      0x74, 0x32, 0x5e, 0x49, 0x5e, 0xa3, 0x73, 0xd4,
++      0xf7, 0x96, 0x00, 0x2d, 0x13, 0xa1, 0xd9, 0x1a,
++      0xac, 0x48, 0x4d, 0xd8, 0x01, 0x78, 0x02, 0x42
++};
++static const u8 enc_output057[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xf0, 0xc1, 0x2d, 0x26, 0xef, 0x03, 0x02, 0x9b,
++      0x62, 0xc0, 0x08, 0xda, 0x27, 0xc5, 0xdc, 0x68
++};
++static const u8 enc_assoc057[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce057[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key057[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input058[] __initconst = {
++      0xda, 0x92, 0xbf, 0x77, 0x7f, 0x6b, 0xe8, 0x7c,
++      0xaa, 0x2c, 0xfb, 0x7b, 0x9b, 0xbc, 0x01, 0x17,
++      0x20, 0x66, 0xb8, 0xfc, 0xfc, 0x04, 0xc4, 0x84,
++      0x7f, 0x1f, 0xcf, 0x41, 0x14, 0x2c, 0xd6, 0x41,
++      0x1c, 0x43, 0x24, 0xa4, 0xe1, 0x21, 0x03, 0x01,
++      0x74, 0x32, 0x5e, 0x49, 0x5e, 0xa3, 0x73, 0xd4,
++      0xf7, 0x96, 0x00, 0x2d, 0x13, 0xa1, 0xd9, 0x1a,
++      0xac, 0x48, 0x4d, 0xd8, 0x01, 0x78, 0x02, 0x42,
++      0x85, 0x25, 0xbb, 0xbd, 0xbd, 0x96, 0x40, 0x05,
++      0xaa, 0xd8, 0x0d, 0x8f, 0x53, 0x09, 0x7a, 0xfd,
++      0x48, 0xb3, 0xa5, 0x1d, 0x19, 0xf3, 0xfa, 0x7f,
++      0x67, 0xe5, 0xb6, 0xc7, 0xba, 0x6c, 0x6d, 0x3b,
++      0x64, 0x4d, 0x0d, 0x7b, 0x49, 0xb9, 0x10, 0x38,
++      0x0c, 0x0f, 0x4e, 0xc9, 0xe2, 0x3c, 0xb7, 0x12,
++      0x88, 0x2c, 0xf4, 0x3a, 0x89, 0x6d, 0x12, 0xc7,
++      0x04, 0x53, 0xfe, 0x77, 0xc7, 0xfb, 0x77, 0x38
++};
++static const u8 enc_output058[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xee, 0x65, 0x78, 0x30, 0x01, 0xc2, 0x56, 0x91,
++      0xfa, 0x28, 0xd0, 0xf5, 0xf1, 0xc1, 0xd7, 0x62
++};
++static const u8 enc_assoc058[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce058[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key058[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input059[] __initconst = {
++      0x25, 0x6d, 0x40, 0x08, 0x80, 0x94, 0x17, 0x03,
++      0x55, 0xd3, 0x04, 0x04, 0x64, 0x43, 0xfe, 0x68,
++      0xdf, 0x99, 0x47, 0x83, 0x03, 0xfb, 0x3b, 0xfb,
++      0x80, 0xe0, 0x30, 0x3e, 0xeb, 0xd3, 0x29, 0x3e
++};
++static const u8 enc_output059[] __initconst = {
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x79, 0xba, 0x7a, 0x29, 0xf5, 0xa7, 0xbb, 0x75,
++      0x79, 0x7a, 0xf8, 0x7a, 0x61, 0x01, 0x29, 0xa4
++};
++static const u8 enc_assoc059[] __initconst = {
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
++};
++static const u8 enc_nonce059[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key059[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input060[] __initconst = {
++      0x25, 0x6d, 0x40, 0x08, 0x80, 0x94, 0x17, 0x03,
++      0x55, 0xd3, 0x04, 0x04, 0x64, 0x43, 0xfe, 0x68,
++      0xdf, 0x99, 0x47, 0x83, 0x03, 0xfb, 0x3b, 0xfb,
++      0x80, 0xe0, 0x30, 0x3e, 0xeb, 0xd3, 0x29, 0x3e,
++      0xe3, 0xbc, 0xdb, 0xdb, 0x1e, 0xde, 0xfc, 0x7e,
++      0x8b, 0xcd, 0xa1, 0x36, 0xa1, 0x5c, 0x8c, 0xab,
++      0x08, 0x69, 0xff, 0x52, 0xec, 0x5e, 0x26, 0x65,
++      0x53, 0xb7, 0xb2, 0xa7, 0xfe, 0x87, 0xfd, 0x3d
++};
++static const u8 enc_output060[] __initconst = {
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x36, 0xb1, 0x74, 0x38, 0x19, 0xe1, 0xb9, 0xba,
++      0x15, 0x51, 0xe8, 0xed, 0x92, 0x2a, 0x95, 0x9a
++};
++static const u8 enc_assoc060[] __initconst = {
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
++};
++static const u8 enc_nonce060[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key060[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input061[] __initconst = {
++      0x25, 0x6d, 0x40, 0x08, 0x80, 0x94, 0x17, 0x03,
++      0x55, 0xd3, 0x04, 0x04, 0x64, 0x43, 0xfe, 0x68,
++      0xdf, 0x99, 0x47, 0x83, 0x03, 0xfb, 0x3b, 0xfb,
++      0x80, 0xe0, 0x30, 0x3e, 0xeb, 0xd3, 0x29, 0x3e,
++      0xe3, 0xbc, 0xdb, 0xdb, 0x1e, 0xde, 0xfc, 0x7e,
++      0x8b, 0xcd, 0xa1, 0x36, 0xa1, 0x5c, 0x8c, 0xab,
++      0x08, 0x69, 0xff, 0x52, 0xec, 0x5e, 0x26, 0x65,
++      0x53, 0xb7, 0xb2, 0xa7, 0xfe, 0x87, 0xfd, 0x3d,
++      0x7a, 0xda, 0x44, 0xc2, 0x42, 0x69, 0xbf, 0x7a,
++      0x55, 0x27, 0xf2, 0xf0, 0xac, 0xf6, 0x85, 0x82,
++      0xb7, 0x4c, 0x5a, 0x62, 0xe6, 0x0c, 0x05, 0x00,
++      0x98, 0x1a, 0x49, 0xb8, 0x45, 0x93, 0x92, 0x44,
++      0x9b, 0xb2, 0xf2, 0x04, 0xb6, 0x46, 0xef, 0x47,
++      0xf3, 0xf0, 0xb1, 0xb6, 0x1d, 0xc3, 0x48, 0x6d,
++      0x77, 0xd3, 0x0b, 0x45, 0x76, 0x92, 0xed, 0xb8,
++      0xfb, 0xac, 0x01, 0x08, 0x38, 0x04, 0x88, 0x47
++};
++static const u8 enc_output061[] __initconst = {
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0xfe, 0xac, 0x49, 0x55, 0x55, 0x4e, 0x80, 0x6f,
++      0x3a, 0x19, 0x02, 0xe2, 0x44, 0x32, 0xc0, 0x8a
++};
++static const u8 enc_assoc061[] __initconst = {
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
++};
++static const u8 enc_nonce061[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key061[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input062[] __initconst = {
++      0xda, 0x92, 0xbf, 0xf7, 0x7f, 0x6b, 0xe8, 0xfc,
++      0xaa, 0x2c, 0xfb, 0xfb, 0x9b, 0xbc, 0x01, 0x97,
++      0x20, 0x66, 0xb8, 0x7c, 0xfc, 0x04, 0xc4, 0x04,
++      0x7f, 0x1f, 0xcf, 0xc1, 0x14, 0x2c, 0xd6, 0xc1
++};
++static const u8 enc_output062[] __initconst = {
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0x20, 0xa3, 0x79, 0x8d, 0xf1, 0x29, 0x2c, 0x59,
++      0x72, 0xbf, 0x97, 0x41, 0xae, 0xc3, 0x8a, 0x19
++};
++static const u8 enc_assoc062[] __initconst = {
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f
++};
++static const u8 enc_nonce062[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key062[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input063[] __initconst = {
++      0xda, 0x92, 0xbf, 0xf7, 0x7f, 0x6b, 0xe8, 0xfc,
++      0xaa, 0x2c, 0xfb, 0xfb, 0x9b, 0xbc, 0x01, 0x97,
++      0x20, 0x66, 0xb8, 0x7c, 0xfc, 0x04, 0xc4, 0x04,
++      0x7f, 0x1f, 0xcf, 0xc1, 0x14, 0x2c, 0xd6, 0xc1,
++      0x1c, 0x43, 0x24, 0x24, 0xe1, 0x21, 0x03, 0x81,
++      0x74, 0x32, 0x5e, 0xc9, 0x5e, 0xa3, 0x73, 0x54,
++      0xf7, 0x96, 0x00, 0xad, 0x13, 0xa1, 0xd9, 0x9a,
++      0xac, 0x48, 0x4d, 0x58, 0x01, 0x78, 0x02, 0xc2
++};
++static const u8 enc_output063[] __initconst = {
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xc0, 0x3d, 0x9f, 0x67, 0x35, 0x4a, 0x97, 0xb2,
++      0xf0, 0x74, 0xf7, 0x55, 0x15, 0x57, 0xe4, 0x9c
++};
++static const u8 enc_assoc063[] __initconst = {
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f
++};
++static const u8 enc_nonce063[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key063[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input064[] __initconst = {
++      0xda, 0x92, 0xbf, 0xf7, 0x7f, 0x6b, 0xe8, 0xfc,
++      0xaa, 0x2c, 0xfb, 0xfb, 0x9b, 0xbc, 0x01, 0x97,
++      0x20, 0x66, 0xb8, 0x7c, 0xfc, 0x04, 0xc4, 0x04,
++      0x7f, 0x1f, 0xcf, 0xc1, 0x14, 0x2c, 0xd6, 0xc1,
++      0x1c, 0x43, 0x24, 0x24, 0xe1, 0x21, 0x03, 0x81,
++      0x74, 0x32, 0x5e, 0xc9, 0x5e, 0xa3, 0x73, 0x54,
++      0xf7, 0x96, 0x00, 0xad, 0x13, 0xa1, 0xd9, 0x9a,
++      0xac, 0x48, 0x4d, 0x58, 0x01, 0x78, 0x02, 0xc2,
++      0x85, 0x25, 0xbb, 0x3d, 0xbd, 0x96, 0x40, 0x85,
++      0xaa, 0xd8, 0x0d, 0x0f, 0x53, 0x09, 0x7a, 0x7d,
++      0x48, 0xb3, 0xa5, 0x9d, 0x19, 0xf3, 0xfa, 0xff,
++      0x67, 0xe5, 0xb6, 0x47, 0xba, 0x6c, 0x6d, 0xbb,
++      0x64, 0x4d, 0x0d, 0xfb, 0x49, 0xb9, 0x10, 0xb8,
++      0x0c, 0x0f, 0x4e, 0x49, 0xe2, 0x3c, 0xb7, 0x92,
++      0x88, 0x2c, 0xf4, 0xba, 0x89, 0x6d, 0x12, 0x47,
++      0x04, 0x53, 0xfe, 0xf7, 0xc7, 0xfb, 0x77, 0xb8
++};
++static const u8 enc_output064[] __initconst = {
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xc8, 0x6d, 0xa8, 0xdd, 0x65, 0x22, 0x86, 0xd5,
++      0x02, 0x13, 0xd3, 0x28, 0xd6, 0x3e, 0x40, 0x06
++};
++static const u8 enc_assoc064[] __initconst = {
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f
++};
++static const u8 enc_nonce064[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key064[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input065[] __initconst = {
++      0x5a, 0x92, 0xbf, 0x77, 0xff, 0x6b, 0xe8, 0x7c,
++      0x2a, 0x2c, 0xfb, 0x7b, 0x1b, 0xbc, 0x01, 0x17,
++      0xa0, 0x66, 0xb8, 0xfc, 0x7c, 0x04, 0xc4, 0x84,
++      0xff, 0x1f, 0xcf, 0x41, 0x94, 0x2c, 0xd6, 0x41
++};
++static const u8 enc_output065[] __initconst = {
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0xbe, 0xde, 0x90, 0x83, 0xce, 0xb3, 0x6d, 0xdf,
++      0xe5, 0xfa, 0x81, 0x1f, 0x95, 0x47, 0x1c, 0x67
++};
++static const u8 enc_assoc065[] __initconst = {
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce065[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key065[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input066[] __initconst = {
++      0x5a, 0x92, 0xbf, 0x77, 0xff, 0x6b, 0xe8, 0x7c,
++      0x2a, 0x2c, 0xfb, 0x7b, 0x1b, 0xbc, 0x01, 0x17,
++      0xa0, 0x66, 0xb8, 0xfc, 0x7c, 0x04, 0xc4, 0x84,
++      0xff, 0x1f, 0xcf, 0x41, 0x94, 0x2c, 0xd6, 0x41,
++      0x9c, 0x43, 0x24, 0xa4, 0x61, 0x21, 0x03, 0x01,
++      0xf4, 0x32, 0x5e, 0x49, 0xde, 0xa3, 0x73, 0xd4,
++      0x77, 0x96, 0x00, 0x2d, 0x93, 0xa1, 0xd9, 0x1a,
++      0x2c, 0x48, 0x4d, 0xd8, 0x81, 0x78, 0x02, 0x42
++};
++static const u8 enc_output066[] __initconst = {
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x30, 0x08, 0x74, 0xbb, 0x06, 0x92, 0xb6, 0x89,
++      0xde, 0xad, 0x9a, 0xe1, 0x5b, 0x06, 0x73, 0x90
++};
++static const u8 enc_assoc066[] __initconst = {
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce066[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key066[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input067[] __initconst = {
++      0x5a, 0x92, 0xbf, 0x77, 0xff, 0x6b, 0xe8, 0x7c,
++      0x2a, 0x2c, 0xfb, 0x7b, 0x1b, 0xbc, 0x01, 0x17,
++      0xa0, 0x66, 0xb8, 0xfc, 0x7c, 0x04, 0xc4, 0x84,
++      0xff, 0x1f, 0xcf, 0x41, 0x94, 0x2c, 0xd6, 0x41,
++      0x9c, 0x43, 0x24, 0xa4, 0x61, 0x21, 0x03, 0x01,
++      0xf4, 0x32, 0x5e, 0x49, 0xde, 0xa3, 0x73, 0xd4,
++      0x77, 0x96, 0x00, 0x2d, 0x93, 0xa1, 0xd9, 0x1a,
++      0x2c, 0x48, 0x4d, 0xd8, 0x81, 0x78, 0x02, 0x42,
++      0x05, 0x25, 0xbb, 0xbd, 0x3d, 0x96, 0x40, 0x05,
++      0x2a, 0xd8, 0x0d, 0x8f, 0xd3, 0x09, 0x7a, 0xfd,
++      0xc8, 0xb3, 0xa5, 0x1d, 0x99, 0xf3, 0xfa, 0x7f,
++      0xe7, 0xe5, 0xb6, 0xc7, 0x3a, 0x6c, 0x6d, 0x3b,
++      0xe4, 0x4d, 0x0d, 0x7b, 0xc9, 0xb9, 0x10, 0x38,
++      0x8c, 0x0f, 0x4e, 0xc9, 0x62, 0x3c, 0xb7, 0x12,
++      0x08, 0x2c, 0xf4, 0x3a, 0x09, 0x6d, 0x12, 0xc7,
++      0x84, 0x53, 0xfe, 0x77, 0x47, 0xfb, 0x77, 0x38
++};
++static const u8 enc_output067[] __initconst = {
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x99, 0xca, 0xd8, 0x5f, 0x45, 0xca, 0x40, 0x94,
++      0x2d, 0x0d, 0x4d, 0x5e, 0x95, 0x0a, 0xde, 0x22
++};
++static const u8 enc_assoc067[] __initconst = {
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff,
++      0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce067[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key067[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input068[] __initconst = {
++      0x25, 0x6d, 0x40, 0x88, 0x7f, 0x6b, 0xe8, 0x7c,
++      0x55, 0xd3, 0x04, 0x84, 0x9b, 0xbc, 0x01, 0x17,
++      0xdf, 0x99, 0x47, 0x03, 0xfc, 0x04, 0xc4, 0x84,
++      0x80, 0xe0, 0x30, 0xbe, 0x14, 0x2c, 0xd6, 0x41
++};
++static const u8 enc_output068[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x8b, 0xbe, 0x14, 0x52, 0x72, 0xe7, 0xc2, 0xd9,
++      0xa1, 0x89, 0x1a, 0x3a, 0xb0, 0x98, 0x3d, 0x9d
++};
++static const u8 enc_assoc068[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce068[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key068[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input069[] __initconst = {
++      0x25, 0x6d, 0x40, 0x88, 0x7f, 0x6b, 0xe8, 0x7c,
++      0x55, 0xd3, 0x04, 0x84, 0x9b, 0xbc, 0x01, 0x17,
++      0xdf, 0x99, 0x47, 0x03, 0xfc, 0x04, 0xc4, 0x84,
++      0x80, 0xe0, 0x30, 0xbe, 0x14, 0x2c, 0xd6, 0x41,
++      0xe3, 0xbc, 0xdb, 0x5b, 0xe1, 0x21, 0x03, 0x01,
++      0x8b, 0xcd, 0xa1, 0xb6, 0x5e, 0xa3, 0x73, 0xd4,
++      0x08, 0x69, 0xff, 0xd2, 0x13, 0xa1, 0xd9, 0x1a,
++      0x53, 0xb7, 0xb2, 0x27, 0x01, 0x78, 0x02, 0x42
++};
++static const u8 enc_output069[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x3b, 0x41, 0x86, 0x19, 0x13, 0xa8, 0xf6, 0xde,
++      0x7f, 0x61, 0xe2, 0x25, 0x63, 0x1b, 0xc3, 0x82
++};
++static const u8 enc_assoc069[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce069[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key069[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input070[] __initconst = {
++      0x25, 0x6d, 0x40, 0x88, 0x7f, 0x6b, 0xe8, 0x7c,
++      0x55, 0xd3, 0x04, 0x84, 0x9b, 0xbc, 0x01, 0x17,
++      0xdf, 0x99, 0x47, 0x03, 0xfc, 0x04, 0xc4, 0x84,
++      0x80, 0xe0, 0x30, 0xbe, 0x14, 0x2c, 0xd6, 0x41,
++      0xe3, 0xbc, 0xdb, 0x5b, 0xe1, 0x21, 0x03, 0x01,
++      0x8b, 0xcd, 0xa1, 0xb6, 0x5e, 0xa3, 0x73, 0xd4,
++      0x08, 0x69, 0xff, 0xd2, 0x13, 0xa1, 0xd9, 0x1a,
++      0x53, 0xb7, 0xb2, 0x27, 0x01, 0x78, 0x02, 0x42,
++      0x7a, 0xda, 0x44, 0x42, 0xbd, 0x96, 0x40, 0x05,
++      0x55, 0x27, 0xf2, 0x70, 0x53, 0x09, 0x7a, 0xfd,
++      0xb7, 0x4c, 0x5a, 0xe2, 0x19, 0xf3, 0xfa, 0x7f,
++      0x98, 0x1a, 0x49, 0x38, 0xba, 0x6c, 0x6d, 0x3b,
++      0x9b, 0xb2, 0xf2, 0x84, 0x49, 0xb9, 0x10, 0x38,
++      0xf3, 0xf0, 0xb1, 0x36, 0xe2, 0x3c, 0xb7, 0x12,
++      0x77, 0xd3, 0x0b, 0xc5, 0x89, 0x6d, 0x12, 0xc7,
++      0xfb, 0xac, 0x01, 0x88, 0xc7, 0xfb, 0x77, 0x38
++};
++static const u8 enc_output070[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x84, 0x28, 0xbc, 0xf0, 0x23, 0xec, 0x6b, 0xf3,
++      0x1f, 0xd9, 0xef, 0xb2, 0x03, 0xff, 0x08, 0x71
++};
++static const u8 enc_assoc070[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce070[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key070[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input071[] __initconst = {
++      0xda, 0x92, 0xbf, 0x77, 0x80, 0x94, 0x17, 0x83,
++      0xaa, 0x2c, 0xfb, 0x7b, 0x64, 0x43, 0xfe, 0xe8,
++      0x20, 0x66, 0xb8, 0xfc, 0x03, 0xfb, 0x3b, 0x7b,
++      0x7f, 0x1f, 0xcf, 0x41, 0xeb, 0xd3, 0x29, 0xbe
++};
++static const u8 enc_output071[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0x13, 0x9f, 0xdf, 0x64, 0x74, 0xea, 0x24, 0xf5,
++      0x49, 0xb0, 0x75, 0x82, 0x5f, 0x2c, 0x76, 0x20
++};
++static const u8 enc_assoc071[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 enc_nonce071[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key071[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input072[] __initconst = {
++      0xda, 0x92, 0xbf, 0x77, 0x80, 0x94, 0x17, 0x83,
++      0xaa, 0x2c, 0xfb, 0x7b, 0x64, 0x43, 0xfe, 0xe8,
++      0x20, 0x66, 0xb8, 0xfc, 0x03, 0xfb, 0x3b, 0x7b,
++      0x7f, 0x1f, 0xcf, 0x41, 0xeb, 0xd3, 0x29, 0xbe,
++      0x1c, 0x43, 0x24, 0xa4, 0x1e, 0xde, 0xfc, 0xfe,
++      0x74, 0x32, 0x5e, 0x49, 0xa1, 0x5c, 0x8c, 0x2b,
++      0xf7, 0x96, 0x00, 0x2d, 0xec, 0x5e, 0x26, 0xe5,
++      0xac, 0x48, 0x4d, 0xd8, 0xfe, 0x87, 0xfd, 0xbd
++};
++static const u8 enc_output072[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xbb, 0xad, 0x8d, 0x86, 0x3b, 0x83, 0x5a, 0x8e,
++      0x86, 0x64, 0xfd, 0x1d, 0x45, 0x66, 0xb6, 0xb4
++};
++static const u8 enc_assoc072[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 enc_nonce072[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key072[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input073[] __initconst = {
++      0xda, 0x92, 0xbf, 0x77, 0x80, 0x94, 0x17, 0x83,
++      0xaa, 0x2c, 0xfb, 0x7b, 0x64, 0x43, 0xfe, 0xe8,
++      0x20, 0x66, 0xb8, 0xfc, 0x03, 0xfb, 0x3b, 0x7b,
++      0x7f, 0x1f, 0xcf, 0x41, 0xeb, 0xd3, 0x29, 0xbe,
++      0x1c, 0x43, 0x24, 0xa4, 0x1e, 0xde, 0xfc, 0xfe,
++      0x74, 0x32, 0x5e, 0x49, 0xa1, 0x5c, 0x8c, 0x2b,
++      0xf7, 0x96, 0x00, 0x2d, 0xec, 0x5e, 0x26, 0xe5,
++      0xac, 0x48, 0x4d, 0xd8, 0xfe, 0x87, 0xfd, 0xbd,
++      0x85, 0x25, 0xbb, 0xbd, 0x42, 0x69, 0xbf, 0xfa,
++      0xaa, 0xd8, 0x0d, 0x8f, 0xac, 0xf6, 0x85, 0x02,
++      0x48, 0xb3, 0xa5, 0x1d, 0xe6, 0x0c, 0x05, 0x80,
++      0x67, 0xe5, 0xb6, 0xc7, 0x45, 0x93, 0x92, 0xc4,
++      0x64, 0x4d, 0x0d, 0x7b, 0xb6, 0x46, 0xef, 0xc7,
++      0x0c, 0x0f, 0x4e, 0xc9, 0x1d, 0xc3, 0x48, 0xed,
++      0x88, 0x2c, 0xf4, 0x3a, 0x76, 0x92, 0xed, 0x38,
++      0x04, 0x53, 0xfe, 0x77, 0x38, 0x04, 0x88, 0xc7
++};
++static const u8 enc_output073[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0x42, 0xf2, 0x35, 0x42, 0x97, 0x84, 0x9a, 0x51,
++      0x1d, 0x53, 0xe5, 0x57, 0x17, 0x72, 0xf7, 0x1f
++};
++static const u8 enc_assoc073[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 enc_nonce073[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0xee, 0x32, 0x00
++};
++static const u8 enc_key073[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input076[] __initconst = {
++      0x1b, 0x99, 0x6f, 0x9a, 0x3c, 0xcc, 0x67, 0x85,
++      0xde, 0x22, 0xff, 0x5b, 0x8a, 0xdd, 0x95, 0x02,
++      0xce, 0x03, 0xa0, 0xfa, 0xf5, 0x99, 0x2a, 0x09,
++      0x52, 0x2c, 0xdd, 0x12, 0x06, 0xd2, 0x20, 0xb8,
++      0xf8, 0xbd, 0x07, 0xd1, 0xf1, 0xf5, 0xa1, 0xbd,
++      0x9a, 0x71, 0xd1, 0x1c, 0x7f, 0x57, 0x9b, 0x85,
++      0x58, 0x18, 0xc0, 0x8d, 0x4d, 0xe0, 0x36, 0x39,
++      0x31, 0x83, 0xb7, 0xf5, 0x90, 0xb3, 0x35, 0xae,
++      0xd8, 0xde, 0x5b, 0x57, 0xb1, 0x3c, 0x5f, 0xed,
++      0xe2, 0x44, 0x1c, 0x3e, 0x18, 0x4a, 0xa9, 0xd4,
++      0x6e, 0x61, 0x59, 0x85, 0x06, 0xb3, 0xe1, 0x1c,
++      0x43, 0xc6, 0x2c, 0xbc, 0xac, 0xec, 0xed, 0x33,
++      0x19, 0x08, 0x75, 0xb0, 0x12, 0x21, 0x8b, 0x19,
++      0x30, 0xfb, 0x7c, 0x38, 0xec, 0x45, 0xac, 0x11,
++      0xc3, 0x53, 0xd0, 0xcf, 0x93, 0x8d, 0xcc, 0xb9,
++      0xef, 0xad, 0x8f, 0xed, 0xbe, 0x46, 0xda, 0xa5
++};
++static const u8 enc_output076[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x4b, 0x0b, 0xda, 0x8a, 0xd0, 0x43, 0x83, 0x0d,
++      0x83, 0x19, 0xab, 0x82, 0xc5, 0x0c, 0x76, 0x63
++};
++static const u8 enc_assoc076[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce076[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0xb4, 0xf0
++};
++static const u8 enc_key076[] __initconst = {
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input077[] __initconst = {
++      0x86, 0xcb, 0xac, 0xae, 0x4d, 0x3f, 0x74, 0xae,
++      0x01, 0x21, 0x3e, 0x05, 0x51, 0xcc, 0x15, 0x16,
++      0x0e, 0xa1, 0xbe, 0x84, 0x08, 0xe3, 0xd5, 0xd7,
++      0x4f, 0x01, 0x46, 0x49, 0x95, 0xa6, 0x9e, 0x61,
++      0x76, 0xcb, 0x9e, 0x02, 0xb2, 0x24, 0x7e, 0xd2,
++      0x99, 0x89, 0x2f, 0x91, 0x82, 0xa4, 0x5c, 0xaf,
++      0x4c, 0x69, 0x40, 0x56, 0x11, 0x76, 0x6e, 0xdf,
++      0xaf, 0xdc, 0x28, 0x55, 0x19, 0xea, 0x30, 0x48,
++      0x0c, 0x44, 0xf0, 0x5e, 0x78, 0x1e, 0xac, 0xf8,
++      0xfc, 0xec, 0xc7, 0x09, 0x0a, 0xbb, 0x28, 0xfa,
++      0x5f, 0xd5, 0x85, 0xac, 0x8c, 0xda, 0x7e, 0x87,
++      0x72, 0xe5, 0x94, 0xe4, 0xce, 0x6c, 0x88, 0x32,
++      0x81, 0x93, 0x2e, 0x0f, 0x89, 0xf8, 0x77, 0xa1,
++      0xf0, 0x4d, 0x9c, 0x32, 0xb0, 0x6c, 0xf9, 0x0b,
++      0x0e, 0x76, 0x2b, 0x43, 0x0c, 0x4d, 0x51, 0x7c,
++      0x97, 0x10, 0x70, 0x68, 0xf4, 0x98, 0xef, 0x7f
++};
++static const u8 enc_output077[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x4b, 0xc9, 0x8f, 0x72, 0xc4, 0x94, 0xc2, 0xa4,
++      0x3c, 0x2b, 0x15, 0xa1, 0x04, 0x3f, 0x1c, 0xfa
++};
++static const u8 enc_assoc077[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce077[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0xfb, 0x66
++};
++static const u8 enc_key077[] __initconst = {
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input078[] __initconst = {
++      0xfa, 0xb1, 0xcd, 0xdf, 0x4f, 0xe1, 0x98, 0xef,
++      0x63, 0xad, 0xd8, 0x81, 0xd6, 0xea, 0xd6, 0xc5,
++      0x76, 0x37, 0xbb, 0xe9, 0x20, 0x18, 0xca, 0x7c,
++      0x0b, 0x96, 0xfb, 0xa0, 0x87, 0x1e, 0x93, 0x2d,
++      0xb1, 0xfb, 0xf9, 0x07, 0x61, 0xbe, 0x25, 0xdf,
++      0x8d, 0xfa, 0xf9, 0x31, 0xce, 0x57, 0x57, 0xe6,
++      0x17, 0xb3, 0xd7, 0xa9, 0xf0, 0xbf, 0x0f, 0xfe,
++      0x5d, 0x59, 0x1a, 0x33, 0xc1, 0x43, 0xb8, 0xf5,
++      0x3f, 0xd0, 0xb5, 0xa1, 0x96, 0x09, 0xfd, 0x62,
++      0xe5, 0xc2, 0x51, 0xa4, 0x28, 0x1a, 0x20, 0x0c,
++      0xfd, 0xc3, 0x4f, 0x28, 0x17, 0x10, 0x40, 0x6f,
++      0x4e, 0x37, 0x62, 0x54, 0x46, 0xff, 0x6e, 0xf2,
++      0x24, 0x91, 0x3d, 0xeb, 0x0d, 0x89, 0xaf, 0x33,
++      0x71, 0x28, 0xe3, 0xd1, 0x55, 0xd1, 0x6d, 0x3e,
++      0xc3, 0x24, 0x60, 0x41, 0x43, 0x21, 0x43, 0xe9,
++      0xab, 0x3a, 0x6d, 0x2c, 0xcc, 0x2f, 0x4d, 0x62
++};
++static const u8 enc_output078[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xf7, 0xe9, 0xe1, 0x51, 0xb0, 0x25, 0x33, 0xc7,
++      0x46, 0x58, 0xbf, 0xc7, 0x73, 0x7c, 0x68, 0x0d
++};
++static const u8 enc_assoc078[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce078[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0xbb, 0x90
++};
++static const u8 enc_key078[] __initconst = {
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input079[] __initconst = {
++      0x22, 0x72, 0x02, 0xbe, 0x7f, 0x35, 0x15, 0xe9,
++      0xd1, 0xc0, 0x2e, 0xea, 0x2f, 0x19, 0x50, 0xb6,
++      0x48, 0x1b, 0x04, 0x8a, 0x4c, 0x91, 0x50, 0x6c,
++      0xb4, 0x0d, 0x50, 0x4e, 0x6c, 0x94, 0x9f, 0x82,
++      0xd1, 0x97, 0xc2, 0x5a, 0xd1, 0x7d, 0xc7, 0x21,
++      0x65, 0x11, 0x25, 0x78, 0x2a, 0xc7, 0xa7, 0x12,
++      0x47, 0xfe, 0xae, 0xf3, 0x2f, 0x1f, 0x25, 0x0c,
++      0xe4, 0xbb, 0x8f, 0x79, 0xac, 0xaa, 0x17, 0x9d,
++      0x45, 0xa7, 0xb0, 0x54, 0x5f, 0x09, 0x24, 0x32,
++      0x5e, 0xfa, 0x87, 0xd5, 0xe4, 0x41, 0xd2, 0x84,
++      0x78, 0xc6, 0x1f, 0x22, 0x23, 0xee, 0x67, 0xc3,
++      0xb4, 0x1f, 0x43, 0x94, 0x53, 0x5e, 0x2a, 0x24,
++      0x36, 0x9a, 0x2e, 0x16, 0x61, 0x3c, 0x45, 0x94,
++      0x90, 0xc1, 0x4f, 0xb1, 0xd7, 0x55, 0xfe, 0x53,
++      0xfb, 0xe1, 0xee, 0x45, 0xb1, 0xb2, 0x1f, 0x71,
++      0x62, 0xe2, 0xfc, 0xaa, 0x74, 0x2a, 0xbe, 0xfd
++};
++static const u8 enc_output079[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x79, 0x5b, 0xcf, 0xf6, 0x47, 0xc5, 0x53, 0xc2,
++      0xe4, 0xeb, 0x6e, 0x0e, 0xaf, 0xd9, 0xe0, 0x4e
++};
++static const u8 enc_assoc079[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce079[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x48, 0x4a
++};
++static const u8 enc_key079[] __initconst = {
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input080[] __initconst = {
++      0xfa, 0xe5, 0x83, 0x45, 0xc1, 0x6c, 0xb0, 0xf5,
++      0xcc, 0x53, 0x7f, 0x2b, 0x1b, 0x34, 0x69, 0xc9,
++      0x69, 0x46, 0x3b, 0x3e, 0xa7, 0x1b, 0xcf, 0x6b,
++      0x98, 0xd6, 0x69, 0xa8, 0xe6, 0x0e, 0x04, 0xfc,
++      0x08, 0xd5, 0xfd, 0x06, 0x9c, 0x36, 0x26, 0x38,
++      0xe3, 0x40, 0x0e, 0xf4, 0xcb, 0x24, 0x2e, 0x27,
++      0xe2, 0x24, 0x5e, 0x68, 0xcb, 0x9e, 0xc5, 0x83,
++      0xda, 0x53, 0x40, 0xb1, 0x2e, 0xdf, 0x42, 0x3b,
++      0x73, 0x26, 0xad, 0x20, 0xfe, 0xeb, 0x57, 0xda,
++      0xca, 0x2e, 0x04, 0x67, 0xa3, 0x28, 0x99, 0xb4,
++      0x2d, 0xf8, 0xe5, 0x6d, 0x84, 0xe0, 0x06, 0xbc,
++      0x8a, 0x7a, 0xcc, 0x73, 0x1e, 0x7c, 0x1f, 0x6b,
++      0xec, 0xb5, 0x71, 0x9f, 0x70, 0x77, 0xf0, 0xd4,
++      0xf4, 0xc6, 0x1a, 0xb1, 0x1e, 0xba, 0xc1, 0x00,
++      0x18, 0x01, 0xce, 0x33, 0xc4, 0xe4, 0xa7, 0x7d,
++      0x83, 0x1d, 0x3c, 0xe3, 0x4e, 0x84, 0x10, 0xe1
++};
++static const u8 enc_output080[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x19, 0x46, 0xd6, 0x53, 0x96, 0x0f, 0x94, 0x7a,
++      0x74, 0xd3, 0xe8, 0x09, 0x3c, 0xf4, 0x85, 0x02
++};
++static const u8 enc_assoc080[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce080[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x2f, 0x40
++};
++static const u8 enc_key080[] __initconst = {
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input081[] __initconst = {
++      0xeb, 0xb2, 0x16, 0xdd, 0xd7, 0xca, 0x70, 0x92,
++      0x15, 0xf5, 0x03, 0xdf, 0x9c, 0xe6, 0x3c, 0x5c,
++      0xd2, 0x19, 0x4e, 0x7d, 0x90, 0x99, 0xe8, 0xa9,
++      0x0b, 0x2a, 0xfa, 0xad, 0x5e, 0xba, 0x35, 0x06,
++      0x99, 0x25, 0xa6, 0x03, 0xfd, 0xbc, 0x34, 0x1a,
++      0xae, 0xd4, 0x15, 0x05, 0xb1, 0x09, 0x41, 0xfa,
++      0x38, 0x56, 0xa7, 0xe2, 0x47, 0xb1, 0x04, 0x07,
++      0x09, 0x74, 0x6c, 0xfc, 0x20, 0x96, 0xca, 0xa6,
++      0x31, 0xb2, 0xff, 0xf4, 0x1c, 0x25, 0x05, 0x06,
++      0xd8, 0x89, 0xc1, 0xc9, 0x06, 0x71, 0xad, 0xe8,
++      0x53, 0xee, 0x63, 0x94, 0xc1, 0x91, 0x92, 0xa5,
++      0xcf, 0x37, 0x10, 0xd1, 0x07, 0x30, 0x99, 0xe5,
++      0xbc, 0x94, 0x65, 0x82, 0xfc, 0x0f, 0xab, 0x9f,
++      0x54, 0x3c, 0x71, 0x6a, 0xe2, 0x48, 0x6a, 0x86,
++      0x83, 0xfd, 0xca, 0x39, 0xd2, 0xe1, 0x4f, 0x23,
++      0xd0, 0x0a, 0x58, 0x26, 0x64, 0xf4, 0xec, 0xb1
++};
++static const u8 enc_output081[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x36, 0xc3, 0x00, 0x29, 0x85, 0xdd, 0x21, 0xba,
++      0xf8, 0x95, 0xd6, 0x33, 0x57, 0x3f, 0x12, 0xc0
++};
++static const u8 enc_assoc081[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce081[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0xe2, 0x93, 0x35
++};
++static const u8 enc_key081[] __initconst = {
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input082[] __initconst = {
++      0x40, 0x8a, 0xe6, 0xef, 0x1c, 0x7e, 0xf0, 0xfb,
++      0x2c, 0x2d, 0x61, 0x08, 0x16, 0xfc, 0x78, 0x49,
++      0xef, 0xa5, 0x8f, 0x78, 0x27, 0x3f, 0x5f, 0x16,
++      0x6e, 0xa6, 0x5f, 0x81, 0xb5, 0x75, 0x74, 0x7d,
++      0x03, 0x5b, 0x30, 0x40, 0xfe, 0xde, 0x1e, 0xb9,
++      0x45, 0x97, 0x88, 0x66, 0x97, 0x88, 0x40, 0x8e,
++      0x00, 0x41, 0x3b, 0x3e, 0x37, 0x6d, 0x15, 0x2d,
++      0x20, 0x4a, 0xa2, 0xb7, 0xa8, 0x35, 0x58, 0xfc,
++      0xd4, 0x8a, 0x0e, 0xf7, 0xa2, 0x6b, 0x1c, 0xd6,
++      0xd3, 0x5d, 0x23, 0xb3, 0xf5, 0xdf, 0xe0, 0xca,
++      0x77, 0xa4, 0xce, 0x32, 0xb9, 0x4a, 0xbf, 0x83,
++      0xda, 0x2a, 0xef, 0xca, 0xf0, 0x68, 0x38, 0x08,
++      0x79, 0xe8, 0x9f, 0xb0, 0xa3, 0x82, 0x95, 0x95,
++      0xcf, 0x44, 0xc3, 0x85, 0x2a, 0xe2, 0xcc, 0x66,
++      0x2b, 0x68, 0x9f, 0x93, 0x55, 0xd9, 0xc1, 0x83,
++      0x80, 0x1f, 0x6a, 0xcc, 0x31, 0x3f, 0x89, 0x07
++};
++static const u8 enc_output082[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x65, 0x14, 0x51, 0x8e, 0x0a, 0x26, 0x41, 0x42,
++      0xe0, 0xb7, 0x35, 0x1f, 0x96, 0x7f, 0xc2, 0xae
++};
++static const u8 enc_assoc082[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce082[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0xf7, 0xd5
++};
++static const u8 enc_key082[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input083[] __initconst = {
++      0x0a, 0x0a, 0x24, 0x49, 0x9b, 0xca, 0xde, 0x58,
++      0xcf, 0x15, 0x76, 0xc3, 0x12, 0xac, 0xa9, 0x84,
++      0x71, 0x8c, 0xb4, 0xcc, 0x7e, 0x01, 0x53, 0xf5,
++      0xa9, 0x01, 0x58, 0x10, 0x85, 0x96, 0x44, 0xdf,
++      0xc0, 0x21, 0x17, 0x4e, 0x0b, 0x06, 0x0a, 0x39,
++      0x74, 0x48, 0xde, 0x8b, 0x48, 0x4a, 0x86, 0x03,
++      0xbe, 0x68, 0x0a, 0x69, 0x34, 0xc0, 0x90, 0x6f,
++      0x30, 0xdd, 0x17, 0xea, 0xe2, 0xd4, 0xc5, 0xfa,
++      0xa7, 0x77, 0xf8, 0xca, 0x53, 0x37, 0x0e, 0x08,
++      0x33, 0x1b, 0x88, 0xc3, 0x42, 0xba, 0xc9, 0x59,
++      0x78, 0x7b, 0xbb, 0x33, 0x93, 0x0e, 0x3b, 0x56,
++      0xbe, 0x86, 0xda, 0x7f, 0x2a, 0x6e, 0xb1, 0xf9,
++      0x40, 0x89, 0xd1, 0xd1, 0x81, 0x07, 0x4d, 0x43,
++      0x02, 0xf8, 0xe0, 0x55, 0x2d, 0x0d, 0xe1, 0xfa,
++      0xb3, 0x06, 0xa2, 0x1b, 0x42, 0xd4, 0xc3, 0xba,
++      0x6e, 0x6f, 0x0c, 0xbc, 0xc8, 0x1e, 0x87, 0x7a
++};
++static const u8 enc_output083[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x4c, 0x19, 0x4d, 0xa6, 0xa9, 0x9f, 0xd6, 0x5b,
++      0x40, 0xe9, 0xca, 0xd7, 0x98, 0xf4, 0x4b, 0x19
++};
++static const u8 enc_assoc083[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce083[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x3d, 0xfc, 0xe4
++};
++static const u8 enc_key083[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input084[] __initconst = {
++      0x4a, 0x0a, 0xaf, 0xf8, 0x49, 0x47, 0x29, 0x18,
++      0x86, 0x91, 0x70, 0x13, 0x40, 0xf3, 0xce, 0x2b,
++      0x8a, 0x78, 0xee, 0xd3, 0xa0, 0xf0, 0x65, 0x99,
++      0x4b, 0x72, 0x48, 0x4e, 0x79, 0x91, 0xd2, 0x5c,
++      0x29, 0xaa, 0x07, 0x5e, 0xb1, 0xfc, 0x16, 0xde,
++      0x93, 0xfe, 0x06, 0x90, 0x58, 0x11, 0x2a, 0xb2,
++      0x84, 0xa3, 0xed, 0x18, 0x78, 0x03, 0x26, 0xd1,
++      0x25, 0x8a, 0x47, 0x22, 0x2f, 0xa6, 0x33, 0xd8,
++      0xb2, 0x9f, 0x3b, 0xd9, 0x15, 0x0b, 0x23, 0x9b,
++      0x15, 0x46, 0xc2, 0xbb, 0x9b, 0x9f, 0x41, 0x0f,
++      0xeb, 0xea, 0xd3, 0x96, 0x00, 0x0e, 0xe4, 0x77,
++      0x70, 0x15, 0x32, 0xc3, 0xd0, 0xf5, 0xfb, 0xf8,
++      0x95, 0xd2, 0x80, 0x19, 0x6d, 0x2f, 0x73, 0x7c,
++      0x5e, 0x9f, 0xec, 0x50, 0xd9, 0x2b, 0xb0, 0xdf,
++      0x5d, 0x7e, 0x51, 0x3b, 0xe5, 0xb8, 0xea, 0x97,
++      0x13, 0x10, 0xd5, 0xbf, 0x16, 0xba, 0x7a, 0xee
++};
++static const u8 enc_output084[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xc8, 0xae, 0x77, 0x88, 0xcd, 0x28, 0x74, 0xab,
++      0xc1, 0x38, 0x54, 0x1e, 0x11, 0xfd, 0x05, 0x87
++};
++static const u8 enc_assoc084[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce084[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x01, 0x84, 0x86, 0xa8
++};
++static const u8 enc_key084[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input085[] __initconst = {
++      0xff, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0x78, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0x9f, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0x9c, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0x47, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0xd4, 0xd2, 0x06, 0x61, 0x6f, 0x92, 0x93, 0xf6,
++      0x5b, 0x45, 0xdb, 0xbc, 0x74, 0xe7, 0xc2, 0xed,
++      0xfb, 0xcb, 0xbf, 0x1c, 0xfb, 0x67, 0x9b, 0xb7,
++      0x39, 0xa5, 0x86, 0x2d, 0xe2, 0xbc, 0xb9, 0x37,
++      0xf7, 0x4d, 0x5b, 0xf8, 0x67, 0x1c, 0x5a, 0x8a,
++      0x50, 0x92, 0xf6, 0x1d, 0x54, 0xc9, 0xaa, 0x5b
++};
++static const u8 enc_output085[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x93, 0x3a, 0x51, 0x63, 0xc7, 0xf6, 0x23, 0x68,
++      0x32, 0x7b, 0x3f, 0xbc, 0x10, 0x36, 0xc9, 0x43
++};
++static const u8 enc_assoc085[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce085[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key085[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input093[] __initconst = {
++      0x00, 0x52, 0x35, 0xd2, 0xa9, 0x19, 0xf2, 0x8d,
++      0x3d, 0xb7, 0x66, 0x4a, 0x34, 0xae, 0x6b, 0x44,
++      0x4d, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0x5b, 0x8b, 0x94, 0x50, 0x9e, 0x2b, 0x74, 0xa3,
++      0x6d, 0x34, 0x6e, 0x33, 0xd5, 0x72, 0x65, 0x9b,
++      0xa9, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0x83, 0xdc, 0xe9, 0xf3, 0x07, 0x3e, 0xfa, 0xdb,
++      0x7d, 0x23, 0xb8, 0x7a, 0xce, 0x35, 0x16, 0x8c
++};
++static const u8 enc_output093[] __initconst = {
++      0x00, 0x39, 0xe2, 0xfd, 0x2f, 0xd3, 0x12, 0x14,
++      0x9e, 0x98, 0x98, 0x80, 0x88, 0x48, 0x13, 0xe7,
++      0xca, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x3b, 0x0e, 0x86, 0x9a, 0xaa, 0x8e, 0xa4, 0x96,
++      0x32, 0xff, 0xff, 0x37, 0xb9, 0xe8, 0xce, 0x00,
++      0xca, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x3b, 0x0e, 0x86, 0x9a, 0xaa, 0x8e, 0xa4, 0x96,
++      0x32, 0xff, 0xff, 0x37, 0xb9, 0xe8, 0xce, 0x00,
++      0xa5, 0x19, 0xac, 0x1a, 0x35, 0xb4, 0xa5, 0x77,
++      0x87, 0x51, 0x0a, 0xf7, 0x8d, 0x8d, 0x20, 0x0a
++};
++static const u8 enc_assoc093[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce093[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key093[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input094[] __initconst = {
++      0xd3, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0xe5, 0xda, 0x78, 0x76, 0x6f, 0xa1, 0x92, 0x90,
++      0xc0, 0x31, 0xf7, 0x52, 0x08, 0x50, 0x67, 0x45,
++      0xae, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0x49, 0x6d, 0xde, 0xb0, 0x55, 0x09, 0xc6, 0xef,
++      0xff, 0xab, 0x75, 0xeb, 0x2d, 0xf4, 0xab, 0x09,
++      0x76, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0x01, 0x49, 0xef, 0x50, 0x4b, 0x71, 0xb1, 0x20,
++      0xca, 0x4f, 0xf3, 0x95, 0x19, 0xc2, 0xc2, 0x10
++};
++static const u8 enc_output094[] __initconst = {
++      0xd3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x62, 0x18, 0xb2, 0x7f, 0x83, 0xb8, 0xb4, 0x66,
++      0x02, 0xf6, 0xe1, 0xd8, 0x34, 0x20, 0x7b, 0x02,
++      0xce, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x2a, 0x64, 0x16, 0xce, 0xdb, 0x1c, 0xdd, 0x29,
++      0x6e, 0xf5, 0xd7, 0xd6, 0x92, 0xda, 0xff, 0x02,
++      0xce, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x2a, 0x64, 0x16, 0xce, 0xdb, 0x1c, 0xdd, 0x29,
++      0x6e, 0xf5, 0xd7, 0xd6, 0x92, 0xda, 0xff, 0x02,
++      0x30, 0x2f, 0xe8, 0x2a, 0xb0, 0xa0, 0x9a, 0xf6,
++      0x44, 0x00, 0xd0, 0x15, 0xae, 0x83, 0xd9, 0xcc
++};
++static const u8 enc_assoc094[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce094[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key094[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input095[] __initconst = {
++      0xe9, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0x6d, 0xf1, 0x39, 0x4e, 0xdc, 0x53, 0x9b, 0x5b,
++      0x3a, 0x09, 0x57, 0xbe, 0x0f, 0xb8, 0x59, 0x46,
++      0x80, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0xd1, 0x76, 0x9f, 0xe8, 0x06, 0xbb, 0xfe, 0xb6,
++      0xf5, 0x90, 0x95, 0x0f, 0x2e, 0xac, 0x9e, 0x0a,
++      0x58, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0x99, 0x52, 0xae, 0x08, 0x18, 0xc3, 0x89, 0x79,
++      0xc0, 0x74, 0x13, 0x71, 0x1a, 0x9a, 0xf7, 0x13
++};
++static const u8 enc_output095[] __initconst = {
++      0xe9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xea, 0x33, 0xf3, 0x47, 0x30, 0x4a, 0xbd, 0xad,
++      0xf8, 0xce, 0x41, 0x34, 0x33, 0xc8, 0x45, 0x01,
++      0xe0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xb2, 0x7f, 0x57, 0x96, 0x88, 0xae, 0xe5, 0x70,
++      0x64, 0xce, 0x37, 0x32, 0x91, 0x82, 0xca, 0x01,
++      0xe0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xb2, 0x7f, 0x57, 0x96, 0x88, 0xae, 0xe5, 0x70,
++      0x64, 0xce, 0x37, 0x32, 0x91, 0x82, 0xca, 0x01,
++      0x98, 0xa7, 0xe8, 0x36, 0xe0, 0xee, 0x4d, 0x02,
++      0x35, 0x00, 0xd0, 0x55, 0x7e, 0xc2, 0xcb, 0xe0
++};
++static const u8 enc_assoc095[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce095[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key095[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input096[] __initconst = {
++      0xff, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0x64, 0xf9, 0x0f, 0x5b, 0x26, 0x92, 0xb8, 0x60,
++      0xd4, 0x59, 0x6f, 0xf4, 0xb3, 0x40, 0x2c, 0x5c,
++      0x00, 0xb9, 0xbb, 0x53, 0x70, 0x7a, 0xa6, 0x67,
++      0xd3, 0x56, 0xfe, 0x50, 0xc7, 0x19, 0x96, 0x94,
++      0x03, 0x35, 0x61, 0xe7, 0xca, 0xca, 0x6d, 0x94,
++      0x1d, 0xc3, 0xcd, 0x69, 0x14, 0xad, 0x69, 0x04
++};
++static const u8 enc_output096[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xe3, 0x3b, 0xc5, 0x52, 0xca, 0x8b, 0x9e, 0x96,
++      0x16, 0x9e, 0x79, 0x7e, 0x8f, 0x30, 0x30, 0x1b,
++      0x60, 0x3c, 0xa9, 0x99, 0x44, 0xdf, 0x76, 0x52,
++      0x8c, 0x9d, 0x6f, 0x54, 0xab, 0x83, 0x3d, 0x0f,
++      0x60, 0x3c, 0xa9, 0x99, 0x44, 0xdf, 0x76, 0x52,
++      0x8c, 0x9d, 0x6f, 0x54, 0xab, 0x83, 0x3d, 0x0f,
++      0x6a, 0xb8, 0xdc, 0xe2, 0xc5, 0x9d, 0xa4, 0x73,
++      0x71, 0x30, 0xb0, 0x25, 0x2f, 0x68, 0xa8, 0xd8
++};
++static const u8 enc_assoc096[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce096[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key096[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input097[] __initconst = {
++      0x68, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0xb0, 0x8f, 0x25, 0x67, 0x5b, 0x9b, 0xcb, 0xf6,
++      0xe3, 0x84, 0x07, 0xde, 0x2e, 0xc7, 0x5a, 0x47,
++      0x9f, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0x2d, 0x2a, 0xf7, 0xcd, 0x6b, 0x08, 0x05, 0x01,
++      0xd3, 0x1b, 0xa5, 0x4f, 0xb2, 0xeb, 0x75, 0x96,
++      0x47, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0x65, 0x0e, 0xc6, 0x2d, 0x75, 0x70, 0x72, 0xce,
++      0xe6, 0xff, 0x23, 0x31, 0x86, 0xdd, 0x1c, 0x8f
++};
++static const u8 enc_output097[] __initconst = {
++      0x68, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x37, 0x4d, 0xef, 0x6e, 0xb7, 0x82, 0xed, 0x00,
++      0x21, 0x43, 0x11, 0x54, 0x12, 0xb7, 0x46, 0x00,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x4e, 0x23, 0x3f, 0xb3, 0xe5, 0x1d, 0x1e, 0xc7,
++      0x42, 0x45, 0x07, 0x72, 0x0d, 0xc5, 0x21, 0x9d,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x4e, 0x23, 0x3f, 0xb3, 0xe5, 0x1d, 0x1e, 0xc7,
++      0x42, 0x45, 0x07, 0x72, 0x0d, 0xc5, 0x21, 0x9d,
++      0x04, 0x4d, 0xea, 0x60, 0x88, 0x80, 0x41, 0x2b,
++      0xfd, 0xff, 0xcf, 0x35, 0x57, 0x9e, 0x9b, 0x26
++};
++static const u8 enc_assoc097[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce097[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key097[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input098[] __initconst = {
++      0x6d, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0xa1, 0x61, 0xb5, 0xab, 0x04, 0x09, 0x00, 0x62,
++      0x9e, 0xfe, 0xff, 0x78, 0xd7, 0xd8, 0x6b, 0x45,
++      0x9f, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0xc6, 0xf8, 0x07, 0x8c, 0xc8, 0xef, 0x12, 0xa0,
++      0xff, 0x65, 0x7d, 0x6d, 0x08, 0xdb, 0x10, 0xb8,
++      0x47, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0x8e, 0xdc, 0x36, 0x6c, 0xd6, 0x97, 0x65, 0x6f,
++      0xca, 0x81, 0xfb, 0x13, 0x3c, 0xed, 0x79, 0xa1
++};
++static const u8 enc_output098[] __initconst = {
++      0x6d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x26, 0xa3, 0x7f, 0xa2, 0xe8, 0x10, 0x26, 0x94,
++      0x5c, 0x39, 0xe9, 0xf2, 0xeb, 0xa8, 0x77, 0x02,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xa5, 0xf1, 0xcf, 0xf2, 0x46, 0xfa, 0x09, 0x66,
++      0x6e, 0x3b, 0xdf, 0x50, 0xb7, 0xf5, 0x44, 0xb3,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xa5, 0xf1, 0xcf, 0xf2, 0x46, 0xfa, 0x09, 0x66,
++      0x6e, 0x3b, 0xdf, 0x50, 0xb7, 0xf5, 0x44, 0xb3,
++      0x1e, 0x6b, 0xea, 0x63, 0x14, 0x54, 0x2e, 0x2e,
++      0xf9, 0xff, 0xcf, 0x45, 0x0b, 0x2e, 0x98, 0x2b
++};
++static const u8 enc_assoc098[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce098[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key098[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input099[] __initconst = {
++      0xff, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0xfc, 0x01, 0xb8, 0x91, 0xe5, 0xf0, 0xf9, 0x12,
++      0x8d, 0x7d, 0x1c, 0x57, 0x91, 0x92, 0xb6, 0x98,
++      0x63, 0x41, 0x44, 0x15, 0xb6, 0x99, 0x68, 0x95,
++      0x9a, 0x72, 0x91, 0xb7, 0xa5, 0xaf, 0x13, 0x48,
++      0x60, 0xcd, 0x9e, 0xa1, 0x0c, 0x29, 0xa3, 0x66,
++      0x54, 0xe7, 0xa2, 0x8e, 0x76, 0x1b, 0xec, 0xd8
++};
++static const u8 enc_output099[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x7b, 0xc3, 0x72, 0x98, 0x09, 0xe9, 0xdf, 0xe4,
++      0x4f, 0xba, 0x0a, 0xdd, 0xad, 0xe2, 0xaa, 0xdf,
++      0x03, 0xc4, 0x56, 0xdf, 0x82, 0x3c, 0xb8, 0xa0,
++      0xc5, 0xb9, 0x00, 0xb3, 0xc9, 0x35, 0xb8, 0xd3,
++      0x03, 0xc4, 0x56, 0xdf, 0x82, 0x3c, 0xb8, 0xa0,
++      0xc5, 0xb9, 0x00, 0xb3, 0xc9, 0x35, 0xb8, 0xd3,
++      0xed, 0x20, 0x17, 0xc8, 0xdb, 0xa4, 0x77, 0x56,
++      0x29, 0x04, 0x9d, 0x78, 0x6e, 0x3b, 0xce, 0xb1
++};
++static const u8 enc_assoc099[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce099[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key099[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input100[] __initconst = {
++      0xff, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0x6b, 0x6d, 0xc9, 0xd2, 0x1a, 0x81, 0x9e, 0x70,
++      0xb5, 0x77, 0xf4, 0x41, 0x37, 0xd3, 0xd6, 0xbd,
++      0x13, 0x35, 0xf5, 0xeb, 0x44, 0x49, 0x40, 0x77,
++      0xb2, 0x64, 0x49, 0xa5, 0x4b, 0x6c, 0x7c, 0x75,
++      0x10, 0xb9, 0x2f, 0x5f, 0xfe, 0xf9, 0x8b, 0x84,
++      0x7c, 0xf1, 0x7a, 0x9c, 0x98, 0xd8, 0x83, 0xe5
++};
++static const u8 enc_output100[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xec, 0xaf, 0x03, 0xdb, 0xf6, 0x98, 0xb8, 0x86,
++      0x77, 0xb0, 0xe2, 0xcb, 0x0b, 0xa3, 0xca, 0xfa,
++      0x73, 0xb0, 0xe7, 0x21, 0x70, 0xec, 0x90, 0x42,
++      0xed, 0xaf, 0xd8, 0xa1, 0x27, 0xf6, 0xd7, 0xee,
++      0x73, 0xb0, 0xe7, 0x21, 0x70, 0xec, 0x90, 0x42,
++      0xed, 0xaf, 0xd8, 0xa1, 0x27, 0xf6, 0xd7, 0xee,
++      0x07, 0x3f, 0x17, 0xcb, 0x67, 0x78, 0x64, 0x59,
++      0x25, 0x04, 0x9d, 0x88, 0x22, 0xcb, 0xca, 0xb6
++};
++static const u8 enc_assoc100[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce100[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key100[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input101[] __initconst = {
++      0xff, 0xcb, 0x2b, 0x11, 0x06, 0xf8, 0x23, 0x4c,
++      0x5e, 0x99, 0xd4, 0xdb, 0x4c, 0x70, 0x48, 0xde,
++      0x32, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0x16, 0xe9, 0x88, 0x4a, 0x11, 0x4f, 0x0e, 0x92,
++      0x66, 0xce, 0xa3, 0x88, 0x5f, 0xe3, 0x6b, 0x9f,
++      0xd6, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0xce, 0xbe, 0xf5, 0xe9, 0x88, 0x5a, 0x80, 0xea,
++      0x76, 0xd9, 0x75, 0xc1, 0x44, 0xa4, 0x18, 0x88
++};
++static const u8 enc_output101[] __initconst = {
++      0xff, 0xa0, 0xfc, 0x3e, 0x80, 0x32, 0xc3, 0xd5,
++      0xfd, 0xb6, 0x2a, 0x11, 0xf0, 0x96, 0x30, 0x7d,
++      0xb5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x76, 0x6c, 0x9a, 0x80, 0x25, 0xea, 0xde, 0xa7,
++      0x39, 0x05, 0x32, 0x8c, 0x33, 0x79, 0xc0, 0x04,
++      0xb5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x76, 0x6c, 0x9a, 0x80, 0x25, 0xea, 0xde, 0xa7,
++      0x39, 0x05, 0x32, 0x8c, 0x33, 0x79, 0xc0, 0x04,
++      0x8b, 0x9b, 0xb4, 0xb4, 0x86, 0x12, 0x89, 0x65,
++      0x8c, 0x69, 0x6a, 0x83, 0x40, 0x15, 0x04, 0x05
++};
++static const u8 enc_assoc101[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce101[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key101[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input102[] __initconst = {
++      0x6f, 0x9e, 0x70, 0xed, 0x3b, 0x8b, 0xac, 0xa0,
++      0x26, 0xe4, 0x6a, 0x5a, 0x09, 0x43, 0x15, 0x8d,
++      0x21, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0x0c, 0x61, 0x2c, 0x5e, 0x8d, 0x89, 0xa8, 0x73,
++      0xdb, 0xca, 0xad, 0x5b, 0x73, 0x46, 0x42, 0x9b,
++      0xc5, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0xd4, 0x36, 0x51, 0xfd, 0x14, 0x9c, 0x26, 0x0b,
++      0xcb, 0xdd, 0x7b, 0x12, 0x68, 0x01, 0x31, 0x8c
++};
++static const u8 enc_output102[] __initconst = {
++      0x6f, 0xf5, 0xa7, 0xc2, 0xbd, 0x41, 0x4c, 0x39,
++      0x85, 0xcb, 0x94, 0x90, 0xb5, 0xa5, 0x6d, 0x2e,
++      0xa6, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x6c, 0xe4, 0x3e, 0x94, 0xb9, 0x2c, 0x78, 0x46,
++      0x84, 0x01, 0x3c, 0x5f, 0x1f, 0xdc, 0xe9, 0x00,
++      0xa6, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x6c, 0xe4, 0x3e, 0x94, 0xb9, 0x2c, 0x78, 0x46,
++      0x84, 0x01, 0x3c, 0x5f, 0x1f, 0xdc, 0xe9, 0x00,
++      0x8b, 0x3b, 0xbd, 0x51, 0x64, 0x44, 0x59, 0x56,
++      0x8d, 0x81, 0xca, 0x1f, 0xa7, 0x2c, 0xe4, 0x04
++};
++static const u8 enc_assoc102[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce102[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key102[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input103[] __initconst = {
++      0x41, 0x2b, 0x08, 0x0a, 0x3e, 0x19, 0xc1, 0x0d,
++      0x44, 0xa1, 0xaf, 0x1e, 0xab, 0xde, 0xb4, 0xce,
++      0x35, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0x6b, 0x83, 0x94, 0x33, 0x09, 0x21, 0x48, 0x6c,
++      0xa1, 0x1d, 0x29, 0x1c, 0x3e, 0x97, 0xee, 0x9a,
++      0xd1, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0xb3, 0xd4, 0xe9, 0x90, 0x90, 0x34, 0xc6, 0x14,
++      0xb1, 0x0a, 0xff, 0x55, 0x25, 0xd0, 0x9d, 0x8d
++};
++static const u8 enc_output103[] __initconst = {
++      0x41, 0x40, 0xdf, 0x25, 0xb8, 0xd3, 0x21, 0x94,
++      0xe7, 0x8e, 0x51, 0xd4, 0x17, 0x38, 0xcc, 0x6d,
++      0xb2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x0b, 0x06, 0x86, 0xf9, 0x3d, 0x84, 0x98, 0x59,
++      0xfe, 0xd6, 0xb8, 0x18, 0x52, 0x0d, 0x45, 0x01,
++      0xb2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x0b, 0x06, 0x86, 0xf9, 0x3d, 0x84, 0x98, 0x59,
++      0xfe, 0xd6, 0xb8, 0x18, 0x52, 0x0d, 0x45, 0x01,
++      0x86, 0xfb, 0xab, 0x2b, 0x4a, 0x94, 0xf4, 0x7a,
++      0xa5, 0x6f, 0x0a, 0xea, 0x65, 0xd1, 0x10, 0x08
++};
++static const u8 enc_assoc103[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce103[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key103[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input104[] __initconst = {
++      0xb2, 0x47, 0xa7, 0x47, 0x23, 0x49, 0x1a, 0xac,
++      0xac, 0xaa, 0xd7, 0x09, 0xc9, 0x1e, 0x93, 0x2b,
++      0x31, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0x9a, 0xde, 0x04, 0xe7, 0x5b, 0xb7, 0x01, 0xd9,
++      0x66, 0x06, 0x01, 0xb3, 0x47, 0x65, 0xde, 0x98,
++      0xd5, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0x42, 0x89, 0x79, 0x44, 0xc2, 0xa2, 0x8f, 0xa1,
++      0x76, 0x11, 0xd7, 0xfa, 0x5c, 0x22, 0xad, 0x8f
++};
++static const u8 enc_output104[] __initconst = {
++      0xb2, 0x2c, 0x70, 0x68, 0xa5, 0x83, 0xfa, 0x35,
++      0x0f, 0x85, 0x29, 0xc3, 0x75, 0xf8, 0xeb, 0x88,
++      0xb6, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xfa, 0x5b, 0x16, 0x2d, 0x6f, 0x12, 0xd1, 0xec,
++      0x39, 0xcd, 0x90, 0xb7, 0x2b, 0xff, 0x75, 0x03,
++      0xb6, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xfa, 0x5b, 0x16, 0x2d, 0x6f, 0x12, 0xd1, 0xec,
++      0x39, 0xcd, 0x90, 0xb7, 0x2b, 0xff, 0x75, 0x03,
++      0xa0, 0x19, 0xac, 0x2e, 0xd6, 0x67, 0xe1, 0x7d,
++      0xa1, 0x6f, 0x0a, 0xfa, 0x19, 0x61, 0x0d, 0x0d
++};
++static const u8 enc_assoc104[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce104[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key104[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input105[] __initconst = {
++      0x74, 0x0f, 0x9e, 0x49, 0xf6, 0x10, 0xef, 0xa5,
++      0x85, 0xb6, 0x59, 0xca, 0x6e, 0xd8, 0xb4, 0x99,
++      0x2d, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0x41, 0x2d, 0x96, 0xaf, 0xbe, 0x80, 0xec, 0x3e,
++      0x79, 0xd4, 0x51, 0xb0, 0x0a, 0x2d, 0xb2, 0x9a,
++      0xc9, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0x99, 0x7a, 0xeb, 0x0c, 0x27, 0x95, 0x62, 0x46,
++      0x69, 0xc3, 0x87, 0xf9, 0x11, 0x6a, 0xc1, 0x8d
++};
++static const u8 enc_output105[] __initconst = {
++      0x74, 0x64, 0x49, 0x66, 0x70, 0xda, 0x0f, 0x3c,
++      0x26, 0x99, 0xa7, 0x00, 0xd2, 0x3e, 0xcc, 0x3a,
++      0xaa, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x21, 0xa8, 0x84, 0x65, 0x8a, 0x25, 0x3c, 0x0b,
++      0x26, 0x1f, 0xc0, 0xb4, 0x66, 0xb7, 0x19, 0x01,
++      0xaa, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x21, 0xa8, 0x84, 0x65, 0x8a, 0x25, 0x3c, 0x0b,
++      0x26, 0x1f, 0xc0, 0xb4, 0x66, 0xb7, 0x19, 0x01,
++      0x73, 0x6e, 0x18, 0x18, 0x16, 0x96, 0xa5, 0x88,
++      0x9c, 0x31, 0x59, 0xfa, 0xab, 0xab, 0x20, 0xfd
++};
++static const u8 enc_assoc105[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce105[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key105[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input106[] __initconst = {
++      0xad, 0xba, 0x5d, 0x10, 0x5b, 0xc8, 0xaa, 0x06,
++      0x2c, 0x23, 0x36, 0xcb, 0x88, 0x9d, 0xdb, 0xd5,
++      0x37, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0x17, 0x7c, 0x5f, 0xfe, 0x28, 0x75, 0xf4, 0x68,
++      0xf6, 0xc2, 0x96, 0x57, 0x48, 0xf3, 0x59, 0x9a,
++      0xd3, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0xcf, 0x2b, 0x22, 0x5d, 0xb1, 0x60, 0x7a, 0x10,
++      0xe6, 0xd5, 0x40, 0x1e, 0x53, 0xb4, 0x2a, 0x8d
++};
++static const u8 enc_output106[] __initconst = {
++      0xad, 0xd1, 0x8a, 0x3f, 0xdd, 0x02, 0x4a, 0x9f,
++      0x8f, 0x0c, 0xc8, 0x01, 0x34, 0x7b, 0xa3, 0x76,
++      0xb0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x77, 0xf9, 0x4d, 0x34, 0x1c, 0xd0, 0x24, 0x5d,
++      0xa9, 0x09, 0x07, 0x53, 0x24, 0x69, 0xf2, 0x01,
++      0xb0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x77, 0xf9, 0x4d, 0x34, 0x1c, 0xd0, 0x24, 0x5d,
++      0xa9, 0x09, 0x07, 0x53, 0x24, 0x69, 0xf2, 0x01,
++      0xba, 0xd5, 0x8f, 0x10, 0xa9, 0x1e, 0x6a, 0x88,
++      0x9a, 0xba, 0x32, 0xfd, 0x17, 0xd8, 0x33, 0x1a
++};
++static const u8 enc_assoc106[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce106[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key106[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input107[] __initconst = {
++      0xfe, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0xc0, 0x01, 0xed, 0xc5, 0xda, 0x44, 0x2e, 0x71,
++      0x9b, 0xce, 0x9a, 0xbe, 0x27, 0x3a, 0xf1, 0x44,
++      0xb4, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0x48, 0x02, 0x5f, 0x41, 0xfa, 0x4e, 0x33, 0x6c,
++      0x78, 0x69, 0x57, 0xa2, 0xa7, 0xc4, 0x93, 0x0a,
++      0x6c, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0x00, 0x26, 0x6e, 0xa1, 0xe4, 0x36, 0x44, 0xa3,
++      0x4d, 0x8d, 0xd1, 0xdc, 0x93, 0xf2, 0xfa, 0x13
++};
++static const u8 enc_output107[] __initconst = {
++      0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x47, 0xc3, 0x27, 0xcc, 0x36, 0x5d, 0x08, 0x87,
++      0x59, 0x09, 0x8c, 0x34, 0x1b, 0x4a, 0xed, 0x03,
++      0xd4, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x2b, 0x0b, 0x97, 0x3f, 0x74, 0x5b, 0x28, 0xaa,
++      0xe9, 0x37, 0xf5, 0x9f, 0x18, 0xea, 0xc7, 0x01,
++      0xd4, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x2b, 0x0b, 0x97, 0x3f, 0x74, 0x5b, 0x28, 0xaa,
++      0xe9, 0x37, 0xf5, 0x9f, 0x18, 0xea, 0xc7, 0x01,
++      0xd6, 0x8c, 0xe1, 0x74, 0x07, 0x9a, 0xdd, 0x02,
++      0x8d, 0xd0, 0x5c, 0xf8, 0x14, 0x63, 0x04, 0x88
++};
++static const u8 enc_assoc107[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce107[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key107[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input108[] __initconst = {
++      0xb5, 0x13, 0xb0, 0x6a, 0xb9, 0xac, 0x14, 0x43,
++      0x5a, 0xcb, 0x8a, 0xa3, 0xa3, 0x7a, 0xfd, 0xb6,
++      0x54, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0x61, 0x95, 0x01, 0x93, 0xb1, 0xbf, 0x03, 0x11,
++      0xff, 0x11, 0x79, 0x89, 0xae, 0xd9, 0xa9, 0x99,
++      0xb0, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0xb9, 0xc2, 0x7c, 0x30, 0x28, 0xaa, 0x8d, 0x69,
++      0xef, 0x06, 0xaf, 0xc0, 0xb5, 0x9e, 0xda, 0x8e
++};
++static const u8 enc_output108[] __initconst = {
++      0xb5, 0x78, 0x67, 0x45, 0x3f, 0x66, 0xf4, 0xda,
++      0xf9, 0xe4, 0x74, 0x69, 0x1f, 0x9c, 0x85, 0x15,
++      0xd3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x01, 0x10, 0x13, 0x59, 0x85, 0x1a, 0xd3, 0x24,
++      0xa0, 0xda, 0xe8, 0x8d, 0xc2, 0x43, 0x02, 0x02,
++      0xd3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x01, 0x10, 0x13, 0x59, 0x85, 0x1a, 0xd3, 0x24,
++      0xa0, 0xda, 0xe8, 0x8d, 0xc2, 0x43, 0x02, 0x02,
++      0xaa, 0x48, 0xa3, 0x88, 0x7d, 0x4b, 0x05, 0x96,
++      0x99, 0xc2, 0xfd, 0xf9, 0xc6, 0x78, 0x7e, 0x0a
++};
++static const u8 enc_assoc108[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce108[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key108[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input109[] __initconst = {
++      0xff, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0xd4, 0xf1, 0x09, 0xe8, 0x14, 0xce, 0xa8, 0x5a,
++      0x08, 0xc0, 0x11, 0xd8, 0x50, 0xdd, 0x1d, 0xcb,
++      0xcf, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0x53, 0x40, 0xb8, 0x5a, 0x9a, 0xa0, 0x82, 0x96,
++      0xb7, 0x7a, 0x5f, 0xc3, 0x96, 0x1f, 0x66, 0x0f,
++      0x17, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0x1b, 0x64, 0x89, 0xba, 0x84, 0xd8, 0xf5, 0x59,
++      0x82, 0x9e, 0xd9, 0xbd, 0xa2, 0x29, 0x0f, 0x16
++};
++static const u8 enc_output109[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x53, 0x33, 0xc3, 0xe1, 0xf8, 0xd7, 0x8e, 0xac,
++      0xca, 0x07, 0x07, 0x52, 0x6c, 0xad, 0x01, 0x8c,
++      0xaf, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x30, 0x49, 0x70, 0x24, 0x14, 0xb5, 0x99, 0x50,
++      0x26, 0x24, 0xfd, 0xfe, 0x29, 0x31, 0x32, 0x04,
++      0xaf, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x30, 0x49, 0x70, 0x24, 0x14, 0xb5, 0x99, 0x50,
++      0x26, 0x24, 0xfd, 0xfe, 0x29, 0x31, 0x32, 0x04,
++      0xb9, 0x36, 0xa8, 0x17, 0xf2, 0x21, 0x1a, 0xf1,
++      0x29, 0xe2, 0xcf, 0x16, 0x0f, 0xd4, 0x2b, 0xcb
++};
++static const u8 enc_assoc109[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce109[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key109[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input110[] __initconst = {
++      0xff, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0xdf, 0x4c, 0x62, 0x03, 0x2d, 0x41, 0x19, 0xb5,
++      0x88, 0x47, 0x7e, 0x99, 0x92, 0x5a, 0x56, 0xd9,
++      0xd6, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0xfa, 0x84, 0xf0, 0x64, 0x55, 0x36, 0x42, 0x1b,
++      0x2b, 0xb9, 0x24, 0x6e, 0xc2, 0x19, 0xed, 0x0b,
++      0x0e, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0xb2, 0xa0, 0xc1, 0x84, 0x4b, 0x4e, 0x35, 0xd4,
++      0x1e, 0x5d, 0xa2, 0x10, 0xf6, 0x2f, 0x84, 0x12
++};
++static const u8 enc_output110[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x58, 0x8e, 0xa8, 0x0a, 0xc1, 0x58, 0x3f, 0x43,
++      0x4a, 0x80, 0x68, 0x13, 0xae, 0x2a, 0x4a, 0x9e,
++      0xb6, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x99, 0x8d, 0x38, 0x1a, 0xdb, 0x23, 0x59, 0xdd,
++      0xba, 0xe7, 0x86, 0x53, 0x7d, 0x37, 0xb9, 0x00,
++      0xb6, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x99, 0x8d, 0x38, 0x1a, 0xdb, 0x23, 0x59, 0xdd,
++      0xba, 0xe7, 0x86, 0x53, 0x7d, 0x37, 0xb9, 0x00,
++      0x9f, 0x7a, 0xc4, 0x35, 0x1f, 0x6b, 0x91, 0xe6,
++      0x30, 0x97, 0xa7, 0x13, 0x11, 0x5d, 0x05, 0xbe
++};
++static const u8 enc_assoc110[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce110[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key110[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input111[] __initconst = {
++      0xff, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0x13, 0xf8, 0x0a, 0x00, 0x6d, 0xc1, 0xbb, 0xda,
++      0xd6, 0x39, 0xa9, 0x2f, 0xc7, 0xec, 0xa6, 0x55,
++      0xf7, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0x63, 0x48, 0xb8, 0xfd, 0x29, 0xbf, 0x96, 0xd5,
++      0x63, 0xa5, 0x17, 0xe2, 0x7d, 0x7b, 0xfc, 0x0f,
++      0x2f, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0x2b, 0x6c, 0x89, 0x1d, 0x37, 0xc7, 0xe1, 0x1a,
++      0x56, 0x41, 0x91, 0x9c, 0x49, 0x4d, 0x95, 0x16
++};
++static const u8 enc_output111[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x94, 0x3a, 0xc0, 0x09, 0x81, 0xd8, 0x9d, 0x2c,
++      0x14, 0xfe, 0xbf, 0xa5, 0xfb, 0x9c, 0xba, 0x12,
++      0x97, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x41, 0x70, 0x83, 0xa7, 0xaa, 0x8d, 0x13,
++      0xf2, 0xfb, 0xb5, 0xdf, 0xc2, 0x55, 0xa8, 0x04,
++      0x97, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x41, 0x70, 0x83, 0xa7, 0xaa, 0x8d, 0x13,
++      0xf2, 0xfb, 0xb5, 0xdf, 0xc2, 0x55, 0xa8, 0x04,
++      0x9a, 0x18, 0xa8, 0x28, 0x07, 0x02, 0x69, 0xf4,
++      0x47, 0x00, 0xd0, 0x09, 0xe7, 0x17, 0x1c, 0xc9
++};
++static const u8 enc_assoc111[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce111[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key111[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input112[] __initconst = {
++      0xff, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0x82, 0xe5, 0x9b, 0x45, 0x82, 0x91, 0x50, 0x38,
++      0xf9, 0x33, 0x81, 0x1e, 0x65, 0x2d, 0xc6, 0x6a,
++      0xfc, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0xb6, 0x71, 0xc8, 0xca, 0xc2, 0x70, 0xc2, 0x65,
++      0xa0, 0xac, 0x2f, 0x53, 0x57, 0x99, 0x88, 0x0a,
++      0x24, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0xfe, 0x55, 0xf9, 0x2a, 0xdc, 0x08, 0xb5, 0xaa,
++      0x95, 0x48, 0xa9, 0x2d, 0x63, 0xaf, 0xe1, 0x13
++};
++static const u8 enc_output112[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x05, 0x27, 0x51, 0x4c, 0x6e, 0x88, 0x76, 0xce,
++      0x3b, 0xf4, 0x97, 0x94, 0x59, 0x5d, 0xda, 0x2d,
++      0x9c, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xd5, 0x78, 0x00, 0xb4, 0x4c, 0x65, 0xd9, 0xa3,
++      0x31, 0xf2, 0x8d, 0x6e, 0xe8, 0xb7, 0xdc, 0x01,
++      0x9c, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xd5, 0x78, 0x00, 0xb4, 0x4c, 0x65, 0xd9, 0xa3,
++      0x31, 0xf2, 0x8d, 0x6e, 0xe8, 0xb7, 0xdc, 0x01,
++      0xb4, 0x36, 0xa8, 0x2b, 0x93, 0xd5, 0x55, 0xf7,
++      0x43, 0x00, 0xd0, 0x19, 0x9b, 0xa7, 0x18, 0xce
++};
++static const u8 enc_assoc112[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce112[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key112[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input113[] __initconst = {
++      0xff, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0xf1, 0xd1, 0x28, 0x87, 0xb7, 0x21, 0x69, 0x86,
++      0xa1, 0x2d, 0x79, 0x09, 0x8b, 0x6d, 0xe6, 0x0f,
++      0xc0, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0xa7, 0xc7, 0x58, 0x99, 0xf3, 0xe6, 0x0a, 0xf1,
++      0xfc, 0xb6, 0xc7, 0x30, 0x7d, 0x87, 0x59, 0x0f,
++      0x18, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0xef, 0xe3, 0x69, 0x79, 0xed, 0x9e, 0x7d, 0x3e,
++      0xc9, 0x52, 0x41, 0x4e, 0x49, 0xb1, 0x30, 0x16
++};
++static const u8 enc_output113[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x76, 0x13, 0xe2, 0x8e, 0x5b, 0x38, 0x4f, 0x70,
++      0x63, 0xea, 0x6f, 0x83, 0xb7, 0x1d, 0xfa, 0x48,
++      0xa0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xc4, 0xce, 0x90, 0xe7, 0x7d, 0xf3, 0x11, 0x37,
++      0x6d, 0xe8, 0x65, 0x0d, 0xc2, 0xa9, 0x0d, 0x04,
++      0xa0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xc4, 0xce, 0x90, 0xe7, 0x7d, 0xf3, 0x11, 0x37,
++      0x6d, 0xe8, 0x65, 0x0d, 0xc2, 0xa9, 0x0d, 0x04,
++      0xce, 0x54, 0xa8, 0x2e, 0x1f, 0xa9, 0x42, 0xfa,
++      0x3f, 0x00, 0xd0, 0x29, 0x4f, 0x37, 0x15, 0xd3
++};
++static const u8 enc_assoc113[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce113[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key113[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input114[] __initconst = {
++      0xcb, 0xf1, 0xda, 0x9e, 0x0b, 0xa9, 0x37, 0x73,
++      0x74, 0xe6, 0x9e, 0x1c, 0x0e, 0x60, 0x0c, 0xfc,
++      0x34, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0xbe, 0x3f, 0xa6, 0x6b, 0x6c, 0xe7, 0x80, 0x8a,
++      0xa3, 0xe4, 0x59, 0x49, 0xf9, 0x44, 0x64, 0x9f,
++      0xd0, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0x66, 0x68, 0xdb, 0xc8, 0xf5, 0xf2, 0x0e, 0xf2,
++      0xb3, 0xf3, 0x8f, 0x00, 0xe2, 0x03, 0x17, 0x88
++};
++static const u8 enc_output114[] __initconst = {
++      0xcb, 0x9a, 0x0d, 0xb1, 0x8d, 0x63, 0xd7, 0xea,
++      0xd7, 0xc9, 0x60, 0xd6, 0xb2, 0x86, 0x74, 0x5f,
++      0xb3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xde, 0xba, 0xb4, 0xa1, 0x58, 0x42, 0x50, 0xbf,
++      0xfc, 0x2f, 0xc8, 0x4d, 0x95, 0xde, 0xcf, 0x04,
++      0xb3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xde, 0xba, 0xb4, 0xa1, 0x58, 0x42, 0x50, 0xbf,
++      0xfc, 0x2f, 0xc8, 0x4d, 0x95, 0xde, 0xcf, 0x04,
++      0x23, 0x83, 0xab, 0x0b, 0x79, 0x92, 0x05, 0x69,
++      0x9b, 0x51, 0x0a, 0xa7, 0x09, 0xbf, 0x31, 0xf1
++};
++static const u8 enc_assoc114[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce114[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key114[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input115[] __initconst = {
++      0x8f, 0x27, 0x86, 0x94, 0xc4, 0xe9, 0xda, 0xeb,
++      0xd5, 0x8d, 0x3e, 0x5b, 0x96, 0x6e, 0x8b, 0x68,
++      0x42, 0x3d, 0x35, 0xf6, 0x13, 0xe6, 0xd9, 0x09,
++      0x3d, 0x38, 0xe9, 0x75, 0xc3, 0x8f, 0xe3, 0xb8,
++      0x06, 0x53, 0xe7, 0xa3, 0x31, 0x71, 0x88, 0x33,
++      0xac, 0xc3, 0xb9, 0xad, 0xff, 0x1c, 0x31, 0x98,
++      0xa6, 0xf6, 0x37, 0x81, 0x71, 0xea, 0xe4, 0x39,
++      0x6e, 0xa1, 0x5d, 0xc2, 0x40, 0xd1, 0xab, 0xf4,
++      0xde, 0x04, 0x9a, 0x00, 0xa8, 0x64, 0x06, 0x4b,
++      0xbc, 0xd4, 0x6f, 0xe4, 0xe4, 0x5b, 0x42, 0x8f
++};
++static const u8 enc_output115[] __initconst = {
++      0x8f, 0x4c, 0x51, 0xbb, 0x42, 0x23, 0x3a, 0x72,
++      0x76, 0xa2, 0xc0, 0x91, 0x2a, 0x88, 0xf3, 0xcb,
++      0xc5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x66, 0xd6, 0xf5, 0x69, 0x05, 0xd4, 0x58, 0x06,
++      0xf3, 0x08, 0x28, 0xa9, 0x93, 0x86, 0x9a, 0x03,
++      0xc5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x66, 0xd6, 0xf5, 0x69, 0x05, 0xd4, 0x58, 0x06,
++      0xf3, 0x08, 0x28, 0xa9, 0x93, 0x86, 0x9a, 0x03,
++      0x8b, 0xfb, 0xab, 0x17, 0xa9, 0xe0, 0xb8, 0x74,
++      0x8b, 0x51, 0x0a, 0xe7, 0xd9, 0xfd, 0x23, 0x05
++};
++static const u8 enc_assoc115[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce115[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key115[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input116[] __initconst = {
++      0xd5, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0x9a, 0x22, 0xd7, 0x0a, 0x48, 0xe2, 0x4f, 0xdd,
++      0xcd, 0xd4, 0x41, 0x9d, 0xe6, 0x4c, 0x8f, 0x44,
++      0xfc, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0x77, 0xb5, 0xc9, 0x07, 0xd9, 0xc9, 0xe1, 0xea,
++      0x51, 0x85, 0x1a, 0x20, 0x4a, 0xad, 0x9f, 0x0a,
++      0x24, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0x3f, 0x91, 0xf8, 0xe7, 0xc7, 0xb1, 0x96, 0x25,
++      0x64, 0x61, 0x9c, 0x5e, 0x7e, 0x9b, 0xf6, 0x13
++};
++static const u8 enc_output116[] __initconst = {
++      0xd5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x1d, 0xe0, 0x1d, 0x03, 0xa4, 0xfb, 0x69, 0x2b,
++      0x0f, 0x13, 0x57, 0x17, 0xda, 0x3c, 0x93, 0x03,
++      0x9c, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x14, 0xbc, 0x01, 0x79, 0x57, 0xdc, 0xfa, 0x2c,
++      0xc0, 0xdb, 0xb8, 0x1d, 0xf5, 0x83, 0xcb, 0x01,
++      0x9c, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x14, 0xbc, 0x01, 0x79, 0x57, 0xdc, 0xfa, 0x2c,
++      0xc0, 0xdb, 0xb8, 0x1d, 0xf5, 0x83, 0xcb, 0x01,
++      0x49, 0xbc, 0x6e, 0x9f, 0xc5, 0x1c, 0x4d, 0x50,
++      0x30, 0x36, 0x64, 0x4d, 0x84, 0x27, 0x73, 0xd2
++};
++static const u8 enc_assoc116[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce116[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key116[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input117[] __initconst = {
++      0xdb, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0x75, 0xd5, 0x64, 0x3a, 0xa5, 0xaf, 0x93, 0x4d,
++      0x8c, 0xce, 0x39, 0x2c, 0xc3, 0xee, 0xdb, 0x47,
++      0xc0, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0x60, 0x1b, 0x5a, 0xd2, 0x06, 0x7f, 0x28, 0x06,
++      0x6a, 0x8f, 0x32, 0x81, 0x71, 0x5b, 0xa8, 0x08,
++      0x18, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0x28, 0x3f, 0x6b, 0x32, 0x18, 0x07, 0x5f, 0xc9,
++      0x5f, 0x6b, 0xb4, 0xff, 0x45, 0x6d, 0xc1, 0x11
++};
++static const u8 enc_output117[] __initconst = {
++      0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xf2, 0x17, 0xae, 0x33, 0x49, 0xb6, 0xb5, 0xbb,
++      0x4e, 0x09, 0x2f, 0xa6, 0xff, 0x9e, 0xc7, 0x00,
++      0xa0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x03, 0x12, 0x92, 0xac, 0x88, 0x6a, 0x33, 0xc0,
++      0xfb, 0xd1, 0x90, 0xbc, 0xce, 0x75, 0xfc, 0x03,
++      0xa0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x03, 0x12, 0x92, 0xac, 0x88, 0x6a, 0x33, 0xc0,
++      0xfb, 0xd1, 0x90, 0xbc, 0xce, 0x75, 0xfc, 0x03,
++      0x63, 0xda, 0x6e, 0xa2, 0x51, 0xf0, 0x39, 0x53,
++      0x2c, 0x36, 0x64, 0x5d, 0x38, 0xb7, 0x6f, 0xd7
++};
++static const u8 enc_assoc117[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce117[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key117[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - edge case intermediate sums in poly1305 */
++static const u8 enc_input118[] __initconst = {
++      0x93, 0x94, 0x28, 0xd0, 0x79, 0x35, 0x1f, 0x66,
++      0x5c, 0xd0, 0x01, 0x35, 0x43, 0x19, 0x87, 0x5c,
++      0x62, 0x48, 0x39, 0x60, 0x42, 0x16, 0xe4, 0x03,
++      0xeb, 0xcc, 0x6a, 0xf5, 0x59, 0xec, 0x8b, 0x43,
++      0x97, 0x7a, 0xed, 0x35, 0xcb, 0x5a, 0x2f, 0xca,
++      0xa0, 0x34, 0x6e, 0xfb, 0x93, 0x65, 0x54, 0x64,
++      0xd8, 0xc8, 0xc3, 0xfa, 0x1a, 0x9e, 0x47, 0x4a,
++      0xbe, 0x52, 0xd0, 0x2c, 0x81, 0x87, 0xe9, 0x0f,
++      0x4f, 0x2d, 0x90, 0x96, 0x52, 0x4f, 0xa1, 0xb2,
++      0xb0, 0x23, 0xb8, 0xb2, 0x88, 0x22, 0x27, 0x73,
++      0x90, 0xec, 0xf2, 0x1a, 0x04, 0xe6, 0x30, 0x85,
++      0x8b, 0xb6, 0x56, 0x52, 0xb5, 0xb1, 0x80, 0x16
++};
++static const u8 enc_output118[] __initconst = {
++      0x93, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xe5, 0x8a, 0xf3, 0x69, 0xae, 0x0f, 0xc2, 0xf5,
++      0x29, 0x0b, 0x7c, 0x7f, 0x65, 0x9c, 0x97, 0x04,
++      0xf7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xbb, 0xc1, 0x0b, 0x84, 0x94, 0x8b, 0x5c, 0x8c,
++      0x2f, 0x0c, 0x72, 0x11, 0x3e, 0xa9, 0xbd, 0x04,
++      0xf7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xbb, 0xc1, 0x0b, 0x84, 0x94, 0x8b, 0x5c, 0x8c,
++      0x2f, 0x0c, 0x72, 0x11, 0x3e, 0xa9, 0xbd, 0x04,
++      0x73, 0xeb, 0x27, 0x24, 0xb5, 0xc4, 0x05, 0xf0,
++      0x4d, 0x00, 0xd0, 0xf1, 0x58, 0x40, 0xa1, 0xc1
++};
++static const u8 enc_assoc118[] __initconst = {
++      0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce118[] __initconst = {
++      0x00, 0x00, 0x00, 0x00, 0x06, 0x4c, 0x2d, 0x52
++};
++static const u8 enc_key118[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++static const struct chacha20poly1305_testvec
++chacha20poly1305_enc_vectors[] __initconst = {
++      { enc_input001, enc_output001, enc_assoc001, enc_nonce001, enc_key001,
++        sizeof(enc_input001), sizeof(enc_assoc001), sizeof(enc_nonce001) },
++      { enc_input002, enc_output002, enc_assoc002, enc_nonce002, enc_key002,
++        sizeof(enc_input002), sizeof(enc_assoc002), sizeof(enc_nonce002) },
++      { enc_input003, enc_output003, enc_assoc003, enc_nonce003, enc_key003,
++        sizeof(enc_input003), sizeof(enc_assoc003), sizeof(enc_nonce003) },
++      { enc_input004, enc_output004, enc_assoc004, enc_nonce004, enc_key004,
++        sizeof(enc_input004), sizeof(enc_assoc004), sizeof(enc_nonce004) },
++      { enc_input005, enc_output005, enc_assoc005, enc_nonce005, enc_key005,
++        sizeof(enc_input005), sizeof(enc_assoc005), sizeof(enc_nonce005) },
++      { enc_input006, enc_output006, enc_assoc006, enc_nonce006, enc_key006,
++        sizeof(enc_input006), sizeof(enc_assoc006), sizeof(enc_nonce006) },
++      { enc_input007, enc_output007, enc_assoc007, enc_nonce007, enc_key007,
++        sizeof(enc_input007), sizeof(enc_assoc007), sizeof(enc_nonce007) },
++      { enc_input008, enc_output008, enc_assoc008, enc_nonce008, enc_key008,
++        sizeof(enc_input008), sizeof(enc_assoc008), sizeof(enc_nonce008) },
++      { enc_input009, enc_output009, enc_assoc009, enc_nonce009, enc_key009,
++        sizeof(enc_input009), sizeof(enc_assoc009), sizeof(enc_nonce009) },
++      { enc_input010, enc_output010, enc_assoc010, enc_nonce010, enc_key010,
++        sizeof(enc_input010), sizeof(enc_assoc010), sizeof(enc_nonce010) },
++      { enc_input011, enc_output011, enc_assoc011, enc_nonce011, enc_key011,
++        sizeof(enc_input011), sizeof(enc_assoc011), sizeof(enc_nonce011) },
++      { enc_input012, enc_output012, enc_assoc012, enc_nonce012, enc_key012,
++        sizeof(enc_input012), sizeof(enc_assoc012), sizeof(enc_nonce012) },
++      { enc_input053, enc_output053, enc_assoc053, enc_nonce053, enc_key053,
++        sizeof(enc_input053), sizeof(enc_assoc053), sizeof(enc_nonce053) },
++      { enc_input054, enc_output054, enc_assoc054, enc_nonce054, enc_key054,
++        sizeof(enc_input054), sizeof(enc_assoc054), sizeof(enc_nonce054) },
++      { enc_input055, enc_output055, enc_assoc055, enc_nonce055, enc_key055,
++        sizeof(enc_input055), sizeof(enc_assoc055), sizeof(enc_nonce055) },
++      { enc_input056, enc_output056, enc_assoc056, enc_nonce056, enc_key056,
++        sizeof(enc_input056), sizeof(enc_assoc056), sizeof(enc_nonce056) },
++      { enc_input057, enc_output057, enc_assoc057, enc_nonce057, enc_key057,
++        sizeof(enc_input057), sizeof(enc_assoc057), sizeof(enc_nonce057) },
++      { enc_input058, enc_output058, enc_assoc058, enc_nonce058, enc_key058,
++        sizeof(enc_input058), sizeof(enc_assoc058), sizeof(enc_nonce058) },
++      { enc_input059, enc_output059, enc_assoc059, enc_nonce059, enc_key059,
++        sizeof(enc_input059), sizeof(enc_assoc059), sizeof(enc_nonce059) },
++      { enc_input060, enc_output060, enc_assoc060, enc_nonce060, enc_key060,
++        sizeof(enc_input060), sizeof(enc_assoc060), sizeof(enc_nonce060) },
++      { enc_input061, enc_output061, enc_assoc061, enc_nonce061, enc_key061,
++        sizeof(enc_input061), sizeof(enc_assoc061), sizeof(enc_nonce061) },
++      { enc_input062, enc_output062, enc_assoc062, enc_nonce062, enc_key062,
++        sizeof(enc_input062), sizeof(enc_assoc062), sizeof(enc_nonce062) },
++      { enc_input063, enc_output063, enc_assoc063, enc_nonce063, enc_key063,
++        sizeof(enc_input063), sizeof(enc_assoc063), sizeof(enc_nonce063) },
++      { enc_input064, enc_output064, enc_assoc064, enc_nonce064, enc_key064,
++        sizeof(enc_input064), sizeof(enc_assoc064), sizeof(enc_nonce064) },
++      { enc_input065, enc_output065, enc_assoc065, enc_nonce065, enc_key065,
++        sizeof(enc_input065), sizeof(enc_assoc065), sizeof(enc_nonce065) },
++      { enc_input066, enc_output066, enc_assoc066, enc_nonce066, enc_key066,
++        sizeof(enc_input066), sizeof(enc_assoc066), sizeof(enc_nonce066) },
++      { enc_input067, enc_output067, enc_assoc067, enc_nonce067, enc_key067,
++        sizeof(enc_input067), sizeof(enc_assoc067), sizeof(enc_nonce067) },
++      { enc_input068, enc_output068, enc_assoc068, enc_nonce068, enc_key068,
++        sizeof(enc_input068), sizeof(enc_assoc068), sizeof(enc_nonce068) },
++      { enc_input069, enc_output069, enc_assoc069, enc_nonce069, enc_key069,
++        sizeof(enc_input069), sizeof(enc_assoc069), sizeof(enc_nonce069) },
++      { enc_input070, enc_output070, enc_assoc070, enc_nonce070, enc_key070,
++        sizeof(enc_input070), sizeof(enc_assoc070), sizeof(enc_nonce070) },
++      { enc_input071, enc_output071, enc_assoc071, enc_nonce071, enc_key071,
++        sizeof(enc_input071), sizeof(enc_assoc071), sizeof(enc_nonce071) },
++      { enc_input072, enc_output072, enc_assoc072, enc_nonce072, enc_key072,
++        sizeof(enc_input072), sizeof(enc_assoc072), sizeof(enc_nonce072) },
++      { enc_input073, enc_output073, enc_assoc073, enc_nonce073, enc_key073,
++        sizeof(enc_input073), sizeof(enc_assoc073), sizeof(enc_nonce073) },
++      { enc_input076, enc_output076, enc_assoc076, enc_nonce076, enc_key076,
++        sizeof(enc_input076), sizeof(enc_assoc076), sizeof(enc_nonce076) },
++      { enc_input077, enc_output077, enc_assoc077, enc_nonce077, enc_key077,
++        sizeof(enc_input077), sizeof(enc_assoc077), sizeof(enc_nonce077) },
++      { enc_input078, enc_output078, enc_assoc078, enc_nonce078, enc_key078,
++        sizeof(enc_input078), sizeof(enc_assoc078), sizeof(enc_nonce078) },
++      { enc_input079, enc_output079, enc_assoc079, enc_nonce079, enc_key079,
++        sizeof(enc_input079), sizeof(enc_assoc079), sizeof(enc_nonce079) },
++      { enc_input080, enc_output080, enc_assoc080, enc_nonce080, enc_key080,
++        sizeof(enc_input080), sizeof(enc_assoc080), sizeof(enc_nonce080) },
++      { enc_input081, enc_output081, enc_assoc081, enc_nonce081, enc_key081,
++        sizeof(enc_input081), sizeof(enc_assoc081), sizeof(enc_nonce081) },
++      { enc_input082, enc_output082, enc_assoc082, enc_nonce082, enc_key082,
++        sizeof(enc_input082), sizeof(enc_assoc082), sizeof(enc_nonce082) },
++      { enc_input083, enc_output083, enc_assoc083, enc_nonce083, enc_key083,
++        sizeof(enc_input083), sizeof(enc_assoc083), sizeof(enc_nonce083) },
++      { enc_input084, enc_output084, enc_assoc084, enc_nonce084, enc_key084,
++        sizeof(enc_input084), sizeof(enc_assoc084), sizeof(enc_nonce084) },
++      { enc_input085, enc_output085, enc_assoc085, enc_nonce085, enc_key085,
++        sizeof(enc_input085), sizeof(enc_assoc085), sizeof(enc_nonce085) },
++      { enc_input093, enc_output093, enc_assoc093, enc_nonce093, enc_key093,
++        sizeof(enc_input093), sizeof(enc_assoc093), sizeof(enc_nonce093) },
++      { enc_input094, enc_output094, enc_assoc094, enc_nonce094, enc_key094,
++        sizeof(enc_input094), sizeof(enc_assoc094), sizeof(enc_nonce094) },
++      { enc_input095, enc_output095, enc_assoc095, enc_nonce095, enc_key095,
++        sizeof(enc_input095), sizeof(enc_assoc095), sizeof(enc_nonce095) },
++      { enc_input096, enc_output096, enc_assoc096, enc_nonce096, enc_key096,
++        sizeof(enc_input096), sizeof(enc_assoc096), sizeof(enc_nonce096) },
++      { enc_input097, enc_output097, enc_assoc097, enc_nonce097, enc_key097,
++        sizeof(enc_input097), sizeof(enc_assoc097), sizeof(enc_nonce097) },
++      { enc_input098, enc_output098, enc_assoc098, enc_nonce098, enc_key098,
++        sizeof(enc_input098), sizeof(enc_assoc098), sizeof(enc_nonce098) },
++      { enc_input099, enc_output099, enc_assoc099, enc_nonce099, enc_key099,
++        sizeof(enc_input099), sizeof(enc_assoc099), sizeof(enc_nonce099) },
++      { enc_input100, enc_output100, enc_assoc100, enc_nonce100, enc_key100,
++        sizeof(enc_input100), sizeof(enc_assoc100), sizeof(enc_nonce100) },
++      { enc_input101, enc_output101, enc_assoc101, enc_nonce101, enc_key101,
++        sizeof(enc_input101), sizeof(enc_assoc101), sizeof(enc_nonce101) },
++      { enc_input102, enc_output102, enc_assoc102, enc_nonce102, enc_key102,
++        sizeof(enc_input102), sizeof(enc_assoc102), sizeof(enc_nonce102) },
++      { enc_input103, enc_output103, enc_assoc103, enc_nonce103, enc_key103,
++        sizeof(enc_input103), sizeof(enc_assoc103), sizeof(enc_nonce103) },
++      { enc_input104, enc_output104, enc_assoc104, enc_nonce104, enc_key104,
++        sizeof(enc_input104), sizeof(enc_assoc104), sizeof(enc_nonce104) },
++      { enc_input105, enc_output105, enc_assoc105, enc_nonce105, enc_key105,
++        sizeof(enc_input105), sizeof(enc_assoc105), sizeof(enc_nonce105) },
++      { enc_input106, enc_output106, enc_assoc106, enc_nonce106, enc_key106,
++        sizeof(enc_input106), sizeof(enc_assoc106), sizeof(enc_nonce106) },
++      { enc_input107, enc_output107, enc_assoc107, enc_nonce107, enc_key107,
++        sizeof(enc_input107), sizeof(enc_assoc107), sizeof(enc_nonce107) },
++      { enc_input108, enc_output108, enc_assoc108, enc_nonce108, enc_key108,
++        sizeof(enc_input108), sizeof(enc_assoc108), sizeof(enc_nonce108) },
++      { enc_input109, enc_output109, enc_assoc109, enc_nonce109, enc_key109,
++        sizeof(enc_input109), sizeof(enc_assoc109), sizeof(enc_nonce109) },
++      { enc_input110, enc_output110, enc_assoc110, enc_nonce110, enc_key110,
++        sizeof(enc_input110), sizeof(enc_assoc110), sizeof(enc_nonce110) },
++      { enc_input111, enc_output111, enc_assoc111, enc_nonce111, enc_key111,
++        sizeof(enc_input111), sizeof(enc_assoc111), sizeof(enc_nonce111) },
++      { enc_input112, enc_output112, enc_assoc112, enc_nonce112, enc_key112,
++        sizeof(enc_input112), sizeof(enc_assoc112), sizeof(enc_nonce112) },
++      { enc_input113, enc_output113, enc_assoc113, enc_nonce113, enc_key113,
++        sizeof(enc_input113), sizeof(enc_assoc113), sizeof(enc_nonce113) },
++      { enc_input114, enc_output114, enc_assoc114, enc_nonce114, enc_key114,
++        sizeof(enc_input114), sizeof(enc_assoc114), sizeof(enc_nonce114) },
++      { enc_input115, enc_output115, enc_assoc115, enc_nonce115, enc_key115,
++        sizeof(enc_input115), sizeof(enc_assoc115), sizeof(enc_nonce115) },
++      { enc_input116, enc_output116, enc_assoc116, enc_nonce116, enc_key116,
++        sizeof(enc_input116), sizeof(enc_assoc116), sizeof(enc_nonce116) },
++      { enc_input117, enc_output117, enc_assoc117, enc_nonce117, enc_key117,
++        sizeof(enc_input117), sizeof(enc_assoc117), sizeof(enc_nonce117) },
++      { enc_input118, enc_output118, enc_assoc118, enc_nonce118, enc_key118,
++        sizeof(enc_input118), sizeof(enc_assoc118), sizeof(enc_nonce118) }
++};
++
++static const u8 dec_input001[] __initconst = {
++      0x64, 0xa0, 0x86, 0x15, 0x75, 0x86, 0x1a, 0xf4,
++      0x60, 0xf0, 0x62, 0xc7, 0x9b, 0xe6, 0x43, 0xbd,
++      0x5e, 0x80, 0x5c, 0xfd, 0x34, 0x5c, 0xf3, 0x89,
++      0xf1, 0x08, 0x67, 0x0a, 0xc7, 0x6c, 0x8c, 0xb2,
++      0x4c, 0x6c, 0xfc, 0x18, 0x75, 0x5d, 0x43, 0xee,
++      0xa0, 0x9e, 0xe9, 0x4e, 0x38, 0x2d, 0x26, 0xb0,
++      0xbd, 0xb7, 0xb7, 0x3c, 0x32, 0x1b, 0x01, 0x00,
++      0xd4, 0xf0, 0x3b, 0x7f, 0x35, 0x58, 0x94, 0xcf,
++      0x33, 0x2f, 0x83, 0x0e, 0x71, 0x0b, 0x97, 0xce,
++      0x98, 0xc8, 0xa8, 0x4a, 0xbd, 0x0b, 0x94, 0x81,
++      0x14, 0xad, 0x17, 0x6e, 0x00, 0x8d, 0x33, 0xbd,
++      0x60, 0xf9, 0x82, 0xb1, 0xff, 0x37, 0xc8, 0x55,
++      0x97, 0x97, 0xa0, 0x6e, 0xf4, 0xf0, 0xef, 0x61,
++      0xc1, 0x86, 0x32, 0x4e, 0x2b, 0x35, 0x06, 0x38,
++      0x36, 0x06, 0x90, 0x7b, 0x6a, 0x7c, 0x02, 0xb0,
++      0xf9, 0xf6, 0x15, 0x7b, 0x53, 0xc8, 0x67, 0xe4,
++      0xb9, 0x16, 0x6c, 0x76, 0x7b, 0x80, 0x4d, 0x46,
++      0xa5, 0x9b, 0x52, 0x16, 0xcd, 0xe7, 0xa4, 0xe9,
++      0x90, 0x40, 0xc5, 0xa4, 0x04, 0x33, 0x22, 0x5e,
++      0xe2, 0x82, 0xa1, 0xb0, 0xa0, 0x6c, 0x52, 0x3e,
++      0xaf, 0x45, 0x34, 0xd7, 0xf8, 0x3f, 0xa1, 0x15,
++      0x5b, 0x00, 0x47, 0x71, 0x8c, 0xbc, 0x54, 0x6a,
++      0x0d, 0x07, 0x2b, 0x04, 0xb3, 0x56, 0x4e, 0xea,
++      0x1b, 0x42, 0x22, 0x73, 0xf5, 0x48, 0x27, 0x1a,
++      0x0b, 0xb2, 0x31, 0x60, 0x53, 0xfa, 0x76, 0x99,
++      0x19, 0x55, 0xeb, 0xd6, 0x31, 0x59, 0x43, 0x4e,
++      0xce, 0xbb, 0x4e, 0x46, 0x6d, 0xae, 0x5a, 0x10,
++      0x73, 0xa6, 0x72, 0x76, 0x27, 0x09, 0x7a, 0x10,
++      0x49, 0xe6, 0x17, 0xd9, 0x1d, 0x36, 0x10, 0x94,
++      0xfa, 0x68, 0xf0, 0xff, 0x77, 0x98, 0x71, 0x30,
++      0x30, 0x5b, 0xea, 0xba, 0x2e, 0xda, 0x04, 0xdf,
++      0x99, 0x7b, 0x71, 0x4d, 0x6c, 0x6f, 0x2c, 0x29,
++      0xa6, 0xad, 0x5c, 0xb4, 0x02, 0x2b, 0x02, 0x70,
++      0x9b, 0xee, 0xad, 0x9d, 0x67, 0x89, 0x0c, 0xbb,
++      0x22, 0x39, 0x23, 0x36, 0xfe, 0xa1, 0x85, 0x1f,
++      0x38
++};
++static const u8 dec_output001[] __initconst = {
++      0x49, 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x65, 0x74,
++      0x2d, 0x44, 0x72, 0x61, 0x66, 0x74, 0x73, 0x20,
++      0x61, 0x72, 0x65, 0x20, 0x64, 0x72, 0x61, 0x66,
++      0x74, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
++      0x6e, 0x74, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x69,
++      0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x20,
++      0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20,
++      0x6f, 0x66, 0x20, 0x73, 0x69, 0x78, 0x20, 0x6d,
++      0x6f, 0x6e, 0x74, 0x68, 0x73, 0x20, 0x61, 0x6e,
++      0x64, 0x20, 0x6d, 0x61, 0x79, 0x20, 0x62, 0x65,
++      0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x64,
++      0x2c, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63,
++      0x65, 0x64, 0x2c, 0x20, 0x6f, 0x72, 0x20, 0x6f,
++      0x62, 0x73, 0x6f, 0x6c, 0x65, 0x74, 0x65, 0x64,
++      0x20, 0x62, 0x79, 0x20, 0x6f, 0x74, 0x68, 0x65,
++      0x72, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
++      0x6e, 0x74, 0x73, 0x20, 0x61, 0x74, 0x20, 0x61,
++      0x6e, 0x79, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x2e,
++      0x20, 0x49, 0x74, 0x20, 0x69, 0x73, 0x20, 0x69,
++      0x6e, 0x61, 0x70, 0x70, 0x72, 0x6f, 0x70, 0x72,
++      0x69, 0x61, 0x74, 0x65, 0x20, 0x74, 0x6f, 0x20,
++      0x75, 0x73, 0x65, 0x20, 0x49, 0x6e, 0x74, 0x65,
++      0x72, 0x6e, 0x65, 0x74, 0x2d, 0x44, 0x72, 0x61,
++      0x66, 0x74, 0x73, 0x20, 0x61, 0x73, 0x20, 0x72,
++      0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65,
++      0x20, 0x6d, 0x61, 0x74, 0x65, 0x72, 0x69, 0x61,
++      0x6c, 0x20, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20,
++      0x63, 0x69, 0x74, 0x65, 0x20, 0x74, 0x68, 0x65,
++      0x6d, 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x20,
++      0x74, 0x68, 0x61, 0x6e, 0x20, 0x61, 0x73, 0x20,
++      0x2f, 0xe2, 0x80, 0x9c, 0x77, 0x6f, 0x72, 0x6b,
++      0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x67,
++      0x72, 0x65, 0x73, 0x73, 0x2e, 0x2f, 0xe2, 0x80,
++      0x9d
++};
++static const u8 dec_assoc001[] __initconst = {
++      0xf3, 0x33, 0x88, 0x86, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x4e, 0x91
++};
++static const u8 dec_nonce001[] __initconst = {
++      0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08
++};
++static const u8 dec_key001[] __initconst = {
++      0x1c, 0x92, 0x40, 0xa5, 0xeb, 0x55, 0xd3, 0x8a,
++      0xf3, 0x33, 0x88, 0x86, 0x04, 0xf6, 0xb5, 0xf0,
++      0x47, 0x39, 0x17, 0xc1, 0x40, 0x2b, 0x80, 0x09,
++      0x9d, 0xca, 0x5c, 0xbc, 0x20, 0x70, 0x75, 0xc0
++};
++
++static const u8 dec_input002[] __initconst = {
++      0xea, 0xe0, 0x1e, 0x9e, 0x2c, 0x91, 0xaa, 0xe1,
++      0xdb, 0x5d, 0x99, 0x3f, 0x8a, 0xf7, 0x69, 0x92
++};
++static const u8 dec_output002[] __initconst = { };
++static const u8 dec_assoc002[] __initconst = { };
++static const u8 dec_nonce002[] __initconst = {
++      0xca, 0xbf, 0x33, 0x71, 0x32, 0x45, 0x77, 0x8e
++};
++static const u8 dec_key002[] __initconst = {
++      0x4c, 0xf5, 0x96, 0x83, 0x38, 0xe6, 0xae, 0x7f,
++      0x2d, 0x29, 0x25, 0x76, 0xd5, 0x75, 0x27, 0x86,
++      0x91, 0x9a, 0x27, 0x7a, 0xfb, 0x46, 0xc5, 0xef,
++      0x94, 0x81, 0x79, 0x57, 0x14, 0x59, 0x40, 0x68
++};
++
++static const u8 dec_input003[] __initconst = {
++      0xdd, 0x6b, 0x3b, 0x82, 0xce, 0x5a, 0xbd, 0xd6,
++      0xa9, 0x35, 0x83, 0xd8, 0x8c, 0x3d, 0x85, 0x77
++};
++static const u8 dec_output003[] __initconst = { };
++static const u8 dec_assoc003[] __initconst = {
++      0x33, 0x10, 0x41, 0x12, 0x1f, 0xf3, 0xd2, 0x6b
++};
++static const u8 dec_nonce003[] __initconst = {
++      0x3d, 0x86, 0xb5, 0x6b, 0xc8, 0xa3, 0x1f, 0x1d
++};
++static const u8 dec_key003[] __initconst = {
++      0x2d, 0xb0, 0x5d, 0x40, 0xc8, 0xed, 0x44, 0x88,
++      0x34, 0xd1, 0x13, 0xaf, 0x57, 0xa1, 0xeb, 0x3a,
++      0x2a, 0x80, 0x51, 0x36, 0xec, 0x5b, 0xbc, 0x08,
++      0x93, 0x84, 0x21, 0xb5, 0x13, 0x88, 0x3c, 0x0d
++};
++
++static const u8 dec_input004[] __initconst = {
++      0xb7, 0x1b, 0xb0, 0x73, 0x59, 0xb0, 0x84, 0xb2,
++      0x6d, 0x8e, 0xab, 0x94, 0x31, 0xa1, 0xae, 0xac,
++      0x89
++};
++static const u8 dec_output004[] __initconst = {
++      0xa4
++};
++static const u8 dec_assoc004[] __initconst = {
++      0x6a, 0xe2, 0xad, 0x3f, 0x88, 0x39, 0x5a, 0x40
++};
++static const u8 dec_nonce004[] __initconst = {
++      0xd2, 0x32, 0x1f, 0x29, 0x28, 0xc6, 0xc4, 0xc4
++};
++static const u8 dec_key004[] __initconst = {
++      0x4b, 0x28, 0x4b, 0xa3, 0x7b, 0xbe, 0xe9, 0xf8,
++      0x31, 0x80, 0x82, 0xd7, 0xd8, 0xe8, 0xb5, 0xa1,
++      0xe2, 0x18, 0x18, 0x8a, 0x9c, 0xfa, 0xa3, 0x3d,
++      0x25, 0x71, 0x3e, 0x40, 0xbc, 0x54, 0x7a, 0x3e
++};
++
++static const u8 dec_input005[] __initconst = {
++      0xbf, 0xe1, 0x5b, 0x0b, 0xdb, 0x6b, 0xf5, 0x5e,
++      0x6c, 0x5d, 0x84, 0x44, 0x39, 0x81, 0xc1, 0x9c,
++      0xac
++};
++static const u8 dec_output005[] __initconst = {
++      0x2d
++};
++static const u8 dec_assoc005[] __initconst = { };
++static const u8 dec_nonce005[] __initconst = {
++      0x20, 0x1c, 0xaa, 0x5f, 0x9c, 0xbf, 0x92, 0x30
++};
++static const u8 dec_key005[] __initconst = {
++      0x66, 0xca, 0x9c, 0x23, 0x2a, 0x4b, 0x4b, 0x31,
++      0x0e, 0x92, 0x89, 0x8b, 0xf4, 0x93, 0xc7, 0x87,
++      0x98, 0xa3, 0xd8, 0x39, 0xf8, 0xf4, 0xa7, 0x01,
++      0xc0, 0x2e, 0x0a, 0xa6, 0x7e, 0x5a, 0x78, 0x87
++};
++
++static const u8 dec_input006[] __initconst = {
++      0x8b, 0x06, 0xd3, 0x31, 0xb0, 0x93, 0x45, 0xb1,
++      0x75, 0x6e, 0x26, 0xf9, 0x67, 0xbc, 0x90, 0x15,
++      0x81, 0x2c, 0xb5, 0xf0, 0xc6, 0x2b, 0xc7, 0x8c,
++      0x56, 0xd1, 0xbf, 0x69, 0x6c, 0x07, 0xa0, 0xda,
++      0x65, 0x27, 0xc9, 0x90, 0x3d, 0xef, 0x4b, 0x11,
++      0x0f, 0x19, 0x07, 0xfd, 0x29, 0x92, 0xd9, 0xc8,
++      0xf7, 0x99, 0x2e, 0x4a, 0xd0, 0xb8, 0x2c, 0xdc,
++      0x93, 0xf5, 0x9e, 0x33, 0x78, 0xd1, 0x37, 0xc3,
++      0x66, 0xd7, 0x5e, 0xbc, 0x44, 0xbf, 0x53, 0xa5,
++      0xbc, 0xc4, 0xcb, 0x7b, 0x3a, 0x8e, 0x7f, 0x02,
++      0xbd, 0xbb, 0xe7, 0xca, 0xa6, 0x6c, 0x6b, 0x93,
++      0x21, 0x93, 0x10, 0x61, 0xe7, 0x69, 0xd0, 0x78,
++      0xf3, 0x07, 0x5a, 0x1a, 0x8f, 0x73, 0xaa, 0xb1,
++      0x4e, 0xd3, 0xda, 0x4f, 0xf3, 0x32, 0xe1, 0x66,
++      0x3e, 0x6c, 0xc6, 0x13, 0xba, 0x06, 0x5b, 0xfc,
++      0x6a, 0xe5, 0x6f, 0x60, 0xfb, 0x07, 0x40, 0xb0,
++      0x8c, 0x9d, 0x84, 0x43, 0x6b, 0xc1, 0xf7, 0x8d,
++      0x8d, 0x31, 0xf7, 0x7a, 0x39, 0x4d, 0x8f, 0x9a,
++      0xeb
++};
++static const u8 dec_output006[] __initconst = {
++      0x33, 0x2f, 0x94, 0xc1, 0xa4, 0xef, 0xcc, 0x2a,
++      0x5b, 0xa6, 0xe5, 0x8f, 0x1d, 0x40, 0xf0, 0x92,
++      0x3c, 0xd9, 0x24, 0x11, 0xa9, 0x71, 0xf9, 0x37,
++      0x14, 0x99, 0xfa, 0xbe, 0xe6, 0x80, 0xde, 0x50,
++      0xc9, 0x96, 0xd4, 0xb0, 0xec, 0x9e, 0x17, 0xec,
++      0xd2, 0x5e, 0x72, 0x99, 0xfc, 0x0a, 0xe1, 0xcb,
++      0x48, 0xd2, 0x85, 0xdd, 0x2f, 0x90, 0xe0, 0x66,
++      0x3b, 0xe6, 0x20, 0x74, 0xbe, 0x23, 0x8f, 0xcb,
++      0xb4, 0xe4, 0xda, 0x48, 0x40, 0xa6, 0xd1, 0x1b,
++      0xc7, 0x42, 0xce, 0x2f, 0x0c, 0xa6, 0x85, 0x6e,
++      0x87, 0x37, 0x03, 0xb1, 0x7c, 0x25, 0x96, 0xa3,
++      0x05, 0xd8, 0xb0, 0xf4, 0xed, 0xea, 0xc2, 0xf0,
++      0x31, 0x98, 0x6c, 0xd1, 0x14, 0x25, 0xc0, 0xcb,
++      0x01, 0x74, 0xd0, 0x82, 0xf4, 0x36, 0xf5, 0x41,
++      0xd5, 0xdc, 0xca, 0xc5, 0xbb, 0x98, 0xfe, 0xfc,
++      0x69, 0x21, 0x70, 0xd8, 0xa4, 0x4b, 0xc8, 0xde,
++      0x8f
++};
++static const u8 dec_assoc006[] __initconst = {
++      0x70, 0xd3, 0x33, 0xf3, 0x8b, 0x18, 0x0b
++};
++static const u8 dec_nonce006[] __initconst = {
++      0xdf, 0x51, 0x84, 0x82, 0x42, 0x0c, 0x75, 0x9c
++};
++static const u8 dec_key006[] __initconst = {
++      0x68, 0x7b, 0x8d, 0x8e, 0xe3, 0xc4, 0xdd, 0xae,
++      0xdf, 0x72, 0x7f, 0x53, 0x72, 0x25, 0x1e, 0x78,
++      0x91, 0xcb, 0x69, 0x76, 0x1f, 0x49, 0x93, 0xf9,
++      0x6f, 0x21, 0xcc, 0x39, 0x9c, 0xad, 0xb1, 0x01
++};
++
++static const u8 dec_input007[] __initconst = {
++      0x85, 0x04, 0xc2, 0xed, 0x8d, 0xfd, 0x97, 0x5c,
++      0xd2, 0xb7, 0xe2, 0xc1, 0x6b, 0xa3, 0xba, 0xf8,
++      0xc9, 0x50, 0xc3, 0xc6, 0xa5, 0xe3, 0xa4, 0x7c,
++      0xc3, 0x23, 0x49, 0x5e, 0xa9, 0xb9, 0x32, 0xeb,
++      0x8a, 0x7c, 0xca, 0xe5, 0xec, 0xfb, 0x7c, 0xc0,
++      0xcb, 0x7d, 0xdc, 0x2c, 0x9d, 0x92, 0x55, 0x21,
++      0x0a, 0xc8, 0x43, 0x63, 0x59, 0x0a, 0x31, 0x70,
++      0x82, 0x67, 0x41, 0x03, 0xf8, 0xdf, 0xf2, 0xac,
++      0xa7, 0x02, 0xd4, 0xd5, 0x8a, 0x2d, 0xc8, 0x99,
++      0x19, 0x66, 0xd0, 0xf6, 0x88, 0x2c, 0x77, 0xd9,
++      0xd4, 0x0d, 0x6c, 0xbd, 0x98, 0xde, 0xe7, 0x7f,
++      0xad, 0x7e, 0x8a, 0xfb, 0xe9, 0x4b, 0xe5, 0xf7,
++      0xe5, 0x50, 0xa0, 0x90, 0x3f, 0xd6, 0x22, 0x53,
++      0xe3, 0xfe, 0x1b, 0xcc, 0x79, 0x3b, 0xec, 0x12,
++      0x47, 0x52, 0xa7, 0xd6, 0x04, 0xe3, 0x52, 0xe6,
++      0x93, 0x90, 0x91, 0x32, 0x73, 0x79, 0xb8, 0xd0,
++      0x31, 0xde, 0x1f, 0x9f, 0x2f, 0x05, 0x38, 0x54,
++      0x2f, 0x35, 0x04, 0x39, 0xe0, 0xa7, 0xba, 0xc6,
++      0x52, 0xf6, 0x37, 0x65, 0x4c, 0x07, 0xa9, 0x7e,
++      0xb3, 0x21, 0x6f, 0x74, 0x8c, 0xc9, 0xde, 0xdb,
++      0x65, 0x1b, 0x9b, 0xaa, 0x60, 0xb1, 0x03, 0x30,
++      0x6b, 0xb2, 0x03, 0xc4, 0x1c, 0x04, 0xf8, 0x0f,
++      0x64, 0xaf, 0x46, 0xe4, 0x65, 0x99, 0x49, 0xe2,
++      0xea, 0xce, 0x78, 0x00, 0xd8, 0x8b, 0xd5, 0x2e,
++      0xcf, 0xfc, 0x40, 0x49, 0xe8, 0x58, 0xdc, 0x34,
++      0x9c, 0x8c, 0x61, 0xbf, 0x0a, 0x8e, 0xec, 0x39,
++      0xa9, 0x30, 0x05, 0x5a, 0xd2, 0x56, 0x01, 0xc7,
++      0xda, 0x8f, 0x4e, 0xbb, 0x43, 0xa3, 0x3a, 0xf9,
++      0x15, 0x2a, 0xd0, 0xa0, 0x7a, 0x87, 0x34, 0x82,
++      0xfe, 0x8a, 0xd1, 0x2d, 0x5e, 0xc7, 0xbf, 0x04,
++      0x53, 0x5f, 0x3b, 0x36, 0xd4, 0x25, 0x5c, 0x34,
++      0x7a, 0x8d, 0xd5, 0x05, 0xce, 0x72, 0xca, 0xef,
++      0x7a, 0x4b, 0xbc, 0xb0, 0x10, 0x5c, 0x96, 0x42,
++      0x3a, 0x00, 0x98, 0xcd, 0x15, 0xe8, 0xb7, 0x53
++};
++static const u8 dec_output007[] __initconst = {
++      0x9b, 0x18, 0xdb, 0xdd, 0x9a, 0x0f, 0x3e, 0xa5,
++      0x15, 0x17, 0xde, 0xdf, 0x08, 0x9d, 0x65, 0x0a,
++      0x67, 0x30, 0x12, 0xe2, 0x34, 0x77, 0x4b, 0xc1,
++      0xd9, 0xc6, 0x1f, 0xab, 0xc6, 0x18, 0x50, 0x17,
++      0xa7, 0x9d, 0x3c, 0xa6, 0xc5, 0x35, 0x8c, 0x1c,
++      0xc0, 0xa1, 0x7c, 0x9f, 0x03, 0x89, 0xca, 0xe1,
++      0xe6, 0xe9, 0xd4, 0xd3, 0x88, 0xdb, 0xb4, 0x51,
++      0x9d, 0xec, 0xb4, 0xfc, 0x52, 0xee, 0x6d, 0xf1,
++      0x75, 0x42, 0xc6, 0xfd, 0xbd, 0x7a, 0x8e, 0x86,
++      0xfc, 0x44, 0xb3, 0x4f, 0xf3, 0xea, 0x67, 0x5a,
++      0x41, 0x13, 0xba, 0xb0, 0xdc, 0xe1, 0xd3, 0x2a,
++      0x7c, 0x22, 0xb3, 0xca, 0xac, 0x6a, 0x37, 0x98,
++      0x3e, 0x1d, 0x40, 0x97, 0xf7, 0x9b, 0x1d, 0x36,
++      0x6b, 0xb3, 0x28, 0xbd, 0x60, 0x82, 0x47, 0x34,
++      0xaa, 0x2f, 0x7d, 0xe9, 0xa8, 0x70, 0x81, 0x57,
++      0xd4, 0xb9, 0x77, 0x0a, 0x9d, 0x29, 0xa7, 0x84,
++      0x52, 0x4f, 0xc2, 0x4a, 0x40, 0x3b, 0x3c, 0xd4,
++      0xc9, 0x2a, 0xdb, 0x4a, 0x53, 0xc4, 0xbe, 0x80,
++      0xe9, 0x51, 0x7f, 0x8f, 0xc7, 0xa2, 0xce, 0x82,
++      0x5c, 0x91, 0x1e, 0x74, 0xd9, 0xd0, 0xbd, 0xd5,
++      0xf3, 0xfd, 0xda, 0x4d, 0x25, 0xb4, 0xbb, 0x2d,
++      0xac, 0x2f, 0x3d, 0x71, 0x85, 0x7b, 0xcf, 0x3c,
++      0x7b, 0x3e, 0x0e, 0x22, 0x78, 0x0c, 0x29, 0xbf,
++      0xe4, 0xf4, 0x57, 0xb3, 0xcb, 0x49, 0xa0, 0xfc,
++      0x1e, 0x05, 0x4e, 0x16, 0xbc, 0xd5, 0xa8, 0xa3,
++      0xee, 0x05, 0x35, 0xc6, 0x7c, 0xab, 0x60, 0x14,
++      0x55, 0x1a, 0x8e, 0xc5, 0x88, 0x5d, 0xd5, 0x81,
++      0xc2, 0x81, 0xa5, 0xc4, 0x60, 0xdb, 0xaf, 0x77,
++      0x91, 0xe1, 0xce, 0xa2, 0x7e, 0x7f, 0x42, 0xe3,
++      0xb0, 0x13, 0x1c, 0x1f, 0x25, 0x60, 0x21, 0xe2,
++      0x40, 0x5f, 0x99, 0xb7, 0x73, 0xec, 0x9b, 0x2b,
++      0xf0, 0x65, 0x11, 0xc8, 0xd0, 0x0a, 0x9f, 0xd3
++};
++static const u8 dec_assoc007[] __initconst = { };
++static const u8 dec_nonce007[] __initconst = {
++      0xde, 0x7b, 0xef, 0xc3, 0x65, 0x1b, 0x68, 0xb0
++};
++static const u8 dec_key007[] __initconst = {
++      0x8d, 0xb8, 0x91, 0x48, 0xf0, 0xe7, 0x0a, 0xbd,
++      0xf9, 0x3f, 0xcd, 0xd9, 0xa0, 0x1e, 0x42, 0x4c,
++      0xe7, 0xde, 0x25, 0x3d, 0xa3, 0xd7, 0x05, 0x80,
++      0x8d, 0xf2, 0x82, 0xac, 0x44, 0x16, 0x51, 0x01
++};
++
++static const u8 dec_input008[] __initconst = {
++      0x14, 0xf6, 0x41, 0x37, 0xa6, 0xd4, 0x27, 0xcd,
++      0xdb, 0x06, 0x3e, 0x9a, 0x4e, 0xab, 0xd5, 0xb1,
++      0x1e, 0x6b, 0xd2, 0xbc, 0x11, 0xf4, 0x28, 0x93,
++      0x63, 0x54, 0xef, 0xbb, 0x5e, 0x1d, 0x3a, 0x1d,
++      0x37, 0x3c, 0x0a, 0x6c, 0x1e, 0xc2, 0xd1, 0x2c,
++      0xb5, 0xa3, 0xb5, 0x7b, 0xb8, 0x8f, 0x25, 0xa6,
++      0x1b, 0x61, 0x1c, 0xec, 0x28, 0x58, 0x26, 0xa4,
++      0xa8, 0x33, 0x28, 0x25, 0x5c, 0x45, 0x05, 0xe5,
++      0x6c, 0x99, 0xe5, 0x45, 0xc4, 0xa2, 0x03, 0x84,
++      0x03, 0x73, 0x1e, 0x8c, 0x49, 0xac, 0x20, 0xdd,
++      0x8d, 0xb3, 0xc4, 0xf5, 0xe7, 0x4f, 0xf1, 0xed,
++      0xa1, 0x98, 0xde, 0xa4, 0x96, 0xdd, 0x2f, 0xab,
++      0xab, 0x97, 0xcf, 0x3e, 0xd2, 0x9e, 0xb8, 0x13,
++      0x07, 0x28, 0x29, 0x19, 0xaf, 0xfd, 0xf2, 0x49,
++      0x43, 0xea, 0x49, 0x26, 0x91, 0xc1, 0x07, 0xd6,
++      0xbb, 0x81, 0x75, 0x35, 0x0d, 0x24, 0x7f, 0xc8,
++      0xda, 0xd4, 0xb7, 0xeb, 0xe8, 0x5c, 0x09, 0xa2,
++      0x2f, 0xdc, 0x28, 0x7d, 0x3a, 0x03, 0xfa, 0x94,
++      0xb5, 0x1d, 0x17, 0x99, 0x36, 0xc3, 0x1c, 0x18,
++      0x34, 0xe3, 0x9f, 0xf5, 0x55, 0x7c, 0xb0, 0x60,
++      0x9d, 0xff, 0xac, 0xd4, 0x61, 0xf2, 0xad, 0xf8,
++      0xce, 0xc7, 0xbe, 0x5c, 0xd2, 0x95, 0xa8, 0x4b,
++      0x77, 0x13, 0x19, 0x59, 0x26, 0xc9, 0xb7, 0x8f,
++      0x6a, 0xcb, 0x2d, 0x37, 0x91, 0xea, 0x92, 0x9c,
++      0x94, 0x5b, 0xda, 0x0b, 0xce, 0xfe, 0x30, 0x20,
++      0xf8, 0x51, 0xad, 0xf2, 0xbe, 0xe7, 0xc7, 0xff,
++      0xb3, 0x33, 0x91, 0x6a, 0xc9, 0x1a, 0x41, 0xc9,
++      0x0f, 0xf3, 0x10, 0x0e, 0xfd, 0x53, 0xff, 0x6c,
++      0x16, 0x52, 0xd9, 0xf3, 0xf7, 0x98, 0x2e, 0xc9,
++      0x07, 0x31, 0x2c, 0x0c, 0x72, 0xd7, 0xc5, 0xc6,
++      0x08, 0x2a, 0x7b, 0xda, 0xbd, 0x7e, 0x02, 0xea,
++      0x1a, 0xbb, 0xf2, 0x04, 0x27, 0x61, 0x28, 0x8e,
++      0xf5, 0x04, 0x03, 0x1f, 0x4c, 0x07, 0x55, 0x82,
++      0xec, 0x1e, 0xd7, 0x8b, 0x2f, 0x65, 0x56, 0xd1,
++      0xd9, 0x1e, 0x3c, 0xe9, 0x1f, 0x5e, 0x98, 0x70,
++      0x38, 0x4a, 0x8c, 0x49, 0xc5, 0x43, 0xa0, 0xa1,
++      0x8b, 0x74, 0x9d, 0x4c, 0x62, 0x0d, 0x10, 0x0c,
++      0xf4, 0x6c, 0x8f, 0xe0, 0xaa, 0x9a, 0x8d, 0xb7,
++      0xe0, 0xbe, 0x4c, 0x87, 0xf1, 0x98, 0x2f, 0xcc,
++      0xed, 0xc0, 0x52, 0x29, 0xdc, 0x83, 0xf8, 0xfc,
++      0x2c, 0x0e, 0xa8, 0x51, 0x4d, 0x80, 0x0d, 0xa3,
++      0xfe, 0xd8, 0x37, 0xe7, 0x41, 0x24, 0xfc, 0xfb,
++      0x75, 0xe3, 0x71, 0x7b, 0x57, 0x45, 0xf5, 0x97,
++      0x73, 0x65, 0x63, 0x14, 0x74, 0xb8, 0x82, 0x9f,
++      0xf8, 0x60, 0x2f, 0x8a, 0xf2, 0x4e, 0xf1, 0x39,
++      0xda, 0x33, 0x91, 0xf8, 0x36, 0xe0, 0x8d, 0x3f,
++      0x1f, 0x3b, 0x56, 0xdc, 0xa0, 0x8f, 0x3c, 0x9d,
++      0x71, 0x52, 0xa7, 0xb8, 0xc0, 0xa5, 0xc6, 0xa2,
++      0x73, 0xda, 0xf4, 0x4b, 0x74, 0x5b, 0x00, 0x3d,
++      0x99, 0xd7, 0x96, 0xba, 0xe6, 0xe1, 0xa6, 0x96,
++      0x38, 0xad, 0xb3, 0xc0, 0xd2, 0xba, 0x91, 0x6b,
++      0xf9, 0x19, 0xdd, 0x3b, 0xbe, 0xbe, 0x9c, 0x20,
++      0x50, 0xba, 0xa1, 0xd0, 0xce, 0x11, 0xbd, 0x95,
++      0xd8, 0xd1, 0xdd, 0x33, 0x85, 0x74, 0xdc, 0xdb,
++      0x66, 0x76, 0x44, 0xdc, 0x03, 0x74, 0x48, 0x35,
++      0x98, 0xb1, 0x18, 0x47, 0x94, 0x7d, 0xff, 0x62,
++      0xe4, 0x58, 0x78, 0xab, 0xed, 0x95, 0x36, 0xd9,
++      0x84, 0x91, 0x82, 0x64, 0x41, 0xbb, 0x58, 0xe6,
++      0x1c, 0x20, 0x6d, 0x15, 0x6b, 0x13, 0x96, 0xe8,
++      0x35, 0x7f, 0xdc, 0x40, 0x2c, 0xe9, 0xbc, 0x8a,
++      0x4f, 0x92, 0xec, 0x06, 0x2d, 0x50, 0xdf, 0x93,
++      0x5d, 0x65, 0x5a, 0xa8, 0xfc, 0x20, 0x50, 0x14,
++      0xa9, 0x8a, 0x7e, 0x1d, 0x08, 0x1f, 0xe2, 0x99,
++      0xd0, 0xbe, 0xfb, 0x3a, 0x21, 0x9d, 0xad, 0x86,
++      0x54, 0xfd, 0x0d, 0x98, 0x1c, 0x5a, 0x6f, 0x1f,
++      0x9a, 0x40, 0xcd, 0xa2, 0xff, 0x6a, 0xf1, 0x54
++};
++static const u8 dec_output008[] __initconst = {
++      0xc3, 0x09, 0x94, 0x62, 0xe6, 0x46, 0x2e, 0x10,
++      0xbe, 0x00, 0xe4, 0xfc, 0xf3, 0x40, 0xa3, 0xe2,
++      0x0f, 0xc2, 0x8b, 0x28, 0xdc, 0xba, 0xb4, 0x3c,
++      0xe4, 0x21, 0x58, 0x61, 0xcd, 0x8b, 0xcd, 0xfb,
++      0xac, 0x94, 0xa1, 0x45, 0xf5, 0x1c, 0xe1, 0x12,
++      0xe0, 0x3b, 0x67, 0x21, 0x54, 0x5e, 0x8c, 0xaa,
++      0xcf, 0xdb, 0xb4, 0x51, 0xd4, 0x13, 0xda, 0xe6,
++      0x83, 0x89, 0xb6, 0x92, 0xe9, 0x21, 0x76, 0xa4,
++      0x93, 0x7d, 0x0e, 0xfd, 0x96, 0x36, 0x03, 0x91,
++      0x43, 0x5c, 0x92, 0x49, 0x62, 0x61, 0x7b, 0xeb,
++      0x43, 0x89, 0xb8, 0x12, 0x20, 0x43, 0xd4, 0x47,
++      0x06, 0x84, 0xee, 0x47, 0xe9, 0x8a, 0x73, 0x15,
++      0x0f, 0x72, 0xcf, 0xed, 0xce, 0x96, 0xb2, 0x7f,
++      0x21, 0x45, 0x76, 0xeb, 0x26, 0x28, 0x83, 0x6a,
++      0xad, 0xaa, 0xa6, 0x81, 0xd8, 0x55, 0xb1, 0xa3,
++      0x85, 0xb3, 0x0c, 0xdf, 0xf1, 0x69, 0x2d, 0x97,
++      0x05, 0x2a, 0xbc, 0x7c, 0x7b, 0x25, 0xf8, 0x80,
++      0x9d, 0x39, 0x25, 0xf3, 0x62, 0xf0, 0x66, 0x5e,
++      0xf4, 0xa0, 0xcf, 0xd8, 0xfd, 0x4f, 0xb1, 0x1f,
++      0x60, 0x3a, 0x08, 0x47, 0xaf, 0xe1, 0xf6, 0x10,
++      0x77, 0x09, 0xa7, 0x27, 0x8f, 0x9a, 0x97, 0x5a,
++      0x26, 0xfa, 0xfe, 0x41, 0x32, 0x83, 0x10, 0xe0,
++      0x1d, 0xbf, 0x64, 0x0d, 0xf4, 0x1c, 0x32, 0x35,
++      0xe5, 0x1b, 0x36, 0xef, 0xd4, 0x4a, 0x93, 0x4d,
++      0x00, 0x7c, 0xec, 0x02, 0x07, 0x8b, 0x5d, 0x7d,
++      0x1b, 0x0e, 0xd1, 0xa6, 0xa5, 0x5d, 0x7d, 0x57,
++      0x88, 0xa8, 0xcc, 0x81, 0xb4, 0x86, 0x4e, 0xb4,
++      0x40, 0xe9, 0x1d, 0xc3, 0xb1, 0x24, 0x3e, 0x7f,
++      0xcc, 0x8a, 0x24, 0x9b, 0xdf, 0x6d, 0xf0, 0x39,
++      0x69, 0x3e, 0x4c, 0xc0, 0x96, 0xe4, 0x13, 0xda,
++      0x90, 0xda, 0xf4, 0x95, 0x66, 0x8b, 0x17, 0x17,
++      0xfe, 0x39, 0x43, 0x25, 0xaa, 0xda, 0xa0, 0x43,
++      0x3c, 0xb1, 0x41, 0x02, 0xa3, 0xf0, 0xa7, 0x19,
++      0x59, 0xbc, 0x1d, 0x7d, 0x6c, 0x6d, 0x91, 0x09,
++      0x5c, 0xb7, 0x5b, 0x01, 0xd1, 0x6f, 0x17, 0x21,
++      0x97, 0xbf, 0x89, 0x71, 0xa5, 0xb0, 0x6e, 0x07,
++      0x45, 0xfd, 0x9d, 0xea, 0x07, 0xf6, 0x7a, 0x9f,
++      0x10, 0x18, 0x22, 0x30, 0x73, 0xac, 0xd4, 0x6b,
++      0x72, 0x44, 0xed, 0xd9, 0x19, 0x9b, 0x2d, 0x4a,
++      0x41, 0xdd, 0xd1, 0x85, 0x5e, 0x37, 0x19, 0xed,
++      0xd2, 0x15, 0x8f, 0x5e, 0x91, 0xdb, 0x33, 0xf2,
++      0xe4, 0xdb, 0xff, 0x98, 0xfb, 0xa3, 0xb5, 0xca,
++      0x21, 0x69, 0x08, 0xe7, 0x8a, 0xdf, 0x90, 0xff,
++      0x3e, 0xe9, 0x20, 0x86, 0x3c, 0xe9, 0xfc, 0x0b,
++      0xfe, 0x5c, 0x61, 0xaa, 0x13, 0x92, 0x7f, 0x7b,
++      0xec, 0xe0, 0x6d, 0xa8, 0x23, 0x22, 0xf6, 0x6b,
++      0x77, 0xc4, 0xfe, 0x40, 0x07, 0x3b, 0xb6, 0xf6,
++      0x8e, 0x5f, 0xd4, 0xb9, 0xb7, 0x0f, 0x21, 0x04,
++      0xef, 0x83, 0x63, 0x91, 0x69, 0x40, 0xa3, 0x48,
++      0x5c, 0xd2, 0x60, 0xf9, 0x4f, 0x6c, 0x47, 0x8b,
++      0x3b, 0xb1, 0x9f, 0x8e, 0xee, 0x16, 0x8a, 0x13,
++      0xfc, 0x46, 0x17, 0xc3, 0xc3, 0x32, 0x56, 0xf8,
++      0x3c, 0x85, 0x3a, 0xb6, 0x3e, 0xaa, 0x89, 0x4f,
++      0xb3, 0xdf, 0x38, 0xfd, 0xf1, 0xe4, 0x3a, 0xc0,
++      0xe6, 0x58, 0xb5, 0x8f, 0xc5, 0x29, 0xa2, 0x92,
++      0x4a, 0xb6, 0xa0, 0x34, 0x7f, 0xab, 0xb5, 0x8a,
++      0x90, 0xa1, 0xdb, 0x4d, 0xca, 0xb6, 0x2c, 0x41,
++      0x3c, 0xf7, 0x2b, 0x21, 0xc3, 0xfd, 0xf4, 0x17,
++      0x5c, 0xb5, 0x33, 0x17, 0x68, 0x2b, 0x08, 0x30,
++      0xf3, 0xf7, 0x30, 0x3c, 0x96, 0xe6, 0x6a, 0x20,
++      0x97, 0xe7, 0x4d, 0x10, 0x5f, 0x47, 0x5f, 0x49,
++      0x96, 0x09, 0xf0, 0x27, 0x91, 0xc8, 0xf8, 0x5a,
++      0x2e, 0x79, 0xb5, 0xe2, 0xb8, 0xe8, 0xb9, 0x7b,
++      0xd5, 0x10, 0xcb, 0xff, 0x5d, 0x14, 0x73, 0xf3
++};
++static const u8 dec_assoc008[] __initconst = { };
++static const u8 dec_nonce008[] __initconst = {
++      0x0e, 0x0d, 0x57, 0xbb, 0x7b, 0x40, 0x54, 0x02
++};
++static const u8 dec_key008[] __initconst = {
++      0xf2, 0xaa, 0x4f, 0x99, 0xfd, 0x3e, 0xa8, 0x53,
++      0xc1, 0x44, 0xe9, 0x81, 0x18, 0xdc, 0xf5, 0xf0,
++      0x3e, 0x44, 0x15, 0x59, 0xe0, 0xc5, 0x44, 0x86,
++      0xc3, 0x91, 0xa8, 0x75, 0xc0, 0x12, 0x46, 0xba
++};
++
++static const u8 dec_input009[] __initconst = {
++      0xfd, 0x81, 0x8d, 0xd0, 0x3d, 0xb4, 0xd5, 0xdf,
++      0xd3, 0x42, 0x47, 0x5a, 0x6d, 0x19, 0x27, 0x66,
++      0x4b, 0x2e, 0x0c, 0x27, 0x9c, 0x96, 0x4c, 0x72,
++      0x02, 0xa3, 0x65, 0xc3, 0xb3, 0x6f, 0x2e, 0xbd,
++      0x63, 0x8a, 0x4a, 0x5d, 0x29, 0xa2, 0xd0, 0x28,
++      0x48, 0xc5, 0x3d, 0x98, 0xa3, 0xbc, 0xe0, 0xbe,
++      0x3b, 0x3f, 0xe6, 0x8a, 0xa4, 0x7f, 0x53, 0x06,
++      0xfa, 0x7f, 0x27, 0x76, 0x72, 0x31, 0xa1, 0xf5,
++      0xd6, 0x0c, 0x52, 0x47, 0xba, 0xcd, 0x4f, 0xd7,
++      0xeb, 0x05, 0x48, 0x0d, 0x7c, 0x35, 0x4a, 0x09,
++      0xc9, 0x76, 0x71, 0x02, 0xa3, 0xfb, 0xb7, 0x1a,
++      0x65, 0xb7, 0xed, 0x98, 0xc6, 0x30, 0x8a, 0x00,
++      0xae, 0xa1, 0x31, 0xe5, 0xb5, 0x9e, 0x6d, 0x62,
++      0xda, 0xda, 0x07, 0x0f, 0x38, 0x38, 0xd3, 0xcb,
++      0xc1, 0xb0, 0xad, 0xec, 0x72, 0xec, 0xb1, 0xa2,
++      0x7b, 0x59, 0xf3, 0x3d, 0x2b, 0xef, 0xcd, 0x28,
++      0x5b, 0x83, 0xcc, 0x18, 0x91, 0x88, 0xb0, 0x2e,
++      0xf9, 0x29, 0x31, 0x18, 0xf9, 0x4e, 0xe9, 0x0a,
++      0x91, 0x92, 0x9f, 0xae, 0x2d, 0xad, 0xf4, 0xe6,
++      0x1a, 0xe2, 0xa4, 0xee, 0x47, 0x15, 0xbf, 0x83,
++      0x6e, 0xd7, 0x72, 0x12, 0x3b, 0x2d, 0x24, 0xe9,
++      0xb2, 0x55, 0xcb, 0x3c, 0x10, 0xf0, 0x24, 0x8a,
++      0x4a, 0x02, 0xea, 0x90, 0x25, 0xf0, 0xb4, 0x79,
++      0x3a, 0xef, 0x6e, 0xf5, 0x52, 0xdf, 0xb0, 0x0a,
++      0xcd, 0x24, 0x1c, 0xd3, 0x2e, 0x22, 0x74, 0xea,
++      0x21, 0x6f, 0xe9, 0xbd, 0xc8, 0x3e, 0x36, 0x5b,
++      0x19, 0xf1, 0xca, 0x99, 0x0a, 0xb4, 0xa7, 0x52,
++      0x1a, 0x4e, 0xf2, 0xad, 0x8d, 0x56, 0x85, 0xbb,
++      0x64, 0x89, 0xba, 0x26, 0xf9, 0xc7, 0xe1, 0x89,
++      0x19, 0x22, 0x77, 0xc3, 0xa8, 0xfc, 0xff, 0xad,
++      0xfe, 0xb9, 0x48, 0xae, 0x12, 0x30, 0x9f, 0x19,
++      0xfb, 0x1b, 0xef, 0x14, 0x87, 0x8a, 0x78, 0x71,
++      0xf3, 0xf4, 0xb7, 0x00, 0x9c, 0x1d, 0xb5, 0x3d,
++      0x49, 0x00, 0x0c, 0x06, 0xd4, 0x50, 0xf9, 0x54,
++      0x45, 0xb2, 0x5b, 0x43, 0xdb, 0x6d, 0xcf, 0x1a,
++      0xe9, 0x7a, 0x7a, 0xcf, 0xfc, 0x8a, 0x4e, 0x4d,
++      0x0b, 0x07, 0x63, 0x28, 0xd8, 0xe7, 0x08, 0x95,
++      0xdf, 0xa6, 0x72, 0x93, 0x2e, 0xbb, 0xa0, 0x42,
++      0x89, 0x16, 0xf1, 0xd9, 0x0c, 0xf9, 0xa1, 0x16,
++      0xfd, 0xd9, 0x03, 0xb4, 0x3b, 0x8a, 0xf5, 0xf6,
++      0xe7, 0x6b, 0x2e, 0x8e, 0x4c, 0x3d, 0xe2, 0xaf,
++      0x08, 0x45, 0x03, 0xff, 0x09, 0xb6, 0xeb, 0x2d,
++      0xc6, 0x1b, 0x88, 0x94, 0xac, 0x3e, 0xf1, 0x9f,
++      0x0e, 0x0e, 0x2b, 0xd5, 0x00, 0x4d, 0x3f, 0x3b,
++      0x53, 0xae, 0xaf, 0x1c, 0x33, 0x5f, 0x55, 0x6e,
++      0x8d, 0xaf, 0x05, 0x7a, 0x10, 0x34, 0xc9, 0xf4,
++      0x66, 0xcb, 0x62, 0x12, 0xa6, 0xee, 0xe8, 0x1c,
++      0x5d, 0x12, 0x86, 0xdb, 0x6f, 0x1c, 0x33, 0xc4,
++      0x1c, 0xda, 0x82, 0x2d, 0x3b, 0x59, 0xfe, 0xb1,
++      0xa4, 0x59, 0x41, 0x86, 0xd0, 0xef, 0xae, 0xfb,
++      0xda, 0x6d, 0x11, 0xb8, 0xca, 0xe9, 0x6e, 0xff,
++      0xf7, 0xa9, 0xd9, 0x70, 0x30, 0xfc, 0x53, 0xe2,
++      0xd7, 0xa2, 0x4e, 0xc7, 0x91, 0xd9, 0x07, 0x06,
++      0xaa, 0xdd, 0xb0, 0x59, 0x28, 0x1d, 0x00, 0x66,
++      0xc5, 0x54, 0xc2, 0xfc, 0x06, 0xda, 0x05, 0x90,
++      0x52, 0x1d, 0x37, 0x66, 0xee, 0xf0, 0xb2, 0x55,
++      0x8a, 0x5d, 0xd2, 0x38, 0x86, 0x94, 0x9b, 0xfc,
++      0x10, 0x4c, 0xa1, 0xb9, 0x64, 0x3e, 0x44, 0xb8,
++      0x5f, 0xb0, 0x0c, 0xec, 0xe0, 0xc9, 0xe5, 0x62,
++      0x75, 0x3f, 0x09, 0xd5, 0xf5, 0xd9, 0x26, 0xba,
++      0x9e, 0xd2, 0xf4, 0xb9, 0x48, 0x0a, 0xbc, 0xa2,
++      0xd6, 0x7c, 0x36, 0x11, 0x7d, 0x26, 0x81, 0x89,
++      0xcf, 0xa4, 0xad, 0x73, 0x0e, 0xee, 0xcc, 0x06,
++      0xa9, 0xdb, 0xb1, 0xfd, 0xfb, 0x09, 0x7f, 0x90,
++      0x42, 0x37, 0x2f, 0xe1, 0x9c, 0x0f, 0x6f, 0xcf,
++      0x43, 0xb5, 0xd9, 0x90, 0xe1, 0x85, 0xf5, 0xa8,
++      0xae
++};
++static const u8 dec_output009[] __initconst = {
++      0xe6, 0xc3, 0xdb, 0x63, 0x55, 0x15, 0xe3, 0x5b,
++      0xb7, 0x4b, 0x27, 0x8b, 0x5a, 0xdd, 0xc2, 0xe8,
++      0x3a, 0x6b, 0xd7, 0x81, 0x96, 0x35, 0x97, 0xca,
++      0xd7, 0x68, 0xe8, 0xef, 0xce, 0xab, 0xda, 0x09,
++      0x6e, 0xd6, 0x8e, 0xcb, 0x55, 0xb5, 0xe1, 0xe5,
++      0x57, 0xfd, 0xc4, 0xe3, 0xe0, 0x18, 0x4f, 0x85,
++      0xf5, 0x3f, 0x7e, 0x4b, 0x88, 0xc9, 0x52, 0x44,
++      0x0f, 0xea, 0xaf, 0x1f, 0x71, 0x48, 0x9f, 0x97,
++      0x6d, 0xb9, 0x6f, 0x00, 0xa6, 0xde, 0x2b, 0x77,
++      0x8b, 0x15, 0xad, 0x10, 0xa0, 0x2b, 0x7b, 0x41,
++      0x90, 0x03, 0x2d, 0x69, 0xae, 0xcc, 0x77, 0x7c,
++      0xa5, 0x9d, 0x29, 0x22, 0xc2, 0xea, 0xb4, 0x00,
++      0x1a, 0xd2, 0x7a, 0x98, 0x8a, 0xf9, 0xf7, 0x82,
++      0xb0, 0xab, 0xd8, 0xa6, 0x94, 0x8d, 0x58, 0x2f,
++      0x01, 0x9e, 0x00, 0x20, 0xfc, 0x49, 0xdc, 0x0e,
++      0x03, 0xe8, 0x45, 0x10, 0xd6, 0xa8, 0xda, 0x55,
++      0x10, 0x9a, 0xdf, 0x67, 0x22, 0x8b, 0x43, 0xab,
++      0x00, 0xbb, 0x02, 0xc8, 0xdd, 0x7b, 0x97, 0x17,
++      0xd7, 0x1d, 0x9e, 0x02, 0x5e, 0x48, 0xde, 0x8e,
++      0xcf, 0x99, 0x07, 0x95, 0x92, 0x3c, 0x5f, 0x9f,
++      0xc5, 0x8a, 0xc0, 0x23, 0xaa, 0xd5, 0x8c, 0x82,
++      0x6e, 0x16, 0x92, 0xb1, 0x12, 0x17, 0x07, 0xc3,
++      0xfb, 0x36, 0xf5, 0x6c, 0x35, 0xd6, 0x06, 0x1f,
++      0x9f, 0xa7, 0x94, 0xa2, 0x38, 0x63, 0x9c, 0xb0,
++      0x71, 0xb3, 0xa5, 0xd2, 0xd8, 0xba, 0x9f, 0x08,
++      0x01, 0xb3, 0xff, 0x04, 0x97, 0x73, 0x45, 0x1b,
++      0xd5, 0xa9, 0x9c, 0x80, 0xaf, 0x04, 0x9a, 0x85,
++      0xdb, 0x32, 0x5b, 0x5d, 0x1a, 0xc1, 0x36, 0x28,
++      0x10, 0x79, 0xf1, 0x3c, 0xbf, 0x1a, 0x41, 0x5c,
++      0x4e, 0xdf, 0xb2, 0x7c, 0x79, 0x3b, 0x7a, 0x62,
++      0x3d, 0x4b, 0xc9, 0x9b, 0x2a, 0x2e, 0x7c, 0xa2,
++      0xb1, 0x11, 0x98, 0xa7, 0x34, 0x1a, 0x00, 0xf3,
++      0xd1, 0xbc, 0x18, 0x22, 0xba, 0x02, 0x56, 0x62,
++      0x31, 0x10, 0x11, 0x6d, 0xe0, 0x54, 0x9d, 0x40,
++      0x1f, 0x26, 0x80, 0x41, 0xca, 0x3f, 0x68, 0x0f,
++      0x32, 0x1d, 0x0a, 0x8e, 0x79, 0xd8, 0xa4, 0x1b,
++      0x29, 0x1c, 0x90, 0x8e, 0xc5, 0xe3, 0xb4, 0x91,
++      0x37, 0x9a, 0x97, 0x86, 0x99, 0xd5, 0x09, 0xc5,
++      0xbb, 0xa3, 0x3f, 0x21, 0x29, 0x82, 0x14, 0x5c,
++      0xab, 0x25, 0xfb, 0xf2, 0x4f, 0x58, 0x26, 0xd4,
++      0x83, 0xaa, 0x66, 0x89, 0x67, 0x7e, 0xc0, 0x49,
++      0xe1, 0x11, 0x10, 0x7f, 0x7a, 0xda, 0x29, 0x04,
++      0xff, 0xf0, 0xcb, 0x09, 0x7c, 0x9d, 0xfa, 0x03,
++      0x6f, 0x81, 0x09, 0x31, 0x60, 0xfb, 0x08, 0xfa,
++      0x74, 0xd3, 0x64, 0x44, 0x7c, 0x55, 0x85, 0xec,
++      0x9c, 0x6e, 0x25, 0xb7, 0x6c, 0xc5, 0x37, 0xb6,
++      0x83, 0x87, 0x72, 0x95, 0x8b, 0x9d, 0xe1, 0x69,
++      0x5c, 0x31, 0x95, 0x42, 0xa6, 0x2c, 0xd1, 0x36,
++      0x47, 0x1f, 0xec, 0x54, 0xab, 0xa2, 0x1c, 0xd8,
++      0x00, 0xcc, 0xbc, 0x0d, 0x65, 0xe2, 0x67, 0xbf,
++      0xbc, 0xea, 0xee, 0x9e, 0xe4, 0x36, 0x95, 0xbe,
++      0x73, 0xd9, 0xa6, 0xd9, 0x0f, 0xa0, 0xcc, 0x82,
++      0x76, 0x26, 0xad, 0x5b, 0x58, 0x6c, 0x4e, 0xab,
++      0x29, 0x64, 0xd3, 0xd9, 0xa9, 0x08, 0x8c, 0x1d,
++      0xa1, 0x4f, 0x80, 0xd8, 0x3f, 0x94, 0xfb, 0xd3,
++      0x7b, 0xfc, 0xd1, 0x2b, 0xc3, 0x21, 0xeb, 0xe5,
++      0x1c, 0x84, 0x23, 0x7f, 0x4b, 0xfa, 0xdb, 0x34,
++      0x18, 0xa2, 0xc2, 0xe5, 0x13, 0xfe, 0x6c, 0x49,
++      0x81, 0xd2, 0x73, 0xe7, 0xe2, 0xd7, 0xe4, 0x4f,
++      0x4b, 0x08, 0x6e, 0xb1, 0x12, 0x22, 0x10, 0x9d,
++      0xac, 0x51, 0x1e, 0x17, 0xd9, 0x8a, 0x0b, 0x42,
++      0x88, 0x16, 0x81, 0x37, 0x7c, 0x6a, 0xf7, 0xef,
++      0x2d, 0xe3, 0xd9, 0xf8, 0x5f, 0xe0, 0x53, 0x27,
++      0x74, 0xb9, 0xe2, 0xd6, 0x1c, 0x80, 0x2c, 0x52,
++      0x65
++};
++static const u8 dec_assoc009[] __initconst = {
++      0x5a, 0x27, 0xff, 0xeb, 0xdf, 0x84, 0xb2, 0x9e,
++      0xef
++};
++static const u8 dec_nonce009[] __initconst = {
++      0xef, 0x2d, 0x63, 0xee, 0x6b, 0x80, 0x8b, 0x78
++};
++static const u8 dec_key009[] __initconst = {
++      0xea, 0xbc, 0x56, 0x99, 0xe3, 0x50, 0xff, 0xc5,
++      0xcc, 0x1a, 0xd7, 0xc1, 0x57, 0x72, 0xea, 0x86,
++      0x5b, 0x89, 0x88, 0x61, 0x3d, 0x2f, 0x9b, 0xb2,
++      0xe7, 0x9c, 0xec, 0x74, 0x6e, 0x3e, 0xf4, 0x3b
++};
++
++static const u8 dec_input010[] __initconst = {
++      0xe5, 0x26, 0xa4, 0x3d, 0xbd, 0x33, 0xd0, 0x4b,
++      0x6f, 0x05, 0xa7, 0x6e, 0x12, 0x7a, 0xd2, 0x74,
++      0xa6, 0xdd, 0xbd, 0x95, 0xeb, 0xf9, 0xa4, 0xf1,
++      0x59, 0x93, 0x91, 0x70, 0xd9, 0xfe, 0x9a, 0xcd,
++      0x53, 0x1f, 0x3a, 0xab, 0xa6, 0x7c, 0x9f, 0xa6,
++      0x9e, 0xbd, 0x99, 0xd9, 0xb5, 0x97, 0x44, 0xd5,
++      0x14, 0x48, 0x4d, 0x9d, 0xc0, 0xd0, 0x05, 0x96,
++      0xeb, 0x4c, 0x78, 0x55, 0x09, 0x08, 0x01, 0x02,
++      0x30, 0x90, 0x7b, 0x96, 0x7a, 0x7b, 0x5f, 0x30,
++      0x41, 0x24, 0xce, 0x68, 0x61, 0x49, 0x86, 0x57,
++      0x82, 0xdd, 0x53, 0x1c, 0x51, 0x28, 0x2b, 0x53,
++      0x6e, 0x2d, 0xc2, 0x20, 0x4c, 0xdd, 0x8f, 0x65,
++      0x10, 0x20, 0x50, 0xdd, 0x9d, 0x50, 0xe5, 0x71,
++      0x40, 0x53, 0x69, 0xfc, 0x77, 0x48, 0x11, 0xb9,
++      0xde, 0xa4, 0x8d, 0x58, 0xe4, 0xa6, 0x1a, 0x18,
++      0x47, 0x81, 0x7e, 0xfc, 0xdd, 0xf6, 0xef, 0xce,
++      0x2f, 0x43, 0x68, 0xd6, 0x06, 0xe2, 0x74, 0x6a,
++      0xad, 0x90, 0xf5, 0x37, 0xf3, 0x3d, 0x82, 0x69,
++      0x40, 0xe9, 0x6b, 0xa7, 0x3d, 0xa8, 0x1e, 0xd2,
++      0x02, 0x7c, 0xb7, 0x9b, 0xe4, 0xda, 0x8f, 0x95,
++      0x06, 0xc5, 0xdf, 0x73, 0xa3, 0x20, 0x9a, 0x49,
++      0xde, 0x9c, 0xbc, 0xee, 0x14, 0x3f, 0x81, 0x5e,
++      0xf8, 0x3b, 0x59, 0x3c, 0xe1, 0x68, 0x12, 0x5a,
++      0x3a, 0x76, 0x3a, 0x3f, 0xf7, 0x87, 0x33, 0x0a,
++      0x01, 0xb8, 0xd4, 0xed, 0xb6, 0xbe, 0x94, 0x5e,
++      0x70, 0x40, 0x56, 0x67, 0x1f, 0x50, 0x44, 0x19,
++      0xce, 0x82, 0x70, 0x10, 0x87, 0x13, 0x20, 0x0b,
++      0x4c, 0x5a, 0xb6, 0xf6, 0xa7, 0xae, 0x81, 0x75,
++      0x01, 0x81, 0xe6, 0x4b, 0x57, 0x7c, 0xdd, 0x6d,
++      0xf8, 0x1c, 0x29, 0x32, 0xf7, 0xda, 0x3c, 0x2d,
++      0xf8, 0x9b, 0x25, 0x6e, 0x00, 0xb4, 0xf7, 0x2f,
++      0xf7, 0x04, 0xf7, 0xa1, 0x56, 0xac, 0x4f, 0x1a,
++      0x64, 0xb8, 0x47, 0x55, 0x18, 0x7b, 0x07, 0x4d,
++      0xbd, 0x47, 0x24, 0x80, 0x5d, 0xa2, 0x70, 0xc5,
++      0xdd, 0x8e, 0x82, 0xd4, 0xeb, 0xec, 0xb2, 0x0c,
++      0x39, 0xd2, 0x97, 0xc1, 0xcb, 0xeb, 0xf4, 0x77,
++      0x59, 0xb4, 0x87, 0xef, 0xcb, 0x43, 0x2d, 0x46,
++      0x54, 0xd1, 0xa7, 0xd7, 0x15, 0x99, 0x0a, 0x43,
++      0xa1, 0xe0, 0x99, 0x33, 0x71, 0xc1, 0xed, 0xfe,
++      0x72, 0x46, 0x33, 0x8e, 0x91, 0x08, 0x9f, 0xc8,
++      0x2e, 0xca, 0xfa, 0xdc, 0x59, 0xd5, 0xc3, 0x76,
++      0x84, 0x9f, 0xa3, 0x37, 0x68, 0xc3, 0xf0, 0x47,
++      0x2c, 0x68, 0xdb, 0x5e, 0xc3, 0x49, 0x4c, 0xe8,
++      0x92, 0x85, 0xe2, 0x23, 0xd3, 0x3f, 0xad, 0x32,
++      0xe5, 0x2b, 0x82, 0xd7, 0x8f, 0x99, 0x0a, 0x59,
++      0x5c, 0x45, 0xd9, 0xb4, 0x51, 0x52, 0xc2, 0xae,
++      0xbf, 0x80, 0xcf, 0xc9, 0xc9, 0x51, 0x24, 0x2a,
++      0x3b, 0x3a, 0x4d, 0xae, 0xeb, 0xbd, 0x22, 0xc3,
++      0x0e, 0x0f, 0x59, 0x25, 0x92, 0x17, 0xe9, 0x74,
++      0xc7, 0x8b, 0x70, 0x70, 0x36, 0x55, 0x95, 0x75,
++      0x4b, 0xad, 0x61, 0x2b, 0x09, 0xbc, 0x82, 0xf2,
++      0x6e, 0x94, 0x43, 0xae, 0xc3, 0xd5, 0xcd, 0x8e,
++      0xfe, 0x5b, 0x9a, 0x88, 0x43, 0x01, 0x75, 0xb2,
++      0x23, 0x09, 0xf7, 0x89, 0x83, 0xe7, 0xfa, 0xf9,
++      0xb4, 0x9b, 0xf8, 0xef, 0xbd, 0x1c, 0x92, 0xc1,
++      0xda, 0x7e, 0xfe, 0x05, 0xba, 0x5a, 0xcd, 0x07,
++      0x6a, 0x78, 0x9e, 0x5d, 0xfb, 0x11, 0x2f, 0x79,
++      0x38, 0xb6, 0xc2, 0x5b, 0x6b, 0x51, 0xb4, 0x71,
++      0xdd, 0xf7, 0x2a, 0xe4, 0xf4, 0x72, 0x76, 0xad,
++      0xc2, 0xdd, 0x64, 0x5d, 0x79, 0xb6, 0xf5, 0x7a,
++      0x77, 0x20, 0x05, 0x3d, 0x30, 0x06, 0xd4, 0x4c,
++      0x0a, 0x2c, 0x98, 0x5a, 0xb9, 0xd4, 0x98, 0xa9,
++      0x3f, 0xc6, 0x12, 0xea, 0x3b, 0x4b, 0xc5, 0x79,
++      0x64, 0x63, 0x6b, 0x09, 0x54, 0x3b, 0x14, 0x27,
++      0xba, 0x99, 0x80, 0xc8, 0x72, 0xa8, 0x12, 0x90,
++      0x29, 0xba, 0x40, 0x54, 0x97, 0x2b, 0x7b, 0xfe,
++      0xeb, 0xcd, 0x01, 0x05, 0x44, 0x72, 0xdb, 0x99,
++      0xe4, 0x61, 0xc9, 0x69, 0xd6, 0xb9, 0x28, 0xd1,
++      0x05, 0x3e, 0xf9, 0x0b, 0x49, 0x0a, 0x49, 0xe9,
++      0x8d, 0x0e, 0xa7, 0x4a, 0x0f, 0xaf, 0x32, 0xd0,
++      0xe0, 0xb2, 0x3a, 0x55, 0x58, 0xfe, 0x5c, 0x28,
++      0x70, 0x51, 0x23, 0xb0, 0x7b, 0x6a, 0x5f, 0x1e,
++      0xb8, 0x17, 0xd7, 0x94, 0x15, 0x8f, 0xee, 0x20,
++      0xc7, 0x42, 0x25, 0x3e, 0x9a, 0x14, 0xd7, 0x60,
++      0x72, 0x39, 0x47, 0x48, 0xa9, 0xfe, 0xdd, 0x47,
++      0x0a, 0xb1, 0xe6, 0x60, 0x28, 0x8c, 0x11, 0x68,
++      0xe1, 0xff, 0xd7, 0xce, 0xc8, 0xbe, 0xb3, 0xfe,
++      0x27, 0x30, 0x09, 0x70, 0xd7, 0xfa, 0x02, 0x33,
++      0x3a, 0x61, 0x2e, 0xc7, 0xff, 0xa4, 0x2a, 0xa8,
++      0x6e, 0xb4, 0x79, 0x35, 0x6d, 0x4c, 0x1e, 0x38,
++      0xf8, 0xee, 0xd4, 0x84, 0x4e, 0x6e, 0x28, 0xa7,
++      0xce, 0xc8, 0xc1, 0xcf, 0x80, 0x05, 0xf3, 0x04,
++      0xef, 0xc8, 0x18, 0x28, 0x2e, 0x8d, 0x5e, 0x0c,
++      0xdf, 0xb8, 0x5f, 0x96, 0xe8, 0xc6, 0x9c, 0x2f,
++      0xe5, 0xa6, 0x44, 0xd7, 0xe7, 0x99, 0x44, 0x0c,
++      0xec, 0xd7, 0x05, 0x60, 0x97, 0xbb, 0x74, 0x77,
++      0x58, 0xd5, 0xbb, 0x48, 0xde, 0x5a, 0xb2, 0x54,
++      0x7f, 0x0e, 0x46, 0x70, 0x6a, 0x6f, 0x78, 0xa5,
++      0x08, 0x89, 0x05, 0x4e, 0x7e, 0xa0, 0x69, 0xb4,
++      0x40, 0x60, 0x55, 0x77, 0x75, 0x9b, 0x19, 0xf2,
++      0xd5, 0x13, 0x80, 0x77, 0xf9, 0x4b, 0x3f, 0x1e,
++      0xee, 0xe6, 0x76, 0x84, 0x7b, 0x8c, 0xe5, 0x27,
++      0xa8, 0x0a, 0x91, 0x01, 0x68, 0x71, 0x8a, 0x3f,
++      0x06, 0xab, 0xf6, 0xa9, 0xa5, 0xe6, 0x72, 0x92,
++      0xe4, 0x67, 0xe2, 0xa2, 0x46, 0x35, 0x84, 0x55,
++      0x7d, 0xca, 0xa8, 0x85, 0xd0, 0xf1, 0x3f, 0xbe,
++      0xd7, 0x34, 0x64, 0xfc, 0xae, 0xe3, 0xe4, 0x04,
++      0x9f, 0x66, 0x02, 0xb9, 0x88, 0x10, 0xd9, 0xc4,
++      0x4c, 0x31, 0x43, 0x7a, 0x93, 0xe2, 0x9b, 0x56,
++      0x43, 0x84, 0xdc, 0xdc, 0xde, 0x1d, 0xa4, 0x02,
++      0x0e, 0xc2, 0xef, 0xc3, 0xf8, 0x78, 0xd1, 0xb2,
++      0x6b, 0x63, 0x18, 0xc9, 0xa9, 0xe5, 0x72, 0xd8,
++      0xf3, 0xb9, 0xd1, 0x8a, 0xc7, 0x1a, 0x02, 0x27,
++      0x20, 0x77, 0x10, 0xe5, 0xc8, 0xd4, 0x4a, 0x47,
++      0xe5, 0xdf, 0x5f, 0x01, 0xaa, 0xb0, 0xd4, 0x10,
++      0xbb, 0x69, 0xe3, 0x36, 0xc8, 0xe1, 0x3d, 0x43,
++      0xfb, 0x86, 0xcd, 0xcc, 0xbf, 0xf4, 0x88, 0xe0,
++      0x20, 0xca, 0xb7, 0x1b, 0xf1, 0x2f, 0x5c, 0xee,
++      0xd4, 0xd3, 0xa3, 0xcc, 0xa4, 0x1e, 0x1c, 0x47,
++      0xfb, 0xbf, 0xfc, 0xa2, 0x41, 0x55, 0x9d, 0xf6,
++      0x5a, 0x5e, 0x65, 0x32, 0x34, 0x7b, 0x52, 0x8d,
++      0xd5, 0xd0, 0x20, 0x60, 0x03, 0xab, 0x3f, 0x8c,
++      0xd4, 0x21, 0xea, 0x2a, 0xd9, 0xc4, 0xd0, 0xd3,
++      0x65, 0xd8, 0x7a, 0x13, 0x28, 0x62, 0x32, 0x4b,
++      0x2c, 0x87, 0x93, 0xa8, 0xb4, 0x52, 0x45, 0x09,
++      0x44, 0xec, 0xec, 0xc3, 0x17, 0xdb, 0x9a, 0x4d,
++      0x5c, 0xa9, 0x11, 0xd4, 0x7d, 0xaf, 0x9e, 0xf1,
++      0x2d, 0xb2, 0x66, 0xc5, 0x1d, 0xed, 0xb7, 0xcd,
++      0x0b, 0x25, 0x5e, 0x30, 0x47, 0x3f, 0x40, 0xf4,
++      0xa1, 0xa0, 0x00, 0x94, 0x10, 0xc5, 0x6a, 0x63,
++      0x1a, 0xd5, 0x88, 0x92, 0x8e, 0x82, 0x39, 0x87,
++      0x3c, 0x78, 0x65, 0x58, 0x42, 0x75, 0x5b, 0xdd,
++      0x77, 0x3e, 0x09, 0x4e, 0x76, 0x5b, 0xe6, 0x0e,
++      0x4d, 0x38, 0xb2, 0xc0, 0xb8, 0x95, 0x01, 0x7a,
++      0x10, 0xe0, 0xfb, 0x07, 0xf2, 0xab, 0x2d, 0x8c,
++      0x32, 0xed, 0x2b, 0xc0, 0x46, 0xc2, 0xf5, 0x38,
++      0x83, 0xf0, 0x17, 0xec, 0xc1, 0x20, 0x6a, 0x9a,
++      0x0b, 0x00, 0xa0, 0x98, 0x22, 0x50, 0x23, 0xd5,
++      0x80, 0x6b, 0xf6, 0x1f, 0xc3, 0xcc, 0x97, 0xc9,
++      0x24, 0x9f, 0xf3, 0xaf, 0x43, 0x14, 0xd5, 0xa0
++};
++static const u8 dec_output010[] __initconst = {
++      0x42, 0x93, 0xe4, 0xeb, 0x97, 0xb0, 0x57, 0xbf,
++      0x1a, 0x8b, 0x1f, 0xe4, 0x5f, 0x36, 0x20, 0x3c,
++      0xef, 0x0a, 0xa9, 0x48, 0x5f, 0x5f, 0x37, 0x22,
++      0x3a, 0xde, 0xe3, 0xae, 0xbe, 0xad, 0x07, 0xcc,
++      0xb1, 0xf6, 0xf5, 0xf9, 0x56, 0xdd, 0xe7, 0x16,
++      0x1e, 0x7f, 0xdf, 0x7a, 0x9e, 0x75, 0xb7, 0xc7,
++      0xbe, 0xbe, 0x8a, 0x36, 0x04, 0xc0, 0x10, 0xf4,
++      0x95, 0x20, 0x03, 0xec, 0xdc, 0x05, 0xa1, 0x7d,
++      0xc4, 0xa9, 0x2c, 0x82, 0xd0, 0xbc, 0x8b, 0xc5,
++      0xc7, 0x45, 0x50, 0xf6, 0xa2, 0x1a, 0xb5, 0x46,
++      0x3b, 0x73, 0x02, 0xa6, 0x83, 0x4b, 0x73, 0x82,
++      0x58, 0x5e, 0x3b, 0x65, 0x2f, 0x0e, 0xfd, 0x2b,
++      0x59, 0x16, 0xce, 0xa1, 0x60, 0x9c, 0xe8, 0x3a,
++      0x99, 0xed, 0x8d, 0x5a, 0xcf, 0xf6, 0x83, 0xaf,
++      0xba, 0xd7, 0x73, 0x73, 0x40, 0x97, 0x3d, 0xca,
++      0xef, 0x07, 0x57, 0xe6, 0xd9, 0x70, 0x0e, 0x95,
++      0xae, 0xa6, 0x8d, 0x04, 0xcc, 0xee, 0xf7, 0x09,
++      0x31, 0x77, 0x12, 0xa3, 0x23, 0x97, 0x62, 0xb3,
++      0x7b, 0x32, 0xfb, 0x80, 0x14, 0x48, 0x81, 0xc3,
++      0xe5, 0xea, 0x91, 0x39, 0x52, 0x81, 0xa2, 0x4f,
++      0xe4, 0xb3, 0x09, 0xff, 0xde, 0x5e, 0xe9, 0x58,
++      0x84, 0x6e, 0xf9, 0x3d, 0xdf, 0x25, 0xea, 0xad,
++      0xae, 0xe6, 0x9a, 0xd1, 0x89, 0x55, 0xd3, 0xde,
++      0x6c, 0x52, 0xdb, 0x70, 0xfe, 0x37, 0xce, 0x44,
++      0x0a, 0xa8, 0x25, 0x5f, 0x92, 0xc1, 0x33, 0x4a,
++      0x4f, 0x9b, 0x62, 0x35, 0xff, 0xce, 0xc0, 0xa9,
++      0x60, 0xce, 0x52, 0x00, 0x97, 0x51, 0x35, 0x26,
++      0x2e, 0xb9, 0x36, 0xa9, 0x87, 0x6e, 0x1e, 0xcc,
++      0x91, 0x78, 0x53, 0x98, 0x86, 0x5b, 0x9c, 0x74,
++      0x7d, 0x88, 0x33, 0xe1, 0xdf, 0x37, 0x69, 0x2b,
++      0xbb, 0xf1, 0x4d, 0xf4, 0xd1, 0xf1, 0x39, 0x93,
++      0x17, 0x51, 0x19, 0xe3, 0x19, 0x1e, 0x76, 0x37,
++      0x25, 0xfb, 0x09, 0x27, 0x6a, 0xab, 0x67, 0x6f,
++      0x14, 0x12, 0x64, 0xe7, 0xc4, 0x07, 0xdf, 0x4d,
++      0x17, 0xbb, 0x6d, 0xe0, 0xe9, 0xb9, 0xab, 0xca,
++      0x10, 0x68, 0xaf, 0x7e, 0xb7, 0x33, 0x54, 0x73,
++      0x07, 0x6e, 0xf7, 0x81, 0x97, 0x9c, 0x05, 0x6f,
++      0x84, 0x5f, 0xd2, 0x42, 0xfb, 0x38, 0xcf, 0xd1,
++      0x2f, 0x14, 0x30, 0x88, 0x98, 0x4d, 0x5a, 0xa9,
++      0x76, 0xd5, 0x4f, 0x3e, 0x70, 0x6c, 0x85, 0x76,
++      0xd7, 0x01, 0xa0, 0x1a, 0xc8, 0x4e, 0xaa, 0xac,
++      0x78, 0xfe, 0x46, 0xde, 0x6a, 0x05, 0x46, 0xa7,
++      0x43, 0x0c, 0xb9, 0xde, 0xb9, 0x68, 0xfb, 0xce,
++      0x42, 0x99, 0x07, 0x4d, 0x0b, 0x3b, 0x5a, 0x30,
++      0x35, 0xa8, 0xf9, 0x3a, 0x73, 0xef, 0x0f, 0xdb,
++      0x1e, 0x16, 0x42, 0xc4, 0xba, 0xae, 0x58, 0xaa,
++      0xf8, 0xe5, 0x75, 0x2f, 0x1b, 0x15, 0x5c, 0xfd,
++      0x0a, 0x97, 0xd0, 0xe4, 0x37, 0x83, 0x61, 0x5f,
++      0x43, 0xa6, 0xc7, 0x3f, 0x38, 0x59, 0xe6, 0xeb,
++      0xa3, 0x90, 0xc3, 0xaa, 0xaa, 0x5a, 0xd3, 0x34,
++      0xd4, 0x17, 0xc8, 0x65, 0x3e, 0x57, 0xbc, 0x5e,
++      0xdd, 0x9e, 0xb7, 0xf0, 0x2e, 0x5b, 0xb2, 0x1f,
++      0x8a, 0x08, 0x0d, 0x45, 0x91, 0x0b, 0x29, 0x53,
++      0x4f, 0x4c, 0x5a, 0x73, 0x56, 0xfe, 0xaf, 0x41,
++      0x01, 0x39, 0x0a, 0x24, 0x3c, 0x7e, 0xbe, 0x4e,
++      0x53, 0xf3, 0xeb, 0x06, 0x66, 0x51, 0x28, 0x1d,
++      0xbd, 0x41, 0x0a, 0x01, 0xab, 0x16, 0x47, 0x27,
++      0x47, 0x47, 0xf7, 0xcb, 0x46, 0x0a, 0x70, 0x9e,
++      0x01, 0x9c, 0x09, 0xe1, 0x2a, 0x00, 0x1a, 0xd8,
++      0xd4, 0x79, 0x9d, 0x80, 0x15, 0x8e, 0x53, 0x2a,
++      0x65, 0x83, 0x78, 0x3e, 0x03, 0x00, 0x07, 0x12,
++      0x1f, 0x33, 0x3e, 0x7b, 0x13, 0x37, 0xf1, 0xc3,
++      0xef, 0xb7, 0xc1, 0x20, 0x3c, 0x3e, 0x67, 0x66,
++      0x5d, 0x88, 0xa7, 0x7d, 0x33, 0x50, 0x77, 0xb0,
++      0x28, 0x8e, 0xe7, 0x2c, 0x2e, 0x7a, 0xf4, 0x3c,
++      0x8d, 0x74, 0x83, 0xaf, 0x8e, 0x87, 0x0f, 0xe4,
++      0x50, 0xff, 0x84, 0x5c, 0x47, 0x0c, 0x6a, 0x49,
++      0xbf, 0x42, 0x86, 0x77, 0x15, 0x48, 0xa5, 0x90,
++      0x5d, 0x93, 0xd6, 0x2a, 0x11, 0xd5, 0xd5, 0x11,
++      0xaa, 0xce, 0xe7, 0x6f, 0xa5, 0xb0, 0x09, 0x2c,
++      0x8d, 0xd3, 0x92, 0xf0, 0x5a, 0x2a, 0xda, 0x5b,
++      0x1e, 0xd5, 0x9a, 0xc4, 0xc4, 0xf3, 0x49, 0x74,
++      0x41, 0xca, 0xe8, 0xc1, 0xf8, 0x44, 0xd6, 0x3c,
++      0xae, 0x6c, 0x1d, 0x9a, 0x30, 0x04, 0x4d, 0x27,
++      0x0e, 0xb1, 0x5f, 0x59, 0xa2, 0x24, 0xe8, 0xe1,
++      0x98, 0xc5, 0x6a, 0x4c, 0xfe, 0x41, 0xd2, 0x27,
++      0x42, 0x52, 0xe1, 0xe9, 0x7d, 0x62, 0xe4, 0x88,
++      0x0f, 0xad, 0xb2, 0x70, 0xcb, 0x9d, 0x4c, 0x27,
++      0x2e, 0x76, 0x1e, 0x1a, 0x63, 0x65, 0xf5, 0x3b,
++      0xf8, 0x57, 0x69, 0xeb, 0x5b, 0x38, 0x26, 0x39,
++      0x33, 0x25, 0x45, 0x3e, 0x91, 0xb8, 0xd8, 0xc7,
++      0xd5, 0x42, 0xc0, 0x22, 0x31, 0x74, 0xf4, 0xbc,
++      0x0c, 0x23, 0xf1, 0xca, 0xc1, 0x8d, 0xd7, 0xbe,
++      0xc9, 0x62, 0xe4, 0x08, 0x1a, 0xcf, 0x36, 0xd5,
++      0xfe, 0x55, 0x21, 0x59, 0x91, 0x87, 0x87, 0xdf,
++      0x06, 0xdb, 0xdf, 0x96, 0x45, 0x58, 0xda, 0x05,
++      0xcd, 0x50, 0x4d, 0xd2, 0x7d, 0x05, 0x18, 0x73,
++      0x6a, 0x8d, 0x11, 0x85, 0xa6, 0x88, 0xe8, 0xda,
++      0xe6, 0x30, 0x33, 0xa4, 0x89, 0x31, 0x75, 0xbe,
++      0x69, 0x43, 0x84, 0x43, 0x50, 0x87, 0xdd, 0x71,
++      0x36, 0x83, 0xc3, 0x78, 0x74, 0x24, 0x0a, 0xed,
++      0x7b, 0xdb, 0xa4, 0x24, 0x0b, 0xb9, 0x7e, 0x5d,
++      0xff, 0xde, 0xb1, 0xef, 0x61, 0x5a, 0x45, 0x33,
++      0xf6, 0x17, 0x07, 0x08, 0x98, 0x83, 0x92, 0x0f,
++      0x23, 0x6d, 0xe6, 0xaa, 0x17, 0x54, 0xad, 0x6a,
++      0xc8, 0xdb, 0x26, 0xbe, 0xb8, 0xb6, 0x08, 0xfa,
++      0x68, 0xf1, 0xd7, 0x79, 0x6f, 0x18, 0xb4, 0x9e,
++      0x2d, 0x3f, 0x1b, 0x64, 0xaf, 0x8d, 0x06, 0x0e,
++      0x49, 0x28, 0xe0, 0x5d, 0x45, 0x68, 0x13, 0x87,
++      0xfa, 0xde, 0x40, 0x7b, 0xd2, 0xc3, 0x94, 0xd5,
++      0xe1, 0xd9, 0xc2, 0xaf, 0x55, 0x89, 0xeb, 0xb4,
++      0x12, 0x59, 0xa8, 0xd4, 0xc5, 0x29, 0x66, 0x38,
++      0xe6, 0xac, 0x22, 0x22, 0xd9, 0x64, 0x9b, 0x34,
++      0x0a, 0x32, 0x9f, 0xc2, 0xbf, 0x17, 0x6c, 0x3f,
++      0x71, 0x7a, 0x38, 0x6b, 0x98, 0xfb, 0x49, 0x36,
++      0x89, 0xc9, 0xe2, 0xd6, 0xc7, 0x5d, 0xd0, 0x69,
++      0x5f, 0x23, 0x35, 0xc9, 0x30, 0xe2, 0xfd, 0x44,
++      0x58, 0x39, 0xd7, 0x97, 0xfb, 0x5c, 0x00, 0xd5,
++      0x4f, 0x7a, 0x1a, 0x95, 0x8b, 0x62, 0x4b, 0xce,
++      0xe5, 0x91, 0x21, 0x7b, 0x30, 0x00, 0xd6, 0xdd,
++      0x6d, 0x02, 0x86, 0x49, 0x0f, 0x3c, 0x1a, 0x27,
++      0x3c, 0xd3, 0x0e, 0x71, 0xf2, 0xff, 0xf5, 0x2f,
++      0x87, 0xac, 0x67, 0x59, 0x81, 0xa3, 0xf7, 0xf8,
++      0xd6, 0x11, 0x0c, 0x84, 0xa9, 0x03, 0xee, 0x2a,
++      0xc4, 0xf3, 0x22, 0xab, 0x7c, 0xe2, 0x25, 0xf5,
++      0x67, 0xa3, 0xe4, 0x11, 0xe0, 0x59, 0xb3, 0xca,
++      0x87, 0xa0, 0xae, 0xc9, 0xa6, 0x62, 0x1b, 0x6e,
++      0x4d, 0x02, 0x6b, 0x07, 0x9d, 0xfd, 0xd0, 0x92,
++      0x06, 0xe1, 0xb2, 0x9a, 0x4a, 0x1f, 0x1f, 0x13,
++      0x49, 0x99, 0x97, 0x08, 0xde, 0x7f, 0x98, 0xaf,
++      0x51, 0x98, 0xee, 0x2c, 0xcb, 0xf0, 0x0b, 0xc6,
++      0xb6, 0xb7, 0x2d, 0x9a, 0xb1, 0xac, 0xa6, 0xe3,
++      0x15, 0x77, 0x9d, 0x6b, 0x1a, 0xe4, 0xfc, 0x8b,
++      0xf2, 0x17, 0x59, 0x08, 0x04, 0x58, 0x81, 0x9d,
++      0x1b, 0x1b, 0x69, 0x55, 0xc2, 0xb4, 0x3c, 0x1f,
++      0x50, 0xf1, 0x7f, 0x77, 0x90, 0x4c, 0x66, 0x40,
++      0x5a, 0xc0, 0x33, 0x1f, 0xcb, 0x05, 0x6d, 0x5c,
++      0x06, 0x87, 0x52, 0xa2, 0x8f, 0x26, 0xd5, 0x4f
++};
++static const u8 dec_assoc010[] __initconst = {
++      0xd2, 0xa1, 0x70, 0xdb, 0x7a, 0xf8, 0xfa, 0x27,
++      0xba, 0x73, 0x0f, 0xbf, 0x3d, 0x1e, 0x82, 0xb2
++};
++static const u8 dec_nonce010[] __initconst = {
++      0xdb, 0x92, 0x0f, 0x7f, 0x17, 0x54, 0x0c, 0x30
++};
++static const u8 dec_key010[] __initconst = {
++      0x47, 0x11, 0xeb, 0x86, 0x2b, 0x2c, 0xab, 0x44,
++      0x34, 0xda, 0x7f, 0x57, 0x03, 0x39, 0x0c, 0xaf,
++      0x2c, 0x14, 0xfd, 0x65, 0x23, 0xe9, 0x8e, 0x74,
++      0xd5, 0x08, 0x68, 0x08, 0xe7, 0xb4, 0x72, 0xd7
++};
++
++static const u8 dec_input011[] __initconst = {
++      0x6a, 0xfc, 0x4b, 0x25, 0xdf, 0xc0, 0xe4, 0xe8,
++      0x17, 0x4d, 0x4c, 0xc9, 0x7e, 0xde, 0x3a, 0xcc,
++      0x3c, 0xba, 0x6a, 0x77, 0x47, 0xdb, 0xe3, 0x74,
++      0x7a, 0x4d, 0x5f, 0x8d, 0x37, 0x55, 0x80, 0x73,
++      0x90, 0x66, 0x5d, 0x3a, 0x7d, 0x5d, 0x86, 0x5e,
++      0x8d, 0xfd, 0x83, 0xff, 0x4e, 0x74, 0x6f, 0xf9,
++      0xe6, 0x70, 0x17, 0x70, 0x3e, 0x96, 0xa7, 0x7e,
++      0xcb, 0xab, 0x8f, 0x58, 0x24, 0x9b, 0x01, 0xfd,
++      0xcb, 0xe6, 0x4d, 0x9b, 0xf0, 0x88, 0x94, 0x57,
++      0x66, 0xef, 0x72, 0x4c, 0x42, 0x6e, 0x16, 0x19,
++      0x15, 0xea, 0x70, 0x5b, 0xac, 0x13, 0xdb, 0x9f,
++      0x18, 0xe2, 0x3c, 0x26, 0x97, 0xbc, 0xdc, 0x45,
++      0x8c, 0x6c, 0x24, 0x69, 0x9c, 0xf7, 0x65, 0x1e,
++      0x18, 0x59, 0x31, 0x7c, 0xe4, 0x73, 0xbc, 0x39,
++      0x62, 0xc6, 0x5c, 0x9f, 0xbf, 0xfa, 0x90, 0x03,
++      0xc9, 0x72, 0x26, 0xb6, 0x1b, 0xc2, 0xb7, 0x3f,
++      0xf2, 0x13, 0x77, 0xf2, 0x8d, 0xb9, 0x47, 0xd0,
++      0x53, 0xdd, 0xc8, 0x91, 0x83, 0x8b, 0xb1, 0xce,
++      0xa3, 0xfe, 0xcd, 0xd9, 0xdd, 0x92, 0x7b, 0xdb,
++      0xb8, 0xfb, 0xc9, 0x2d, 0x01, 0x59, 0x39, 0x52,
++      0xad, 0x1b, 0xec, 0xcf, 0xd7, 0x70, 0x13, 0x21,
++      0xf5, 0x47, 0xaa, 0x18, 0x21, 0x5c, 0xc9, 0x9a,
++      0xd2, 0x6b, 0x05, 0x9c, 0x01, 0xa1, 0xda, 0x35,
++      0x5d, 0xb3, 0x70, 0xe6, 0xa9, 0x80, 0x8b, 0x91,
++      0xb7, 0xb3, 0x5f, 0x24, 0x9a, 0xb7, 0xd1, 0x6b,
++      0xa1, 0x1c, 0x50, 0xba, 0x49, 0xe0, 0xee, 0x2e,
++      0x75, 0xac, 0x69, 0xc0, 0xeb, 0x03, 0xdd, 0x19,
++      0xe5, 0xf6, 0x06, 0xdd, 0xc3, 0xd7, 0x2b, 0x07,
++      0x07, 0x30, 0xa7, 0x19, 0x0c, 0xbf, 0xe6, 0x18,
++      0xcc, 0xb1, 0x01, 0x11, 0x85, 0x77, 0x1d, 0x96,
++      0xa7, 0xa3, 0x00, 0x84, 0x02, 0xa2, 0x83, 0x68,
++      0xda, 0x17, 0x27, 0xc8, 0x7f, 0x23, 0xb7, 0xf4,
++      0x13, 0x85, 0xcf, 0xdd, 0x7a, 0x7d, 0x24, 0x57,
++      0xfe, 0x05, 0x93, 0xf5, 0x74, 0xce, 0xed, 0x0c,
++      0x20, 0x98, 0x8d, 0x92, 0x30, 0xa1, 0x29, 0x23,
++      0x1a, 0xa0, 0x4f, 0x69, 0x56, 0x4c, 0xe1, 0xc8,
++      0xce, 0xf6, 0x9a, 0x0c, 0xa4, 0xfa, 0x04, 0xf6,
++      0x62, 0x95, 0xf2, 0xfa, 0xc7, 0x40, 0x68, 0x40,
++      0x8f, 0x41, 0xda, 0xb4, 0x26, 0x6f, 0x70, 0xab,
++      0x40, 0x61, 0xa4, 0x0e, 0x75, 0xfb, 0x86, 0xeb,
++      0x9d, 0x9a, 0x1f, 0xec, 0x76, 0x99, 0xe7, 0xea,
++      0xaa, 0x1e, 0x2d, 0xb5, 0xd4, 0xa6, 0x1a, 0xb8,
++      0x61, 0x0a, 0x1d, 0x16, 0x5b, 0x98, 0xc2, 0x31,
++      0x40, 0xe7, 0x23, 0x1d, 0x66, 0x99, 0xc8, 0xc0,
++      0xd7, 0xce, 0xf3, 0x57, 0x40, 0x04, 0x3f, 0xfc,
++      0xea, 0xb3, 0xfc, 0xd2, 0xd3, 0x99, 0xa4, 0x94,
++      0x69, 0xa0, 0xef, 0xd1, 0x85, 0xb3, 0xa6, 0xb1,
++      0x28, 0xbf, 0x94, 0x67, 0x22, 0xc3, 0x36, 0x46,
++      0xf8, 0xd2, 0x0f, 0x5f, 0xf4, 0x59, 0x80, 0xe6,
++      0x2d, 0x43, 0x08, 0x7d, 0x19, 0x09, 0x97, 0xa7,
++      0x4c, 0x3d, 0x8d, 0xba, 0x65, 0x62, 0xa3, 0x71,
++      0x33, 0x29, 0x62, 0xdb, 0xc1, 0x33, 0x34, 0x1a,
++      0x63, 0x33, 0x16, 0xb6, 0x64, 0x7e, 0xab, 0x33,
++      0xf0, 0xe6, 0x26, 0x68, 0xba, 0x1d, 0x2e, 0x38,
++      0x08, 0xe6, 0x02, 0xd3, 0x25, 0x2c, 0x47, 0x23,
++      0x58, 0x34, 0x0f, 0x9d, 0x63, 0x4f, 0x63, 0xbb,
++      0x7f, 0x3b, 0x34, 0x38, 0xa7, 0xb5, 0x8d, 0x65,
++      0xd9, 0x9f, 0x79, 0x55, 0x3e, 0x4d, 0xe7, 0x73,
++      0xd8, 0xf6, 0x98, 0x97, 0x84, 0x60, 0x9c, 0xc8,
++      0xa9, 0x3c, 0xf6, 0xdc, 0x12, 0x5c, 0xe1, 0xbb,
++      0x0b, 0x8b, 0x98, 0x9c, 0x9d, 0x26, 0x7c, 0x4a,
++      0xe6, 0x46, 0x36, 0x58, 0x21, 0x4a, 0xee, 0xca,
++      0xd7, 0x3b, 0xc2, 0x6c, 0x49, 0x2f, 0xe5, 0xd5,
++      0x03, 0x59, 0x84, 0x53, 0xcb, 0xfe, 0x92, 0x71,
++      0x2e, 0x7c, 0x21, 0xcc, 0x99, 0x85, 0x7f, 0xb8,
++      0x74, 0x90, 0x13, 0x42, 0x3f, 0xe0, 0x6b, 0x1d,
++      0xf2, 0x4d, 0x54, 0xd4, 0xfc, 0x3a, 0x05, 0xe6,
++      0x74, 0xaf, 0xa6, 0xa0, 0x2a, 0x20, 0x23, 0x5d,
++      0x34, 0x5c, 0xd9, 0x3e, 0x4e, 0xfa, 0x93, 0xe7,
++      0xaa, 0xe9, 0x6f, 0x08, 0x43, 0x67, 0x41, 0xc5,
++      0xad, 0xfb, 0x31, 0x95, 0x82, 0x73, 0x32, 0xd8,
++      0xa6, 0xa3, 0xed, 0x0e, 0x2d, 0xf6, 0x5f, 0xfd,
++      0x80, 0xa6, 0x7a, 0xe0, 0xdf, 0x78, 0x15, 0x29,
++      0x74, 0x33, 0xd0, 0x9e, 0x83, 0x86, 0x72, 0x22,
++      0x57, 0x29, 0xb9, 0x9e, 0x5d, 0xd3, 0x1a, 0xb5,
++      0x96, 0x72, 0x41, 0x3d, 0xf1, 0x64, 0x43, 0x67,
++      0xee, 0xaa, 0x5c, 0xd3, 0x9a, 0x96, 0x13, 0x11,
++      0x5d, 0xf3, 0x0c, 0x87, 0x82, 0x1e, 0x41, 0x9e,
++      0xd0, 0x27, 0xd7, 0x54, 0x3b, 0x67, 0x73, 0x09,
++      0x91, 0xe9, 0xd5, 0x36, 0xa7, 0xb5, 0x55, 0xe4,
++      0xf3, 0x21, 0x51, 0x49, 0x22, 0x07, 0x55, 0x4f,
++      0x44, 0x4b, 0xd2, 0x15, 0x93, 0x17, 0x2a, 0xfa,
++      0x4d, 0x4a, 0x57, 0xdb, 0x4c, 0xa6, 0xeb, 0xec,
++      0x53, 0x25, 0x6c, 0x21, 0xed, 0x00, 0x4c, 0x3b,
++      0xca, 0x14, 0x57, 0xa9, 0xd6, 0x6a, 0xcd, 0x8d,
++      0x5e, 0x74, 0xac, 0x72, 0xc1, 0x97, 0xe5, 0x1b,
++      0x45, 0x4e, 0xda, 0xfc, 0xcc, 0x40, 0xe8, 0x48,
++      0x88, 0x0b, 0xa3, 0xe3, 0x8d, 0x83, 0x42, 0xc3,
++      0x23, 0xfd, 0x68, 0xb5, 0x8e, 0xf1, 0x9d, 0x63,
++      0x77, 0xe9, 0xa3, 0x8e, 0x8c, 0x26, 0x6b, 0xbd,
++      0x72, 0x73, 0x35, 0x0c, 0x03, 0xf8, 0x43, 0x78,
++      0x52, 0x71, 0x15, 0x1f, 0x71, 0x5d, 0x6e, 0xed,
++      0xb9, 0xcc, 0x86, 0x30, 0xdb, 0x2b, 0xd3, 0x82,
++      0x88, 0x23, 0x71, 0x90, 0x53, 0x5c, 0xa9, 0x2f,
++      0x76, 0x01, 0xb7, 0x9a, 0xfe, 0x43, 0x55, 0xa3,
++      0x04, 0x9b, 0x0e, 0xe4, 0x59, 0xdf, 0xc9, 0xe9,
++      0xb1, 0xea, 0x29, 0x28, 0x3c, 0x5c, 0xae, 0x72,
++      0x84, 0xb6, 0xc6, 0xeb, 0x0c, 0x27, 0x07, 0x74,
++      0x90, 0x0d, 0x31, 0xb0, 0x00, 0x77, 0xe9, 0x40,
++      0x70, 0x6f, 0x68, 0xa7, 0xfd, 0x06, 0xec, 0x4b,
++      0xc0, 0xb7, 0xac, 0xbc, 0x33, 0xb7, 0x6d, 0x0a,
++      0xbd, 0x12, 0x1b, 0x59, 0xcb, 0xdd, 0x32, 0xf5,
++      0x1d, 0x94, 0x57, 0x76, 0x9e, 0x0c, 0x18, 0x98,
++      0x71, 0xd7, 0x2a, 0xdb, 0x0b, 0x7b, 0xa7, 0x71,
++      0xb7, 0x67, 0x81, 0x23, 0x96, 0xae, 0xb9, 0x7e,
++      0x32, 0x43, 0x92, 0x8a, 0x19, 0xa0, 0xc4, 0xd4,
++      0x3b, 0x57, 0xf9, 0x4a, 0x2c, 0xfb, 0x51, 0x46,
++      0xbb, 0xcb, 0x5d, 0xb3, 0xef, 0x13, 0x93, 0x6e,
++      0x68, 0x42, 0x54, 0x57, 0xd3, 0x6a, 0x3a, 0x8f,
++      0x9d, 0x66, 0xbf, 0xbd, 0x36, 0x23, 0xf5, 0x93,
++      0x83, 0x7b, 0x9c, 0xc0, 0xdd, 0xc5, 0x49, 0xc0,
++      0x64, 0xed, 0x07, 0x12, 0xb3, 0xe6, 0xe4, 0xe5,
++      0x38, 0x95, 0x23, 0xb1, 0xa0, 0x3b, 0x1a, 0x61,
++      0xda, 0x17, 0xac, 0xc3, 0x58, 0xdd, 0x74, 0x64,
++      0x22, 0x11, 0xe8, 0x32, 0x1d, 0x16, 0x93, 0x85,
++      0x99, 0xa5, 0x9c, 0x34, 0x55, 0xb1, 0xe9, 0x20,
++      0x72, 0xc9, 0x28, 0x7b, 0x79, 0x00, 0xa1, 0xa6,
++      0xa3, 0x27, 0x40, 0x18, 0x8a, 0x54, 0xe0, 0xcc,
++      0xe8, 0x4e, 0x8e, 0x43, 0x96, 0xe7, 0x3f, 0xc8,
++      0xe9, 0xb2, 0xf9, 0xc9, 0xda, 0x04, 0x71, 0x50,
++      0x47, 0xe4, 0xaa, 0xce, 0xa2, 0x30, 0xc8, 0xe4,
++      0xac, 0xc7, 0x0d, 0x06, 0x2e, 0xe6, 0xe8, 0x80,
++      0x36, 0x29, 0x9e, 0x01, 0xb8, 0xc3, 0xf0, 0xa0,
++      0x5d, 0x7a, 0xca, 0x4d, 0xa0, 0x57, 0xbd, 0x2a,
++      0x45, 0xa7, 0x7f, 0x9c, 0x93, 0x07, 0x8f, 0x35,
++      0x67, 0x92, 0xe3, 0xe9, 0x7f, 0xa8, 0x61, 0x43,
++      0x9e, 0x25, 0x4f, 0x33, 0x76, 0x13, 0x6e, 0x12,
++      0xb9, 0xdd, 0xa4, 0x7c, 0x08, 0x9f, 0x7c, 0xe7,
++      0x0a, 0x8d, 0x84, 0x06, 0xa4, 0x33, 0x17, 0x34,
++      0x5e, 0x10, 0x7c, 0xc0, 0xa8, 0x3d, 0x1f, 0x42,
++      0x20, 0x51, 0x65, 0x5d, 0x09, 0xc3, 0xaa, 0xc0,
++      0xc8, 0x0d, 0xf0, 0x79, 0xbc, 0x20, 0x1b, 0x95,
++      0xe7, 0x06, 0x7d, 0x47, 0x20, 0x03, 0x1a, 0x74,
++      0xdd, 0xe2, 0xd4, 0xae, 0x38, 0x71, 0x9b, 0xf5,
++      0x80, 0xec, 0x08, 0x4e, 0x56, 0xba, 0x76, 0x12,
++      0x1a, 0xdf, 0x48, 0xf3, 0xae, 0xb3, 0xe6, 0xe6,
++      0xbe, 0xc0, 0x91, 0x2e, 0x01, 0xb3, 0x01, 0x86,
++      0xa2, 0xb9, 0x52, 0xd1, 0x21, 0xae, 0xd4, 0x97,
++      0x1d, 0xef, 0x41, 0x12, 0x95, 0x3d, 0x48, 0x45,
++      0x1c, 0x56, 0x32, 0x8f, 0xb8, 0x43, 0xbb, 0x19,
++      0xf3, 0xca, 0xe9, 0xeb, 0x6d, 0x84, 0xbe, 0x86,
++      0x06, 0xe2, 0x36, 0xb2, 0x62, 0x9d, 0xd3, 0x4c,
++      0x48, 0x18, 0x54, 0x13, 0x4e, 0xcf, 0xfd, 0xba,
++      0x84, 0xb9, 0x30, 0x53, 0xcf, 0xfb, 0xb9, 0x29,
++      0x8f, 0xdc, 0x9f, 0xef, 0x60, 0x0b, 0x64, 0xf6,
++      0x8b, 0xee, 0xa6, 0x91, 0xc2, 0x41, 0x6c, 0xf6,
++      0xfa, 0x79, 0x67, 0x4b, 0xc1, 0x3f, 0xaf, 0x09,
++      0x81, 0xd4, 0x5d, 0xcb, 0x09, 0xdf, 0x36, 0x31,
++      0xc0, 0x14, 0x3c, 0x7c, 0x0e, 0x65, 0x95, 0x99,
++      0x6d, 0xa3, 0xf4, 0xd7, 0x38, 0xee, 0x1a, 0x2b,
++      0x37, 0xe2, 0xa4, 0x3b, 0x4b, 0xd0, 0x65, 0xca,
++      0xf8, 0xc3, 0xe8, 0x15, 0x20, 0xef, 0xf2, 0x00,
++      0xfd, 0x01, 0x09, 0xc5, 0xc8, 0x17, 0x04, 0x93,
++      0xd0, 0x93, 0x03, 0x55, 0xc5, 0xfe, 0x32, 0xa3,
++      0x3e, 0x28, 0x2d, 0x3b, 0x93, 0x8a, 0xcc, 0x07,
++      0x72, 0x80, 0x8b, 0x74, 0x16, 0x24, 0xbb, 0xda,
++      0x94, 0x39, 0x30, 0x8f, 0xb1, 0xcd, 0x4a, 0x90,
++      0x92, 0x7c, 0x14, 0x8f, 0x95, 0x4e, 0xac, 0x9b,
++      0xd8, 0x8f, 0x1a, 0x87, 0xa4, 0x32, 0x27, 0x8a,
++      0xba, 0xf7, 0x41, 0xcf, 0x84, 0x37, 0x19, 0xe6,
++      0x06, 0xf5, 0x0e, 0xcf, 0x36, 0xf5, 0x9e, 0x6c,
++      0xde, 0xbc, 0xff, 0x64, 0x7e, 0x4e, 0x59, 0x57,
++      0x48, 0xfe, 0x14, 0xf7, 0x9c, 0x93, 0x5d, 0x15,
++      0xad, 0xcc, 0x11, 0xb1, 0x17, 0x18, 0xb2, 0x7e,
++      0xcc, 0xab, 0xe9, 0xce, 0x7d, 0x77, 0x5b, 0x51,
++      0x1b, 0x1e, 0x20, 0xa8, 0x32, 0x06, 0x0e, 0x75,
++      0x93, 0xac, 0xdb, 0x35, 0x37, 0x1f, 0xe9, 0x19,
++      0x1d, 0xb4, 0x71, 0x97, 0xd6, 0x4e, 0x2c, 0x08,
++      0xa5, 0x13, 0xf9, 0x0e, 0x7e, 0x78, 0x6e, 0x14,
++      0xe0, 0xa9, 0xb9, 0x96, 0x4c, 0x80, 0x82, 0xba,
++      0x17, 0xb3, 0x9d, 0x69, 0xb0, 0x84, 0x46, 0xff,
++      0xf9, 0x52, 0x79, 0x94, 0x58, 0x3a, 0x62, 0x90,
++      0x15, 0x35, 0x71, 0x10, 0x37, 0xed, 0xa1, 0x8e,
++      0x53, 0x6e, 0xf4, 0x26, 0x57, 0x93, 0x15, 0x93,
++      0xf6, 0x81, 0x2c, 0x5a, 0x10, 0xda, 0x92, 0xad,
++      0x2f, 0xdb, 0x28, 0x31, 0x2d, 0x55, 0x04, 0xd2,
++      0x06, 0x28, 0x8c, 0x1e, 0xdc, 0xea, 0x54, 0xac,
++      0xff, 0xb7, 0x6c, 0x30, 0x15, 0xd4, 0xb4, 0x0d,
++      0x00, 0x93, 0x57, 0xdd, 0xd2, 0x07, 0x07, 0x06,
++      0xd9, 0x43, 0x9b, 0xcd, 0x3a, 0xf4, 0x7d, 0x4c,
++      0x36, 0x5d, 0x23, 0xa2, 0xcc, 0x57, 0x40, 0x91,
++      0xe9, 0x2c, 0x2f, 0x2c, 0xd5, 0x30, 0x9b, 0x17,
++      0xb0, 0xc9, 0xf7, 0xa7, 0x2f, 0xd1, 0x93, 0x20,
++      0x6b, 0xc6, 0xc1, 0xe4, 0x6f, 0xcb, 0xd1, 0xe7,
++      0x09, 0x0f, 0x9e, 0xdc, 0xaa, 0x9f, 0x2f, 0xdf,
++      0x56, 0x9f, 0xd4, 0x33, 0x04, 0xaf, 0xd3, 0x6c,
++      0x58, 0x61, 0xf0, 0x30, 0xec, 0xf2, 0x7f, 0xf2,
++      0x9c, 0xdf, 0x39, 0xbb, 0x6f, 0xa2, 0x8c, 0x7e,
++      0xc4, 0x22, 0x51, 0x71, 0xc0, 0x4d, 0x14, 0x1a,
++      0xc4, 0xcd, 0x04, 0xd9, 0x87, 0x08, 0x50, 0x05,
++      0xcc, 0xaf, 0xf6, 0xf0, 0x8f, 0x92, 0x54, 0x58,
++      0xc2, 0xc7, 0x09, 0x7a, 0x59, 0x02, 0x05, 0xe8,
++      0xb0, 0x86, 0xd9, 0xbf, 0x7b, 0x35, 0x51, 0x4d,
++      0xaf, 0x08, 0x97, 0x2c, 0x65, 0xda, 0x2a, 0x71,
++      0x3a, 0xa8, 0x51, 0xcc, 0xf2, 0x73, 0x27, 0xc3,
++      0xfd, 0x62, 0xcf, 0xe3, 0xb2, 0xca, 0xcb, 0xbe,
++      0x1a, 0x0a, 0xa1, 0x34, 0x7b, 0x77, 0xc4, 0x62,
++      0x68, 0x78, 0x5f, 0x94, 0x07, 0x04, 0x65, 0x16,
++      0x4b, 0x61, 0xcb, 0xff, 0x75, 0x26, 0x50, 0x66,
++      0x1f, 0x6e, 0x93, 0xf8, 0xc5, 0x51, 0xeb, 0xa4,
++      0x4a, 0x48, 0x68, 0x6b, 0xe2, 0x5e, 0x44, 0xb2,
++      0x50, 0x2c, 0x6c, 0xae, 0x79, 0x4e, 0x66, 0x35,
++      0x81, 0x50, 0xac, 0xbc, 0x3f, 0xb1, 0x0c, 0xf3,
++      0x05, 0x3c, 0x4a, 0xa3, 0x6c, 0x2a, 0x79, 0xb4,
++      0xb7, 0xab, 0xca, 0xc7, 0x9b, 0x8e, 0xcd, 0x5f,
++      0x11, 0x03, 0xcb, 0x30, 0xa3, 0xab, 0xda, 0xfe,
++      0x64, 0xb9, 0xbb, 0xd8, 0x5e, 0x3a, 0x1a, 0x56,
++      0xe5, 0x05, 0x48, 0x90, 0x1e, 0x61, 0x69, 0x1b,
++      0x22, 0xe6, 0x1a, 0x3c, 0x75, 0xad, 0x1f, 0x37,
++      0x28, 0xdc, 0xe4, 0x6d, 0xbd, 0x42, 0xdc, 0xd3,
++      0xc8, 0xb6, 0x1c, 0x48, 0xfe, 0x94, 0x77, 0x7f,
++      0xbd, 0x62, 0xac, 0xa3, 0x47, 0x27, 0xcf, 0x5f,
++      0xd9, 0xdb, 0xaf, 0xec, 0xf7, 0x5e, 0xc1, 0xb0,
++      0x9d, 0x01, 0x26, 0x99, 0x7e, 0x8f, 0x03, 0x70,
++      0xb5, 0x42, 0xbe, 0x67, 0x28, 0x1b, 0x7c, 0xbd,
++      0x61, 0x21, 0x97, 0xcc, 0x5c, 0xe1, 0x97, 0x8f,
++      0x8d, 0xde, 0x2b, 0xaa, 0xa7, 0x71, 0x1d, 0x1e,
++      0x02, 0x73, 0x70, 0x58, 0x32, 0x5b, 0x1d, 0x67,
++      0x3d, 0xe0, 0x74, 0x4f, 0x03, 0xf2, 0x70, 0x51,
++      0x79, 0xf1, 0x61, 0x70, 0x15, 0x74, 0x9d, 0x23,
++      0x89, 0xde, 0xac, 0xfd, 0xde, 0xd0, 0x1f, 0xc3,
++      0x87, 0x44, 0x35, 0x4b, 0xe5, 0xb0, 0x60, 0xc5,
++      0x22, 0xe4, 0x9e, 0xca, 0xeb, 0xd5, 0x3a, 0x09,
++      0x45, 0xa4, 0xdb, 0xfa, 0x3f, 0xeb, 0x1b, 0xc7,
++      0xc8, 0x14, 0x99, 0x51, 0x92, 0x10, 0xed, 0xed,
++      0x28, 0xe0, 0xa1, 0xf8, 0x26, 0xcf, 0xcd, 0xcb,
++      0x63, 0xa1, 0x3b, 0xe3, 0xdf, 0x7e, 0xfe, 0xa6,
++      0xf0, 0x81, 0x9a, 0xbf, 0x55, 0xde, 0x54, 0xd5,
++      0x56, 0x60, 0x98, 0x10, 0x68, 0xf4, 0x38, 0x96,
++      0x8e, 0x6f, 0x1d, 0x44, 0x7f, 0xd6, 0x2f, 0xfe,
++      0x55, 0xfb, 0x0c, 0x7e, 0x67, 0xe2, 0x61, 0x44,
++      0xed, 0xf2, 0x35, 0x30, 0x5d, 0xe9, 0xc7, 0xd6,
++      0x6d, 0xe0, 0xa0, 0xed, 0xf3, 0xfc, 0xd8, 0x3e,
++      0x0a, 0x7b, 0xcd, 0xaf, 0x65, 0x68, 0x18, 0xc0,
++      0xec, 0x04, 0x1c, 0x74, 0x6d, 0xe2, 0x6e, 0x79,
++      0xd4, 0x11, 0x2b, 0x62, 0xd5, 0x27, 0xad, 0x4f,
++      0x01, 0x59, 0x73, 0xcc, 0x6a, 0x53, 0xfb, 0x2d,
++      0xd5, 0x4e, 0x99, 0x21, 0x65, 0x4d, 0xf5, 0x82,
++      0xf7, 0xd8, 0x42, 0xce, 0x6f, 0x3d, 0x36, 0x47,
++      0xf1, 0x05, 0x16, 0xe8, 0x1b, 0x6a, 0x8f, 0x93,
++      0xf2, 0x8f, 0x37, 0x40, 0x12, 0x28, 0xa3, 0xe6,
++      0xb9, 0x17, 0x4a, 0x1f, 0xb1, 0xd1, 0x66, 0x69,
++      0x86, 0xc4, 0xfc, 0x97, 0xae, 0x3f, 0x8f, 0x1e,
++      0x2b, 0xdf, 0xcd, 0xf9, 0x3c
++};
++static const u8 dec_output011[] __initconst = {
++      0x7a, 0x57, 0xf2, 0xc7, 0x06, 0x3f, 0x50, 0x7b,
++      0x36, 0x1a, 0x66, 0x5c, 0xb9, 0x0e, 0x5e, 0x3b,
++      0x45, 0x60, 0xbe, 0x9a, 0x31, 0x9f, 0xff, 0x5d,
++      0x66, 0x34, 0xb4, 0xdc, 0xfb, 0x9d, 0x8e, 0xee,
++      0x6a, 0x33, 0xa4, 0x07, 0x3c, 0xf9, 0x4c, 0x30,
++      0xa1, 0x24, 0x52, 0xf9, 0x50, 0x46, 0x88, 0x20,
++      0x02, 0x32, 0x3a, 0x0e, 0x99, 0x63, 0xaf, 0x1f,
++      0x15, 0x28, 0x2a, 0x05, 0xff, 0x57, 0x59, 0x5e,
++      0x18, 0xa1, 0x1f, 0xd0, 0x92, 0x5c, 0x88, 0x66,
++      0x1b, 0x00, 0x64, 0xa5, 0x93, 0x8d, 0x06, 0x46,
++      0xb0, 0x64, 0x8b, 0x8b, 0xef, 0x99, 0x05, 0x35,
++      0x85, 0xb3, 0xf3, 0x33, 0xbb, 0xec, 0x66, 0xb6,
++      0x3d, 0x57, 0x42, 0xe3, 0xb4, 0xc6, 0xaa, 0xb0,
++      0x41, 0x2a, 0xb9, 0x59, 0xa9, 0xf6, 0x3e, 0x15,
++      0x26, 0x12, 0x03, 0x21, 0x4c, 0x74, 0x43, 0x13,
++      0x2a, 0x03, 0x27, 0x09, 0xb4, 0xfb, 0xe7, 0xb7,
++      0x40, 0xff, 0x5e, 0xce, 0x48, 0x9a, 0x60, 0xe3,
++      0x8b, 0x80, 0x8c, 0x38, 0x2d, 0xcb, 0x93, 0x37,
++      0x74, 0x05, 0x52, 0x6f, 0x73, 0x3e, 0xc3, 0xbc,
++      0xca, 0x72, 0x0a, 0xeb, 0xf1, 0x3b, 0xa0, 0x95,
++      0xdc, 0x8a, 0xc4, 0xa9, 0xdc, 0xca, 0x44, 0xd8,
++      0x08, 0x63, 0x6a, 0x36, 0xd3, 0x3c, 0xb8, 0xac,
++      0x46, 0x7d, 0xfd, 0xaa, 0xeb, 0x3e, 0x0f, 0x45,
++      0x8f, 0x49, 0xda, 0x2b, 0xf2, 0x12, 0xbd, 0xaf,
++      0x67, 0x8a, 0x63, 0x48, 0x4b, 0x55, 0x5f, 0x6d,
++      0x8c, 0xb9, 0x76, 0x34, 0x84, 0xae, 0xc2, 0xfc,
++      0x52, 0x64, 0x82, 0xf7, 0xb0, 0x06, 0xf0, 0x45,
++      0x73, 0x12, 0x50, 0x30, 0x72, 0xea, 0x78, 0x9a,
++      0xa8, 0xaf, 0xb5, 0xe3, 0xbb, 0x77, 0x52, 0xec,
++      0x59, 0x84, 0xbf, 0x6b, 0x8f, 0xce, 0x86, 0x5e,
++      0x1f, 0x23, 0xe9, 0xfb, 0x08, 0x86, 0xf7, 0x10,
++      0xb9, 0xf2, 0x44, 0x96, 0x44, 0x63, 0xa9, 0xa8,
++      0x78, 0x00, 0x23, 0xd6, 0xc7, 0xe7, 0x6e, 0x66,
++      0x4f, 0xcc, 0xee, 0x15, 0xb3, 0xbd, 0x1d, 0xa0,
++      0xe5, 0x9c, 0x1b, 0x24, 0x2c, 0x4d, 0x3c, 0x62,
++      0x35, 0x9c, 0x88, 0x59, 0x09, 0xdd, 0x82, 0x1b,
++      0xcf, 0x0a, 0x83, 0x6b, 0x3f, 0xae, 0x03, 0xc4,
++      0xb4, 0xdd, 0x7e, 0x5b, 0x28, 0x76, 0x25, 0x96,
++      0xd9, 0xc9, 0x9d, 0x5f, 0x86, 0xfa, 0xf6, 0xd7,
++      0xd2, 0xe6, 0x76, 0x1d, 0x0f, 0xa1, 0xdc, 0x74,
++      0x05, 0x1b, 0x1d, 0xe0, 0xcd, 0x16, 0xb0, 0xa8,
++      0x8a, 0x34, 0x7b, 0x15, 0x11, 0x77, 0xe5, 0x7b,
++      0x7e, 0x20, 0xf7, 0xda, 0x38, 0xda, 0xce, 0x70,
++      0xe9, 0xf5, 0x6c, 0xd9, 0xbe, 0x0c, 0x4c, 0x95,
++      0x4c, 0xc2, 0x9b, 0x34, 0x55, 0x55, 0xe1, 0xf3,
++      0x46, 0x8e, 0x48, 0x74, 0x14, 0x4f, 0x9d, 0xc9,
++      0xf5, 0xe8, 0x1a, 0xf0, 0x11, 0x4a, 0xc1, 0x8d,
++      0xe0, 0x93, 0xa0, 0xbe, 0x09, 0x1c, 0x2b, 0x4e,
++      0x0f, 0xb2, 0x87, 0x8b, 0x84, 0xfe, 0x92, 0x32,
++      0x14, 0xd7, 0x93, 0xdf, 0xe7, 0x44, 0xbc, 0xc5,
++      0xae, 0x53, 0x69, 0xd8, 0xb3, 0x79, 0x37, 0x80,
++      0xe3, 0x17, 0x5c, 0xec, 0x53, 0x00, 0x9a, 0xe3,
++      0x8e, 0xdc, 0x38, 0xb8, 0x66, 0xf0, 0xd3, 0xad,
++      0x1d, 0x02, 0x96, 0x86, 0x3e, 0x9d, 0x3b, 0x5d,
++      0xa5, 0x7f, 0x21, 0x10, 0xf1, 0x1f, 0x13, 0x20,
++      0xf9, 0x57, 0x87, 0x20, 0xf5, 0x5f, 0xf1, 0x17,
++      0x48, 0x0a, 0x51, 0x5a, 0xcd, 0x19, 0x03, 0xa6,
++      0x5a, 0xd1, 0x12, 0x97, 0xe9, 0x48, 0xe2, 0x1d,
++      0x83, 0x75, 0x50, 0xd9, 0x75, 0x7d, 0x6a, 0x82,
++      0xa1, 0xf9, 0x4e, 0x54, 0x87, 0x89, 0xc9, 0x0c,
++      0xb7, 0x5b, 0x6a, 0x91, 0xc1, 0x9c, 0xb2, 0xa9,
++      0xdc, 0x9a, 0xa4, 0x49, 0x0a, 0x6d, 0x0d, 0xbb,
++      0xde, 0x86, 0x44, 0xdd, 0x5d, 0x89, 0x2b, 0x96,
++      0x0f, 0x23, 0x95, 0xad, 0xcc, 0xa2, 0xb3, 0xb9,
++      0x7e, 0x74, 0x38, 0xba, 0x9f, 0x73, 0xae, 0x5f,
++      0xf8, 0x68, 0xa2, 0xe0, 0xa9, 0xce, 0xbd, 0x40,
++      0xd4, 0x4c, 0x6b, 0xd2, 0x56, 0x62, 0xb0, 0xcc,
++      0x63, 0x7e, 0x5b, 0xd3, 0xae, 0xd1, 0x75, 0xce,
++      0xbb, 0xb4, 0x5b, 0xa8, 0xf8, 0xb4, 0xac, 0x71,
++      0x75, 0xaa, 0xc9, 0x9f, 0xbb, 0x6c, 0xad, 0x0f,
++      0x55, 0x5d, 0xe8, 0x85, 0x7d, 0xf9, 0x21, 0x35,
++      0xea, 0x92, 0x85, 0x2b, 0x00, 0xec, 0x84, 0x90,
++      0x0a, 0x63, 0x96, 0xe4, 0x6b, 0xa9, 0x77, 0xb8,
++      0x91, 0xf8, 0x46, 0x15, 0x72, 0x63, 0x70, 0x01,
++      0x40, 0xa3, 0xa5, 0x76, 0x62, 0x2b, 0xbf, 0xf1,
++      0xe5, 0x8d, 0x9f, 0xa3, 0xfa, 0x9b, 0x03, 0xbe,
++      0xfe, 0x65, 0x6f, 0xa2, 0x29, 0x0d, 0x54, 0xb4,
++      0x71, 0xce, 0xa9, 0xd6, 0x3d, 0x88, 0xf9, 0xaf,
++      0x6b, 0xa8, 0x9e, 0xf4, 0x16, 0x96, 0x36, 0xb9,
++      0x00, 0xdc, 0x10, 0xab, 0xb5, 0x08, 0x31, 0x1f,
++      0x00, 0xb1, 0x3c, 0xd9, 0x38, 0x3e, 0xc6, 0x04,
++      0xa7, 0x4e, 0xe8, 0xae, 0xed, 0x98, 0xc2, 0xf7,
++      0xb9, 0x00, 0x5f, 0x8c, 0x60, 0xd1, 0xe5, 0x15,
++      0xf7, 0xae, 0x1e, 0x84, 0x88, 0xd1, 0xf6, 0xbc,
++      0x3a, 0x89, 0x35, 0x22, 0x83, 0x7c, 0xca, 0xf0,
++      0x33, 0x82, 0x4c, 0x79, 0x3c, 0xfd, 0xb1, 0xae,
++      0x52, 0x62, 0x55, 0xd2, 0x41, 0x60, 0xc6, 0xbb,
++      0xfa, 0x0e, 0x59, 0xd6, 0xa8, 0xfe, 0x5d, 0xed,
++      0x47, 0x3d, 0xe0, 0xea, 0x1f, 0x6e, 0x43, 0x51,
++      0xec, 0x10, 0x52, 0x56, 0x77, 0x42, 0x6b, 0x52,
++      0x87, 0xd8, 0xec, 0xe0, 0xaa, 0x76, 0xa5, 0x84,
++      0x2a, 0x22, 0x24, 0xfd, 0x92, 0x40, 0x88, 0xd5,
++      0x85, 0x1c, 0x1f, 0x6b, 0x47, 0xa0, 0xc4, 0xe4,
++      0xef, 0xf4, 0xea, 0xd7, 0x59, 0xac, 0x2a, 0x9e,
++      0x8c, 0xfa, 0x1f, 0x42, 0x08, 0xfe, 0x4f, 0x74,
++      0xa0, 0x26, 0xf5, 0xb3, 0x84, 0xf6, 0x58, 0x5f,
++      0x26, 0x66, 0x3e, 0xd7, 0xe4, 0x22, 0x91, 0x13,
++      0xc8, 0xac, 0x25, 0x96, 0x23, 0xd8, 0x09, 0xea,
++      0x45, 0x75, 0x23, 0xb8, 0x5f, 0xc2, 0x90, 0x8b,
++      0x09, 0xc4, 0xfc, 0x47, 0x6c, 0x6d, 0x0a, 0xef,
++      0x69, 0xa4, 0x38, 0x19, 0xcf, 0x7d, 0xf9, 0x09,
++      0x73, 0x9b, 0x60, 0x5a, 0xf7, 0x37, 0xb5, 0xfe,
++      0x9f, 0xe3, 0x2b, 0x4c, 0x0d, 0x6e, 0x19, 0xf1,
++      0xd6, 0xc0, 0x70, 0xf3, 0x9d, 0x22, 0x3c, 0xf9,
++      0x49, 0xce, 0x30, 0x8e, 0x44, 0xb5, 0x76, 0x15,
++      0x8f, 0x52, 0xfd, 0xa5, 0x04, 0xb8, 0x55, 0x6a,
++      0x36, 0x59, 0x7c, 0xc4, 0x48, 0xb8, 0xd7, 0xab,
++      0x05, 0x66, 0xe9, 0x5e, 0x21, 0x6f, 0x6b, 0x36,
++      0x29, 0xbb, 0xe9, 0xe3, 0xa2, 0x9a, 0xa8, 0xcd,
++      0x55, 0x25, 0x11, 0xba, 0x5a, 0x58, 0xa0, 0xde,
++      0xae, 0x19, 0x2a, 0x48, 0x5a, 0xff, 0x36, 0xcd,
++      0x6d, 0x16, 0x7a, 0x73, 0x38, 0x46, 0xe5, 0x47,
++      0x59, 0xc8, 0xa2, 0xf6, 0xe2, 0x6c, 0x83, 0xc5,
++      0x36, 0x2c, 0x83, 0x7d, 0xb4, 0x01, 0x05, 0x69,
++      0xe7, 0xaf, 0x5c, 0xc4, 0x64, 0x82, 0x12, 0x21,
++      0xef, 0xf7, 0xd1, 0x7d, 0xb8, 0x8d, 0x8c, 0x98,
++      0x7c, 0x5f, 0x7d, 0x92, 0x88, 0xb9, 0x94, 0x07,
++      0x9c, 0xd8, 0xe9, 0x9c, 0x17, 0x38, 0xe3, 0x57,
++      0x6c, 0xe0, 0xdc, 0xa5, 0x92, 0x42, 0xb3, 0xbd,
++      0x50, 0xa2, 0x7e, 0xb5, 0xb1, 0x52, 0x72, 0x03,
++      0x97, 0xd8, 0xaa, 0x9a, 0x1e, 0x75, 0x41, 0x11,
++      0xa3, 0x4f, 0xcc, 0xd4, 0xe3, 0x73, 0xad, 0x96,
++      0xdc, 0x47, 0x41, 0x9f, 0xb0, 0xbe, 0x79, 0x91,
++      0xf5, 0xb6, 0x18, 0xfe, 0xc2, 0x83, 0x18, 0x7d,
++      0x73, 0xd9, 0x4f, 0x83, 0x84, 0x03, 0xb3, 0xf0,
++      0x77, 0x66, 0x3d, 0x83, 0x63, 0x2e, 0x2c, 0xf9,
++      0xdd, 0xa6, 0x1f, 0x89, 0x82, 0xb8, 0x23, 0x42,
++      0xeb, 0xe2, 0xca, 0x70, 0x82, 0x61, 0x41, 0x0a,
++      0x6d, 0x5f, 0x75, 0xc5, 0xe2, 0xc4, 0x91, 0x18,
++      0x44, 0x22, 0xfa, 0x34, 0x10, 0xf5, 0x20, 0xdc,
++      0xb7, 0xdd, 0x2a, 0x20, 0x77, 0xf5, 0xf9, 0xce,
++      0xdb, 0xa0, 0x0a, 0x52, 0x2a, 0x4e, 0xdd, 0xcc,
++      0x97, 0xdf, 0x05, 0xe4, 0x5e, 0xb7, 0xaa, 0xf0,
++      0xe2, 0x80, 0xff, 0xba, 0x1a, 0x0f, 0xac, 0xdf,
++      0x02, 0x32, 0xe6, 0xf7, 0xc7, 0x17, 0x13, 0xb7,
++      0xfc, 0x98, 0x48, 0x8c, 0x0d, 0x82, 0xc9, 0x80,
++      0x7a, 0xe2, 0x0a, 0xc5, 0xb4, 0xde, 0x7c, 0x3c,
++      0x79, 0x81, 0x0e, 0x28, 0x65, 0x79, 0x67, 0x82,
++      0x69, 0x44, 0x66, 0x09, 0xf7, 0x16, 0x1a, 0xf9,
++      0x7d, 0x80, 0xa1, 0x79, 0x14, 0xa9, 0xc8, 0x20,
++      0xfb, 0xa2, 0x46, 0xbe, 0x08, 0x35, 0x17, 0x58,
++      0xc1, 0x1a, 0xda, 0x2a, 0x6b, 0x2e, 0x1e, 0xe6,
++      0x27, 0x55, 0x7b, 0x19, 0xe2, 0xfb, 0x64, 0xfc,
++      0x5e, 0x15, 0x54, 0x3c, 0xe7, 0xc2, 0x11, 0x50,
++      0x30, 0xb8, 0x72, 0x03, 0x0b, 0x1a, 0x9f, 0x86,
++      0x27, 0x11, 0x5c, 0x06, 0x2b, 0xbd, 0x75, 0x1a,
++      0x0a, 0xda, 0x01, 0xfa, 0x5c, 0x4a, 0xc1, 0x80,
++      0x3a, 0x6e, 0x30, 0xc8, 0x2c, 0xeb, 0x56, 0xec,
++      0x89, 0xfa, 0x35, 0x7b, 0xb2, 0xf0, 0x97, 0x08,
++      0x86, 0x53, 0xbe, 0xbd, 0x40, 0x41, 0x38, 0x1c,
++      0xb4, 0x8b, 0x79, 0x2e, 0x18, 0x96, 0x94, 0xde,
++      0xe8, 0xca, 0xe5, 0x9f, 0x92, 0x9f, 0x15, 0x5d,
++      0x56, 0x60, 0x5c, 0x09, 0xf9, 0x16, 0xf4, 0x17,
++      0x0f, 0xf6, 0x4c, 0xda, 0xe6, 0x67, 0x89, 0x9f,
++      0xca, 0x6c, 0xe7, 0x9b, 0x04, 0x62, 0x0e, 0x26,
++      0xa6, 0x52, 0xbd, 0x29, 0xff, 0xc7, 0xa4, 0x96,
++      0xe6, 0x6a, 0x02, 0xa5, 0x2e, 0x7b, 0xfe, 0x97,
++      0x68, 0x3e, 0x2e, 0x5f, 0x3b, 0x0f, 0x36, 0xd6,
++      0x98, 0x19, 0x59, 0x48, 0xd2, 0xc6, 0xe1, 0x55,
++      0x1a, 0x6e, 0xd6, 0xed, 0x2c, 0xba, 0xc3, 0x9e,
++      0x64, 0xc9, 0x95, 0x86, 0x35, 0x5e, 0x3e, 0x88,
++      0x69, 0x99, 0x4b, 0xee, 0xbe, 0x9a, 0x99, 0xb5,
++      0x6e, 0x58, 0xae, 0xdd, 0x22, 0xdb, 0xdd, 0x6b,
++      0xfc, 0xaf, 0x90, 0xa3, 0x3d, 0xa4, 0xc1, 0x15,
++      0x92, 0x18, 0x8d, 0xd2, 0x4b, 0x7b, 0x06, 0xd1,
++      0x37, 0xb5, 0xe2, 0x7c, 0x2c, 0xf0, 0x25, 0xe4,
++      0x94, 0x2a, 0xbd, 0xe3, 0x82, 0x70, 0x78, 0xa3,
++      0x82, 0x10, 0x5a, 0x90, 0xd7, 0xa4, 0xfa, 0xaf,
++      0x1a, 0x88, 0x59, 0xdc, 0x74, 0x12, 0xb4, 0x8e,
++      0xd7, 0x19, 0x46, 0xf4, 0x84, 0x69, 0x9f, 0xbb,
++      0x70, 0xa8, 0x4c, 0x52, 0x81, 0xa9, 0xff, 0x76,
++      0x1c, 0xae, 0xd8, 0x11, 0x3d, 0x7f, 0x7d, 0xc5,
++      0x12, 0x59, 0x28, 0x18, 0xc2, 0xa2, 0xb7, 0x1c,
++      0x88, 0xf8, 0xd6, 0x1b, 0xa6, 0x7d, 0x9e, 0xde,
++      0x29, 0xf8, 0xed, 0xff, 0xeb, 0x92, 0x24, 0x4f,
++      0x05, 0xaa, 0xd9, 0x49, 0xba, 0x87, 0x59, 0x51,
++      0xc9, 0x20, 0x5c, 0x9b, 0x74, 0xcf, 0x03, 0xd9,
++      0x2d, 0x34, 0xc7, 0x5b, 0xa5, 0x40, 0xb2, 0x99,
++      0xf5, 0xcb, 0xb4, 0xf6, 0xb7, 0x72, 0x4a, 0xd6,
++      0xbd, 0xb0, 0xf3, 0x93, 0xe0, 0x1b, 0xa8, 0x04,
++      0x1e, 0x35, 0xd4, 0x80, 0x20, 0xf4, 0x9c, 0x31,
++      0x6b, 0x45, 0xb9, 0x15, 0xb0, 0x5e, 0xdd, 0x0a,
++      0x33, 0x9c, 0x83, 0xcd, 0x58, 0x89, 0x50, 0x56,
++      0xbb, 0x81, 0x00, 0x91, 0x32, 0xf3, 0x1b, 0x3e,
++      0xcf, 0x45, 0xe1, 0xf9, 0xe1, 0x2c, 0x26, 0x78,
++      0x93, 0x9a, 0x60, 0x46, 0xc9, 0xb5, 0x5e, 0x6a,
++      0x28, 0x92, 0x87, 0x3f, 0x63, 0x7b, 0xdb, 0xf7,
++      0xd0, 0x13, 0x9d, 0x32, 0x40, 0x5e, 0xcf, 0xfb,
++      0x79, 0x68, 0x47, 0x4c, 0xfd, 0x01, 0x17, 0xe6,
++      0x97, 0x93, 0x78, 0xbb, 0xa6, 0x27, 0xa3, 0xe8,
++      0x1a, 0xe8, 0x94, 0x55, 0x7d, 0x08, 0xe5, 0xdc,
++      0x66, 0xa3, 0x69, 0xc8, 0xca, 0xc5, 0xa1, 0x84,
++      0x55, 0xde, 0x08, 0x91, 0x16, 0x3a, 0x0c, 0x86,
++      0xab, 0x27, 0x2b, 0x64, 0x34, 0x02, 0x6c, 0x76,
++      0x8b, 0xc6, 0xaf, 0xcc, 0xe1, 0xd6, 0x8c, 0x2a,
++      0x18, 0x3d, 0xa6, 0x1b, 0x37, 0x75, 0x45, 0x73,
++      0xc2, 0x75, 0xd7, 0x53, 0x78, 0x3a, 0xd6, 0xe8,
++      0x29, 0xd2, 0x4a, 0xa8, 0x1e, 0x82, 0xf6, 0xb6,
++      0x81, 0xde, 0x21, 0xed, 0x2b, 0x56, 0xbb, 0xf2,
++      0xd0, 0x57, 0xc1, 0x7c, 0xd2, 0x6a, 0xd2, 0x56,
++      0xf5, 0x13, 0x5f, 0x1c, 0x6a, 0x0b, 0x74, 0xfb,
++      0xe9, 0xfe, 0x9e, 0xea, 0x95, 0xb2, 0x46, 0xab,
++      0x0a, 0xfc, 0xfd, 0xf3, 0xbb, 0x04, 0x2b, 0x76,
++      0x1b, 0xa4, 0x74, 0xb0, 0xc1, 0x78, 0xc3, 0x69,
++      0xe2, 0xb0, 0x01, 0xe1, 0xde, 0x32, 0x4c, 0x8d,
++      0x1a, 0xb3, 0x38, 0x08, 0xd5, 0xfc, 0x1f, 0xdc,
++      0x0e, 0x2c, 0x9c, 0xb1, 0xa1, 0x63, 0x17, 0x22,
++      0xf5, 0x6c, 0x93, 0x70, 0x74, 0x00, 0xf8, 0x39,
++      0x01, 0x94, 0xd1, 0x32, 0x23, 0x56, 0x5d, 0xa6,
++      0x02, 0x76, 0x76, 0x93, 0xce, 0x2f, 0x19, 0xe9,
++      0x17, 0x52, 0xae, 0x6e, 0x2c, 0x6d, 0x61, 0x7f,
++      0x3b, 0xaa, 0xe0, 0x52, 0x85, 0xc5, 0x65, 0xc1,
++      0xbb, 0x8e, 0x5b, 0x21, 0xd5, 0xc9, 0x78, 0x83,
++      0x07, 0x97, 0x4c, 0x62, 0x61, 0x41, 0xd4, 0xfc,
++      0xc9, 0x39, 0xe3, 0x9b, 0xd0, 0xcc, 0x75, 0xc4,
++      0x97, 0xe6, 0xdd, 0x2a, 0x5f, 0xa6, 0xe8, 0x59,
++      0x6c, 0x98, 0xb9, 0x02, 0xe2, 0xa2, 0xd6, 0x68,
++      0xee, 0x3b, 0x1d, 0xe3, 0x4d, 0x5b, 0x30, 0xef,
++      0x03, 0xf2, 0xeb, 0x18, 0x57, 0x36, 0xe8, 0xa1,
++      0xf4, 0x47, 0xfb, 0xcb, 0x8f, 0xcb, 0xc8, 0xf3,
++      0x4f, 0x74, 0x9d, 0x9d, 0xb1, 0x8d, 0x14, 0x44,
++      0xd9, 0x19, 0xb4, 0x54, 0x4f, 0x75, 0x19, 0x09,
++      0xa0, 0x75, 0xbc, 0x3b, 0x82, 0xc6, 0x3f, 0xb8,
++      0x83, 0x19, 0x6e, 0xd6, 0x37, 0xfe, 0x6e, 0x8a,
++      0x4e, 0xe0, 0x4a, 0xab, 0x7b, 0xc8, 0xb4, 0x1d,
++      0xf4, 0xed, 0x27, 0x03, 0x65, 0xa2, 0xa1, 0xae,
++      0x11, 0xe7, 0x98, 0x78, 0x48, 0x91, 0xd2, 0xd2,
++      0xd4, 0x23, 0x78, 0x50, 0xb1, 0x5b, 0x85, 0x10,
++      0x8d, 0xca, 0x5f, 0x0f, 0x71, 0xae, 0x72, 0x9a,
++      0xf6, 0x25, 0x19, 0x60, 0x06, 0xf7, 0x10, 0x34,
++      0x18, 0x0d, 0xc9, 0x9f, 0x7b, 0x0c, 0x9b, 0x8f,
++      0x91, 0x1b, 0x9f, 0xcd, 0x10, 0xee, 0x75, 0xf9,
++      0x97, 0x66, 0xfc, 0x4d, 0x33, 0x6e, 0x28, 0x2b,
++      0x92, 0x85, 0x4f, 0xab, 0x43, 0x8d, 0x8f, 0x7d,
++      0x86, 0xa7, 0xc7, 0xd8, 0xd3, 0x0b, 0x8b, 0x57,
++      0xb6, 0x1d, 0x95, 0x0d, 0xe9, 0xbc, 0xd9, 0x03,
++      0xd9, 0x10, 0x19, 0xc3, 0x46, 0x63, 0x55, 0x87,
++      0x61, 0x79, 0x6c, 0x95, 0x0e, 0x9c, 0xdd, 0xca,
++      0xc3, 0xf3, 0x64, 0xf0, 0x7d, 0x76, 0xb7, 0x53,
++      0x67, 0x2b, 0x1e, 0x44, 0x56, 0x81, 0xea, 0x8f,
++      0x5c, 0x42, 0x16, 0xb8, 0x28, 0xeb, 0x1b, 0x61,
++      0x10, 0x1e, 0xbf, 0xec, 0xa8
++};
++static const u8 dec_assoc011[] __initconst = {
++      0xd6, 0x31, 0xda, 0x5d, 0x42, 0x5e, 0xd7
++};
++static const u8 dec_nonce011[] __initconst = {
++      0xfd, 0x87, 0xd4, 0xd8, 0x62, 0xfd, 0xec, 0xaa
++};
++static const u8 dec_key011[] __initconst = {
++      0x35, 0x4e, 0xb5, 0x70, 0x50, 0x42, 0x8a, 0x85,
++      0xf2, 0xfb, 0xed, 0x7b, 0xd0, 0x9e, 0x97, 0xca,
++      0xfa, 0x98, 0x66, 0x63, 0xee, 0x37, 0xcc, 0x52,
++      0xfe, 0xd1, 0xdf, 0x95, 0x15, 0x34, 0x29, 0x38
++};
++
++static const u8 dec_input012[] __initconst = {
++      0x52, 0x34, 0xb3, 0x65, 0x3b, 0xb7, 0xe5, 0xd3,
++      0xab, 0x49, 0x17, 0x60, 0xd2, 0x52, 0x56, 0xdf,
++      0xdf, 0x34, 0x56, 0x82, 0xe2, 0xbe, 0xe5, 0xe1,
++      0x28, 0xd1, 0x4e, 0x5f, 0x4f, 0x01, 0x7d, 0x3f,
++      0x99, 0x6b, 0x30, 0x6e, 0x1a, 0x7c, 0x4c, 0x8e,
++      0x62, 0x81, 0xae, 0x86, 0x3f, 0x6b, 0xd0, 0xb5,
++      0xa9, 0xcf, 0x50, 0xf1, 0x02, 0x12, 0xa0, 0x0b,
++      0x24, 0xe9, 0xe6, 0x72, 0x89, 0x2c, 0x52, 0x1b,
++      0x34, 0x38, 0xf8, 0x75, 0x5f, 0xa0, 0x74, 0xe2,
++      0x99, 0xdd, 0xa6, 0x4b, 0x14, 0x50, 0x4e, 0xf1,
++      0xbe, 0xd6, 0x9e, 0xdb, 0xb2, 0x24, 0x27, 0x74,
++      0x12, 0x4a, 0x78, 0x78, 0x17, 0xa5, 0x58, 0x8e,
++      0x2f, 0xf9, 0xf4, 0x8d, 0xee, 0x03, 0x88, 0xae,
++      0xb8, 0x29, 0xa1, 0x2f, 0x4b, 0xee, 0x92, 0xbd,
++      0x87, 0xb3, 0xce, 0x34, 0x21, 0x57, 0x46, 0x04,
++      0x49, 0x0c, 0x80, 0xf2, 0x01, 0x13, 0xa1, 0x55,
++      0xb3, 0xff, 0x44, 0x30, 0x3c, 0x1c, 0xd0, 0xef,
++      0xbc, 0x18, 0x74, 0x26, 0xad, 0x41, 0x5b, 0x5b,
++      0x3e, 0x9a, 0x7a, 0x46, 0x4f, 0x16, 0xd6, 0x74,
++      0x5a, 0xb7, 0x3a, 0x28, 0x31, 0xd8, 0xae, 0x26,
++      0xac, 0x50, 0x53, 0x86, 0xf2, 0x56, 0xd7, 0x3f,
++      0x29, 0xbc, 0x45, 0x68, 0x8e, 0xcb, 0x98, 0x64,
++      0xdd, 0xc9, 0xba, 0xb8, 0x4b, 0x7b, 0x82, 0xdd,
++      0x14, 0xa7, 0xcb, 0x71, 0x72, 0x00, 0x5c, 0xad,
++      0x7b, 0x6a, 0x89, 0xa4, 0x3d, 0xbf, 0xb5, 0x4b,
++      0x3e, 0x7c, 0x5a, 0xcf, 0xb8, 0xa1, 0xc5, 0x6e,
++      0xc8, 0xb6, 0x31, 0x57, 0x7b, 0xdf, 0xa5, 0x7e,
++      0xb1, 0xd6, 0x42, 0x2a, 0x31, 0x36, 0xd1, 0xd0,
++      0x3f, 0x7a, 0xe5, 0x94, 0xd6, 0x36, 0xa0, 0x6f,
++      0xb7, 0x40, 0x7d, 0x37, 0xc6, 0x55, 0x7c, 0x50,
++      0x40, 0x6d, 0x29, 0x89, 0xe3, 0x5a, 0xae, 0x97,
++      0xe7, 0x44, 0x49, 0x6e, 0xbd, 0x81, 0x3d, 0x03,
++      0x93, 0x06, 0x12, 0x06, 0xe2, 0x41, 0x12, 0x4a,
++      0xf1, 0x6a, 0xa4, 0x58, 0xa2, 0xfb, 0xd2, 0x15,
++      0xba, 0xc9, 0x79, 0xc9, 0xce, 0x5e, 0x13, 0xbb,
++      0xf1, 0x09, 0x04, 0xcc, 0xfd, 0xe8, 0x51, 0x34,
++      0x6a, 0xe8, 0x61, 0x88, 0xda, 0xed, 0x01, 0x47,
++      0x84, 0xf5, 0x73, 0x25, 0xf9, 0x1c, 0x42, 0x86,
++      0x07, 0xf3, 0x5b, 0x1a, 0x01, 0xb3, 0xeb, 0x24,
++      0x32, 0x8d, 0xf6, 0xed, 0x7c, 0x4b, 0xeb, 0x3c,
++      0x36, 0x42, 0x28, 0xdf, 0xdf, 0xb6, 0xbe, 0xd9,
++      0x8c, 0x52, 0xd3, 0x2b, 0x08, 0x90, 0x8c, 0xe7,
++      0x98, 0x31, 0xe2, 0x32, 0x8e, 0xfc, 0x11, 0x48,
++      0x00, 0xa8, 0x6a, 0x42, 0x4a, 0x02, 0xc6, 0x4b,
++      0x09, 0xf1, 0xe3, 0x49, 0xf3, 0x45, 0x1f, 0x0e,
++      0xbc, 0x56, 0xe2, 0xe4, 0xdf, 0xfb, 0xeb, 0x61,
++      0xfa, 0x24, 0xc1, 0x63, 0x75, 0xbb, 0x47, 0x75,
++      0xaf, 0xe1, 0x53, 0x16, 0x96, 0x21, 0x85, 0x26,
++      0x11, 0xb3, 0x76, 0xe3, 0x23, 0xa1, 0x6b, 0x74,
++      0x37, 0xd0, 0xde, 0x06, 0x90, 0x71, 0x5d, 0x43,
++      0x88, 0x9b, 0x00, 0x54, 0xa6, 0x75, 0x2f, 0xa1,
++      0xc2, 0x0b, 0x73, 0x20, 0x1d, 0xb6, 0x21, 0x79,
++      0x57, 0x3f, 0xfa, 0x09, 0xbe, 0x8a, 0x33, 0xc3,
++      0x52, 0xf0, 0x1d, 0x82, 0x31, 0xd1, 0x55, 0xb5,
++      0x6c, 0x99, 0x25, 0xcf, 0x5c, 0x32, 0xce, 0xe9,
++      0x0d, 0xfa, 0x69, 0x2c, 0xd5, 0x0d, 0xc5, 0x6d,
++      0x86, 0xd0, 0x0c, 0x3b, 0x06, 0x50, 0x79, 0xe8,
++      0xc3, 0xae, 0x04, 0xe6, 0xcd, 0x51, 0xe4, 0x26,
++      0x9b, 0x4f, 0x7e, 0xa6, 0x0f, 0xab, 0xd8, 0xe5,
++      0xde, 0xa9, 0x00, 0x95, 0xbe, 0xa3, 0x9d, 0x5d,
++      0xb2, 0x09, 0x70, 0x18, 0x1c, 0xf0, 0xac, 0x29,
++      0x23, 0x02, 0x29, 0x28, 0xd2, 0x74, 0x35, 0x57,
++      0x62, 0x0f, 0x24, 0xea, 0x5e, 0x33, 0xc2, 0x92,
++      0xf3, 0x78, 0x4d, 0x30, 0x1e, 0xa1, 0x99, 0xa9,
++      0x82, 0xb0, 0x42, 0x31, 0x8d, 0xad, 0x8a, 0xbc,
++      0xfc, 0xd4, 0x57, 0x47, 0x3e, 0xb4, 0x50, 0xdd,
++      0x6e, 0x2c, 0x80, 0x4d, 0x22, 0xf1, 0xfb, 0x57,
++      0xc4, 0xdd, 0x17, 0xe1, 0x8a, 0x36, 0x4a, 0xb3,
++      0x37, 0xca, 0xc9, 0x4e, 0xab, 0xd5, 0x69, 0xc4,
++      0xf4, 0xbc, 0x0b, 0x3b, 0x44, 0x4b, 0x29, 0x9c,
++      0xee, 0xd4, 0x35, 0x22, 0x21, 0xb0, 0x1f, 0x27,
++      0x64, 0xa8, 0x51, 0x1b, 0xf0, 0x9f, 0x19, 0x5c,
++      0xfb, 0x5a, 0x64, 0x74, 0x70, 0x45, 0x09, 0xf5,
++      0x64, 0xfe, 0x1a, 0x2d, 0xc9, 0x14, 0x04, 0x14,
++      0xcf, 0xd5, 0x7d, 0x60, 0xaf, 0x94, 0x39, 0x94,
++      0xe2, 0x7d, 0x79, 0x82, 0xd0, 0x65, 0x3b, 0x6b,
++      0x9c, 0x19, 0x84, 0xb4, 0x6d, 0xb3, 0x0c, 0x99,
++      0xc0, 0x56, 0xa8, 0xbd, 0x73, 0xce, 0x05, 0x84,
++      0x3e, 0x30, 0xaa, 0xc4, 0x9b, 0x1b, 0x04, 0x2a,
++      0x9f, 0xd7, 0x43, 0x2b, 0x23, 0xdf, 0xbf, 0xaa,
++      0xd5, 0xc2, 0x43, 0x2d, 0x70, 0xab, 0xdc, 0x75,
++      0xad, 0xac, 0xf7, 0xc0, 0xbe, 0x67, 0xb2, 0x74,
++      0xed, 0x67, 0x10, 0x4a, 0x92, 0x60, 0xc1, 0x40,
++      0x50, 0x19, 0x8a, 0x8a, 0x8c, 0x09, 0x0e, 0x72,
++      0xe1, 0x73, 0x5e, 0xe8, 0x41, 0x85, 0x63, 0x9f,
++      0x3f, 0xd7, 0x7d, 0xc4, 0xfb, 0x22, 0x5d, 0x92,
++      0x6c, 0xb3, 0x1e, 0xe2, 0x50, 0x2f, 0x82, 0xa8,
++      0x28, 0xc0, 0xb5, 0xd7, 0x5f, 0x68, 0x0d, 0x2c,
++      0x2d, 0xaf, 0x7e, 0xfa, 0x2e, 0x08, 0x0f, 0x1f,
++      0x70, 0x9f, 0xe9, 0x19, 0x72, 0x55, 0xf8, 0xfb,
++      0x51, 0xd2, 0x33, 0x5d, 0xa0, 0xd3, 0x2b, 0x0a,
++      0x6c, 0xbc, 0x4e, 0xcf, 0x36, 0x4d, 0xdc, 0x3b,
++      0xe9, 0x3e, 0x81, 0x7c, 0x61, 0xdb, 0x20, 0x2d,
++      0x3a, 0xc3, 0xb3, 0x0c, 0x1e, 0x00, 0xb9, 0x7c,
++      0xf5, 0xca, 0x10, 0x5f, 0x3a, 0x71, 0xb3, 0xe4,
++      0x20, 0xdb, 0x0c, 0x2a, 0x98, 0x63, 0x45, 0x00,
++      0x58, 0xf6, 0x68, 0xe4, 0x0b, 0xda, 0x13, 0x3b,
++      0x60, 0x5c, 0x76, 0xdb, 0xb9, 0x97, 0x71, 0xe4,
++      0xd9, 0xb7, 0xdb, 0xbd, 0x68, 0xc7, 0x84, 0x84,
++      0xaa, 0x7c, 0x68, 0x62, 0x5e, 0x16, 0xfc, 0xba,
++      0x72, 0xaa, 0x9a, 0xa9, 0xeb, 0x7c, 0x75, 0x47,
++      0x97, 0x7e, 0xad, 0xe2, 0xd9, 0x91, 0xe8, 0xe4,
++      0xa5, 0x31, 0xd7, 0x01, 0x8e, 0xa2, 0x11, 0x88,
++      0x95, 0xb9, 0xf2, 0x9b, 0xd3, 0x7f, 0x1b, 0x81,
++      0x22, 0xf7, 0x98, 0x60, 0x0a, 0x64, 0xa6, 0xc1,
++      0xf6, 0x49, 0xc7, 0xe3, 0x07, 0x4d, 0x94, 0x7a,
++      0xcf, 0x6e, 0x68, 0x0c, 0x1b, 0x3f, 0x6e, 0x2e,
++      0xee, 0x92, 0xfa, 0x52, 0xb3, 0x59, 0xf8, 0xf1,
++      0x8f, 0x6a, 0x66, 0xa3, 0x82, 0x76, 0x4a, 0x07,
++      0x1a, 0xc7, 0xdd, 0xf5, 0xda, 0x9c, 0x3c, 0x24,
++      0xbf, 0xfd, 0x42, 0xa1, 0x10, 0x64, 0x6a, 0x0f,
++      0x89, 0xee, 0x36, 0xa5, 0xce, 0x99, 0x48, 0x6a,
++      0xf0, 0x9f, 0x9e, 0x69, 0xa4, 0x40, 0x20, 0xe9,
++      0x16, 0x15, 0xf7, 0xdb, 0x75, 0x02, 0xcb, 0xe9,
++      0x73, 0x8b, 0x3b, 0x49, 0x2f, 0xf0, 0xaf, 0x51,
++      0x06, 0x5c, 0xdf, 0x27, 0x27, 0x49, 0x6a, 0xd1,
++      0xcc, 0xc7, 0xb5, 0x63, 0xb5, 0xfc, 0xb8, 0x5c,
++      0x87, 0x7f, 0x84, 0xb4, 0xcc, 0x14, 0xa9, 0x53,
++      0xda, 0xa4, 0x56, 0xf8, 0xb6, 0x1b, 0xcc, 0x40,
++      0x27, 0x52, 0x06, 0x5a, 0x13, 0x81, 0xd7, 0x3a,
++      0xd4, 0x3b, 0xfb, 0x49, 0x65, 0x31, 0x33, 0xb2,
++      0xfa, 0xcd, 0xad, 0x58, 0x4e, 0x2b, 0xae, 0xd2,
++      0x20, 0xfb, 0x1a, 0x48, 0xb4, 0x3f, 0x9a, 0xd8,
++      0x7a, 0x35, 0x4a, 0xc8, 0xee, 0x88, 0x5e, 0x07,
++      0x66, 0x54, 0xb9, 0xec, 0x9f, 0xa3, 0xe3, 0xb9,
++      0x37, 0xaa, 0x49, 0x76, 0x31, 0xda, 0x74, 0x2d,
++      0x3c, 0xa4, 0x65, 0x10, 0x32, 0x38, 0xf0, 0xde,
++      0xd3, 0x99, 0x17, 0xaa, 0x71, 0xaa, 0x8f, 0x0f,
++      0x8c, 0xaf, 0xa2, 0xf8, 0x5d, 0x64, 0xba, 0x1d,
++      0xa3, 0xef, 0x96, 0x73, 0xe8, 0xa1, 0x02, 0x8d,
++      0x0c, 0x6d, 0xb8, 0x06, 0x90, 0xb8, 0x08, 0x56,
++      0x2c, 0xa7, 0x06, 0xc9, 0xc2, 0x38, 0xdb, 0x7c,
++      0x63, 0xb1, 0x57, 0x8e, 0xea, 0x7c, 0x79, 0xf3,
++      0x49, 0x1d, 0xfe, 0x9f, 0xf3, 0x6e, 0xb1, 0x1d,
++      0xba, 0x19, 0x80, 0x1a, 0x0a, 0xd3, 0xb0, 0x26,
++      0x21, 0x40, 0xb1, 0x7c, 0xf9, 0x4d, 0x8d, 0x10,
++      0xc1, 0x7e, 0xf4, 0xf6, 0x3c, 0xa8, 0xfd, 0x7c,
++      0xa3, 0x92, 0xb2, 0x0f, 0xaa, 0xcc, 0xa6, 0x11,
++      0xfe, 0x04, 0xe3, 0xd1, 0x7a, 0x32, 0x89, 0xdf,
++      0x0d, 0xc4, 0x8f, 0x79, 0x6b, 0xca, 0x16, 0x7c,
++      0x6e, 0xf9, 0xad, 0x0f, 0xf6, 0xfe, 0x27, 0xdb,
++      0xc4, 0x13, 0x70, 0xf1, 0x62, 0x1a, 0x4f, 0x79,
++      0x40, 0xc9, 0x9b, 0x8b, 0x21, 0xea, 0x84, 0xfa,
++      0xf5, 0xf1, 0x89, 0xce, 0xb7, 0x55, 0x0a, 0x80,
++      0x39, 0x2f, 0x55, 0x36, 0x16, 0x9c, 0x7b, 0x08,
++      0xbd, 0x87, 0x0d, 0xa5, 0x32, 0xf1, 0x52, 0x7c,
++      0xe8, 0x55, 0x60, 0x5b, 0xd7, 0x69, 0xe4, 0xfc,
++      0xfa, 0x12, 0x85, 0x96, 0xea, 0x50, 0x28, 0xab,
++      0x8a, 0xf7, 0xbb, 0x0e, 0x53, 0x74, 0xca, 0xa6,
++      0x27, 0x09, 0xc2, 0xb5, 0xde, 0x18, 0x14, 0xd9,
++      0xea, 0xe5, 0x29, 0x1c, 0x40, 0x56, 0xcf, 0xd7,
++      0xae, 0x05, 0x3f, 0x65, 0xaf, 0x05, 0x73, 0xe2,
++      0x35, 0x96, 0x27, 0x07, 0x14, 0xc0, 0xad, 0x33,
++      0xf1, 0xdc, 0x44, 0x7a, 0x89, 0x17, 0x77, 0xd2,
++      0x9c, 0x58, 0x60, 0xf0, 0x3f, 0x7b, 0x2d, 0x2e,
++      0x57, 0x95, 0x54, 0x87, 0xed, 0xf2, 0xc7, 0x4c,
++      0xf0, 0xae, 0x56, 0x29, 0x19, 0x7d, 0x66, 0x4b,
++      0x9b, 0x83, 0x84, 0x42, 0x3b, 0x01, 0x25, 0x66,
++      0x8e, 0x02, 0xde, 0xb9, 0x83, 0x54, 0x19, 0xf6,
++      0x9f, 0x79, 0x0d, 0x67, 0xc5, 0x1d, 0x7a, 0x44,
++      0x02, 0x98, 0xa7, 0x16, 0x1c, 0x29, 0x0d, 0x74,
++      0xff, 0x85, 0x40, 0x06, 0xef, 0x2c, 0xa9, 0xc6,
++      0xf5, 0x53, 0x07, 0x06, 0xae, 0xe4, 0xfa, 0x5f,
++      0xd8, 0x39, 0x4d, 0xf1, 0x9b, 0x6b, 0xd9, 0x24,
++      0x84, 0xfe, 0x03, 0x4c, 0xb2, 0x3f, 0xdf, 0xa1,
++      0x05, 0x9e, 0x50, 0x14, 0x5a, 0xd9, 0x1a, 0xa2,
++      0xa7, 0xfa, 0xfa, 0x17, 0xf7, 0x78, 0xd6, 0xb5,
++      0x92, 0x61, 0x91, 0xac, 0x36, 0xfa, 0x56, 0x0d,
++      0x38, 0x32, 0x18, 0x85, 0x08, 0x58, 0x37, 0xf0,
++      0x4b, 0xdb, 0x59, 0xe7, 0xa4, 0x34, 0xc0, 0x1b,
++      0x01, 0xaf, 0x2d, 0xde, 0xa1, 0xaa, 0x5d, 0xd3,
++      0xec, 0xe1, 0xd4, 0xf7, 0xe6, 0x54, 0x68, 0xf0,
++      0x51, 0x97, 0xa7, 0x89, 0xea, 0x24, 0xad, 0xd3,
++      0x6e, 0x47, 0x93, 0x8b, 0x4b, 0xb4, 0xf7, 0x1c,
++      0x42, 0x06, 0x67, 0xe8, 0x99, 0xf6, 0xf5, 0x7b,
++      0x85, 0xb5, 0x65, 0xb5, 0xb5, 0xd2, 0x37, 0xf5,
++      0xf3, 0x02, 0xa6, 0x4d, 0x11, 0xa7, 0xdc, 0x51,
++      0x09, 0x7f, 0xa0, 0xd8, 0x88, 0x1c, 0x13, 0x71,
++      0xae, 0x9c, 0xb7, 0x7b, 0x34, 0xd6, 0x4e, 0x68,
++      0x26, 0x83, 0x51, 0xaf, 0x1d, 0xee, 0x8b, 0xbb,
++      0x69, 0x43, 0x2b, 0x9e, 0x8a, 0xbc, 0x02, 0x0e,
++      0xa0, 0x1b, 0xe0, 0xa8, 0x5f, 0x6f, 0xaf, 0x1b,
++      0x8f, 0xe7, 0x64, 0x71, 0x74, 0x11, 0x7e, 0xa8,
++      0xd8, 0xf9, 0x97, 0x06, 0xc3, 0xb6, 0xfb, 0xfb,
++      0xb7, 0x3d, 0x35, 0x9d, 0x3b, 0x52, 0xed, 0x54,
++      0xca, 0xf4, 0x81, 0x01, 0x2d, 0x1b, 0xc3, 0xa7,
++      0x00, 0x3d, 0x1a, 0x39, 0x54, 0xe1, 0xf6, 0xff,
++      0xed, 0x6f, 0x0b, 0x5a, 0x68, 0xda, 0x58, 0xdd,
++      0xa9, 0xcf, 0x5c, 0x4a, 0xe5, 0x09, 0x4e, 0xde,
++      0x9d, 0xbc, 0x3e, 0xee, 0x5a, 0x00, 0x3b, 0x2c,
++      0x87, 0x10, 0x65, 0x60, 0xdd, 0xd7, 0x56, 0xd1,
++      0x4c, 0x64, 0x45, 0xe4, 0x21, 0xec, 0x78, 0xf8,
++      0x25, 0x7a, 0x3e, 0x16, 0x5d, 0x09, 0x53, 0x14,
++      0xbe, 0x4f, 0xae, 0x87, 0xd8, 0xd1, 0xaa, 0x3c,
++      0xf6, 0x3e, 0xa4, 0x70, 0x8c, 0x5e, 0x70, 0xa4,
++      0xb3, 0x6b, 0x66, 0x73, 0xd3, 0xbf, 0x31, 0x06,
++      0x19, 0x62, 0x93, 0x15, 0xf2, 0x86, 0xe4, 0x52,
++      0x7e, 0x53, 0x4c, 0x12, 0x38, 0xcc, 0x34, 0x7d,
++      0x57, 0xf6, 0x42, 0x93, 0x8a, 0xc4, 0xee, 0x5c,
++      0x8a, 0xe1, 0x52, 0x8f, 0x56, 0x64, 0xf6, 0xa6,
++      0xd1, 0x91, 0x57, 0x70, 0xcd, 0x11, 0x76, 0xf5,
++      0x59, 0x60, 0x60, 0x3c, 0xc1, 0xc3, 0x0b, 0x7f,
++      0x58, 0x1a, 0x50, 0x91, 0xf1, 0x68, 0x8f, 0x6e,
++      0x74, 0x74, 0xa8, 0x51, 0x0b, 0xf7, 0x7a, 0x98,
++      0x37, 0xf2, 0x0a, 0x0e, 0xa4, 0x97, 0x04, 0xb8,
++      0x9b, 0xfd, 0xa0, 0xea, 0xf7, 0x0d, 0xe1, 0xdb,
++      0x03, 0xf0, 0x31, 0x29, 0xf8, 0xdd, 0x6b, 0x8b,
++      0x5d, 0xd8, 0x59, 0xa9, 0x29, 0xcf, 0x9a, 0x79,
++      0x89, 0x19, 0x63, 0x46, 0x09, 0x79, 0x6a, 0x11,
++      0xda, 0x63, 0x68, 0x48, 0x77, 0x23, 0xfb, 0x7d,
++      0x3a, 0x43, 0xcb, 0x02, 0x3b, 0x7a, 0x6d, 0x10,
++      0x2a, 0x9e, 0xac, 0xf1, 0xd4, 0x19, 0xf8, 0x23,
++      0x64, 0x1d, 0x2c, 0x5f, 0xf2, 0xb0, 0x5c, 0x23,
++      0x27, 0xf7, 0x27, 0x30, 0x16, 0x37, 0xb1, 0x90,
++      0xab, 0x38, 0xfb, 0x55, 0xcd, 0x78, 0x58, 0xd4,
++      0x7d, 0x43, 0xf6, 0x45, 0x5e, 0x55, 0x8d, 0xb1,
++      0x02, 0x65, 0x58, 0xb4, 0x13, 0x4b, 0x36, 0xf7,
++      0xcc, 0xfe, 0x3d, 0x0b, 0x82, 0xe2, 0x12, 0x11,
++      0xbb, 0xe6, 0xb8, 0x3a, 0x48, 0x71, 0xc7, 0x50,
++      0x06, 0x16, 0x3a, 0xe6, 0x7c, 0x05, 0xc7, 0xc8,
++      0x4d, 0x2f, 0x08, 0x6a, 0x17, 0x9a, 0x95, 0x97,
++      0x50, 0x68, 0xdc, 0x28, 0x18, 0xc4, 0x61, 0x38,
++      0xb9, 0xe0, 0x3e, 0x78, 0xdb, 0x29, 0xe0, 0x9f,
++      0x52, 0xdd, 0xf8, 0x4f, 0x91, 0xc1, 0xd0, 0x33,
++      0xa1, 0x7a, 0x8e, 0x30, 0x13, 0x82, 0x07, 0x9f,
++      0xd3, 0x31, 0x0f, 0x23, 0xbe, 0x32, 0x5a, 0x75,
++      0xcf, 0x96, 0xb2, 0xec, 0xb5, 0x32, 0xac, 0x21,
++      0xd1, 0x82, 0x33, 0xd3, 0x15, 0x74, 0xbd, 0x90,
++      0xf1, 0x2c, 0xe6, 0x5f, 0x8d, 0xe3, 0x02, 0xe8,
++      0xe9, 0xc4, 0xca, 0x96, 0xeb, 0x0e, 0xbc, 0x91,
++      0xf4, 0xb9, 0xea, 0xd9, 0x1b, 0x75, 0xbd, 0xe1,
++      0xac, 0x2a, 0x05, 0x37, 0x52, 0x9b, 0x1b, 0x3f,
++      0x5a, 0xdc, 0x21, 0xc3, 0x98, 0xbb, 0xaf, 0xa3,
++      0xf2, 0x00, 0xbf, 0x0d, 0x30, 0x89, 0x05, 0xcc,
++      0xa5, 0x76, 0xf5, 0x06, 0xf0, 0xc6, 0x54, 0x8a,
++      0x5d, 0xd4, 0x1e, 0xc1, 0xf2, 0xce, 0xb0, 0x62,
++      0xc8, 0xfc, 0x59, 0x42, 0x9a, 0x90, 0x60, 0x55,
++      0xfe, 0x88, 0xa5, 0x8b, 0xb8, 0x33, 0x0c, 0x23,
++      0x24, 0x0d, 0x15, 0x70, 0x37, 0x1e, 0x3d, 0xf6,
++      0xd2, 0xea, 0x92, 0x10, 0xb2, 0xc4, 0x51, 0xac,
++      0xf2, 0xac, 0xf3, 0x6b, 0x6c, 0xaa, 0xcf, 0x12,
++      0xc5, 0x6c, 0x90, 0x50, 0xb5, 0x0c, 0xfc, 0x1a,
++      0x15, 0x52, 0xe9, 0x26, 0xc6, 0x52, 0xa4, 0xe7,
++      0x81, 0x69, 0xe1, 0xe7, 0x9e, 0x30, 0x01, 0xec,
++      0x84, 0x89, 0xb2, 0x0d, 0x66, 0xdd, 0xce, 0x28,
++      0x5c, 0xec, 0x98, 0x46, 0x68, 0x21, 0x9f, 0x88,
++      0x3f, 0x1f, 0x42, 0x77, 0xce, 0xd0, 0x61, 0xd4,
++      0x20, 0xa7, 0xff, 0x53, 0xad, 0x37, 0xd0, 0x17,
++      0x35, 0xc9, 0xfc, 0xba, 0x0a, 0x78, 0x3f, 0xf2,
++      0xcc, 0x86, 0x89, 0xe8, 0x4b, 0x3c, 0x48, 0x33,
++      0x09, 0x7f, 0xc6, 0xc0, 0xdd, 0xb8, 0xfd, 0x7a,
++      0x66, 0x66, 0x65, 0xeb, 0x47, 0xa7, 0x04, 0x28,
++      0xa3, 0x19, 0x8e, 0xa9, 0xb1, 0x13, 0x67, 0x62,
++      0x70, 0xcf, 0xd6
++};
++static const u8 dec_output012[] __initconst = {
++      0x74, 0xa6, 0x3e, 0xe4, 0xb1, 0xcb, 0xaf, 0xb0,
++      0x40, 0xe5, 0x0f, 0x9e, 0xf1, 0xf2, 0x89, 0xb5,
++      0x42, 0x34, 0x8a, 0xa1, 0x03, 0xb7, 0xe9, 0x57,
++      0x46, 0xbe, 0x20, 0xe4, 0x6e, 0xb0, 0xeb, 0xff,
++      0xea, 0x07, 0x7e, 0xef, 0xe2, 0x55, 0x9f, 0xe5,
++      0x78, 0x3a, 0xb7, 0x83, 0xc2, 0x18, 0x40, 0x7b,
++      0xeb, 0xcd, 0x81, 0xfb, 0x90, 0x12, 0x9e, 0x46,
++      0xa9, 0xd6, 0x4a, 0xba, 0xb0, 0x62, 0xdb, 0x6b,
++      0x99, 0xc4, 0xdb, 0x54, 0x4b, 0xb8, 0xa5, 0x71,
++      0xcb, 0xcd, 0x63, 0x32, 0x55, 0xfb, 0x31, 0xf0,
++      0x38, 0xf5, 0xbe, 0x78, 0xe4, 0x45, 0xce, 0x1b,
++      0x6a, 0x5b, 0x0e, 0xf4, 0x16, 0xe4, 0xb1, 0x3d,
++      0xf6, 0x63, 0x7b, 0xa7, 0x0c, 0xde, 0x6f, 0x8f,
++      0x74, 0xdf, 0xe0, 0x1e, 0x9d, 0xce, 0x8f, 0x24,
++      0xef, 0x23, 0x35, 0x33, 0x7b, 0x83, 0x34, 0x23,
++      0x58, 0x74, 0x14, 0x77, 0x1f, 0xc2, 0x4f, 0x4e,
++      0xc6, 0x89, 0xf9, 0x52, 0x09, 0x37, 0x64, 0x14,
++      0xc4, 0x01, 0x6b, 0x9d, 0x77, 0xe8, 0x90, 0x5d,
++      0xa8, 0x4a, 0x2a, 0xef, 0x5c, 0x7f, 0xeb, 0xbb,
++      0xb2, 0xc6, 0x93, 0x99, 0x66, 0xdc, 0x7f, 0xd4,
++      0x9e, 0x2a, 0xca, 0x8d, 0xdb, 0xe7, 0x20, 0xcf,
++      0xe4, 0x73, 0xae, 0x49, 0x7d, 0x64, 0x0f, 0x0e,
++      0x28, 0x46, 0xa9, 0xa8, 0x32, 0xe4, 0x0e, 0xf6,
++      0x51, 0x53, 0xb8, 0x3c, 0xb1, 0xff, 0xa3, 0x33,
++      0x41, 0x75, 0xff, 0xf1, 0x6f, 0xf1, 0xfb, 0xbb,
++      0x83, 0x7f, 0x06, 0x9b, 0xe7, 0x1b, 0x0a, 0xe0,
++      0x5c, 0x33, 0x60, 0x5b, 0xdb, 0x5b, 0xed, 0xfe,
++      0xa5, 0x16, 0x19, 0x72, 0xa3, 0x64, 0x23, 0x00,
++      0x02, 0xc7, 0xf3, 0x6a, 0x81, 0x3e, 0x44, 0x1d,
++      0x79, 0x15, 0x5f, 0x9a, 0xde, 0xe2, 0xfd, 0x1b,
++      0x73, 0xc1, 0xbc, 0x23, 0xba, 0x31, 0xd2, 0x50,
++      0xd5, 0xad, 0x7f, 0x74, 0xa7, 0xc9, 0xf8, 0x3e,
++      0x2b, 0x26, 0x10, 0xf6, 0x03, 0x36, 0x74, 0xe4,
++      0x0e, 0x6a, 0x72, 0xb7, 0x73, 0x0a, 0x42, 0x28,
++      0xc2, 0xad, 0x5e, 0x03, 0xbe, 0xb8, 0x0b, 0xa8,
++      0x5b, 0xd4, 0xb8, 0xba, 0x52, 0x89, 0xb1, 0x9b,
++      0xc1, 0xc3, 0x65, 0x87, 0xed, 0xa5, 0xf4, 0x86,
++      0xfd, 0x41, 0x80, 0x91, 0x27, 0x59, 0x53, 0x67,
++      0x15, 0x78, 0x54, 0x8b, 0x2d, 0x3d, 0xc7, 0xff,
++      0x02, 0x92, 0x07, 0x5f, 0x7a, 0x4b, 0x60, 0x59,
++      0x3c, 0x6f, 0x5c, 0xd8, 0xec, 0x95, 0xd2, 0xfe,
++      0xa0, 0x3b, 0xd8, 0x3f, 0xd1, 0x69, 0xa6, 0xd6,
++      0x41, 0xb2, 0xf4, 0x4d, 0x12, 0xf4, 0x58, 0x3e,
++      0x66, 0x64, 0x80, 0x31, 0x9b, 0xa8, 0x4c, 0x8b,
++      0x07, 0xb2, 0xec, 0x66, 0x94, 0x66, 0x47, 0x50,
++      0x50, 0x5f, 0x18, 0x0b, 0x0e, 0xd6, 0xc0, 0x39,
++      0x21, 0x13, 0x9e, 0x33, 0xbc, 0x79, 0x36, 0x02,
++      0x96, 0x70, 0xf0, 0x48, 0x67, 0x2f, 0x26, 0xe9,
++      0x6d, 0x10, 0xbb, 0xd6, 0x3f, 0xd1, 0x64, 0x7a,
++      0x2e, 0xbe, 0x0c, 0x61, 0xf0, 0x75, 0x42, 0x38,
++      0x23, 0xb1, 0x9e, 0x9f, 0x7c, 0x67, 0x66, 0xd9,
++      0x58, 0x9a, 0xf1, 0xbb, 0x41, 0x2a, 0x8d, 0x65,
++      0x84, 0x94, 0xfc, 0xdc, 0x6a, 0x50, 0x64, 0xdb,
++      0x56, 0x33, 0x76, 0x00, 0x10, 0xed, 0xbe, 0xd2,
++      0x12, 0xf6, 0xf6, 0x1b, 0xa2, 0x16, 0xde, 0xae,
++      0x31, 0x95, 0xdd, 0xb1, 0x08, 0x7e, 0x4e, 0xee,
++      0xe7, 0xf9, 0xa5, 0xfb, 0x5b, 0x61, 0x43, 0x00,
++      0x40, 0xf6, 0x7e, 0x02, 0x04, 0x32, 0x4e, 0x0c,
++      0xe2, 0x66, 0x0d, 0xd7, 0x07, 0x98, 0x0e, 0xf8,
++      0x72, 0x34, 0x6d, 0x95, 0x86, 0xd7, 0xcb, 0x31,
++      0x54, 0x47, 0xd0, 0x38, 0x29, 0x9c, 0x5a, 0x68,
++      0xd4, 0x87, 0x76, 0xc9, 0xe7, 0x7e, 0xe3, 0xf4,
++      0x81, 0x6d, 0x18, 0xcb, 0xc9, 0x05, 0xaf, 0xa0,
++      0xfb, 0x66, 0xf7, 0xf1, 0x1c, 0xc6, 0x14, 0x11,
++      0x4f, 0x2b, 0x79, 0x42, 0x8b, 0xbc, 0xac, 0xe7,
++      0x6c, 0xfe, 0x0f, 0x58, 0xe7, 0x7c, 0x78, 0x39,
++      0x30, 0xb0, 0x66, 0x2c, 0x9b, 0x6d, 0x3a, 0xe1,
++      0xcf, 0xc9, 0xa4, 0x0e, 0x6d, 0x6d, 0x8a, 0xa1,
++      0x3a, 0xe7, 0x28, 0xd4, 0x78, 0x4c, 0xa6, 0xa2,
++      0x2a, 0xa6, 0x03, 0x30, 0xd7, 0xa8, 0x25, 0x66,
++      0x87, 0x2f, 0x69, 0x5c, 0x4e, 0xdd, 0xa5, 0x49,
++      0x5d, 0x37, 0x4a, 0x59, 0xc4, 0xaf, 0x1f, 0xa2,
++      0xe4, 0xf8, 0xa6, 0x12, 0x97, 0xd5, 0x79, 0xf5,
++      0xe2, 0x4a, 0x2b, 0x5f, 0x61, 0xe4, 0x9e, 0xe3,
++      0xee, 0xb8, 0xa7, 0x5b, 0x2f, 0xf4, 0x9e, 0x6c,
++      0xfb, 0xd1, 0xc6, 0x56, 0x77, 0xba, 0x75, 0xaa,
++      0x3d, 0x1a, 0xa8, 0x0b, 0xb3, 0x68, 0x24, 0x00,
++      0x10, 0x7f, 0xfd, 0xd7, 0xa1, 0x8d, 0x83, 0x54,
++      0x4f, 0x1f, 0xd8, 0x2a, 0xbe, 0x8a, 0x0c, 0x87,
++      0xab, 0xa2, 0xde, 0xc3, 0x39, 0xbf, 0x09, 0x03,
++      0xa5, 0xf3, 0x05, 0x28, 0xe1, 0xe1, 0xee, 0x39,
++      0x70, 0x9c, 0xd8, 0x81, 0x12, 0x1e, 0x02, 0x40,
++      0xd2, 0x6e, 0xf0, 0xeb, 0x1b, 0x3d, 0x22, 0xc6,
++      0xe5, 0xe3, 0xb4, 0x5a, 0x98, 0xbb, 0xf0, 0x22,
++      0x28, 0x8d, 0xe5, 0xd3, 0x16, 0x48, 0x24, 0xa5,
++      0xe6, 0x66, 0x0c, 0xf9, 0x08, 0xf9, 0x7e, 0x1e,
++      0xe1, 0x28, 0x26, 0x22, 0xc7, 0xc7, 0x0a, 0x32,
++      0x47, 0xfa, 0xa3, 0xbe, 0x3c, 0xc4, 0xc5, 0x53,
++      0x0a, 0xd5, 0x94, 0x4a, 0xd7, 0x93, 0xd8, 0x42,
++      0x99, 0xb9, 0x0a, 0xdb, 0x56, 0xf7, 0xb9, 0x1c,
++      0x53, 0x4f, 0xfa, 0xd3, 0x74, 0xad, 0xd9, 0x68,
++      0xf1, 0x1b, 0xdf, 0x61, 0xc6, 0x5e, 0xa8, 0x48,
++      0xfc, 0xd4, 0x4a, 0x4c, 0x3c, 0x32, 0xf7, 0x1c,
++      0x96, 0x21, 0x9b, 0xf9, 0xa3, 0xcc, 0x5a, 0xce,
++      0xd5, 0xd7, 0x08, 0x24, 0xf6, 0x1c, 0xfd, 0xdd,
++      0x38, 0xc2, 0x32, 0xe9, 0xb8, 0xe7, 0xb6, 0xfa,
++      0x9d, 0x45, 0x13, 0x2c, 0x83, 0xfd, 0x4a, 0x69,
++      0x82, 0xcd, 0xdc, 0xb3, 0x76, 0x0c, 0x9e, 0xd8,
++      0xf4, 0x1b, 0x45, 0x15, 0xb4, 0x97, 0xe7, 0x58,
++      0x34, 0xe2, 0x03, 0x29, 0x5a, 0xbf, 0xb6, 0xe0,
++      0x5d, 0x13, 0xd9, 0x2b, 0xb4, 0x80, 0xb2, 0x45,
++      0x81, 0x6a, 0x2e, 0x6c, 0x89, 0x7d, 0xee, 0xbb,
++      0x52, 0xdd, 0x1f, 0x18, 0xe7, 0x13, 0x6b, 0x33,
++      0x0e, 0xea, 0x36, 0x92, 0x77, 0x7b, 0x6d, 0x9c,
++      0x5a, 0x5f, 0x45, 0x7b, 0x7b, 0x35, 0x62, 0x23,
++      0xd1, 0xbf, 0x0f, 0xd0, 0x08, 0x1b, 0x2b, 0x80,
++      0x6b, 0x7e, 0xf1, 0x21, 0x47, 0xb0, 0x57, 0xd1,
++      0x98, 0x72, 0x90, 0x34, 0x1c, 0x20, 0x04, 0xff,
++      0x3d, 0x5c, 0xee, 0x0e, 0x57, 0x5f, 0x6f, 0x24,
++      0x4e, 0x3c, 0xea, 0xfc, 0xa5, 0xa9, 0x83, 0xc9,
++      0x61, 0xb4, 0x51, 0x24, 0xf8, 0x27, 0x5e, 0x46,
++      0x8c, 0xb1, 0x53, 0x02, 0x96, 0x35, 0xba, 0xb8,
++      0x4c, 0x71, 0xd3, 0x15, 0x59, 0x35, 0x22, 0x20,
++      0xad, 0x03, 0x9f, 0x66, 0x44, 0x3b, 0x9c, 0x35,
++      0x37, 0x1f, 0x9b, 0xbb, 0xf3, 0xdb, 0x35, 0x63,
++      0x30, 0x64, 0xaa, 0xa2, 0x06, 0xa8, 0x5d, 0xbb,
++      0xe1, 0x9f, 0x70, 0xec, 0x82, 0x11, 0x06, 0x36,
++      0xec, 0x8b, 0x69, 0x66, 0x24, 0x44, 0xc9, 0x4a,
++      0x57, 0xbb, 0x9b, 0x78, 0x13, 0xce, 0x9c, 0x0c,
++      0xba, 0x92, 0x93, 0x63, 0xb8, 0xe2, 0x95, 0x0f,
++      0x0f, 0x16, 0x39, 0x52, 0xfd, 0x3a, 0x6d, 0x02,
++      0x4b, 0xdf, 0x13, 0xd3, 0x2a, 0x22, 0xb4, 0x03,
++      0x7c, 0x54, 0x49, 0x96, 0x68, 0x54, 0x10, 0xfa,
++      0xef, 0xaa, 0x6c, 0xe8, 0x22, 0xdc, 0x71, 0x16,
++      0x13, 0x1a, 0xf6, 0x28, 0xe5, 0x6d, 0x77, 0x3d,
++      0xcd, 0x30, 0x63, 0xb1, 0x70, 0x52, 0xa1, 0xc5,
++      0x94, 0x5f, 0xcf, 0xe8, 0xb8, 0x26, 0x98, 0xf7,
++      0x06, 0xa0, 0x0a, 0x70, 0xfa, 0x03, 0x80, 0xac,
++      0xc1, 0xec, 0xd6, 0x4c, 0x54, 0xd7, 0xfe, 0x47,
++      0xb6, 0x88, 0x4a, 0xf7, 0x71, 0x24, 0xee, 0xf3,
++      0xd2, 0xc2, 0x4a, 0x7f, 0xfe, 0x61, 0xc7, 0x35,
++      0xc9, 0x37, 0x67, 0xcb, 0x24, 0x35, 0xda, 0x7e,
++      0xca, 0x5f, 0xf3, 0x8d, 0xd4, 0x13, 0x8e, 0xd6,
++      0xcb, 0x4d, 0x53, 0x8f, 0x53, 0x1f, 0xc0, 0x74,
++      0xf7, 0x53, 0xb9, 0x5e, 0x23, 0x37, 0xba, 0x6e,
++      0xe3, 0x9d, 0x07, 0x55, 0x25, 0x7b, 0xe6, 0x2a,
++      0x64, 0xd1, 0x32, 0xdd, 0x54, 0x1b, 0x4b, 0xc0,
++      0xe1, 0xd7, 0x69, 0x58, 0xf8, 0x93, 0x29, 0xc4,
++      0xdd, 0x23, 0x2f, 0xa5, 0xfc, 0x9d, 0x7e, 0xf8,
++      0xd4, 0x90, 0xcd, 0x82, 0x55, 0xdc, 0x16, 0x16,
++      0x9f, 0x07, 0x52, 0x9b, 0x9d, 0x25, 0xed, 0x32,
++      0xc5, 0x7b, 0xdf, 0xf6, 0x83, 0x46, 0x3d, 0x65,
++      0xb7, 0xef, 0x87, 0x7a, 0x12, 0x69, 0x8f, 0x06,
++      0x7c, 0x51, 0x15, 0x4a, 0x08, 0xe8, 0xac, 0x9a,
++      0x0c, 0x24, 0xa7, 0x27, 0xd8, 0x46, 0x2f, 0xe7,
++      0x01, 0x0e, 0x1c, 0xc6, 0x91, 0xb0, 0x6e, 0x85,
++      0x65, 0xf0, 0x29, 0x0d, 0x2e, 0x6b, 0x3b, 0xfb,
++      0x4b, 0xdf, 0xe4, 0x80, 0x93, 0x03, 0x66, 0x46,
++      0x3e, 0x8a, 0x6e, 0xf3, 0x5e, 0x4d, 0x62, 0x0e,
++      0x49, 0x05, 0xaf, 0xd4, 0xf8, 0x21, 0x20, 0x61,
++      0x1d, 0x39, 0x17, 0xf4, 0x61, 0x47, 0x95, 0xfb,
++      0x15, 0x2e, 0xb3, 0x4f, 0xd0, 0x5d, 0xf5, 0x7d,
++      0x40, 0xda, 0x90, 0x3c, 0x6b, 0xcb, 0x17, 0x00,
++      0x13, 0x3b, 0x64, 0x34, 0x1b, 0xf0, 0xf2, 0xe5,
++      0x3b, 0xb2, 0xc7, 0xd3, 0x5f, 0x3a, 0x44, 0xa6,
++      0x9b, 0xb7, 0x78, 0x0e, 0x42, 0x5d, 0x4c, 0xc1,
++      0xe9, 0xd2, 0xcb, 0xb7, 0x78, 0xd1, 0xfe, 0x9a,
++      0xb5, 0x07, 0xe9, 0xe0, 0xbe, 0xe2, 0x8a, 0xa7,
++      0x01, 0x83, 0x00, 0x8c, 0x5c, 0x08, 0xe6, 0x63,
++      0x12, 0x92, 0xb7, 0xb7, 0xa6, 0x19, 0x7d, 0x38,
++      0x13, 0x38, 0x92, 0x87, 0x24, 0xf9, 0x48, 0xb3,
++      0x5e, 0x87, 0x6a, 0x40, 0x39, 0x5c, 0x3f, 0xed,
++      0x8f, 0xee, 0xdb, 0x15, 0x82, 0x06, 0xda, 0x49,
++      0x21, 0x2b, 0xb5, 0xbf, 0x32, 0x7c, 0x9f, 0x42,
++      0x28, 0x63, 0xcf, 0xaf, 0x1e, 0xf8, 0xc6, 0xa0,
++      0xd1, 0x02, 0x43, 0x57, 0x62, 0xec, 0x9b, 0x0f,
++      0x01, 0x9e, 0x71, 0xd8, 0x87, 0x9d, 0x01, 0xc1,
++      0x58, 0x77, 0xd9, 0xaf, 0xb1, 0x10, 0x7e, 0xdd,
++      0xa6, 0x50, 0x96, 0xe5, 0xf0, 0x72, 0x00, 0x6d,
++      0x4b, 0xf8, 0x2a, 0x8f, 0x19, 0xf3, 0x22, 0x88,
++      0x11, 0x4a, 0x8b, 0x7c, 0xfd, 0xb7, 0xed, 0xe1,
++      0xf6, 0x40, 0x39, 0xe0, 0xe9, 0xf6, 0x3d, 0x25,
++      0xe6, 0x74, 0x3c, 0x58, 0x57, 0x7f, 0xe1, 0x22,
++      0x96, 0x47, 0x31, 0x91, 0xba, 0x70, 0x85, 0x28,
++      0x6b, 0x9f, 0x6e, 0x25, 0xac, 0x23, 0x66, 0x2f,
++      0x29, 0x88, 0x28, 0xce, 0x8c, 0x5c, 0x88, 0x53,
++      0xd1, 0x3b, 0xcc, 0x6a, 0x51, 0xb2, 0xe1, 0x28,
++      0x3f, 0x91, 0xb4, 0x0d, 0x00, 0x3a, 0xe3, 0xf8,
++      0xc3, 0x8f, 0xd7, 0x96, 0x62, 0x0e, 0x2e, 0xfc,
++      0xc8, 0x6c, 0x77, 0xa6, 0x1d, 0x22, 0xc1, 0xb8,
++      0xe6, 0x61, 0xd7, 0x67, 0x36, 0x13, 0x7b, 0xbb,
++      0x9b, 0x59, 0x09, 0xa6, 0xdf, 0xf7, 0x6b, 0xa3,
++      0x40, 0x1a, 0xf5, 0x4f, 0xb4, 0xda, 0xd3, 0xf3,
++      0x81, 0x93, 0xc6, 0x18, 0xd9, 0x26, 0xee, 0xac,
++      0xf0, 0xaa, 0xdf, 0xc5, 0x9c, 0xca, 0xc2, 0xa2,
++      0xcc, 0x7b, 0x5c, 0x24, 0xb0, 0xbc, 0xd0, 0x6a,
++      0x4d, 0x89, 0x09, 0xb8, 0x07, 0xfe, 0x87, 0xad,
++      0x0a, 0xea, 0xb8, 0x42, 0xf9, 0x5e, 0xb3, 0x3e,
++      0x36, 0x4c, 0xaf, 0x75, 0x9e, 0x1c, 0xeb, 0xbd,
++      0xbc, 0xbb, 0x80, 0x40, 0xa7, 0x3a, 0x30, 0xbf,
++      0xa8, 0x44, 0xf4, 0xeb, 0x38, 0xad, 0x29, 0xba,
++      0x23, 0xed, 0x41, 0x0c, 0xea, 0xd2, 0xbb, 0x41,
++      0x18, 0xd6, 0xb9, 0xba, 0x65, 0x2b, 0xa3, 0x91,
++      0x6d, 0x1f, 0xa9, 0xf4, 0xd1, 0x25, 0x8d, 0x4d,
++      0x38, 0xff, 0x64, 0xa0, 0xec, 0xde, 0xa6, 0xb6,
++      0x79, 0xab, 0x8e, 0x33, 0x6c, 0x47, 0xde, 0xaf,
++      0x94, 0xa4, 0xa5, 0x86, 0x77, 0x55, 0x09, 0x92,
++      0x81, 0x31, 0x76, 0xc7, 0x34, 0x22, 0x89, 0x8e,
++      0x3d, 0x26, 0x26, 0xd7, 0xfc, 0x1e, 0x16, 0x72,
++      0x13, 0x33, 0x63, 0xd5, 0x22, 0xbe, 0xb8, 0x04,
++      0x34, 0x84, 0x41, 0xbb, 0x80, 0xd0, 0x9f, 0x46,
++      0x48, 0x07, 0xa7, 0xfc, 0x2b, 0x3a, 0x75, 0x55,
++      0x8c, 0xc7, 0x6a, 0xbd, 0x7e, 0x46, 0x08, 0x84,
++      0x0f, 0xd5, 0x74, 0xc0, 0x82, 0x8e, 0xaa, 0x61,
++      0x05, 0x01, 0xb2, 0x47, 0x6e, 0x20, 0x6a, 0x2d,
++      0x58, 0x70, 0x48, 0x32, 0xa7, 0x37, 0xd2, 0xb8,
++      0x82, 0x1a, 0x51, 0xb9, 0x61, 0xdd, 0xfd, 0x9d,
++      0x6b, 0x0e, 0x18, 0x97, 0xf8, 0x45, 0x5f, 0x87,
++      0x10, 0xcf, 0x34, 0x72, 0x45, 0x26, 0x49, 0x70,
++      0xe7, 0xa3, 0x78, 0xe0, 0x52, 0x89, 0x84, 0x94,
++      0x83, 0x82, 0xc2, 0x69, 0x8f, 0xe3, 0xe1, 0x3f,
++      0x60, 0x74, 0x88, 0xc4, 0xf7, 0x75, 0x2c, 0xfb,
++      0xbd, 0xb6, 0xc4, 0x7e, 0x10, 0x0a, 0x6c, 0x90,
++      0x04, 0x9e, 0xc3, 0x3f, 0x59, 0x7c, 0xce, 0x31,
++      0x18, 0x60, 0x57, 0x73, 0x46, 0x94, 0x7d, 0x06,
++      0xa0, 0x6d, 0x44, 0xec, 0xa2, 0x0a, 0x9e, 0x05,
++      0x15, 0xef, 0xca, 0x5c, 0xbf, 0x00, 0xeb, 0xf7,
++      0x3d, 0x32, 0xd4, 0xa5, 0xef, 0x49, 0x89, 0x5e,
++      0x46, 0xb0, 0xa6, 0x63, 0x5b, 0x8a, 0x73, 0xae,
++      0x6f, 0xd5, 0x9d, 0xf8, 0x4f, 0x40, 0xb5, 0xb2,
++      0x6e, 0xd3, 0xb6, 0x01, 0xa9, 0x26, 0xa2, 0x21,
++      0xcf, 0x33, 0x7a, 0x3a, 0xa4, 0x23, 0x13, 0xb0,
++      0x69, 0x6a, 0xee, 0xce, 0xd8, 0x9d, 0x01, 0x1d,
++      0x50, 0xc1, 0x30, 0x6c, 0xb1, 0xcd, 0xa0, 0xf0,
++      0xf0, 0xa2, 0x64, 0x6f, 0xbb, 0xbf, 0x5e, 0xe6,
++      0xab, 0x87, 0xb4, 0x0f, 0x4f, 0x15, 0xaf, 0xb5,
++      0x25, 0xa1, 0xb2, 0xd0, 0x80, 0x2c, 0xfb, 0xf9,
++      0xfe, 0xd2, 0x33, 0xbb, 0x76, 0xfe, 0x7c, 0xa8,
++      0x66, 0xf7, 0xe7, 0x85, 0x9f, 0x1f, 0x85, 0x57,
++      0x88, 0xe1, 0xe9, 0x63, 0xe4, 0xd8, 0x1c, 0xa1,
++      0xfb, 0xda, 0x44, 0x05, 0x2e, 0x1d, 0x3a, 0x1c,
++      0xff, 0xc8, 0x3b, 0xc0, 0xfe, 0xda, 0x22, 0x0b,
++      0x43, 0xd6, 0x88, 0x39, 0x4c, 0x4a, 0xa6, 0x69,
++      0x18, 0x93, 0x42, 0x4e, 0xb5, 0xcc, 0x66, 0x0d,
++      0x09, 0xf8, 0x1e, 0x7c, 0xd3, 0x3c, 0x99, 0x0d,
++      0x50, 0x1d, 0x62, 0xe9, 0x57, 0x06, 0xbf, 0x19,
++      0x88, 0xdd, 0xad, 0x7b, 0x4f, 0xf9, 0xc7, 0x82,
++      0x6d, 0x8d, 0xc8, 0xc4, 0xc5, 0x78, 0x17, 0x20,
++      0x15, 0xc5, 0x52, 0x41, 0xcf, 0x5b, 0xd6, 0x7f,
++      0x94, 0x02, 0x41, 0xe0, 0x40, 0x22, 0x03, 0x5e,
++      0xd1, 0x53, 0xd4, 0x86, 0xd3, 0x2c, 0x9f, 0x0f,
++      0x96, 0xe3, 0x6b, 0x9a, 0x76, 0x32, 0x06, 0x47,
++      0x4b, 0x11, 0xb3, 0xdd, 0x03, 0x65, 0xbd, 0x9b,
++      0x01, 0xda, 0x9c, 0xb9, 0x7e, 0x3f, 0x6a, 0xc4,
++      0x7b, 0xea, 0xd4, 0x3c, 0xb9, 0xfb, 0x5c, 0x6b,
++      0x64, 0x33, 0x52, 0xba, 0x64, 0x78, 0x8f, 0xa4,
++      0xaf, 0x7a, 0x61, 0x8d, 0xbc, 0xc5, 0x73, 0xe9,
++      0x6b, 0x58, 0x97, 0x4b, 0xbf, 0x63, 0x22, 0xd3,
++      0x37, 0x02, 0x54, 0xc5, 0xb9, 0x16, 0x4a, 0xf0,
++      0x19, 0xd8, 0x94, 0x57, 0xb8, 0x8a, 0xb3, 0x16,
++      0x3b, 0xd0, 0x84, 0x8e, 0x67, 0xa6, 0xa3, 0x7d,
++      0x78, 0xec, 0x00
++};
++static const u8 dec_assoc012[] __initconst = {
++      0xb1, 0x69, 0x83, 0x87, 0x30, 0xaa, 0x5d, 0xb8,
++      0x77, 0xe8, 0x21, 0xff, 0x06, 0x59, 0x35, 0xce,
++      0x75, 0xfe, 0x38, 0xef, 0xb8, 0x91, 0x43, 0x8c,
++      0xcf, 0x70, 0xdd, 0x0a, 0x68, 0xbf, 0xd4, 0xbc,
++      0x16, 0x76, 0x99, 0x36, 0x1e, 0x58, 0x79, 0x5e,
++      0xd4, 0x29, 0xf7, 0x33, 0x93, 0x48, 0xdb, 0x5f,
++      0x01, 0xae, 0x9c, 0xb6, 0xe4, 0x88, 0x6d, 0x2b,
++      0x76, 0x75, 0xe0, 0xf3, 0x74, 0xe2, 0xc9
++};
++static const u8 dec_nonce012[] __initconst = {
++      0x05, 0xa3, 0x93, 0xed, 0x30, 0xc5, 0xa2, 0x06
++};
++static const u8 dec_key012[] __initconst = {
++      0xb3, 0x35, 0x50, 0x03, 0x54, 0x2e, 0x40, 0x5e,
++      0x8f, 0x59, 0x8e, 0xc5, 0x90, 0xd5, 0x27, 0x2d,
++      0xba, 0x29, 0x2e, 0xcb, 0x1b, 0x70, 0x44, 0x1e,
++      0x65, 0x91, 0x6e, 0x2a, 0x79, 0x22, 0xda, 0x64
++};
++
++static const u8 dec_input013[] __initconst = {
++      0x52, 0x34, 0xb3, 0x65, 0x3b, 0xb7, 0xe5, 0xd3,
++      0xab, 0x49, 0x17, 0x60, 0xd2, 0x52, 0x56, 0xdf,
++      0xdf, 0x34, 0x56, 0x82, 0xe2, 0xbe, 0xe5, 0xe1,
++      0x28, 0xd1, 0x4e, 0x5f, 0x4f, 0x01, 0x7d, 0x3f,
++      0x99, 0x6b, 0x30, 0x6e, 0x1a, 0x7c, 0x4c, 0x8e,
++      0x62, 0x81, 0xae, 0x86, 0x3f, 0x6b, 0xd0, 0xb5,
++      0xa9, 0xcf, 0x50, 0xf1, 0x02, 0x12, 0xa0, 0x0b,
++      0x24, 0xe9, 0xe6, 0x72, 0x89, 0x2c, 0x52, 0x1b,
++      0x34, 0x38, 0xf8, 0x75, 0x5f, 0xa0, 0x74, 0xe2,
++      0x99, 0xdd, 0xa6, 0x4b, 0x14, 0x50, 0x4e, 0xf1,
++      0xbe, 0xd6, 0x9e, 0xdb, 0xb2, 0x24, 0x27, 0x74,
++      0x12, 0x4a, 0x78, 0x78, 0x17, 0xa5, 0x58, 0x8e,
++      0x2f, 0xf9, 0xf4, 0x8d, 0xee, 0x03, 0x88, 0xae,
++      0xb8, 0x29, 0xa1, 0x2f, 0x4b, 0xee, 0x92, 0xbd,
++      0x87, 0xb3, 0xce, 0x34, 0x21, 0x57, 0x46, 0x04,
++      0x49, 0x0c, 0x80, 0xf2, 0x01, 0x13, 0xa1, 0x55,
++      0xb3, 0xff, 0x44, 0x30, 0x3c, 0x1c, 0xd0, 0xef,
++      0xbc, 0x18, 0x74, 0x26, 0xad, 0x41, 0x5b, 0x5b,
++      0x3e, 0x9a, 0x7a, 0x46, 0x4f, 0x16, 0xd6, 0x74,
++      0x5a, 0xb7, 0x3a, 0x28, 0x31, 0xd8, 0xae, 0x26,
++      0xac, 0x50, 0x53, 0x86, 0xf2, 0x56, 0xd7, 0x3f,
++      0x29, 0xbc, 0x45, 0x68, 0x8e, 0xcb, 0x98, 0x64,
++      0xdd, 0xc9, 0xba, 0xb8, 0x4b, 0x7b, 0x82, 0xdd,
++      0x14, 0xa7, 0xcb, 0x71, 0x72, 0x00, 0x5c, 0xad,
++      0x7b, 0x6a, 0x89, 0xa4, 0x3d, 0xbf, 0xb5, 0x4b,
++      0x3e, 0x7c, 0x5a, 0xcf, 0xb8, 0xa1, 0xc5, 0x6e,
++      0xc8, 0xb6, 0x31, 0x57, 0x7b, 0xdf, 0xa5, 0x7e,
++      0xb1, 0xd6, 0x42, 0x2a, 0x31, 0x36, 0xd1, 0xd0,
++      0x3f, 0x7a, 0xe5, 0x94, 0xd6, 0x36, 0xa0, 0x6f,
++      0xb7, 0x40, 0x7d, 0x37, 0xc6, 0x55, 0x7c, 0x50,
++      0x40, 0x6d, 0x29, 0x89, 0xe3, 0x5a, 0xae, 0x97,
++      0xe7, 0x44, 0x49, 0x6e, 0xbd, 0x81, 0x3d, 0x03,
++      0x93, 0x06, 0x12, 0x06, 0xe2, 0x41, 0x12, 0x4a,
++      0xf1, 0x6a, 0xa4, 0x58, 0xa2, 0xfb, 0xd2, 0x15,
++      0xba, 0xc9, 0x79, 0xc9, 0xce, 0x5e, 0x13, 0xbb,
++      0xf1, 0x09, 0x04, 0xcc, 0xfd, 0xe8, 0x51, 0x34,
++      0x6a, 0xe8, 0x61, 0x88, 0xda, 0xed, 0x01, 0x47,
++      0x84, 0xf5, 0x73, 0x25, 0xf9, 0x1c, 0x42, 0x86,
++      0x07, 0xf3, 0x5b, 0x1a, 0x01, 0xb3, 0xeb, 0x24,
++      0x32, 0x8d, 0xf6, 0xed, 0x7c, 0x4b, 0xeb, 0x3c,
++      0x36, 0x42, 0x28, 0xdf, 0xdf, 0xb6, 0xbe, 0xd9,
++      0x8c, 0x52, 0xd3, 0x2b, 0x08, 0x90, 0x8c, 0xe7,
++      0x98, 0x31, 0xe2, 0x32, 0x8e, 0xfc, 0x11, 0x48,
++      0x00, 0xa8, 0x6a, 0x42, 0x4a, 0x02, 0xc6, 0x4b,
++      0x09, 0xf1, 0xe3, 0x49, 0xf3, 0x45, 0x1f, 0x0e,
++      0xbc, 0x56, 0xe2, 0xe4, 0xdf, 0xfb, 0xeb, 0x61,
++      0xfa, 0x24, 0xc1, 0x63, 0x75, 0xbb, 0x47, 0x75,
++      0xaf, 0xe1, 0x53, 0x16, 0x96, 0x21, 0x85, 0x26,
++      0x11, 0xb3, 0x76, 0xe3, 0x23, 0xa1, 0x6b, 0x74,
++      0x37, 0xd0, 0xde, 0x06, 0x90, 0x71, 0x5d, 0x43,
++      0x88, 0x9b, 0x00, 0x54, 0xa6, 0x75, 0x2f, 0xa1,
++      0xc2, 0x0b, 0x73, 0x20, 0x1d, 0xb6, 0x21, 0x79,
++      0x57, 0x3f, 0xfa, 0x09, 0xbe, 0x8a, 0x33, 0xc3,
++      0x52, 0xf0, 0x1d, 0x82, 0x31, 0xd1, 0x55, 0xb5,
++      0x6c, 0x99, 0x25, 0xcf, 0x5c, 0x32, 0xce, 0xe9,
++      0x0d, 0xfa, 0x69, 0x2c, 0xd5, 0x0d, 0xc5, 0x6d,
++      0x86, 0xd0, 0x0c, 0x3b, 0x06, 0x50, 0x79, 0xe8,
++      0xc3, 0xae, 0x04, 0xe6, 0xcd, 0x51, 0xe4, 0x26,
++      0x9b, 0x4f, 0x7e, 0xa6, 0x0f, 0xab, 0xd8, 0xe5,
++      0xde, 0xa9, 0x00, 0x95, 0xbe, 0xa3, 0x9d, 0x5d,
++      0xb2, 0x09, 0x70, 0x18, 0x1c, 0xf0, 0xac, 0x29,
++      0x23, 0x02, 0x29, 0x28, 0xd2, 0x74, 0x35, 0x57,
++      0x62, 0x0f, 0x24, 0xea, 0x5e, 0x33, 0xc2, 0x92,
++      0xf3, 0x78, 0x4d, 0x30, 0x1e, 0xa1, 0x99, 0xa9,
++      0x82, 0xb0, 0x42, 0x31, 0x8d, 0xad, 0x8a, 0xbc,
++      0xfc, 0xd4, 0x57, 0x47, 0x3e, 0xb4, 0x50, 0xdd,
++      0x6e, 0x2c, 0x80, 0x4d, 0x22, 0xf1, 0xfb, 0x57,
++      0xc4, 0xdd, 0x17, 0xe1, 0x8a, 0x36, 0x4a, 0xb3,
++      0x37, 0xca, 0xc9, 0x4e, 0xab, 0xd5, 0x69, 0xc4,
++      0xf4, 0xbc, 0x0b, 0x3b, 0x44, 0x4b, 0x29, 0x9c,
++      0xee, 0xd4, 0x35, 0x22, 0x21, 0xb0, 0x1f, 0x27,
++      0x64, 0xa8, 0x51, 0x1b, 0xf0, 0x9f, 0x19, 0x5c,
++      0xfb, 0x5a, 0x64, 0x74, 0x70, 0x45, 0x09, 0xf5,
++      0x64, 0xfe, 0x1a, 0x2d, 0xc9, 0x14, 0x04, 0x14,
++      0xcf, 0xd5, 0x7d, 0x60, 0xaf, 0x94, 0x39, 0x94,
++      0xe2, 0x7d, 0x79, 0x82, 0xd0, 0x65, 0x3b, 0x6b,
++      0x9c, 0x19, 0x84, 0xb4, 0x6d, 0xb3, 0x0c, 0x99,
++      0xc0, 0x56, 0xa8, 0xbd, 0x73, 0xce, 0x05, 0x84,
++      0x3e, 0x30, 0xaa, 0xc4, 0x9b, 0x1b, 0x04, 0x2a,
++      0x9f, 0xd7, 0x43, 0x2b, 0x23, 0xdf, 0xbf, 0xaa,
++      0xd5, 0xc2, 0x43, 0x2d, 0x70, 0xab, 0xdc, 0x75,
++      0xad, 0xac, 0xf7, 0xc0, 0xbe, 0x67, 0xb2, 0x74,
++      0xed, 0x67, 0x10, 0x4a, 0x92, 0x60, 0xc1, 0x40,
++      0x50, 0x19, 0x8a, 0x8a, 0x8c, 0x09, 0x0e, 0x72,
++      0xe1, 0x73, 0x5e, 0xe8, 0x41, 0x85, 0x63, 0x9f,
++      0x3f, 0xd7, 0x7d, 0xc4, 0xfb, 0x22, 0x5d, 0x92,
++      0x6c, 0xb3, 0x1e, 0xe2, 0x50, 0x2f, 0x82, 0xa8,
++      0x28, 0xc0, 0xb5, 0xd7, 0x5f, 0x68, 0x0d, 0x2c,
++      0x2d, 0xaf, 0x7e, 0xfa, 0x2e, 0x08, 0x0f, 0x1f,
++      0x70, 0x9f, 0xe9, 0x19, 0x72, 0x55, 0xf8, 0xfb,
++      0x51, 0xd2, 0x33, 0x5d, 0xa0, 0xd3, 0x2b, 0x0a,
++      0x6c, 0xbc, 0x4e, 0xcf, 0x36, 0x4d, 0xdc, 0x3b,
++      0xe9, 0x3e, 0x81, 0x7c, 0x61, 0xdb, 0x20, 0x2d,
++      0x3a, 0xc3, 0xb3, 0x0c, 0x1e, 0x00, 0xb9, 0x7c,
++      0xf5, 0xca, 0x10, 0x5f, 0x3a, 0x71, 0xb3, 0xe4,
++      0x20, 0xdb, 0x0c, 0x2a, 0x98, 0x63, 0x45, 0x00,
++      0x58, 0xf6, 0x68, 0xe4, 0x0b, 0xda, 0x13, 0x3b,
++      0x60, 0x5c, 0x76, 0xdb, 0xb9, 0x97, 0x71, 0xe4,
++      0xd9, 0xb7, 0xdb, 0xbd, 0x68, 0xc7, 0x84, 0x84,
++      0xaa, 0x7c, 0x68, 0x62, 0x5e, 0x16, 0xfc, 0xba,
++      0x72, 0xaa, 0x9a, 0xa9, 0xeb, 0x7c, 0x75, 0x47,
++      0x97, 0x7e, 0xad, 0xe2, 0xd9, 0x91, 0xe8, 0xe4,
++      0xa5, 0x31, 0xd7, 0x01, 0x8e, 0xa2, 0x11, 0x88,
++      0x95, 0xb9, 0xf2, 0x9b, 0xd3, 0x7f, 0x1b, 0x81,
++      0x22, 0xf7, 0x98, 0x60, 0x0a, 0x64, 0xa6, 0xc1,
++      0xf6, 0x49, 0xc7, 0xe3, 0x07, 0x4d, 0x94, 0x7a,
++      0xcf, 0x6e, 0x68, 0x0c, 0x1b, 0x3f, 0x6e, 0x2e,
++      0xee, 0x92, 0xfa, 0x52, 0xb3, 0x59, 0xf8, 0xf1,
++      0x8f, 0x6a, 0x66, 0xa3, 0x82, 0x76, 0x4a, 0x07,
++      0x1a, 0xc7, 0xdd, 0xf5, 0xda, 0x9c, 0x3c, 0x24,
++      0xbf, 0xfd, 0x42, 0xa1, 0x10, 0x64, 0x6a, 0x0f,
++      0x89, 0xee, 0x36, 0xa5, 0xce, 0x99, 0x48, 0x6a,
++      0xf0, 0x9f, 0x9e, 0x69, 0xa4, 0x40, 0x20, 0xe9,
++      0x16, 0x15, 0xf7, 0xdb, 0x75, 0x02, 0xcb, 0xe9,
++      0x73, 0x8b, 0x3b, 0x49, 0x2f, 0xf0, 0xaf, 0x51,
++      0x06, 0x5c, 0xdf, 0x27, 0x27, 0x49, 0x6a, 0xd1,
++      0xcc, 0xc7, 0xb5, 0x63, 0xb5, 0xfc, 0xb8, 0x5c,
++      0x87, 0x7f, 0x84, 0xb4, 0xcc, 0x14, 0xa9, 0x53,
++      0xda, 0xa4, 0x56, 0xf8, 0xb6, 0x1b, 0xcc, 0x40,
++      0x27, 0x52, 0x06, 0x5a, 0x13, 0x81, 0xd7, 0x3a,
++      0xd4, 0x3b, 0xfb, 0x49, 0x65, 0x31, 0x33, 0xb2,
++      0xfa, 0xcd, 0xad, 0x58, 0x4e, 0x2b, 0xae, 0xd2,
++      0x20, 0xfb, 0x1a, 0x48, 0xb4, 0x3f, 0x9a, 0xd8,
++      0x7a, 0x35, 0x4a, 0xc8, 0xee, 0x88, 0x5e, 0x07,
++      0x66, 0x54, 0xb9, 0xec, 0x9f, 0xa3, 0xe3, 0xb9,
++      0x37, 0xaa, 0x49, 0x76, 0x31, 0xda, 0x74, 0x2d,
++      0x3c, 0xa4, 0x65, 0x10, 0x32, 0x38, 0xf0, 0xde,
++      0xd3, 0x99, 0x17, 0xaa, 0x71, 0xaa, 0x8f, 0x0f,
++      0x8c, 0xaf, 0xa2, 0xf8, 0x5d, 0x64, 0xba, 0x1d,
++      0xa3, 0xef, 0x96, 0x73, 0xe8, 0xa1, 0x02, 0x8d,
++      0x0c, 0x6d, 0xb8, 0x06, 0x90, 0xb8, 0x08, 0x56,
++      0x2c, 0xa7, 0x06, 0xc9, 0xc2, 0x38, 0xdb, 0x7c,
++      0x63, 0xb1, 0x57, 0x8e, 0xea, 0x7c, 0x79, 0xf3,
++      0x49, 0x1d, 0xfe, 0x9f, 0xf3, 0x6e, 0xb1, 0x1d,
++      0xba, 0x19, 0x80, 0x1a, 0x0a, 0xd3, 0xb0, 0x26,
++      0x21, 0x40, 0xb1, 0x7c, 0xf9, 0x4d, 0x8d, 0x10,
++      0xc1, 0x7e, 0xf4, 0xf6, 0x3c, 0xa8, 0xfd, 0x7c,
++      0xa3, 0x92, 0xb2, 0x0f, 0xaa, 0xcc, 0xa6, 0x11,
++      0xfe, 0x04, 0xe3, 0xd1, 0x7a, 0x32, 0x89, 0xdf,
++      0x0d, 0xc4, 0x8f, 0x79, 0x6b, 0xca, 0x16, 0x7c,
++      0x6e, 0xf9, 0xad, 0x0f, 0xf6, 0xfe, 0x27, 0xdb,
++      0xc4, 0x13, 0x70, 0xf1, 0x62, 0x1a, 0x4f, 0x79,
++      0x40, 0xc9, 0x9b, 0x8b, 0x21, 0xea, 0x84, 0xfa,
++      0xf5, 0xf1, 0x89, 0xce, 0xb7, 0x55, 0x0a, 0x80,
++      0x39, 0x2f, 0x55, 0x36, 0x16, 0x9c, 0x7b, 0x08,
++      0xbd, 0x87, 0x0d, 0xa5, 0x32, 0xf1, 0x52, 0x7c,
++      0xe8, 0x55, 0x60, 0x5b, 0xd7, 0x69, 0xe4, 0xfc,
++      0xfa, 0x12, 0x85, 0x96, 0xea, 0x50, 0x28, 0xab,
++      0x8a, 0xf7, 0xbb, 0x0e, 0x53, 0x74, 0xca, 0xa6,
++      0x27, 0x09, 0xc2, 0xb5, 0xde, 0x18, 0x14, 0xd9,
++      0xea, 0xe5, 0x29, 0x1c, 0x40, 0x56, 0xcf, 0xd7,
++      0xae, 0x05, 0x3f, 0x65, 0xaf, 0x05, 0x73, 0xe2,
++      0x35, 0x96, 0x27, 0x07, 0x14, 0xc0, 0xad, 0x33,
++      0xf1, 0xdc, 0x44, 0x7a, 0x89, 0x17, 0x77, 0xd2,
++      0x9c, 0x58, 0x60, 0xf0, 0x3f, 0x7b, 0x2d, 0x2e,
++      0x57, 0x95, 0x54, 0x87, 0xed, 0xf2, 0xc7, 0x4c,
++      0xf0, 0xae, 0x56, 0x29, 0x19, 0x7d, 0x66, 0x4b,
++      0x9b, 0x83, 0x84, 0x42, 0x3b, 0x01, 0x25, 0x66,
++      0x8e, 0x02, 0xde, 0xb9, 0x83, 0x54, 0x19, 0xf6,
++      0x9f, 0x79, 0x0d, 0x67, 0xc5, 0x1d, 0x7a, 0x44,
++      0x02, 0x98, 0xa7, 0x16, 0x1c, 0x29, 0x0d, 0x74,
++      0xff, 0x85, 0x40, 0x06, 0xef, 0x2c, 0xa9, 0xc6,
++      0xf5, 0x53, 0x07, 0x06, 0xae, 0xe4, 0xfa, 0x5f,
++      0xd8, 0x39, 0x4d, 0xf1, 0x9b, 0x6b, 0xd9, 0x24,
++      0x84, 0xfe, 0x03, 0x4c, 0xb2, 0x3f, 0xdf, 0xa1,
++      0x05, 0x9e, 0x50, 0x14, 0x5a, 0xd9, 0x1a, 0xa2,
++      0xa7, 0xfa, 0xfa, 0x17, 0xf7, 0x78, 0xd6, 0xb5,
++      0x92, 0x61, 0x91, 0xac, 0x36, 0xfa, 0x56, 0x0d,
++      0x38, 0x32, 0x18, 0x85, 0x08, 0x58, 0x37, 0xf0,
++      0x4b, 0xdb, 0x59, 0xe7, 0xa4, 0x34, 0xc0, 0x1b,
++      0x01, 0xaf, 0x2d, 0xde, 0xa1, 0xaa, 0x5d, 0xd3,
++      0xec, 0xe1, 0xd4, 0xf7, 0xe6, 0x54, 0x68, 0xf0,
++      0x51, 0x97, 0xa7, 0x89, 0xea, 0x24, 0xad, 0xd3,
++      0x6e, 0x47, 0x93, 0x8b, 0x4b, 0xb4, 0xf7, 0x1c,
++      0x42, 0x06, 0x67, 0xe8, 0x99, 0xf6, 0xf5, 0x7b,
++      0x85, 0xb5, 0x65, 0xb5, 0xb5, 0xd2, 0x37, 0xf5,
++      0xf3, 0x02, 0xa6, 0x4d, 0x11, 0xa7, 0xdc, 0x51,
++      0x09, 0x7f, 0xa0, 0xd8, 0x88, 0x1c, 0x13, 0x71,
++      0xae, 0x9c, 0xb7, 0x7b, 0x34, 0xd6, 0x4e, 0x68,
++      0x26, 0x83, 0x51, 0xaf, 0x1d, 0xee, 0x8b, 0xbb,
++      0x69, 0x43, 0x2b, 0x9e, 0x8a, 0xbc, 0x02, 0x0e,
++      0xa0, 0x1b, 0xe0, 0xa8, 0x5f, 0x6f, 0xaf, 0x1b,
++      0x8f, 0xe7, 0x64, 0x71, 0x74, 0x11, 0x7e, 0xa8,
++      0xd8, 0xf9, 0x97, 0x06, 0xc3, 0xb6, 0xfb, 0xfb,
++      0xb7, 0x3d, 0x35, 0x9d, 0x3b, 0x52, 0xed, 0x54,
++      0xca, 0xf4, 0x81, 0x01, 0x2d, 0x1b, 0xc3, 0xa7,
++      0x00, 0x3d, 0x1a, 0x39, 0x54, 0xe1, 0xf6, 0xff,
++      0xed, 0x6f, 0x0b, 0x5a, 0x68, 0xda, 0x58, 0xdd,
++      0xa9, 0xcf, 0x5c, 0x4a, 0xe5, 0x09, 0x4e, 0xde,
++      0x9d, 0xbc, 0x3e, 0xee, 0x5a, 0x00, 0x3b, 0x2c,
++      0x87, 0x10, 0x65, 0x60, 0xdd, 0xd7, 0x56, 0xd1,
++      0x4c, 0x64, 0x45, 0xe4, 0x21, 0xec, 0x78, 0xf8,
++      0x25, 0x7a, 0x3e, 0x16, 0x5d, 0x09, 0x53, 0x14,
++      0xbe, 0x4f, 0xae, 0x87, 0xd8, 0xd1, 0xaa, 0x3c,
++      0xf6, 0x3e, 0xa4, 0x70, 0x8c, 0x5e, 0x70, 0xa4,
++      0xb3, 0x6b, 0x66, 0x73, 0xd3, 0xbf, 0x31, 0x06,
++      0x19, 0x62, 0x93, 0x15, 0xf2, 0x86, 0xe4, 0x52,
++      0x7e, 0x53, 0x4c, 0x12, 0x38, 0xcc, 0x34, 0x7d,
++      0x57, 0xf6, 0x42, 0x93, 0x8a, 0xc4, 0xee, 0x5c,
++      0x8a, 0xe1, 0x52, 0x8f, 0x56, 0x64, 0xf6, 0xa6,
++      0xd1, 0x91, 0x57, 0x70, 0xcd, 0x11, 0x76, 0xf5,
++      0x59, 0x60, 0x60, 0x3c, 0xc1, 0xc3, 0x0b, 0x7f,
++      0x58, 0x1a, 0x50, 0x91, 0xf1, 0x68, 0x8f, 0x6e,
++      0x74, 0x74, 0xa8, 0x51, 0x0b, 0xf7, 0x7a, 0x98,
++      0x37, 0xf2, 0x0a, 0x0e, 0xa4, 0x97, 0x04, 0xb8,
++      0x9b, 0xfd, 0xa0, 0xea, 0xf7, 0x0d, 0xe1, 0xdb,
++      0x03, 0xf0, 0x31, 0x29, 0xf8, 0xdd, 0x6b, 0x8b,
++      0x5d, 0xd8, 0x59, 0xa9, 0x29, 0xcf, 0x9a, 0x79,
++      0x89, 0x19, 0x63, 0x46, 0x09, 0x79, 0x6a, 0x11,
++      0xda, 0x63, 0x68, 0x48, 0x77, 0x23, 0xfb, 0x7d,
++      0x3a, 0x43, 0xcb, 0x02, 0x3b, 0x7a, 0x6d, 0x10,
++      0x2a, 0x9e, 0xac, 0xf1, 0xd4, 0x19, 0xf8, 0x23,
++      0x64, 0x1d, 0x2c, 0x5f, 0xf2, 0xb0, 0x5c, 0x23,
++      0x27, 0xf7, 0x27, 0x30, 0x16, 0x37, 0xb1, 0x90,
++      0xab, 0x38, 0xfb, 0x55, 0xcd, 0x78, 0x58, 0xd4,
++      0x7d, 0x43, 0xf6, 0x45, 0x5e, 0x55, 0x8d, 0xb1,
++      0x02, 0x65, 0x58, 0xb4, 0x13, 0x4b, 0x36, 0xf7,
++      0xcc, 0xfe, 0x3d, 0x0b, 0x82, 0xe2, 0x12, 0x11,
++      0xbb, 0xe6, 0xb8, 0x3a, 0x48, 0x71, 0xc7, 0x50,
++      0x06, 0x16, 0x3a, 0xe6, 0x7c, 0x05, 0xc7, 0xc8,
++      0x4d, 0x2f, 0x08, 0x6a, 0x17, 0x9a, 0x95, 0x97,
++      0x50, 0x68, 0xdc, 0x28, 0x18, 0xc4, 0x61, 0x38,
++      0xb9, 0xe0, 0x3e, 0x78, 0xdb, 0x29, 0xe0, 0x9f,
++      0x52, 0xdd, 0xf8, 0x4f, 0x91, 0xc1, 0xd0, 0x33,
++      0xa1, 0x7a, 0x8e, 0x30, 0x13, 0x82, 0x07, 0x9f,
++      0xd3, 0x31, 0x0f, 0x23, 0xbe, 0x32, 0x5a, 0x75,
++      0xcf, 0x96, 0xb2, 0xec, 0xb5, 0x32, 0xac, 0x21,
++      0xd1, 0x82, 0x33, 0xd3, 0x15, 0x74, 0xbd, 0x90,
++      0xf1, 0x2c, 0xe6, 0x5f, 0x8d, 0xe3, 0x02, 0xe8,
++      0xe9, 0xc4, 0xca, 0x96, 0xeb, 0x0e, 0xbc, 0x91,
++      0xf4, 0xb9, 0xea, 0xd9, 0x1b, 0x75, 0xbd, 0xe1,
++      0xac, 0x2a, 0x05, 0x37, 0x52, 0x9b, 0x1b, 0x3f,
++      0x5a, 0xdc, 0x21, 0xc3, 0x98, 0xbb, 0xaf, 0xa3,
++      0xf2, 0x00, 0xbf, 0x0d, 0x30, 0x89, 0x05, 0xcc,
++      0xa5, 0x76, 0xf5, 0x06, 0xf0, 0xc6, 0x54, 0x8a,
++      0x5d, 0xd4, 0x1e, 0xc1, 0xf2, 0xce, 0xb0, 0x62,
++      0xc8, 0xfc, 0x59, 0x42, 0x9a, 0x90, 0x60, 0x55,
++      0xfe, 0x88, 0xa5, 0x8b, 0xb8, 0x33, 0x0c, 0x23,
++      0x24, 0x0d, 0x15, 0x70, 0x37, 0x1e, 0x3d, 0xf6,
++      0xd2, 0xea, 0x92, 0x10, 0xb2, 0xc4, 0x51, 0xac,
++      0xf2, 0xac, 0xf3, 0x6b, 0x6c, 0xaa, 0xcf, 0x12,
++      0xc5, 0x6c, 0x90, 0x50, 0xb5, 0x0c, 0xfc, 0x1a,
++      0x15, 0x52, 0xe9, 0x26, 0xc6, 0x52, 0xa4, 0xe7,
++      0x81, 0x69, 0xe1, 0xe7, 0x9e, 0x30, 0x01, 0xec,
++      0x84, 0x89, 0xb2, 0x0d, 0x66, 0xdd, 0xce, 0x28,
++      0x5c, 0xec, 0x98, 0x46, 0x68, 0x21, 0x9f, 0x88,
++      0x3f, 0x1f, 0x42, 0x77, 0xce, 0xd0, 0x61, 0xd4,
++      0x20, 0xa7, 0xff, 0x53, 0xad, 0x37, 0xd0, 0x17,
++      0x35, 0xc9, 0xfc, 0xba, 0x0a, 0x78, 0x3f, 0xf2,
++      0xcc, 0x86, 0x89, 0xe8, 0x4b, 0x3c, 0x48, 0x33,
++      0x09, 0x7f, 0xc6, 0xc0, 0xdd, 0xb8, 0xfd, 0x7a,
++      0x66, 0x66, 0x65, 0xeb, 0x47, 0xa7, 0x04, 0x28,
++      0xa3, 0x19, 0x8e, 0xa9, 0xb1, 0x13, 0x67, 0x62,
++      0x70, 0xcf, 0xd7
++};
++static const u8 dec_output013[] __initconst = {
++      0x74, 0xa6, 0x3e, 0xe4, 0xb1, 0xcb, 0xaf, 0xb0,
++      0x40, 0xe5, 0x0f, 0x9e, 0xf1, 0xf2, 0x89, 0xb5,
++      0x42, 0x34, 0x8a, 0xa1, 0x03, 0xb7, 0xe9, 0x57,
++      0x46, 0xbe, 0x20, 0xe4, 0x6e, 0xb0, 0xeb, 0xff,
++      0xea, 0x07, 0x7e, 0xef, 0xe2, 0x55, 0x9f, 0xe5,
++      0x78, 0x3a, 0xb7, 0x83, 0xc2, 0x18, 0x40, 0x7b,
++      0xeb, 0xcd, 0x81, 0xfb, 0x90, 0x12, 0x9e, 0x46,
++      0xa9, 0xd6, 0x4a, 0xba, 0xb0, 0x62, 0xdb, 0x6b,
++      0x99, 0xc4, 0xdb, 0x54, 0x4b, 0xb8, 0xa5, 0x71,
++      0xcb, 0xcd, 0x63, 0x32, 0x55, 0xfb, 0x31, 0xf0,
++      0x38, 0xf5, 0xbe, 0x78, 0xe4, 0x45, 0xce, 0x1b,
++      0x6a, 0x5b, 0x0e, 0xf4, 0x16, 0xe4, 0xb1, 0x3d,
++      0xf6, 0x63, 0x7b, 0xa7, 0x0c, 0xde, 0x6f, 0x8f,
++      0x74, 0xdf, 0xe0, 0x1e, 0x9d, 0xce, 0x8f, 0x24,
++      0xef, 0x23, 0x35, 0x33, 0x7b, 0x83, 0x34, 0x23,
++      0x58, 0x74, 0x14, 0x77, 0x1f, 0xc2, 0x4f, 0x4e,
++      0xc6, 0x89, 0xf9, 0x52, 0x09, 0x37, 0x64, 0x14,
++      0xc4, 0x01, 0x6b, 0x9d, 0x77, 0xe8, 0x90, 0x5d,
++      0xa8, 0x4a, 0x2a, 0xef, 0x5c, 0x7f, 0xeb, 0xbb,
++      0xb2, 0xc6, 0x93, 0x99, 0x66, 0xdc, 0x7f, 0xd4,
++      0x9e, 0x2a, 0xca, 0x8d, 0xdb, 0xe7, 0x20, 0xcf,
++      0xe4, 0x73, 0xae, 0x49, 0x7d, 0x64, 0x0f, 0x0e,
++      0x28, 0x46, 0xa9, 0xa8, 0x32, 0xe4, 0x0e, 0xf6,
++      0x51, 0x53, 0xb8, 0x3c, 0xb1, 0xff, 0xa3, 0x33,
++      0x41, 0x75, 0xff, 0xf1, 0x6f, 0xf1, 0xfb, 0xbb,
++      0x83, 0x7f, 0x06, 0x9b, 0xe7, 0x1b, 0x0a, 0xe0,
++      0x5c, 0x33, 0x60, 0x5b, 0xdb, 0x5b, 0xed, 0xfe,
++      0xa5, 0x16, 0x19, 0x72, 0xa3, 0x64, 0x23, 0x00,
++      0x02, 0xc7, 0xf3, 0x6a, 0x81, 0x3e, 0x44, 0x1d,
++      0x79, 0x15, 0x5f, 0x9a, 0xde, 0xe2, 0xfd, 0x1b,
++      0x73, 0xc1, 0xbc, 0x23, 0xba, 0x31, 0xd2, 0x50,
++      0xd5, 0xad, 0x7f, 0x74, 0xa7, 0xc9, 0xf8, 0x3e,
++      0x2b, 0x26, 0x10, 0xf6, 0x03, 0x36, 0x74, 0xe4,
++      0x0e, 0x6a, 0x72, 0xb7, 0x73, 0x0a, 0x42, 0x28,
++      0xc2, 0xad, 0x5e, 0x03, 0xbe, 0xb8, 0x0b, 0xa8,
++      0x5b, 0xd4, 0xb8, 0xba, 0x52, 0x89, 0xb1, 0x9b,
++      0xc1, 0xc3, 0x65, 0x87, 0xed, 0xa5, 0xf4, 0x86,
++      0xfd, 0x41, 0x80, 0x91, 0x27, 0x59, 0x53, 0x67,
++      0x15, 0x78, 0x54, 0x8b, 0x2d, 0x3d, 0xc7, 0xff,
++      0x02, 0x92, 0x07, 0x5f, 0x7a, 0x4b, 0x60, 0x59,
++      0x3c, 0x6f, 0x5c, 0xd8, 0xec, 0x95, 0xd2, 0xfe,
++      0xa0, 0x3b, 0xd8, 0x3f, 0xd1, 0x69, 0xa6, 0xd6,
++      0x41, 0xb2, 0xf4, 0x4d, 0x12, 0xf4, 0x58, 0x3e,
++      0x66, 0x64, 0x80, 0x31, 0x9b, 0xa8, 0x4c, 0x8b,
++      0x07, 0xb2, 0xec, 0x66, 0x94, 0x66, 0x47, 0x50,
++      0x50, 0x5f, 0x18, 0x0b, 0x0e, 0xd6, 0xc0, 0x39,
++      0x21, 0x13, 0x9e, 0x33, 0xbc, 0x79, 0x36, 0x02,
++      0x96, 0x70, 0xf0, 0x48, 0x67, 0x2f, 0x26, 0xe9,
++      0x6d, 0x10, 0xbb, 0xd6, 0x3f, 0xd1, 0x64, 0x7a,
++      0x2e, 0xbe, 0x0c, 0x61, 0xf0, 0x75, 0x42, 0x38,
++      0x23, 0xb1, 0x9e, 0x9f, 0x7c, 0x67, 0x66, 0xd9,
++      0x58, 0x9a, 0xf1, 0xbb, 0x41, 0x2a, 0x8d, 0x65,
++      0x84, 0x94, 0xfc, 0xdc, 0x6a, 0x50, 0x64, 0xdb,
++      0x56, 0x33, 0x76, 0x00, 0x10, 0xed, 0xbe, 0xd2,
++      0x12, 0xf6, 0xf6, 0x1b, 0xa2, 0x16, 0xde, 0xae,
++      0x31, 0x95, 0xdd, 0xb1, 0x08, 0x7e, 0x4e, 0xee,
++      0xe7, 0xf9, 0xa5, 0xfb, 0x5b, 0x61, 0x43, 0x00,
++      0x40, 0xf6, 0x7e, 0x02, 0x04, 0x32, 0x4e, 0x0c,
++      0xe2, 0x66, 0x0d, 0xd7, 0x07, 0x98, 0x0e, 0xf8,
++      0x72, 0x34, 0x6d, 0x95, 0x86, 0xd7, 0xcb, 0x31,
++      0x54, 0x47, 0xd0, 0x38, 0x29, 0x9c, 0x5a, 0x68,
++      0xd4, 0x87, 0x76, 0xc9, 0xe7, 0x7e, 0xe3, 0xf4,
++      0x81, 0x6d, 0x18, 0xcb, 0xc9, 0x05, 0xaf, 0xa0,
++      0xfb, 0x66, 0xf7, 0xf1, 0x1c, 0xc6, 0x14, 0x11,
++      0x4f, 0x2b, 0x79, 0x42, 0x8b, 0xbc, 0xac, 0xe7,
++      0x6c, 0xfe, 0x0f, 0x58, 0xe7, 0x7c, 0x78, 0x39,
++      0x30, 0xb0, 0x66, 0x2c, 0x9b, 0x6d, 0x3a, 0xe1,
++      0xcf, 0xc9, 0xa4, 0x0e, 0x6d, 0x6d, 0x8a, 0xa1,
++      0x3a, 0xe7, 0x28, 0xd4, 0x78, 0x4c, 0xa6, 0xa2,
++      0x2a, 0xa6, 0x03, 0x30, 0xd7, 0xa8, 0x25, 0x66,
++      0x87, 0x2f, 0x69, 0x5c, 0x4e, 0xdd, 0xa5, 0x49,
++      0x5d, 0x37, 0x4a, 0x59, 0xc4, 0xaf, 0x1f, 0xa2,
++      0xe4, 0xf8, 0xa6, 0x12, 0x97, 0xd5, 0x79, 0xf5,
++      0xe2, 0x4a, 0x2b, 0x5f, 0x61, 0xe4, 0x9e, 0xe3,
++      0xee, 0xb8, 0xa7, 0x5b, 0x2f, 0xf4, 0x9e, 0x6c,
++      0xfb, 0xd1, 0xc6, 0x56, 0x77, 0xba, 0x75, 0xaa,
++      0x3d, 0x1a, 0xa8, 0x0b, 0xb3, 0x68, 0x24, 0x00,
++      0x10, 0x7f, 0xfd, 0xd7, 0xa1, 0x8d, 0x83, 0x54,
++      0x4f, 0x1f, 0xd8, 0x2a, 0xbe, 0x8a, 0x0c, 0x87,
++      0xab, 0xa2, 0xde, 0xc3, 0x39, 0xbf, 0x09, 0x03,
++      0xa5, 0xf3, 0x05, 0x28, 0xe1, 0xe1, 0xee, 0x39,
++      0x70, 0x9c, 0xd8, 0x81, 0x12, 0x1e, 0x02, 0x40,
++      0xd2, 0x6e, 0xf0, 0xeb, 0x1b, 0x3d, 0x22, 0xc6,
++      0xe5, 0xe3, 0xb4, 0x5a, 0x98, 0xbb, 0xf0, 0x22,
++      0x28, 0x8d, 0xe5, 0xd3, 0x16, 0x48, 0x24, 0xa5,
++      0xe6, 0x66, 0x0c, 0xf9, 0x08, 0xf9, 0x7e, 0x1e,
++      0xe1, 0x28, 0x26, 0x22, 0xc7, 0xc7, 0x0a, 0x32,
++      0x47, 0xfa, 0xa3, 0xbe, 0x3c, 0xc4, 0xc5, 0x53,
++      0x0a, 0xd5, 0x94, 0x4a, 0xd7, 0x93, 0xd8, 0x42,
++      0x99, 0xb9, 0x0a, 0xdb, 0x56, 0xf7, 0xb9, 0x1c,
++      0x53, 0x4f, 0xfa, 0xd3, 0x74, 0xad, 0xd9, 0x68,
++      0xf1, 0x1b, 0xdf, 0x61, 0xc6, 0x5e, 0xa8, 0x48,
++      0xfc, 0xd4, 0x4a, 0x4c, 0x3c, 0x32, 0xf7, 0x1c,
++      0x96, 0x21, 0x9b, 0xf9, 0xa3, 0xcc, 0x5a, 0xce,
++      0xd5, 0xd7, 0x08, 0x24, 0xf6, 0x1c, 0xfd, 0xdd,
++      0x38, 0xc2, 0x32, 0xe9, 0xb8, 0xe7, 0xb6, 0xfa,
++      0x9d, 0x45, 0x13, 0x2c, 0x83, 0xfd, 0x4a, 0x69,
++      0x82, 0xcd, 0xdc, 0xb3, 0x76, 0x0c, 0x9e, 0xd8,
++      0xf4, 0x1b, 0x45, 0x15, 0xb4, 0x97, 0xe7, 0x58,
++      0x34, 0xe2, 0x03, 0x29, 0x5a, 0xbf, 0xb6, 0xe0,
++      0x5d, 0x13, 0xd9, 0x2b, 0xb4, 0x80, 0xb2, 0x45,
++      0x81, 0x6a, 0x2e, 0x6c, 0x89, 0x7d, 0xee, 0xbb,
++      0x52, 0xdd, 0x1f, 0x18, 0xe7, 0x13, 0x6b, 0x33,
++      0x0e, 0xea, 0x36, 0x92, 0x77, 0x7b, 0x6d, 0x9c,
++      0x5a, 0x5f, 0x45, 0x7b, 0x7b, 0x35, 0x62, 0x23,
++      0xd1, 0xbf, 0x0f, 0xd0, 0x08, 0x1b, 0x2b, 0x80,
++      0x6b, 0x7e, 0xf1, 0x21, 0x47, 0xb0, 0x57, 0xd1,
++      0x98, 0x72, 0x90, 0x34, 0x1c, 0x20, 0x04, 0xff,
++      0x3d, 0x5c, 0xee, 0x0e, 0x57, 0x5f, 0x6f, 0x24,
++      0x4e, 0x3c, 0xea, 0xfc, 0xa5, 0xa9, 0x83, 0xc9,
++      0x61, 0xb4, 0x51, 0x24, 0xf8, 0x27, 0x5e, 0x46,
++      0x8c, 0xb1, 0x53, 0x02, 0x96, 0x35, 0xba, 0xb8,
++      0x4c, 0x71, 0xd3, 0x15, 0x59, 0x35, 0x22, 0x20,
++      0xad, 0x03, 0x9f, 0x66, 0x44, 0x3b, 0x9c, 0x35,
++      0x37, 0x1f, 0x9b, 0xbb, 0xf3, 0xdb, 0x35, 0x63,
++      0x30, 0x64, 0xaa, 0xa2, 0x06, 0xa8, 0x5d, 0xbb,
++      0xe1, 0x9f, 0x70, 0xec, 0x82, 0x11, 0x06, 0x36,
++      0xec, 0x8b, 0x69, 0x66, 0x24, 0x44, 0xc9, 0x4a,
++      0x57, 0xbb, 0x9b, 0x78, 0x13, 0xce, 0x9c, 0x0c,
++      0xba, 0x92, 0x93, 0x63, 0xb8, 0xe2, 0x95, 0x0f,
++      0x0f, 0x16, 0x39, 0x52, 0xfd, 0x3a, 0x6d, 0x02,
++      0x4b, 0xdf, 0x13, 0xd3, 0x2a, 0x22, 0xb4, 0x03,
++      0x7c, 0x54, 0x49, 0x96, 0x68, 0x54, 0x10, 0xfa,
++      0xef, 0xaa, 0x6c, 0xe8, 0x22, 0xdc, 0x71, 0x16,
++      0x13, 0x1a, 0xf6, 0x28, 0xe5, 0x6d, 0x77, 0x3d,
++      0xcd, 0x30, 0x63, 0xb1, 0x70, 0x52, 0xa1, 0xc5,
++      0x94, 0x5f, 0xcf, 0xe8, 0xb8, 0x26, 0x98, 0xf7,
++      0x06, 0xa0, 0x0a, 0x70, 0xfa, 0x03, 0x80, 0xac,
++      0xc1, 0xec, 0xd6, 0x4c, 0x54, 0xd7, 0xfe, 0x47,
++      0xb6, 0x88, 0x4a, 0xf7, 0x71, 0x24, 0xee, 0xf3,
++      0xd2, 0xc2, 0x4a, 0x7f, 0xfe, 0x61, 0xc7, 0x35,
++      0xc9, 0x37, 0x67, 0xcb, 0x24, 0x35, 0xda, 0x7e,
++      0xca, 0x5f, 0xf3, 0x8d, 0xd4, 0x13, 0x8e, 0xd6,
++      0xcb, 0x4d, 0x53, 0x8f, 0x53, 0x1f, 0xc0, 0x74,
++      0xf7, 0x53, 0xb9, 0x5e, 0x23, 0x37, 0xba, 0x6e,
++      0xe3, 0x9d, 0x07, 0x55, 0x25, 0x7b, 0xe6, 0x2a,
++      0x64, 0xd1, 0x32, 0xdd, 0x54, 0x1b, 0x4b, 0xc0,
++      0xe1, 0xd7, 0x69, 0x58, 0xf8, 0x93, 0x29, 0xc4,
++      0xdd, 0x23, 0x2f, 0xa5, 0xfc, 0x9d, 0x7e, 0xf8,
++      0xd4, 0x90, 0xcd, 0x82, 0x55, 0xdc, 0x16, 0x16,
++      0x9f, 0x07, 0x52, 0x9b, 0x9d, 0x25, 0xed, 0x32,
++      0xc5, 0x7b, 0xdf, 0xf6, 0x83, 0x46, 0x3d, 0x65,
++      0xb7, 0xef, 0x87, 0x7a, 0x12, 0x69, 0x8f, 0x06,
++      0x7c, 0x51, 0x15, 0x4a, 0x08, 0xe8, 0xac, 0x9a,
++      0x0c, 0x24, 0xa7, 0x27, 0xd8, 0x46, 0x2f, 0xe7,
++      0x01, 0x0e, 0x1c, 0xc6, 0x91, 0xb0, 0x6e, 0x85,
++      0x65, 0xf0, 0x29, 0x0d, 0x2e, 0x6b, 0x3b, 0xfb,
++      0x4b, 0xdf, 0xe4, 0x80, 0x93, 0x03, 0x66, 0x46,
++      0x3e, 0x8a, 0x6e, 0xf3, 0x5e, 0x4d, 0x62, 0x0e,
++      0x49, 0x05, 0xaf, 0xd4, 0xf8, 0x21, 0x20, 0x61,
++      0x1d, 0x39, 0x17, 0xf4, 0x61, 0x47, 0x95, 0xfb,
++      0x15, 0x2e, 0xb3, 0x4f, 0xd0, 0x5d, 0xf5, 0x7d,
++      0x40, 0xda, 0x90, 0x3c, 0x6b, 0xcb, 0x17, 0x00,
++      0x13, 0x3b, 0x64, 0x34, 0x1b, 0xf0, 0xf2, 0xe5,
++      0x3b, 0xb2, 0xc7, 0xd3, 0x5f, 0x3a, 0x44, 0xa6,
++      0x9b, 0xb7, 0x78, 0x0e, 0x42, 0x5d, 0x4c, 0xc1,
++      0xe9, 0xd2, 0xcb, 0xb7, 0x78, 0xd1, 0xfe, 0x9a,
++      0xb5, 0x07, 0xe9, 0xe0, 0xbe, 0xe2, 0x8a, 0xa7,
++      0x01, 0x83, 0x00, 0x8c, 0x5c, 0x08, 0xe6, 0x63,
++      0x12, 0x92, 0xb7, 0xb7, 0xa6, 0x19, 0x7d, 0x38,
++      0x13, 0x38, 0x92, 0x87, 0x24, 0xf9, 0x48, 0xb3,
++      0x5e, 0x87, 0x6a, 0x40, 0x39, 0x5c, 0x3f, 0xed,
++      0x8f, 0xee, 0xdb, 0x15, 0x82, 0x06, 0xda, 0x49,
++      0x21, 0x2b, 0xb5, 0xbf, 0x32, 0x7c, 0x9f, 0x42,
++      0x28, 0x63, 0xcf, 0xaf, 0x1e, 0xf8, 0xc6, 0xa0,
++      0xd1, 0x02, 0x43, 0x57, 0x62, 0xec, 0x9b, 0x0f,
++      0x01, 0x9e, 0x71, 0xd8, 0x87, 0x9d, 0x01, 0xc1,
++      0x58, 0x77, 0xd9, 0xaf, 0xb1, 0x10, 0x7e, 0xdd,
++      0xa6, 0x50, 0x96, 0xe5, 0xf0, 0x72, 0x00, 0x6d,
++      0x4b, 0xf8, 0x2a, 0x8f, 0x19, 0xf3, 0x22, 0x88,
++      0x11, 0x4a, 0x8b, 0x7c, 0xfd, 0xb7, 0xed, 0xe1,
++      0xf6, 0x40, 0x39, 0xe0, 0xe9, 0xf6, 0x3d, 0x25,
++      0xe6, 0x74, 0x3c, 0x58, 0x57, 0x7f, 0xe1, 0x22,
++      0x96, 0x47, 0x31, 0x91, 0xba, 0x70, 0x85, 0x28,
++      0x6b, 0x9f, 0x6e, 0x25, 0xac, 0x23, 0x66, 0x2f,
++      0x29, 0x88, 0x28, 0xce, 0x8c, 0x5c, 0x88, 0x53,
++      0xd1, 0x3b, 0xcc, 0x6a, 0x51, 0xb2, 0xe1, 0x28,
++      0x3f, 0x91, 0xb4, 0x0d, 0x00, 0x3a, 0xe3, 0xf8,
++      0xc3, 0x8f, 0xd7, 0x96, 0x62, 0x0e, 0x2e, 0xfc,
++      0xc8, 0x6c, 0x77, 0xa6, 0x1d, 0x22, 0xc1, 0xb8,
++      0xe6, 0x61, 0xd7, 0x67, 0x36, 0x13, 0x7b, 0xbb,
++      0x9b, 0x59, 0x09, 0xa6, 0xdf, 0xf7, 0x6b, 0xa3,
++      0x40, 0x1a, 0xf5, 0x4f, 0xb4, 0xda, 0xd3, 0xf3,
++      0x81, 0x93, 0xc6, 0x18, 0xd9, 0x26, 0xee, 0xac,
++      0xf0, 0xaa, 0xdf, 0xc5, 0x9c, 0xca, 0xc2, 0xa2,
++      0xcc, 0x7b, 0x5c, 0x24, 0xb0, 0xbc, 0xd0, 0x6a,
++      0x4d, 0x89, 0x09, 0xb8, 0x07, 0xfe, 0x87, 0xad,
++      0x0a, 0xea, 0xb8, 0x42, 0xf9, 0x5e, 0xb3, 0x3e,
++      0x36, 0x4c, 0xaf, 0x75, 0x9e, 0x1c, 0xeb, 0xbd,
++      0xbc, 0xbb, 0x80, 0x40, 0xa7, 0x3a, 0x30, 0xbf,
++      0xa8, 0x44, 0xf4, 0xeb, 0x38, 0xad, 0x29, 0xba,
++      0x23, 0xed, 0x41, 0x0c, 0xea, 0xd2, 0xbb, 0x41,
++      0x18, 0xd6, 0xb9, 0xba, 0x65, 0x2b, 0xa3, 0x91,
++      0x6d, 0x1f, 0xa9, 0xf4, 0xd1, 0x25, 0x8d, 0x4d,
++      0x38, 0xff, 0x64, 0xa0, 0xec, 0xde, 0xa6, 0xb6,
++      0x79, 0xab, 0x8e, 0x33, 0x6c, 0x47, 0xde, 0xaf,
++      0x94, 0xa4, 0xa5, 0x86, 0x77, 0x55, 0x09, 0x92,
++      0x81, 0x31, 0x76, 0xc7, 0x34, 0x22, 0x89, 0x8e,
++      0x3d, 0x26, 0x26, 0xd7, 0xfc, 0x1e, 0x16, 0x72,
++      0x13, 0x33, 0x63, 0xd5, 0x22, 0xbe, 0xb8, 0x04,
++      0x34, 0x84, 0x41, 0xbb, 0x80, 0xd0, 0x9f, 0x46,
++      0x48, 0x07, 0xa7, 0xfc, 0x2b, 0x3a, 0x75, 0x55,
++      0x8c, 0xc7, 0x6a, 0xbd, 0x7e, 0x46, 0x08, 0x84,
++      0x0f, 0xd5, 0x74, 0xc0, 0x82, 0x8e, 0xaa, 0x61,
++      0x05, 0x01, 0xb2, 0x47, 0x6e, 0x20, 0x6a, 0x2d,
++      0x58, 0x70, 0x48, 0x32, 0xa7, 0x37, 0xd2, 0xb8,
++      0x82, 0x1a, 0x51, 0xb9, 0x61, 0xdd, 0xfd, 0x9d,
++      0x6b, 0x0e, 0x18, 0x97, 0xf8, 0x45, 0x5f, 0x87,
++      0x10, 0xcf, 0x34, 0x72, 0x45, 0x26, 0x49, 0x70,
++      0xe7, 0xa3, 0x78, 0xe0, 0x52, 0x89, 0x84, 0x94,
++      0x83, 0x82, 0xc2, 0x69, 0x8f, 0xe3, 0xe1, 0x3f,
++      0x60, 0x74, 0x88, 0xc4, 0xf7, 0x75, 0x2c, 0xfb,
++      0xbd, 0xb6, 0xc4, 0x7e, 0x10, 0x0a, 0x6c, 0x90,
++      0x04, 0x9e, 0xc3, 0x3f, 0x59, 0x7c, 0xce, 0x31,
++      0x18, 0x60, 0x57, 0x73, 0x46, 0x94, 0x7d, 0x06,
++      0xa0, 0x6d, 0x44, 0xec, 0xa2, 0x0a, 0x9e, 0x05,
++      0x15, 0xef, 0xca, 0x5c, 0xbf, 0x00, 0xeb, 0xf7,
++      0x3d, 0x32, 0xd4, 0xa5, 0xef, 0x49, 0x89, 0x5e,
++      0x46, 0xb0, 0xa6, 0x63, 0x5b, 0x8a, 0x73, 0xae,
++      0x6f, 0xd5, 0x9d, 0xf8, 0x4f, 0x40, 0xb5, 0xb2,
++      0x6e, 0xd3, 0xb6, 0x01, 0xa9, 0x26, 0xa2, 0x21,
++      0xcf, 0x33, 0x7a, 0x3a, 0xa4, 0x23, 0x13, 0xb0,
++      0x69, 0x6a, 0xee, 0xce, 0xd8, 0x9d, 0x01, 0x1d,
++      0x50, 0xc1, 0x30, 0x6c, 0xb1, 0xcd, 0xa0, 0xf0,
++      0xf0, 0xa2, 0x64, 0x6f, 0xbb, 0xbf, 0x5e, 0xe6,
++      0xab, 0x87, 0xb4, 0x0f, 0x4f, 0x15, 0xaf, 0xb5,
++      0x25, 0xa1, 0xb2, 0xd0, 0x80, 0x2c, 0xfb, 0xf9,
++      0xfe, 0xd2, 0x33, 0xbb, 0x76, 0xfe, 0x7c, 0xa8,
++      0x66, 0xf7, 0xe7, 0x85, 0x9f, 0x1f, 0x85, 0x57,
++      0x88, 0xe1, 0xe9, 0x63, 0xe4, 0xd8, 0x1c, 0xa1,
++      0xfb, 0xda, 0x44, 0x05, 0x2e, 0x1d, 0x3a, 0x1c,
++      0xff, 0xc8, 0x3b, 0xc0, 0xfe, 0xda, 0x22, 0x0b,
++      0x43, 0xd6, 0x88, 0x39, 0x4c, 0x4a, 0xa6, 0x69,
++      0x18, 0x93, 0x42, 0x4e, 0xb5, 0xcc, 0x66, 0x0d,
++      0x09, 0xf8, 0x1e, 0x7c, 0xd3, 0x3c, 0x99, 0x0d,
++      0x50, 0x1d, 0x62, 0xe9, 0x57, 0x06, 0xbf, 0x19,
++      0x88, 0xdd, 0xad, 0x7b, 0x4f, 0xf9, 0xc7, 0x82,
++      0x6d, 0x8d, 0xc8, 0xc4, 0xc5, 0x78, 0x17, 0x20,
++      0x15, 0xc5, 0x52, 0x41, 0xcf, 0x5b, 0xd6, 0x7f,
++      0x94, 0x02, 0x41, 0xe0, 0x40, 0x22, 0x03, 0x5e,
++      0xd1, 0x53, 0xd4, 0x86, 0xd3, 0x2c, 0x9f, 0x0f,
++      0x96, 0xe3, 0x6b, 0x9a, 0x76, 0x32, 0x06, 0x47,
++      0x4b, 0x11, 0xb3, 0xdd, 0x03, 0x65, 0xbd, 0x9b,
++      0x01, 0xda, 0x9c, 0xb9, 0x7e, 0x3f, 0x6a, 0xc4,
++      0x7b, 0xea, 0xd4, 0x3c, 0xb9, 0xfb, 0x5c, 0x6b,
++      0x64, 0x33, 0x52, 0xba, 0x64, 0x78, 0x8f, 0xa4,
++      0xaf, 0x7a, 0x61, 0x8d, 0xbc, 0xc5, 0x73, 0xe9,
++      0x6b, 0x58, 0x97, 0x4b, 0xbf, 0x63, 0x22, 0xd3,
++      0x37, 0x02, 0x54, 0xc5, 0xb9, 0x16, 0x4a, 0xf0,
++      0x19, 0xd8, 0x94, 0x57, 0xb8, 0x8a, 0xb3, 0x16,
++      0x3b, 0xd0, 0x84, 0x8e, 0x67, 0xa6, 0xa3, 0x7d,
++      0x78, 0xec, 0x00
++};
++static const u8 dec_assoc013[] __initconst = {
++      0xb1, 0x69, 0x83, 0x87, 0x30, 0xaa, 0x5d, 0xb8,
++      0x77, 0xe8, 0x21, 0xff, 0x06, 0x59, 0x35, 0xce,
++      0x75, 0xfe, 0x38, 0xef, 0xb8, 0x91, 0x43, 0x8c,
++      0xcf, 0x70, 0xdd, 0x0a, 0x68, 0xbf, 0xd4, 0xbc,
++      0x16, 0x76, 0x99, 0x36, 0x1e, 0x58, 0x79, 0x5e,
++      0xd4, 0x29, 0xf7, 0x33, 0x93, 0x48, 0xdb, 0x5f,
++      0x01, 0xae, 0x9c, 0xb6, 0xe4, 0x88, 0x6d, 0x2b,
++      0x76, 0x75, 0xe0, 0xf3, 0x74, 0xe2, 0xc9
++};
++static const u8 dec_nonce013[] __initconst = {
++      0x05, 0xa3, 0x93, 0xed, 0x30, 0xc5, 0xa2, 0x06
++};
++static const u8 dec_key013[] __initconst = {
++      0xb3, 0x35, 0x50, 0x03, 0x54, 0x2e, 0x40, 0x5e,
++      0x8f, 0x59, 0x8e, 0xc5, 0x90, 0xd5, 0x27, 0x2d,
++      0xba, 0x29, 0x2e, 0xcb, 0x1b, 0x70, 0x44, 0x1e,
++      0x65, 0x91, 0x6e, 0x2a, 0x79, 0x22, 0xda, 0x64
++};
++
++static const struct chacha20poly1305_testvec
++chacha20poly1305_dec_vectors[] __initconst = {
++      { dec_input001, dec_output001, dec_assoc001, dec_nonce001, dec_key001,
++        sizeof(dec_input001), sizeof(dec_assoc001), sizeof(dec_nonce001) },
++      { dec_input002, dec_output002, dec_assoc002, dec_nonce002, dec_key002,
++        sizeof(dec_input002), sizeof(dec_assoc002), sizeof(dec_nonce002) },
++      { dec_input003, dec_output003, dec_assoc003, dec_nonce003, dec_key003,
++        sizeof(dec_input003), sizeof(dec_assoc003), sizeof(dec_nonce003) },
++      { dec_input004, dec_output004, dec_assoc004, dec_nonce004, dec_key004,
++        sizeof(dec_input004), sizeof(dec_assoc004), sizeof(dec_nonce004) },
++      { dec_input005, dec_output005, dec_assoc005, dec_nonce005, dec_key005,
++        sizeof(dec_input005), sizeof(dec_assoc005), sizeof(dec_nonce005) },
++      { dec_input006, dec_output006, dec_assoc006, dec_nonce006, dec_key006,
++        sizeof(dec_input006), sizeof(dec_assoc006), sizeof(dec_nonce006) },
++      { dec_input007, dec_output007, dec_assoc007, dec_nonce007, dec_key007,
++        sizeof(dec_input007), sizeof(dec_assoc007), sizeof(dec_nonce007) },
++      { dec_input008, dec_output008, dec_assoc008, dec_nonce008, dec_key008,
++        sizeof(dec_input008), sizeof(dec_assoc008), sizeof(dec_nonce008) },
++      { dec_input009, dec_output009, dec_assoc009, dec_nonce009, dec_key009,
++        sizeof(dec_input009), sizeof(dec_assoc009), sizeof(dec_nonce009) },
++      { dec_input010, dec_output010, dec_assoc010, dec_nonce010, dec_key010,
++        sizeof(dec_input010), sizeof(dec_assoc010), sizeof(dec_nonce010) },
++      { dec_input011, dec_output011, dec_assoc011, dec_nonce011, dec_key011,
++        sizeof(dec_input011), sizeof(dec_assoc011), sizeof(dec_nonce011) },
++      { dec_input012, dec_output012, dec_assoc012, dec_nonce012, dec_key012,
++        sizeof(dec_input012), sizeof(dec_assoc012), sizeof(dec_nonce012) },
++      { dec_input013, dec_output013, dec_assoc013, dec_nonce013, dec_key013,
++        sizeof(dec_input013), sizeof(dec_assoc013), sizeof(dec_nonce013),
++        true }
++};
++
++static const u8 xenc_input001[] __initconst = {
++      0x49, 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x65, 0x74,
++      0x2d, 0x44, 0x72, 0x61, 0x66, 0x74, 0x73, 0x20,
++      0x61, 0x72, 0x65, 0x20, 0x64, 0x72, 0x61, 0x66,
++      0x74, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
++      0x6e, 0x74, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x69,
++      0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x20,
++      0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20,
++      0x6f, 0x66, 0x20, 0x73, 0x69, 0x78, 0x20, 0x6d,
++      0x6f, 0x6e, 0x74, 0x68, 0x73, 0x20, 0x61, 0x6e,
++      0x64, 0x20, 0x6d, 0x61, 0x79, 0x20, 0x62, 0x65,
++      0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x64,
++      0x2c, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63,
++      0x65, 0x64, 0x2c, 0x20, 0x6f, 0x72, 0x20, 0x6f,
++      0x62, 0x73, 0x6f, 0x6c, 0x65, 0x74, 0x65, 0x64,
++      0x20, 0x62, 0x79, 0x20, 0x6f, 0x74, 0x68, 0x65,
++      0x72, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
++      0x6e, 0x74, 0x73, 0x20, 0x61, 0x74, 0x20, 0x61,
++      0x6e, 0x79, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x2e,
++      0x20, 0x49, 0x74, 0x20, 0x69, 0x73, 0x20, 0x69,
++      0x6e, 0x61, 0x70, 0x70, 0x72, 0x6f, 0x70, 0x72,
++      0x69, 0x61, 0x74, 0x65, 0x20, 0x74, 0x6f, 0x20,
++      0x75, 0x73, 0x65, 0x20, 0x49, 0x6e, 0x74, 0x65,
++      0x72, 0x6e, 0x65, 0x74, 0x2d, 0x44, 0x72, 0x61,
++      0x66, 0x74, 0x73, 0x20, 0x61, 0x73, 0x20, 0x72,
++      0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65,
++      0x20, 0x6d, 0x61, 0x74, 0x65, 0x72, 0x69, 0x61,
++      0x6c, 0x20, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20,
++      0x63, 0x69, 0x74, 0x65, 0x20, 0x74, 0x68, 0x65,
++      0x6d, 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x20,
++      0x74, 0x68, 0x61, 0x6e, 0x20, 0x61, 0x73, 0x20,
++      0x2f, 0xe2, 0x80, 0x9c, 0x77, 0x6f, 0x72, 0x6b,
++      0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x67,
++      0x72, 0x65, 0x73, 0x73, 0x2e, 0x2f, 0xe2, 0x80,
++      0x9d
++};
++static const u8 xenc_output001[] __initconst = {
++      0x1a, 0x6e, 0x3a, 0xd9, 0xfd, 0x41, 0x3f, 0x77,
++      0x54, 0x72, 0x0a, 0x70, 0x9a, 0xa0, 0x29, 0x92,
++      0x2e, 0xed, 0x93, 0xcf, 0x0f, 0x71, 0x88, 0x18,
++      0x7a, 0x9d, 0x2d, 0x24, 0xe0, 0xf5, 0xea, 0x3d,
++      0x55, 0x64, 0xd7, 0xad, 0x2a, 0x1a, 0x1f, 0x7e,
++      0x86, 0x6d, 0xb0, 0xce, 0x80, 0x41, 0x72, 0x86,
++      0x26, 0xee, 0x84, 0xd7, 0xef, 0x82, 0x9e, 0xe2,
++      0x60, 0x9d, 0x5a, 0xfc, 0xf0, 0xe4, 0x19, 0x85,
++      0xea, 0x09, 0xc6, 0xfb, 0xb3, 0xa9, 0x50, 0x09,
++      0xec, 0x5e, 0x11, 0x90, 0xa1, 0xc5, 0x4e, 0x49,
++      0xef, 0x50, 0xd8, 0x8f, 0xe0, 0x78, 0xd7, 0xfd,
++      0xb9, 0x3b, 0xc9, 0xf2, 0x91, 0xc8, 0x25, 0xc8,
++      0xa7, 0x63, 0x60, 0xce, 0x10, 0xcd, 0xc6, 0x7f,
++      0xf8, 0x16, 0xf8, 0xe1, 0x0a, 0xd9, 0xde, 0x79,
++      0x50, 0x33, 0xf2, 0x16, 0x0f, 0x17, 0xba, 0xb8,
++      0x5d, 0xd8, 0xdf, 0x4e, 0x51, 0xa8, 0x39, 0xd0,
++      0x85, 0xca, 0x46, 0x6a, 0x10, 0xa7, 0xa3, 0x88,
++      0xef, 0x79, 0xb9, 0xf8, 0x24, 0xf3, 0xe0, 0x71,
++      0x7b, 0x76, 0x28, 0x46, 0x3a, 0x3a, 0x1b, 0x91,
++      0xb6, 0xd4, 0x3e, 0x23, 0xe5, 0x44, 0x15, 0xbf,
++      0x60, 0x43, 0x9d, 0xa4, 0xbb, 0xd5, 0x5f, 0x89,
++      0xeb, 0xef, 0x8e, 0xfd, 0xdd, 0xb4, 0x0d, 0x46,
++      0xf0, 0x69, 0x23, 0x63, 0xae, 0x94, 0xf5, 0x5e,
++      0xa5, 0xad, 0x13, 0x1c, 0x41, 0x76, 0xe6, 0x90,
++      0xd6, 0x6d, 0xa2, 0x8f, 0x97, 0x4c, 0xa8, 0x0b,
++      0xcf, 0x8d, 0x43, 0x2b, 0x9c, 0x9b, 0xc5, 0x58,
++      0xa5, 0xb6, 0x95, 0x9a, 0xbf, 0x81, 0xc6, 0x54,
++      0xc9, 0x66, 0x0c, 0xe5, 0x4f, 0x6a, 0x53, 0xa1,
++      0xe5, 0x0c, 0xba, 0x31, 0xde, 0x34, 0x64, 0x73,
++      0x8a, 0x3b, 0xbd, 0x92, 0x01, 0xdb, 0x71, 0x69,
++      0xf3, 0x58, 0x99, 0xbc, 0xd1, 0xcb, 0x4a, 0x05,
++      0xe2, 0x58, 0x9c, 0x25, 0x17, 0xcd, 0xdc, 0x83,
++      0xb7, 0xff, 0xfb, 0x09, 0x61, 0xad, 0xbf, 0x13,
++      0x5b, 0x5e, 0xed, 0x46, 0x82, 0x6f, 0x22, 0xd8,
++      0x93, 0xa6, 0x85, 0x5b, 0x40, 0x39, 0x5c, 0xc5,
++      0x9c
++};
++static const u8 xenc_assoc001[] __initconst = {
++      0xf3, 0x33, 0x88, 0x86, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x4e, 0x91
++};
++static const u8 xenc_nonce001[] __initconst = {
++      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
++      0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
++      0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
++};
++static const u8 xenc_key001[] __initconst = {
++      0x1c, 0x92, 0x40, 0xa5, 0xeb, 0x55, 0xd3, 0x8a,
++      0xf3, 0x33, 0x88, 0x86, 0x04, 0xf6, 0xb5, 0xf0,
++      0x47, 0x39, 0x17, 0xc1, 0x40, 0x2b, 0x80, 0x09,
++      0x9d, 0xca, 0x5c, 0xbc, 0x20, 0x70, 0x75, 0xc0
++};
++
++static const struct chacha20poly1305_testvec
++xchacha20poly1305_enc_vectors[] __initconst = {
++      { xenc_input001, xenc_output001, xenc_assoc001, xenc_nonce001, xenc_key001,
++        sizeof(xenc_input001), sizeof(xenc_assoc001), sizeof(xenc_nonce001) }
++};
++
++static const u8 xdec_input001[] __initconst = {
++      0x1a, 0x6e, 0x3a, 0xd9, 0xfd, 0x41, 0x3f, 0x77,
++      0x54, 0x72, 0x0a, 0x70, 0x9a, 0xa0, 0x29, 0x92,
++      0x2e, 0xed, 0x93, 0xcf, 0x0f, 0x71, 0x88, 0x18,
++      0x7a, 0x9d, 0x2d, 0x24, 0xe0, 0xf5, 0xea, 0x3d,
++      0x55, 0x64, 0xd7, 0xad, 0x2a, 0x1a, 0x1f, 0x7e,
++      0x86, 0x6d, 0xb0, 0xce, 0x80, 0x41, 0x72, 0x86,
++      0x26, 0xee, 0x84, 0xd7, 0xef, 0x82, 0x9e, 0xe2,
++      0x60, 0x9d, 0x5a, 0xfc, 0xf0, 0xe4, 0x19, 0x85,
++      0xea, 0x09, 0xc6, 0xfb, 0xb3, 0xa9, 0x50, 0x09,
++      0xec, 0x5e, 0x11, 0x90, 0xa1, 0xc5, 0x4e, 0x49,
++      0xef, 0x50, 0xd8, 0x8f, 0xe0, 0x78, 0xd7, 0xfd,
++      0xb9, 0x3b, 0xc9, 0xf2, 0x91, 0xc8, 0x25, 0xc8,
++      0xa7, 0x63, 0x60, 0xce, 0x10, 0xcd, 0xc6, 0x7f,
++      0xf8, 0x16, 0xf8, 0xe1, 0x0a, 0xd9, 0xde, 0x79,
++      0x50, 0x33, 0xf2, 0x16, 0x0f, 0x17, 0xba, 0xb8,
++      0x5d, 0xd8, 0xdf, 0x4e, 0x51, 0xa8, 0x39, 0xd0,
++      0x85, 0xca, 0x46, 0x6a, 0x10, 0xa7, 0xa3, 0x88,
++      0xef, 0x79, 0xb9, 0xf8, 0x24, 0xf3, 0xe0, 0x71,
++      0x7b, 0x76, 0x28, 0x46, 0x3a, 0x3a, 0x1b, 0x91,
++      0xb6, 0xd4, 0x3e, 0x23, 0xe5, 0x44, 0x15, 0xbf,
++      0x60, 0x43, 0x9d, 0xa4, 0xbb, 0xd5, 0x5f, 0x89,
++      0xeb, 0xef, 0x8e, 0xfd, 0xdd, 0xb4, 0x0d, 0x46,
++      0xf0, 0x69, 0x23, 0x63, 0xae, 0x94, 0xf5, 0x5e,
++      0xa5, 0xad, 0x13, 0x1c, 0x41, 0x76, 0xe6, 0x90,
++      0xd6, 0x6d, 0xa2, 0x8f, 0x97, 0x4c, 0xa8, 0x0b,
++      0xcf, 0x8d, 0x43, 0x2b, 0x9c, 0x9b, 0xc5, 0x58,
++      0xa5, 0xb6, 0x95, 0x9a, 0xbf, 0x81, 0xc6, 0x54,
++      0xc9, 0x66, 0x0c, 0xe5, 0x4f, 0x6a, 0x53, 0xa1,
++      0xe5, 0x0c, 0xba, 0x31, 0xde, 0x34, 0x64, 0x73,
++      0x8a, 0x3b, 0xbd, 0x92, 0x01, 0xdb, 0x71, 0x69,
++      0xf3, 0x58, 0x99, 0xbc, 0xd1, 0xcb, 0x4a, 0x05,
++      0xe2, 0x58, 0x9c, 0x25, 0x17, 0xcd, 0xdc, 0x83,
++      0xb7, 0xff, 0xfb, 0x09, 0x61, 0xad, 0xbf, 0x13,
++      0x5b, 0x5e, 0xed, 0x46, 0x82, 0x6f, 0x22, 0xd8,
++      0x93, 0xa6, 0x85, 0x5b, 0x40, 0x39, 0x5c, 0xc5,
++      0x9c
++};
++static const u8 xdec_output001[] __initconst = {
++      0x49, 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x65, 0x74,
++      0x2d, 0x44, 0x72, 0x61, 0x66, 0x74, 0x73, 0x20,
++      0x61, 0x72, 0x65, 0x20, 0x64, 0x72, 0x61, 0x66,
++      0x74, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
++      0x6e, 0x74, 0x73, 0x20, 0x76, 0x61, 0x6c, 0x69,
++      0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x20,
++      0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20,
++      0x6f, 0x66, 0x20, 0x73, 0x69, 0x78, 0x20, 0x6d,
++      0x6f, 0x6e, 0x74, 0x68, 0x73, 0x20, 0x61, 0x6e,
++      0x64, 0x20, 0x6d, 0x61, 0x79, 0x20, 0x62, 0x65,
++      0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x64,
++      0x2c, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63,
++      0x65, 0x64, 0x2c, 0x20, 0x6f, 0x72, 0x20, 0x6f,
++      0x62, 0x73, 0x6f, 0x6c, 0x65, 0x74, 0x65, 0x64,
++      0x20, 0x62, 0x79, 0x20, 0x6f, 0x74, 0x68, 0x65,
++      0x72, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
++      0x6e, 0x74, 0x73, 0x20, 0x61, 0x74, 0x20, 0x61,
++      0x6e, 0x79, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x2e,
++      0x20, 0x49, 0x74, 0x20, 0x69, 0x73, 0x20, 0x69,
++      0x6e, 0x61, 0x70, 0x70, 0x72, 0x6f, 0x70, 0x72,
++      0x69, 0x61, 0x74, 0x65, 0x20, 0x74, 0x6f, 0x20,
++      0x75, 0x73, 0x65, 0x20, 0x49, 0x6e, 0x74, 0x65,
++      0x72, 0x6e, 0x65, 0x74, 0x2d, 0x44, 0x72, 0x61,
++      0x66, 0x74, 0x73, 0x20, 0x61, 0x73, 0x20, 0x72,
++      0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65,
++      0x20, 0x6d, 0x61, 0x74, 0x65, 0x72, 0x69, 0x61,
++      0x6c, 0x20, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20,
++      0x63, 0x69, 0x74, 0x65, 0x20, 0x74, 0x68, 0x65,
++      0x6d, 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x20,
++      0x74, 0x68, 0x61, 0x6e, 0x20, 0x61, 0x73, 0x20,
++      0x2f, 0xe2, 0x80, 0x9c, 0x77, 0x6f, 0x72, 0x6b,
++      0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x67,
++      0x72, 0x65, 0x73, 0x73, 0x2e, 0x2f, 0xe2, 0x80,
++      0x9d
++};
++static const u8 xdec_assoc001[] __initconst = {
++      0xf3, 0x33, 0x88, 0x86, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x4e, 0x91
++};
++static const u8 xdec_nonce001[] __initconst = {
++      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
++      0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
++      0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
++};
++static const u8 xdec_key001[] __initconst = {
++      0x1c, 0x92, 0x40, 0xa5, 0xeb, 0x55, 0xd3, 0x8a,
++      0xf3, 0x33, 0x88, 0x86, 0x04, 0xf6, 0xb5, 0xf0,
++      0x47, 0x39, 0x17, 0xc1, 0x40, 0x2b, 0x80, 0x09,
++      0x9d, 0xca, 0x5c, 0xbc, 0x20, 0x70, 0x75, 0xc0
++};
++
++static const struct chacha20poly1305_testvec
++xchacha20poly1305_dec_vectors[] __initconst = {
++      { xdec_input001, xdec_output001, xdec_assoc001, xdec_nonce001, xdec_key001,
++        sizeof(xdec_input001), sizeof(xdec_assoc001), sizeof(xdec_nonce001) }
++};
++
++static void __init
++chacha20poly1305_selftest_encrypt(u8 *dst, const u8 *src, const size_t src_len,
++                                const u8 *ad, const size_t ad_len,
++                                const u8 *nonce, const size_t nonce_len,
++                                const u8 key[CHACHA20POLY1305_KEY_SIZE])
++{
++      if (nonce_len == 8)
++              chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len,
++                                       get_unaligned_le64(nonce), key);
++      else
++              BUG();
++}
++
++static bool __init
++decryption_success(bool func_ret, bool expect_failure, int memcmp_result)
++{
++      if (expect_failure)
++              return !func_ret;
++      return func_ret && !memcmp_result;
++}
++
++bool __init chacha20poly1305_selftest(void)
++{
++      enum { MAXIMUM_TEST_BUFFER_LEN = 1UL << 12 };
++      size_t i;
++      u8 *computed_output = NULL, *heap_src = NULL;
++      bool success = true, ret;
++
++      heap_src = kmalloc(MAXIMUM_TEST_BUFFER_LEN, GFP_KERNEL);
++      computed_output = kmalloc(MAXIMUM_TEST_BUFFER_LEN, GFP_KERNEL);
++      if (!heap_src || !computed_output) {
++              pr_err("chacha20poly1305 self-test malloc: FAIL\n");
++              success = false;
++              goto out;
++      }
++
++      for (i = 0; i < ARRAY_SIZE(chacha20poly1305_enc_vectors); ++i) {
++              memset(computed_output, 0, MAXIMUM_TEST_BUFFER_LEN);
++              chacha20poly1305_selftest_encrypt(computed_output,
++                                      chacha20poly1305_enc_vectors[i].input,
++                                      chacha20poly1305_enc_vectors[i].ilen,
++                                      chacha20poly1305_enc_vectors[i].assoc,
++                                      chacha20poly1305_enc_vectors[i].alen,
++                                      chacha20poly1305_enc_vectors[i].nonce,
++                                      chacha20poly1305_enc_vectors[i].nlen,
++                                      chacha20poly1305_enc_vectors[i].key);
++              if (memcmp(computed_output,
++                         chacha20poly1305_enc_vectors[i].output,
++                         chacha20poly1305_enc_vectors[i].ilen +
++                                                      POLY1305_DIGEST_SIZE)) {
++                      pr_err("chacha20poly1305 encryption self-test %zu: FAIL\n",
++                             i + 1);
++                      success = false;
++              }
++      }
++
++      for (i = 0; i < ARRAY_SIZE(chacha20poly1305_dec_vectors); ++i) {
++              memset(computed_output, 0, MAXIMUM_TEST_BUFFER_LEN);
++              ret = chacha20poly1305_decrypt(computed_output,
++                      chacha20poly1305_dec_vectors[i].input,
++                      chacha20poly1305_dec_vectors[i].ilen,
++                      chacha20poly1305_dec_vectors[i].assoc,
++                      chacha20poly1305_dec_vectors[i].alen,
++                      get_unaligned_le64(chacha20poly1305_dec_vectors[i].nonce),
++                      chacha20poly1305_dec_vectors[i].key);
++              if (!decryption_success(ret,
++                              chacha20poly1305_dec_vectors[i].failure,
++                              memcmp(computed_output,
++                                     chacha20poly1305_dec_vectors[i].output,
++                                     chacha20poly1305_dec_vectors[i].ilen -
++                                                      POLY1305_DIGEST_SIZE))) {
++                      pr_err("chacha20poly1305 decryption self-test %zu: FAIL\n",
++                             i + 1);
++                      success = false;
++              }
++      }
++
++
++      for (i = 0; i < ARRAY_SIZE(xchacha20poly1305_enc_vectors); ++i) {
++              memset(computed_output, 0, MAXIMUM_TEST_BUFFER_LEN);
++              xchacha20poly1305_encrypt(computed_output,
++                                      xchacha20poly1305_enc_vectors[i].input,
++                                      xchacha20poly1305_enc_vectors[i].ilen,
++                                      xchacha20poly1305_enc_vectors[i].assoc,
++                                      xchacha20poly1305_enc_vectors[i].alen,
++                                      xchacha20poly1305_enc_vectors[i].nonce,
++                                      xchacha20poly1305_enc_vectors[i].key);
++              if (memcmp(computed_output,
++                         xchacha20poly1305_enc_vectors[i].output,
++                         xchacha20poly1305_enc_vectors[i].ilen +
++                                                      POLY1305_DIGEST_SIZE)) {
++                      pr_err("xchacha20poly1305 encryption self-test %zu: FAIL\n",
++                             i + 1);
++                      success = false;
++              }
++      }
++      for (i = 0; i < ARRAY_SIZE(xchacha20poly1305_dec_vectors); ++i) {
++              memset(computed_output, 0, MAXIMUM_TEST_BUFFER_LEN);
++              ret = xchacha20poly1305_decrypt(computed_output,
++                                      xchacha20poly1305_dec_vectors[i].input,
++                                      xchacha20poly1305_dec_vectors[i].ilen,
++                                      xchacha20poly1305_dec_vectors[i].assoc,
++                                      xchacha20poly1305_dec_vectors[i].alen,
++                                      xchacha20poly1305_dec_vectors[i].nonce,
++                                      xchacha20poly1305_dec_vectors[i].key);
++              if (!decryption_success(ret,
++                              xchacha20poly1305_dec_vectors[i].failure,
++                              memcmp(computed_output,
++                                     xchacha20poly1305_dec_vectors[i].output,
++                                     xchacha20poly1305_dec_vectors[i].ilen -
++                                                      POLY1305_DIGEST_SIZE))) {
++                      pr_err("xchacha20poly1305 decryption self-test %zu: FAIL\n",
++                             i + 1);
++                      success = false;
++              }
++      }
++
++out:
++      kfree(heap_src);
++      kfree(computed_output);
++      return success;
++}
+--- /dev/null
++++ b/lib/crypto/chacha20poly1305.c
+@@ -0,0 +1,219 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This is an implementation of the ChaCha20Poly1305 AEAD construction.
++ *
++ * Information: https://tools.ietf.org/html/rfc8439
++ */
++
++#include <crypto/algapi.h>
++#include <crypto/chacha20poly1305.h>
++#include <crypto/chacha.h>
++#include <crypto/poly1305.h>
++
++#include <asm/unaligned.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/mm.h>
++#include <linux/module.h>
++
++#define CHACHA_KEY_WORDS      (CHACHA_KEY_SIZE / sizeof(u32))
++
++bool __init chacha20poly1305_selftest(void);
++
++static void chacha_load_key(u32 *k, const u8 *in)
++{
++      k[0] = get_unaligned_le32(in);
++      k[1] = get_unaligned_le32(in + 4);
++      k[2] = get_unaligned_le32(in + 8);
++      k[3] = get_unaligned_le32(in + 12);
++      k[4] = get_unaligned_le32(in + 16);
++      k[5] = get_unaligned_le32(in + 20);
++      k[6] = get_unaligned_le32(in + 24);
++      k[7] = get_unaligned_le32(in + 28);
++}
++
++static void xchacha_init(u32 *chacha_state, const u8 *key, const u8 *nonce)
++{
++      u32 k[CHACHA_KEY_WORDS];
++      u8 iv[CHACHA_IV_SIZE];
++
++      memset(iv, 0, 8);
++      memcpy(iv + 8, nonce + 16, 8);
++
++      chacha_load_key(k, key);
++
++      /* Compute the subkey given the original key and first 128 nonce bits */
++      chacha_init(chacha_state, k, nonce);
++      hchacha_block(chacha_state, k, 20);
++
++      chacha_init(chacha_state, k, iv);
++
++      memzero_explicit(k, sizeof(k));
++      memzero_explicit(iv, sizeof(iv));
++}
++
++static void
++__chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
++                         const u8 *ad, const size_t ad_len, u32 *chacha_state)
++{
++      const u8 *pad0 = page_address(ZERO_PAGE(0));
++      struct poly1305_desc_ctx poly1305_state;
++      union {
++              u8 block0[POLY1305_KEY_SIZE];
++              __le64 lens[2];
++      } b;
++
++      chacha_crypt(chacha_state, b.block0, pad0, sizeof(b.block0), 20);
++      poly1305_init(&poly1305_state, b.block0);
++
++      poly1305_update(&poly1305_state, ad, ad_len);
++      if (ad_len & 0xf)
++              poly1305_update(&poly1305_state, pad0, 0x10 - (ad_len & 0xf));
++
++      chacha_crypt(chacha_state, dst, src, src_len, 20);
++
++      poly1305_update(&poly1305_state, dst, src_len);
++      if (src_len & 0xf)
++              poly1305_update(&poly1305_state, pad0, 0x10 - (src_len & 0xf));
++
++      b.lens[0] = cpu_to_le64(ad_len);
++      b.lens[1] = cpu_to_le64(src_len);
++      poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens));
++
++      poly1305_final(&poly1305_state, dst + src_len);
++
++      memzero_explicit(chacha_state, CHACHA_STATE_WORDS * sizeof(u32));
++      memzero_explicit(&b, sizeof(b));
++}
++
++void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
++                            const u8 *ad, const size_t ad_len,
++                            const u64 nonce,
++                            const u8 key[CHACHA20POLY1305_KEY_SIZE])
++{
++      u32 chacha_state[CHACHA_STATE_WORDS];
++      u32 k[CHACHA_KEY_WORDS];
++      __le64 iv[2];
++
++      chacha_load_key(k, key);
++
++      iv[0] = 0;
++      iv[1] = cpu_to_le64(nonce);
++
++      chacha_init(chacha_state, k, (u8 *)iv);
++      __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, chacha_state);
++
++      memzero_explicit(iv, sizeof(iv));
++      memzero_explicit(k, sizeof(k));
++}
++EXPORT_SYMBOL(chacha20poly1305_encrypt);
++
++void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
++                             const u8 *ad, const size_t ad_len,
++                             const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
++                             const u8 key[CHACHA20POLY1305_KEY_SIZE])
++{
++      u32 chacha_state[CHACHA_STATE_WORDS];
++
++      xchacha_init(chacha_state, key, nonce);
++      __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, chacha_state);
++}
++EXPORT_SYMBOL(xchacha20poly1305_encrypt);
++
++static bool
++__chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
++                         const u8 *ad, const size_t ad_len, u32 *chacha_state)
++{
++      const u8 *pad0 = page_address(ZERO_PAGE(0));
++      struct poly1305_desc_ctx poly1305_state;
++      size_t dst_len;
++      int ret;
++      union {
++              u8 block0[POLY1305_KEY_SIZE];
++              u8 mac[POLY1305_DIGEST_SIZE];
++              __le64 lens[2];
++      } b;
++
++      if (unlikely(src_len < POLY1305_DIGEST_SIZE))
++              return false;
++
++      chacha_crypt(chacha_state, b.block0, pad0, sizeof(b.block0), 20);
++      poly1305_init(&poly1305_state, b.block0);
++
++      poly1305_update(&poly1305_state, ad, ad_len);
++      if (ad_len & 0xf)
++              poly1305_update(&poly1305_state, pad0, 0x10 - (ad_len & 0xf));
++
++      dst_len = src_len - POLY1305_DIGEST_SIZE;
++      poly1305_update(&poly1305_state, src, dst_len);
++      if (dst_len & 0xf)
++              poly1305_update(&poly1305_state, pad0, 0x10 - (dst_len & 0xf));
++
++      b.lens[0] = cpu_to_le64(ad_len);
++      b.lens[1] = cpu_to_le64(dst_len);
++      poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens));
++
++      poly1305_final(&poly1305_state, b.mac);
++
++      ret = crypto_memneq(b.mac, src + dst_len, POLY1305_DIGEST_SIZE);
++      if (likely(!ret))
++              chacha_crypt(chacha_state, dst, src, dst_len, 20);
++
++      memzero_explicit(&b, sizeof(b));
++
++      return !ret;
++}
++
++bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
++                            const u8 *ad, const size_t ad_len,
++                            const u64 nonce,
++                            const u8 key[CHACHA20POLY1305_KEY_SIZE])
++{
++      u32 chacha_state[CHACHA_STATE_WORDS];
++      u32 k[CHACHA_KEY_WORDS];
++      __le64 iv[2];
++      bool ret;
++
++      chacha_load_key(k, key);
++
++      iv[0] = 0;
++      iv[1] = cpu_to_le64(nonce);
++
++      chacha_init(chacha_state, k, (u8 *)iv);
++      ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len,
++                                       chacha_state);
++
++      memzero_explicit(chacha_state, sizeof(chacha_state));
++      memzero_explicit(iv, sizeof(iv));
++      memzero_explicit(k, sizeof(k));
++      return ret;
++}
++EXPORT_SYMBOL(chacha20poly1305_decrypt);
++
++bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
++                             const u8 *ad, const size_t ad_len,
++                             const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
++                             const u8 key[CHACHA20POLY1305_KEY_SIZE])
++{
++      u32 chacha_state[CHACHA_STATE_WORDS];
++
++      xchacha_init(chacha_state, key, nonce);
++      return __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len,
++                                        chacha_state);
++}
++EXPORT_SYMBOL(xchacha20poly1305_decrypt);
++
++static int __init mod_init(void)
++{
++      if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
++          WARN_ON(!chacha20poly1305_selftest()))
++              return -ENODEV;
++      return 0;
++}
++
++module_init(mod_init);
++MODULE_LICENSE("GPL v2");
++MODULE_DESCRIPTION("ChaCha20Poly1305 AEAD construction");
++MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0033-crypto-lib-chacha20poly1305-reimplement-crypt_from_s.patch b/target/linux/generic/backport-5.4/080-wireguard-0033-crypto-lib-chacha20poly1305-reimplement-crypt_from_s.patch

new file mode 100644 (file)

index 0000000..a7811eb
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0033-crypto-lib-chacha20poly1305-reimplement-crypt_from_s.patch
@@ -0,0 +1,295 @@
+From b7af0c213ba3afe27da21845419756aec63b43b4 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:40 +0100
+Subject: [PATCH 033/124] crypto: lib/chacha20poly1305 - reimplement
+ crypt_from_sg() routine
+
+commit d95312a3ccc0cd544d374be2fc45aeaa803e5fd9 upstream.
+
+Reimplement the library routines to perform chacha20poly1305 en/decryption
+on scatterlists, without [ab]using the [deprecated] blkcipher interface,
+which is rather heavyweight and does things we don't really need.
+
+Instead, we use the sg_miter API in a novel and clever way, to iterate
+over the scatterlist in-place (i.e., source == destination, which is the
+only way this library is expected to be used). That way, we don't have to
+iterate over two scatterlists in parallel.
+
+Another optimization is that, instead of relying on the blkcipher walker
+to present the input in suitable chunks, we recognize that ChaCha is a
+streamcipher, and so we can simply deal with partial blocks by keeping a
+block of cipherstream on the stack and use crypto_xor() to mix it with
+the in/output.
+
+Finally, we omit the scatterwalk_and_copy() call if the last element of
+the scatterlist covers the MAC as well (which is the common case),
+avoiding the need to walk the scatterlist and kmap() the page twice.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/crypto/chacha20poly1305.h      |  11 ++
+ lib/crypto/chacha20poly1305-selftest.c |  45 ++++++++
+ lib/crypto/chacha20poly1305.c          | 150 +++++++++++++++++++++++++
+ 3 files changed, 206 insertions(+)
+
+--- a/include/crypto/chacha20poly1305.h
++++ b/include/crypto/chacha20poly1305.h
+@@ -7,6 +7,7 @@
+ #define __CHACHA20POLY1305_H
+ 
+ #include <linux/types.h>
++#include <linux/scatterlist.h>
+ 
+ enum chacha20poly1305_lengths {
+       XCHACHA20POLY1305_NONCE_SIZE = 24,
+@@ -34,4 +35,14 @@ bool __must_check xchacha20poly1305_decr
+       const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
+       const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+ 
++bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
++                                       const u8 *ad, const size_t ad_len,
++                                       const u64 nonce,
++                                       const u8 key[CHACHA20POLY1305_KEY_SIZE]);
++
++bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
++                                       const u8 *ad, const size_t ad_len,
++                                       const u64 nonce,
++                                       const u8 key[CHACHA20POLY1305_KEY_SIZE]);
++
+ #endif /* __CHACHA20POLY1305_H */
+--- a/lib/crypto/chacha20poly1305-selftest.c
++++ b/lib/crypto/chacha20poly1305-selftest.c
+@@ -7250,6 +7250,7 @@ bool __init chacha20poly1305_selftest(vo
+       enum { MAXIMUM_TEST_BUFFER_LEN = 1UL << 12 };
+       size_t i;
+       u8 *computed_output = NULL, *heap_src = NULL;
++      struct scatterlist sg_src;
+       bool success = true, ret;
+ 
+       heap_src = kmalloc(MAXIMUM_TEST_BUFFER_LEN, GFP_KERNEL);
+@@ -7280,6 +7281,29 @@ bool __init chacha20poly1305_selftest(vo
+               }
+       }
+ 
++      for (i = 0; i < ARRAY_SIZE(chacha20poly1305_enc_vectors); ++i) {
++              if (chacha20poly1305_enc_vectors[i].nlen != 8)
++                      continue;
++              memcpy(heap_src, chacha20poly1305_enc_vectors[i].input,
++                     chacha20poly1305_enc_vectors[i].ilen);
++              sg_init_one(&sg_src, heap_src,
++                          chacha20poly1305_enc_vectors[i].ilen + POLY1305_DIGEST_SIZE);
++              chacha20poly1305_encrypt_sg_inplace(&sg_src,
++                      chacha20poly1305_enc_vectors[i].ilen,
++                      chacha20poly1305_enc_vectors[i].assoc,
++                      chacha20poly1305_enc_vectors[i].alen,
++                      get_unaligned_le64(chacha20poly1305_enc_vectors[i].nonce),
++                      chacha20poly1305_enc_vectors[i].key);
++              if (memcmp(heap_src,
++                                 chacha20poly1305_enc_vectors[i].output,
++                                 chacha20poly1305_enc_vectors[i].ilen +
++                                                      POLY1305_DIGEST_SIZE)) {
++                      pr_err("chacha20poly1305 sg encryption self-test %zu: FAIL\n",
++                             i + 1);
++                      success = false;
++              }
++      }
++
+       for (i = 0; i < ARRAY_SIZE(chacha20poly1305_dec_vectors); ++i) {
+               memset(computed_output, 0, MAXIMUM_TEST_BUFFER_LEN);
+               ret = chacha20poly1305_decrypt(computed_output,
+@@ -7301,6 +7325,27 @@ bool __init chacha20poly1305_selftest(vo
+               }
+       }
+ 
++      for (i = 0; i < ARRAY_SIZE(chacha20poly1305_dec_vectors); ++i) {
++              memcpy(heap_src, chacha20poly1305_dec_vectors[i].input,
++                     chacha20poly1305_dec_vectors[i].ilen);
++              sg_init_one(&sg_src, heap_src,
++                          chacha20poly1305_dec_vectors[i].ilen);
++              ret = chacha20poly1305_decrypt_sg_inplace(&sg_src,
++                      chacha20poly1305_dec_vectors[i].ilen,
++                      chacha20poly1305_dec_vectors[i].assoc,
++                      chacha20poly1305_dec_vectors[i].alen,
++                      get_unaligned_le64(chacha20poly1305_dec_vectors[i].nonce),
++                      chacha20poly1305_dec_vectors[i].key);
++              if (!decryption_success(ret,
++                      chacha20poly1305_dec_vectors[i].failure,
++                      memcmp(heap_src, chacha20poly1305_dec_vectors[i].output,
++                             chacha20poly1305_dec_vectors[i].ilen -
++                                                      POLY1305_DIGEST_SIZE))) {
++                      pr_err("chacha20poly1305 sg decryption self-test %zu: FAIL\n",
++                             i + 1);
++                      success = false;
++              }
++      }
+ 
+       for (i = 0; i < ARRAY_SIZE(xchacha20poly1305_enc_vectors); ++i) {
+               memset(computed_output, 0, MAXIMUM_TEST_BUFFER_LEN);
+--- a/lib/crypto/chacha20poly1305.c
++++ b/lib/crypto/chacha20poly1305.c
+@@ -11,6 +11,7 @@
+ #include <crypto/chacha20poly1305.h>
+ #include <crypto/chacha.h>
+ #include <crypto/poly1305.h>
++#include <crypto/scatterwalk.h>
+ 
+ #include <asm/unaligned.h>
+ #include <linux/kernel.h>
+@@ -205,6 +206,155 @@ bool xchacha20poly1305_decrypt(u8 *dst,
+ }
+ EXPORT_SYMBOL(xchacha20poly1305_decrypt);
+ 
++static
++bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
++                                     const size_t src_len,
++                                     const u8 *ad, const size_t ad_len,
++                                     const u64 nonce,
++                                     const u8 key[CHACHA20POLY1305_KEY_SIZE],
++                                     int encrypt)
++{
++      const u8 *pad0 = page_address(ZERO_PAGE(0));
++      struct poly1305_desc_ctx poly1305_state;
++      u32 chacha_state[CHACHA_STATE_WORDS];
++      struct sg_mapping_iter miter;
++      size_t partial = 0;
++      unsigned int flags;
++      bool ret = true;
++      int sl;
++      union {
++              struct {
++                      u32 k[CHACHA_KEY_WORDS];
++                      __le64 iv[2];
++              };
++              u8 block0[POLY1305_KEY_SIZE];
++              u8 chacha_stream[CHACHA_BLOCK_SIZE];
++              struct {
++                      u8 mac[2][POLY1305_DIGEST_SIZE];
++              };
++              __le64 lens[2];
++      } b __aligned(16);
++
++      chacha_load_key(b.k, key);
++
++      b.iv[0] = 0;
++      b.iv[1] = cpu_to_le64(nonce);
++
++      chacha_init(chacha_state, b.k, (u8 *)b.iv);
++      chacha_crypt(chacha_state, b.block0, pad0, sizeof(b.block0), 20);
++      poly1305_init(&poly1305_state, b.block0);
++
++      if (unlikely(ad_len)) {
++              poly1305_update(&poly1305_state, ad, ad_len);
++              if (ad_len & 0xf)
++                      poly1305_update(&poly1305_state, pad0, 0x10 - (ad_len & 0xf));
++      }
++
++      flags = SG_MITER_TO_SG;
++      if (!preemptible())
++              flags |= SG_MITER_ATOMIC;
++
++      sg_miter_start(&miter, src, sg_nents(src), flags);
++
++      for (sl = src_len; sl > 0 && sg_miter_next(&miter); sl -= miter.length) {
++              u8 *addr = miter.addr;
++              size_t length = min_t(size_t, sl, miter.length);
++
++              if (!encrypt)
++                      poly1305_update(&poly1305_state, addr, length);
++
++              if (unlikely(partial)) {
++                      size_t l = min(length, CHACHA_BLOCK_SIZE - partial);
++
++                      crypto_xor(addr, b.chacha_stream + partial, l);
++                      partial = (partial + l) & (CHACHA_BLOCK_SIZE - 1);
++
++                      addr += l;
++                      length -= l;
++              }
++
++              if (likely(length >= CHACHA_BLOCK_SIZE || length == sl)) {
++                      size_t l = length;
++
++                      if (unlikely(length < sl))
++                              l &= ~(CHACHA_BLOCK_SIZE - 1);
++                      chacha_crypt(chacha_state, addr, addr, l, 20);
++                      addr += l;
++                      length -= l;
++              }
++
++              if (unlikely(length > 0)) {
++                      chacha_crypt(chacha_state, b.chacha_stream, pad0,
++                                   CHACHA_BLOCK_SIZE, 20);
++                      crypto_xor(addr, b.chacha_stream, length);
++                      partial = length;
++              }
++
++              if (encrypt)
++                      poly1305_update(&poly1305_state, miter.addr,
++                                      min_t(size_t, sl, miter.length));
++      }
++
++      if (src_len & 0xf)
++              poly1305_update(&poly1305_state, pad0, 0x10 - (src_len & 0xf));
++
++      b.lens[0] = cpu_to_le64(ad_len);
++      b.lens[1] = cpu_to_le64(src_len);
++      poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens));
++
++      if (likely(sl <= -POLY1305_DIGEST_SIZE)) {
++              if (encrypt) {
++                      poly1305_final(&poly1305_state,
++                                     miter.addr + miter.length + sl);
++                      ret = true;
++              } else {
++                      poly1305_final(&poly1305_state, b.mac[0]);
++                      ret = !crypto_memneq(b.mac[0],
++                                           miter.addr + miter.length + sl,
++                                           POLY1305_DIGEST_SIZE);
++              }
++      }
++
++      sg_miter_stop(&miter);
++
++      if (unlikely(sl > -POLY1305_DIGEST_SIZE)) {
++              poly1305_final(&poly1305_state, b.mac[1]);
++              scatterwalk_map_and_copy(b.mac[encrypt], src, src_len,
++                                       sizeof(b.mac[1]), encrypt);
++              ret = encrypt ||
++                    !crypto_memneq(b.mac[0], b.mac[1], POLY1305_DIGEST_SIZE);
++      }
++
++      memzero_explicit(chacha_state, sizeof(chacha_state));
++      memzero_explicit(&b, sizeof(b));
++
++      return ret;
++}
++
++bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
++                                       const u8 *ad, const size_t ad_len,
++                                       const u64 nonce,
++                                       const u8 key[CHACHA20POLY1305_KEY_SIZE])
++{
++      return chacha20poly1305_crypt_sg_inplace(src, src_len, ad, ad_len,
++                                               nonce, key, 1);
++}
++EXPORT_SYMBOL(chacha20poly1305_encrypt_sg_inplace);
++
++bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
++                                       const u8 *ad, const size_t ad_len,
++                                       const u64 nonce,
++                                       const u8 key[CHACHA20POLY1305_KEY_SIZE])
++{
++      if (unlikely(src_len < POLY1305_DIGEST_SIZE))
++              return false;
++
++      return chacha20poly1305_crypt_sg_inplace(src,
++                                               src_len - POLY1305_DIGEST_SIZE,
++                                               ad, ad_len, nonce, key, 0);
++}
++EXPORT_SYMBOL(chacha20poly1305_decrypt_sg_inplace);
++
+ static int __init mod_init(void)
+ {
+       if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0034-crypto-chacha_generic-remove-unnecessary-setkey-func.patch b/target/linux/generic/backport-5.4/080-wireguard-0034-crypto-chacha_generic-remove-unnecessary-setkey-func.patch

new file mode 100644 (file)

index 0000000..493da3a
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0034-crypto-chacha_generic-remove-unnecessary-setkey-func.patch
@@ -0,0 +1,68 @@
+From d59a7ffb8aa6735586929c5a2d90e142c6d6952d Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Sun, 17 Nov 2019 23:21:29 -0800
+Subject: [PATCH 034/124] crypto: chacha_generic - remove unnecessary setkey()
+ functions
+
+commit 2043323a799a660bc84bbee404cf7a2617ec6157 upstream.
+
+Use chacha20_setkey() and chacha12_setkey() from
+<crypto/internal/chacha.h> instead of defining them again in
+chacha_generic.c.
+
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/chacha_generic.c | 18 +++---------------
+ 1 file changed, 3 insertions(+), 15 deletions(-)
+
+--- a/crypto/chacha_generic.c
++++ b/crypto/chacha_generic.c
+@@ -37,18 +37,6 @@ static int chacha_stream_xor(struct skci
+       return err;
+ }
+ 
+-static int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-                                unsigned int keysize)
+-{
+-      return chacha_setkey(tfm, key, keysize, 20);
+-}
+-
+-static int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-                               unsigned int keysize)
+-{
+-      return chacha_setkey(tfm, key, keysize, 12);
+-}
+-
+ static int crypto_chacha_crypt(struct skcipher_request *req)
+ {
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+@@ -91,7 +79,7 @@ static struct skcipher_alg algs[] = {
+               .max_keysize            = CHACHA_KEY_SIZE,
+               .ivsize                 = CHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha20_setkey,
++              .setkey                 = chacha20_setkey,
+               .encrypt                = crypto_chacha_crypt,
+               .decrypt                = crypto_chacha_crypt,
+       }, {
+@@ -106,7 +94,7 @@ static struct skcipher_alg algs[] = {
+               .max_keysize            = CHACHA_KEY_SIZE,
+               .ivsize                 = XCHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha20_setkey,
++              .setkey                 = chacha20_setkey,
+               .encrypt                = crypto_xchacha_crypt,
+               .decrypt                = crypto_xchacha_crypt,
+       }, {
+@@ -121,7 +109,7 @@ static struct skcipher_alg algs[] = {
+               .max_keysize            = CHACHA_KEY_SIZE,
+               .ivsize                 = XCHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+-              .setkey                 = crypto_chacha12_setkey,
++              .setkey                 = chacha12_setkey,
+               .encrypt                = crypto_xchacha_crypt,
+               .decrypt                = crypto_xchacha_crypt,
+       }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0035-crypto-x86-chacha-only-unregister-algorithms-if-regi.patch b/target/linux/generic/backport-5.4/080-wireguard-0035-crypto-x86-chacha-only-unregister-algorithms-if-regi.patch

new file mode 100644 (file)

index 0000000..f423acb
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0035-crypto-x86-chacha-only-unregister-algorithms-if-regi.patch
@@ -0,0 +1,32 @@
+From 4fa6b436d97e44deef404676d150ed4c13d63bba Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Sun, 17 Nov 2019 23:21:58 -0800
+Subject: [PATCH 035/124] crypto: x86/chacha - only unregister algorithms if
+ registered
+
+commit b62755aed3a3f5ca9edd2718339ccea3b6bbbe57 upstream.
+
+It's not valid to call crypto_unregister_skciphers() without a prior
+call to crypto_register_skciphers().
+
+Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/chacha_glue.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/crypto/chacha_glue.c
++++ b/arch/x86/crypto/chacha_glue.c
+@@ -304,7 +304,8 @@ static int __init chacha_simd_mod_init(v
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-      crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
++      if (boot_cpu_has(X86_FEATURE_SSSE3))
++              crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+ module_init(chacha_simd_mod_init);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0036-crypto-lib-chacha20poly1305-use-chacha20_crypt.patch b/target/linux/generic/backport-5.4/080-wireguard-0036-crypto-lib-chacha20poly1305-use-chacha20_crypt.patch

new file mode 100644 (file)

index 0000000..1f6d22e
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0036-crypto-lib-chacha20poly1305-use-chacha20_crypt.patch
@@ -0,0 +1,83 @@
+From 41d7b5227dcad70f5bd6471e9620fe3c8b3db300 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Sun, 17 Nov 2019 23:22:16 -0800
+Subject: [PATCH 036/124] crypto: lib/chacha20poly1305 - use chacha20_crypt()
+
+commit 413808b71e6204b0cc1eeaa77960f7c3cd381d33 upstream.
+
+Use chacha20_crypt() instead of chacha_crypt(), since it's not really
+appropriate for users of the ChaCha library API to be passing the number
+of rounds as an argument.
+
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/chacha20poly1305.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+--- a/lib/crypto/chacha20poly1305.c
++++ b/lib/crypto/chacha20poly1305.c
+@@ -66,14 +66,14 @@ __chacha20poly1305_encrypt(u8 *dst, cons
+               __le64 lens[2];
+       } b;
+ 
+-      chacha_crypt(chacha_state, b.block0, pad0, sizeof(b.block0), 20);
++      chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0));
+       poly1305_init(&poly1305_state, b.block0);
+ 
+       poly1305_update(&poly1305_state, ad, ad_len);
+       if (ad_len & 0xf)
+               poly1305_update(&poly1305_state, pad0, 0x10 - (ad_len & 0xf));
+ 
+-      chacha_crypt(chacha_state, dst, src, src_len, 20);
++      chacha20_crypt(chacha_state, dst, src, src_len);
+ 
+       poly1305_update(&poly1305_state, dst, src_len);
+       if (src_len & 0xf)
+@@ -140,7 +140,7 @@ __chacha20poly1305_decrypt(u8 *dst, cons
+       if (unlikely(src_len < POLY1305_DIGEST_SIZE))
+               return false;
+ 
+-      chacha_crypt(chacha_state, b.block0, pad0, sizeof(b.block0), 20);
++      chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0));
+       poly1305_init(&poly1305_state, b.block0);
+ 
+       poly1305_update(&poly1305_state, ad, ad_len);
+@@ -160,7 +160,7 @@ __chacha20poly1305_decrypt(u8 *dst, cons
+ 
+       ret = crypto_memneq(b.mac, src + dst_len, POLY1305_DIGEST_SIZE);
+       if (likely(!ret))
+-              chacha_crypt(chacha_state, dst, src, dst_len, 20);
++              chacha20_crypt(chacha_state, dst, src, dst_len);
+ 
+       memzero_explicit(&b, sizeof(b));
+ 
+@@ -241,7 +241,7 @@ bool chacha20poly1305_crypt_sg_inplace(s
+       b.iv[1] = cpu_to_le64(nonce);
+ 
+       chacha_init(chacha_state, b.k, (u8 *)b.iv);
+-      chacha_crypt(chacha_state, b.block0, pad0, sizeof(b.block0), 20);
++      chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0));
+       poly1305_init(&poly1305_state, b.block0);
+ 
+       if (unlikely(ad_len)) {
+@@ -278,14 +278,14 @@ bool chacha20poly1305_crypt_sg_inplace(s
+ 
+                       if (unlikely(length < sl))
+                               l &= ~(CHACHA_BLOCK_SIZE - 1);
+-                      chacha_crypt(chacha_state, addr, addr, l, 20);
++                      chacha20_crypt(chacha_state, addr, addr, l);
+                       addr += l;
+                       length -= l;
+               }
+ 
+               if (unlikely(length > 0)) {
+-                      chacha_crypt(chacha_state, b.chacha_stream, pad0,
+-                                   CHACHA_BLOCK_SIZE, 20);
++                      chacha20_crypt(chacha_state, b.chacha_stream, pad0,
++                                     CHACHA_BLOCK_SIZE);
+                       crypto_xor(addr, b.chacha_stream, length);
+                       partial = length;
+               }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0037-crypto-arch-conditionalize-crypto-api-in-arch-glue-f.patch b/target/linux/generic/backport-5.4/080-wireguard-0037-crypto-arch-conditionalize-crypto-api-in-arch-glue-f.patch

new file mode 100644 (file)

index 0000000..ab04cec
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0037-crypto-arch-conditionalize-crypto-api-in-arch-glue-f.patch
@@ -0,0 +1,275 @@
+From f23fdc58a0a08afada84fe4910279ec3d8d085e7 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 25 Nov 2019 11:31:12 +0100
+Subject: [PATCH 037/124] crypto: arch - conditionalize crypto api in arch glue
+ for lib code
+
+commit 8394bfec51e0e565556101bcc4e2fe7551104cd8 upstream.
+
+For glue code that's used by Zinc, the actual Crypto API functions might
+not necessarily exist, and don't need to exist either. Before this
+patch, there are valid build configurations that lead to a unbuildable
+kernel. This fixes it to conditionalize those symbols on the existence
+of the proper config entry.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-glue.c        | 26 ++++++++++++++++----------
+ arch/arm/crypto/curve25519-glue.c    |  5 +++--
+ arch/arm/crypto/poly1305-glue.c      |  9 ++++++---
+ arch/arm64/crypto/chacha-neon-glue.c |  5 +++--
+ arch/arm64/crypto/poly1305-glue.c    |  5 +++--
+ arch/mips/crypto/chacha-glue.c       |  6 ++++--
+ arch/mips/crypto/poly1305-glue.c     |  6 ++++--
+ arch/x86/crypto/blake2s-glue.c       |  6 ++++--
+ arch/x86/crypto/chacha_glue.c        |  5 +++--
+ arch/x86/crypto/curve25519-x86_64.c  |  7 ++++---
+ arch/x86/crypto/poly1305_glue.c      |  5 +++--
+ 11 files changed, 53 insertions(+), 32 deletions(-)
+
+--- a/arch/arm/crypto/chacha-glue.c
++++ b/arch/arm/crypto/chacha-glue.c
+@@ -286,11 +286,13 @@ static struct skcipher_alg neon_algs[] =
+ 
+ static int __init chacha_simd_mod_init(void)
+ {
+-      int err;
++      int err = 0;
+ 
+-      err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+-      if (err)
+-              return err;
++      if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
++              err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
++              if (err)
++                      return err;
++      }
+ 
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+               int i;
+@@ -310,18 +312,22 @@ static int __init chacha_simd_mod_init(v
+                       static_branch_enable(&use_neon);
+               }
+ 
+-              err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
+-              if (err)
+-                      crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
++              if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
++                      err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
++                      if (err)
++                              crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
++              }
+       }
+       return err;
+ }
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-      crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+-      if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
+-              crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
++      if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
++              crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
++              if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
++                      crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
++      }
+ }
+ 
+ module_init(chacha_simd_mod_init);
+--- a/arch/arm/crypto/curve25519-glue.c
++++ b/arch/arm/crypto/curve25519-glue.c
+@@ -108,14 +108,15 @@ static int __init mod_init(void)
+ {
+       if (elf_hwcap & HWCAP_NEON) {
+               static_branch_enable(&have_neon);
+-              return crypto_register_kpp(&curve25519_alg);
++              return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
++                      crypto_register_kpp(&curve25519_alg) : 0;
+       }
+       return 0;
+ }
+ 
+ static void __exit mod_exit(void)
+ {
+-      if (elf_hwcap & HWCAP_NEON)
++      if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON)
+               crypto_unregister_kpp(&curve25519_alg);
+ }
+ 
+--- a/arch/arm/crypto/poly1305-glue.c
++++ b/arch/arm/crypto/poly1305-glue.c
+@@ -249,16 +249,19 @@ static int __init arm_poly1305_mod_init(
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+           (elf_hwcap & HWCAP_NEON))
+               static_branch_enable(&have_neon);
+-      else
++      else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
+               /* register only the first entry */
+               return crypto_register_shash(&arm_poly1305_algs[0]);
+ 
+-      return crypto_register_shashes(arm_poly1305_algs,
+-                                     ARRAY_SIZE(arm_poly1305_algs));
++      return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
++              crypto_register_shashes(arm_poly1305_algs,
++                                      ARRAY_SIZE(arm_poly1305_algs)) : 0;
+ }
+ 
+ static void __exit arm_poly1305_mod_exit(void)
+ {
++      if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
++              return;
+       if (!static_branch_likely(&have_neon)) {
+               crypto_unregister_shash(&arm_poly1305_algs[0]);
+               return;
+--- a/arch/arm64/crypto/chacha-neon-glue.c
++++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -211,12 +211,13 @@ static int __init chacha_simd_mod_init(v
+ 
+       static_branch_enable(&have_neon);
+ 
+-      return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
++      return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
++              crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
+ }
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-      if (cpu_have_named_feature(ASIMD))
++      if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && cpu_have_named_feature(ASIMD))
+               crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+--- a/arch/arm64/crypto/poly1305-glue.c
++++ b/arch/arm64/crypto/poly1305-glue.c
+@@ -220,12 +220,13 @@ static int __init neon_poly1305_mod_init
+ 
+       static_branch_enable(&have_neon);
+ 
+-      return crypto_register_shash(&neon_poly1305_alg);
++      return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
++              crypto_register_shash(&neon_poly1305_alg) : 0;
+ }
+ 
+ static void __exit neon_poly1305_mod_exit(void)
+ {
+-      if (cpu_have_named_feature(ASIMD))
++      if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && cpu_have_named_feature(ASIMD))
+               crypto_unregister_shash(&neon_poly1305_alg);
+ }
+ 
+--- a/arch/mips/crypto/chacha-glue.c
++++ b/arch/mips/crypto/chacha-glue.c
+@@ -128,12 +128,14 @@ static struct skcipher_alg algs[] = {
+ 
+ static int __init chacha_simd_mod_init(void)
+ {
+-      return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
++      return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
++              crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
+ }
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-      crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
++      if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER))
++              crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+ module_init(chacha_simd_mod_init);
+--- a/arch/mips/crypto/poly1305-glue.c
++++ b/arch/mips/crypto/poly1305-glue.c
+@@ -187,12 +187,14 @@ static struct shash_alg mips_poly1305_al
+ 
+ static int __init mips_poly1305_mod_init(void)
+ {
+-      return crypto_register_shash(&mips_poly1305_alg);
++      return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
++              crypto_register_shash(&mips_poly1305_alg) : 0;
+ }
+ 
+ static void __exit mips_poly1305_mod_exit(void)
+ {
+-      crypto_unregister_shash(&mips_poly1305_alg);
++      if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
++              crypto_unregister_shash(&mips_poly1305_alg);
+ }
+ 
+ module_init(mips_poly1305_mod_init);
+--- a/arch/x86/crypto/blake2s-glue.c
++++ b/arch/x86/crypto/blake2s-glue.c
+@@ -210,12 +210,14 @@ static int __init blake2s_mod_init(void)
+                             XFEATURE_MASK_AVX512, NULL))
+               static_branch_enable(&blake2s_use_avx512);
+ 
+-      return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
++      return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
++              crypto_register_shashes(blake2s_algs,
++                                      ARRAY_SIZE(blake2s_algs)) : 0;
+ }
+ 
+ static void __exit blake2s_mod_exit(void)
+ {
+-      if (boot_cpu_has(X86_FEATURE_SSSE3))
++      if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+               crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+ }
+ 
+--- a/arch/x86/crypto/chacha_glue.c
++++ b/arch/x86/crypto/chacha_glue.c
+@@ -299,12 +299,13 @@ static int __init chacha_simd_mod_init(v
+                   boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
+                       static_branch_enable(&chacha_use_avx512vl);
+       }
+-      return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
++      return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
++              crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
+ }
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-      if (boot_cpu_has(X86_FEATURE_SSSE3))
++      if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
+               crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+--- a/arch/x86/crypto/curve25519-x86_64.c
++++ b/arch/x86/crypto/curve25519-x86_64.c
+@@ -2457,13 +2457,14 @@ static int __init curve25519_mod_init(vo
+               static_branch_enable(&curve25519_use_adx);
+       else
+               return 0;
+-      return crypto_register_kpp(&curve25519_alg);
++      return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
++              crypto_register_kpp(&curve25519_alg) : 0;
+ }
+ 
+ static void __exit curve25519_mod_exit(void)
+ {
+-      if (boot_cpu_has(X86_FEATURE_BMI2) ||
+-          boot_cpu_has(X86_FEATURE_ADX))
++      if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
++          (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
+               crypto_unregister_kpp(&curve25519_alg);
+ }
+ 
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -224,12 +224,13 @@ static int __init poly1305_simd_mod_init
+           cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+               static_branch_enable(&poly1305_use_avx2);
+ 
+-      return crypto_register_shash(&alg);
++      return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
+ }
+ 
+ static void __exit poly1305_simd_mod_exit(void)
+ {
+-      crypto_unregister_shash(&alg);
++      if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
++              crypto_unregister_shash(&alg);
+ }
+ 
+ module_init(poly1305_simd_mod_init);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0038-crypto-chacha-fix-warning-message-in-header-file.patch b/target/linux/generic/backport-5.4/080-wireguard-0038-crypto-chacha-fix-warning-message-in-header-file.patch

new file mode 100644 (file)

index 0000000..6170e7b
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0038-crypto-chacha-fix-warning-message-in-header-file.patch
@@ -0,0 +1,35 @@
+From 61ad3d7b564718b9810b8112a6d2e9ad6405b167 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= <valdis.kletnieks@vt.edu>
+Date: Thu, 5 Dec 2019 20:58:36 -0500
+Subject: [PATCH 038/124] crypto: chacha - fix warning message in header file
+
+commit 579d705cd64e44f3fcda1a6cfd5f37468a5ddf63 upstream.
+
+Building with W=1 causes a warning:
+
+  CC [M]  arch/x86/crypto/chacha_glue.o
+In file included from arch/x86/crypto/chacha_glue.c:10:
+./include/crypto/internal/chacha.h:37:1: warning: 'inline' is not at beginning of declaration [-Wold-style-declaration]
+   37 | static int inline chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+      | ^~~~~~
+
+Straighten out the order to match the rest of the header file.
+
+Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/crypto/internal/chacha.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/crypto/internal/chacha.h
++++ b/include/crypto/internal/chacha.h
+@@ -34,7 +34,7 @@ static inline int chacha20_setkey(struct
+       return chacha_setkey(tfm, key, keysize, 20);
+ }
+ 
+-static int inline chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
++static inline int chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                                 unsigned int keysize)
+ {
+       return chacha_setkey(tfm, key, keysize, 12);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0039-crypto-arm-curve25519-add-arch-specific-key-generati.patch b/target/linux/generic/backport-5.4/080-wireguard-0039-crypto-arm-curve25519-add-arch-specific-key-generati.patch

new file mode 100644 (file)

index 0000000..19d1338
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0039-crypto-arm-curve25519-add-arch-specific-key-generati.patch
@@ -0,0 +1,38 @@
+From 610442255536492764547dddde0289d46a9566db Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 11 Dec 2019 10:26:39 +0100
+Subject: [PATCH 039/124] crypto: arm/curve25519 - add arch-specific key
+ generation function
+
+commit 84faa307249b341f6ad8de3e1869d77a65e26669 upstream.
+
+Somehow this was forgotten when Zinc was being split into oddly shaped
+pieces, resulting in linker errors. The x86_64 glue has a specific key
+generation implementation, but the Arm one does not. However, it can
+still receive the NEON speedups by calling the ordinary DH function
+using the base point.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/curve25519-glue.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/arch/arm/crypto/curve25519-glue.c
++++ b/arch/arm/crypto/curve25519-glue.c
+@@ -38,6 +38,13 @@ void curve25519_arch(u8 out[CURVE25519_K
+ }
+ EXPORT_SYMBOL(curve25519_arch);
+ 
++void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
++                        const u8 secret[CURVE25519_KEY_SIZE])
++{
++      return curve25519_arch(pub, secret, curve25519_base_point);
++}
++EXPORT_SYMBOL(curve25519_base_arch);
++
+ static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
+                                unsigned int len)
+ {
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0040-crypto-lib-curve25519-re-add-selftests.patch b/target/linux/generic/backport-5.4/080-wireguard-0040-crypto-lib-curve25519-re-add-selftests.patch

new file mode 100644 (file)

index 0000000..e4de170
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0040-crypto-lib-curve25519-re-add-selftests.patch
@@ -0,0 +1,1387 @@
+From 63b5e3c85a71705225aa3eab04127b3449a4ab5a Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 16 Dec 2019 19:53:26 +0100
+Subject: [PATCH 040/124] crypto: lib/curve25519 - re-add selftests
+
+commit aa127963f1cab2b93c74c9b128a84610203fb674 upstream.
+
+Somehow these were dropped when Zinc was being integrated, which is
+problematic, because testing the library interface for Curve25519 is
+important.. This commit simply adds them back and wires them in in the
+same way that the blake2s selftests are wired in.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/Makefile              |    1 +
+ lib/crypto/curve25519-selftest.c | 1321 ++++++++++++++++++++++++++++++
+ lib/crypto/curve25519.c          |   17 +
+ 3 files changed, 1339 insertions(+)
+ create mode 100644 lib/crypto/curve25519-selftest.c
+
+--- a/lib/crypto/Makefile
++++ b/lib/crypto/Makefile
+@@ -36,4 +36,5 @@ libsha256-y                                  := sha256.o
+ ifneq ($(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS),y)
+ libblake2s-y                                  += blake2s-selftest.o
+ libchacha20poly1305-y                         += chacha20poly1305-selftest.o
++libcurve25519-y                                       += curve25519-selftest.o
+ endif
+--- /dev/null
++++ b/lib/crypto/curve25519-selftest.c
+@@ -0,0 +1,1321 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include <crypto/curve25519.h>
++
++struct curve25519_test_vector {
++      u8 private[CURVE25519_KEY_SIZE];
++      u8 public[CURVE25519_KEY_SIZE];
++      u8 result[CURVE25519_KEY_SIZE];
++      bool valid;
++};
++static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = {
++      {
++              .private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d,
++                           0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45,
++                           0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a,
++                           0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a },
++              .public = { 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4,
++                          0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37,
++                          0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d,
++                          0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f },
++              .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1,
++                          0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25,
++                          0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33,
++                          0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 },
++              .valid = true
++      },
++      {
++              .private = { 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b,
++                           0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6,
++                           0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd,
++                           0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb },
++              .public = { 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54,
++                          0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a,
++                          0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4,
++                          0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a },
++              .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1,
++                          0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25,
++                          0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33,
++                          0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 },
++              .valid = true
++      },
++      {
++              .private = { 1 },
++              .public = { 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .result = { 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64,
++                          0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d,
++                          0x0b, 0x95, 0x48, 0xdc, 0x0c, 0xd8, 0x19, 0x98,
++                          0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f },
++              .valid = true
++      },
++      {
++              .private = { 1 },
++              .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f,
++                          0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d,
++                          0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x08, 0xed, 0xe3,
++                          0x0b, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 },
++              .valid = true
++      },
++      {
++              .private = { 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d,
++                           0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd,
++                           0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18,
++                           0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 },
++              .public = { 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb,
++                          0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c,
++                          0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b,
++                          0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c },
++              .result = { 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90,
++                          0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f,
++                          0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7,
++                          0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 },
++              .valid = true
++      },
++      {
++              .private = { 1, 2, 3, 4 },
++              .public = { 0 },
++              .result = { 0 },
++              .valid = false
++      },
++      {
++              .private = { 2, 4, 6, 8 },
++              .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae,
++                          0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a,
++                          0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd,
++                          0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8 },
++              .result = { 0 },
++              .valid = false
++      },
++      {
++              .private = { 0xff, 0xff, 0xff, 0xff, 0x0a, 0xff, 0xff, 0xff,
++                           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0x0a, 0x00, 0xfb, 0x9f },
++              .result = { 0x77, 0x52, 0xb6, 0x18, 0xc1, 0x2d, 0x48, 0xd2,
++                          0xc6, 0x93, 0x46, 0x83, 0x81, 0x7c, 0xc6, 0x57,
++                          0xf3, 0x31, 0x03, 0x19, 0x49, 0x48, 0x20, 0x05,
++                          0x42, 0x2b, 0x4e, 0xae, 0x8d, 0x1d, 0x43, 0x23 },
++              .valid = true
++      },
++      {
++              .private = { 0x8e, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .public = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x06 },
++              .result = { 0x5a, 0xdf, 0xaa, 0x25, 0x86, 0x8e, 0x32, 0x3d,
++                          0xae, 0x49, 0x62, 0xc1, 0x01, 0x5c, 0xb3, 0x12,
++                          0xe1, 0xc5, 0xc7, 0x9e, 0x95, 0x3f, 0x03, 0x99,
++                          0xb0, 0xba, 0x16, 0x22, 0xf3, 0xb6, 0xf7, 0x0c },
++              .valid = true
++      },
++      /* wycheproof - normal case */
++      {
++              .private = { 0x48, 0x52, 0x83, 0x4d, 0x9d, 0x6b, 0x77, 0xda,
++                           0xde, 0xab, 0xaa, 0xf2, 0xe1, 0x1d, 0xca, 0x66,
++                           0xd1, 0x9f, 0xe7, 0x49, 0x93, 0xa7, 0xbe, 0xc3,
++                           0x6c, 0x6e, 0x16, 0xa0, 0x98, 0x3f, 0xea, 0xba },
++              .public = { 0x9c, 0x64, 0x7d, 0x9a, 0xe5, 0x89, 0xb9, 0xf5,
++                          0x8f, 0xdc, 0x3c, 0xa4, 0x94, 0x7e, 0xfb, 0xc9,
++                          0x15, 0xc4, 0xb2, 0xe0, 0x8e, 0x74, 0x4a, 0x0e,
++                          0xdf, 0x46, 0x9d, 0xac, 0x59, 0xc8, 0xf8, 0x5a },
++              .result = { 0x87, 0xb7, 0xf2, 0x12, 0xb6, 0x27, 0xf7, 0xa5,
++                          0x4c, 0xa5, 0xe0, 0xbc, 0xda, 0xdd, 0xd5, 0x38,
++                          0x9d, 0x9d, 0xe6, 0x15, 0x6c, 0xdb, 0xcf, 0x8e,
++                          0xbe, 0x14, 0xff, 0xbc, 0xfb, 0x43, 0x65, 0x51 },
++              .valid = true
++      },
++      /* wycheproof - public key on twist */
++      {
++              .private = { 0x58, 0x8c, 0x06, 0x1a, 0x50, 0x80, 0x4a, 0xc4,
++                           0x88, 0xad, 0x77, 0x4a, 0xc7, 0x16, 0xc3, 0xf5,
++                           0xba, 0x71, 0x4b, 0x27, 0x12, 0xe0, 0x48, 0x49,
++                           0x13, 0x79, 0xa5, 0x00, 0x21, 0x19, 0x98, 0xa8 },
++              .public = { 0x63, 0xaa, 0x40, 0xc6, 0xe3, 0x83, 0x46, 0xc5,
++                          0xca, 0xf2, 0x3a, 0x6d, 0xf0, 0xa5, 0xe6, 0xc8,
++                          0x08, 0x89, 0xa0, 0x86, 0x47, 0xe5, 0x51, 0xb3,
++                          0x56, 0x34, 0x49, 0xbe, 0xfc, 0xfc, 0x97, 0x33 },
++              .result = { 0xb1, 0xa7, 0x07, 0x51, 0x94, 0x95, 0xff, 0xff,
++                          0xb2, 0x98, 0xff, 0x94, 0x17, 0x16, 0xb0, 0x6d,
++                          0xfa, 0xb8, 0x7c, 0xf8, 0xd9, 0x11, 0x23, 0xfe,
++                          0x2b, 0xe9, 0xa2, 0x33, 0xdd, 0xa2, 0x22, 0x12 },
++              .valid = true
++      },
++      /* wycheproof - public key on twist */
++      {
++              .private = { 0xb0, 0x5b, 0xfd, 0x32, 0xe5, 0x53, 0x25, 0xd9,
++                           0xfd, 0x64, 0x8c, 0xb3, 0x02, 0x84, 0x80, 0x39,
++                           0x00, 0x0b, 0x39, 0x0e, 0x44, 0xd5, 0x21, 0xe5,
++                           0x8a, 0xab, 0x3b, 0x29, 0xa6, 0x96, 0x0b, 0xa8 },
++              .public = { 0x0f, 0x83, 0xc3, 0x6f, 0xde, 0xd9, 0xd3, 0x2f,
++                          0xad, 0xf4, 0xef, 0xa3, 0xae, 0x93, 0xa9, 0x0b,
++                          0xb5, 0xcf, 0xa6, 0x68, 0x93, 0xbc, 0x41, 0x2c,
++                          0x43, 0xfa, 0x72, 0x87, 0xdb, 0xb9, 0x97, 0x79 },
++              .result = { 0x67, 0xdd, 0x4a, 0x6e, 0x16, 0x55, 0x33, 0x53,
++                          0x4c, 0x0e, 0x3f, 0x17, 0x2e, 0x4a, 0xb8, 0x57,
++                          0x6b, 0xca, 0x92, 0x3a, 0x5f, 0x07, 0xb2, 0xc0,
++                          0x69, 0xb4, 0xc3, 0x10, 0xff, 0x2e, 0x93, 0x5b },
++              .valid = true
++      },
++      /* wycheproof - public key on twist */
++      {
++              .private = { 0x70, 0xe3, 0x4b, 0xcb, 0xe1, 0xf4, 0x7f, 0xbc,
++                           0x0f, 0xdd, 0xfd, 0x7c, 0x1e, 0x1a, 0xa5, 0x3d,
++                           0x57, 0xbf, 0xe0, 0xf6, 0x6d, 0x24, 0x30, 0x67,
++                           0xb4, 0x24, 0xbb, 0x62, 0x10, 0xbe, 0xd1, 0x9c },
++              .public = { 0x0b, 0x82, 0x11, 0xa2, 0xb6, 0x04, 0x90, 0x97,
++                          0xf6, 0x87, 0x1c, 0x6c, 0x05, 0x2d, 0x3c, 0x5f,
++                          0xc1, 0xba, 0x17, 0xda, 0x9e, 0x32, 0xae, 0x45,
++                          0x84, 0x03, 0xb0, 0x5b, 0xb2, 0x83, 0x09, 0x2a },
++              .result = { 0x4a, 0x06, 0x38, 0xcf, 0xaa, 0x9e, 0xf1, 0x93,
++                          0x3b, 0x47, 0xf8, 0x93, 0x92, 0x96, 0xa6, 0xb2,
++                          0x5b, 0xe5, 0x41, 0xef, 0x7f, 0x70, 0xe8, 0x44,
++                          0xc0, 0xbc, 0xc0, 0x0b, 0x13, 0x4d, 0xe6, 0x4a },
++              .valid = true
++      },
++      /* wycheproof - public key on twist */
++      {
++              .private = { 0x68, 0xc1, 0xf3, 0xa6, 0x53, 0xa4, 0xcd, 0xb1,
++                           0xd3, 0x7b, 0xba, 0x94, 0x73, 0x8f, 0x8b, 0x95,
++                           0x7a, 0x57, 0xbe, 0xb2, 0x4d, 0x64, 0x6e, 0x99,
++                           0x4d, 0xc2, 0x9a, 0x27, 0x6a, 0xad, 0x45, 0x8d },
++              .public = { 0x34, 0x3a, 0xc2, 0x0a, 0x3b, 0x9c, 0x6a, 0x27,
++                          0xb1, 0x00, 0x81, 0x76, 0x50, 0x9a, 0xd3, 0x07,
++                          0x35, 0x85, 0x6e, 0xc1, 0xc8, 0xd8, 0xfc, 0xae,
++                          0x13, 0x91, 0x2d, 0x08, 0xd1, 0x52, 0xf4, 0x6c },
++              .result = { 0x39, 0x94, 0x91, 0xfc, 0xe8, 0xdf, 0xab, 0x73,
++                          0xb4, 0xf9, 0xf6, 0x11, 0xde, 0x8e, 0xa0, 0xb2,
++                          0x7b, 0x28, 0xf8, 0x59, 0x94, 0x25, 0x0b, 0x0f,
++                          0x47, 0x5d, 0x58, 0x5d, 0x04, 0x2a, 0xc2, 0x07 },
++              .valid = true
++      },
++      /* wycheproof - public key on twist */
++      {
++              .private = { 0xd8, 0x77, 0xb2, 0x6d, 0x06, 0xdf, 0xf9, 0xd9,
++                           0xf7, 0xfd, 0x4c, 0x5b, 0x37, 0x69, 0xf8, 0xcd,
++                           0xd5, 0xb3, 0x05, 0x16, 0xa5, 0xab, 0x80, 0x6b,
++                           0xe3, 0x24, 0xff, 0x3e, 0xb6, 0x9e, 0xa0, 0xb2 },
++              .public = { 0xfa, 0x69, 0x5f, 0xc7, 0xbe, 0x8d, 0x1b, 0xe5,
++                          0xbf, 0x70, 0x48, 0x98, 0xf3, 0x88, 0xc4, 0x52,
++                          0xba, 0xfd, 0xd3, 0xb8, 0xea, 0xe8, 0x05, 0xf8,
++                          0x68, 0x1a, 0x8d, 0x15, 0xc2, 0xd4, 0xe1, 0x42 },
++              .result = { 0x2c, 0x4f, 0xe1, 0x1d, 0x49, 0x0a, 0x53, 0x86,
++                          0x17, 0x76, 0xb1, 0x3b, 0x43, 0x54, 0xab, 0xd4,
++                          0xcf, 0x5a, 0x97, 0x69, 0x9d, 0xb6, 0xe6, 0xc6,
++                          0x8c, 0x16, 0x26, 0xd0, 0x76, 0x62, 0xf7, 0x58 },
++              .valid = true
++      },
++      /* wycheproof - public key = 0 */
++      {
++              .private = { 0x20, 0x74, 0x94, 0x03, 0x8f, 0x2b, 0xb8, 0x11,
++                           0xd4, 0x78, 0x05, 0xbc, 0xdf, 0x04, 0xa2, 0xac,
++                           0x58, 0x5a, 0xda, 0x7f, 0x2f, 0x23, 0x38, 0x9b,
++                           0xfd, 0x46, 0x58, 0xf9, 0xdd, 0xd4, 0xde, 0xbc },
++              .public = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key = 1 */
++      {
++              .private = { 0x20, 0x2e, 0x89, 0x72, 0xb6, 0x1c, 0x7e, 0x61,
++                           0x93, 0x0e, 0xb9, 0x45, 0x0b, 0x50, 0x70, 0xea,
++                           0xe1, 0xc6, 0x70, 0x47, 0x56, 0x85, 0x54, 0x1f,
++                           0x04, 0x76, 0x21, 0x7e, 0x48, 0x18, 0xcf, 0xab },
++              .public = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - edge case on twist */
++      {
++              .private = { 0x38, 0xdd, 0xe9, 0xf3, 0xe7, 0xb7, 0x99, 0x04,
++                           0x5f, 0x9a, 0xc3, 0x79, 0x3d, 0x4a, 0x92, 0x77,
++                           0xda, 0xde, 0xad, 0xc4, 0x1b, 0xec, 0x02, 0x90,
++                           0xf8, 0x1f, 0x74, 0x4f, 0x73, 0x77, 0x5f, 0x84 },
++              .public = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .result = { 0x9a, 0x2c, 0xfe, 0x84, 0xff, 0x9c, 0x4a, 0x97,
++                          0x39, 0x62, 0x5c, 0xae, 0x4a, 0x3b, 0x82, 0xa9,
++                          0x06, 0x87, 0x7a, 0x44, 0x19, 0x46, 0xf8, 0xd7,
++                          0xb3, 0xd7, 0x95, 0xfe, 0x8f, 0x5d, 0x16, 0x39 },
++              .valid = true
++      },
++      /* wycheproof - edge case on twist */
++      {
++              .private = { 0x98, 0x57, 0xa9, 0x14, 0xe3, 0xc2, 0x90, 0x36,
++                           0xfd, 0x9a, 0x44, 0x2b, 0xa5, 0x26, 0xb5, 0xcd,
++                           0xcd, 0xf2, 0x82, 0x16, 0x15, 0x3e, 0x63, 0x6c,
++                           0x10, 0x67, 0x7a, 0xca, 0xb6, 0xbd, 0x6a, 0xa5 },
++              .public = { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .result = { 0x4d, 0xa4, 0xe0, 0xaa, 0x07, 0x2c, 0x23, 0x2e,
++                          0xe2, 0xf0, 0xfa, 0x4e, 0x51, 0x9a, 0xe5, 0x0b,
++                          0x52, 0xc1, 0xed, 0xd0, 0x8a, 0x53, 0x4d, 0x4e,
++                          0xf3, 0x46, 0xc2, 0xe1, 0x06, 0xd2, 0x1d, 0x60 },
++              .valid = true
++      },
++      /* wycheproof - edge case on twist */
++      {
++              .private = { 0x48, 0xe2, 0x13, 0x0d, 0x72, 0x33, 0x05, 0xed,
++                           0x05, 0xe6, 0xe5, 0x89, 0x4d, 0x39, 0x8a, 0x5e,
++                           0x33, 0x36, 0x7a, 0x8c, 0x6a, 0xac, 0x8f, 0xcd,
++                           0xf0, 0xa8, 0x8e, 0x4b, 0x42, 0x82, 0x0d, 0xb7 },
++              .public = { 0xff, 0xff, 0xff, 0x03, 0x00, 0x00, 0xf8, 0xff,
++                          0xff, 0x1f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff,
++                          0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 0x00,
++                          0x00, 0xf0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00 },
++              .result = { 0x9e, 0xd1, 0x0c, 0x53, 0x74, 0x7f, 0x64, 0x7f,
++                          0x82, 0xf4, 0x51, 0x25, 0xd3, 0xde, 0x15, 0xa1,
++                          0xe6, 0xb8, 0x24, 0x49, 0x6a, 0xb4, 0x04, 0x10,
++                          0xff, 0xcc, 0x3c, 0xfe, 0x95, 0x76, 0x0f, 0x3b },
++              .valid = true
++      },
++      /* wycheproof - edge case on twist */
++      {
++              .private = { 0x28, 0xf4, 0x10, 0x11, 0x69, 0x18, 0x51, 0xb3,
++                           0xa6, 0x2b, 0x64, 0x15, 0x53, 0xb3, 0x0d, 0x0d,
++                           0xfd, 0xdc, 0xb8, 0xff, 0xfc, 0xf5, 0x37, 0x00,
++                           0xa7, 0xbe, 0x2f, 0x6a, 0x87, 0x2e, 0x9f, 0xb0 },
++              .public = { 0x00, 0x00, 0x00, 0xfc, 0xff, 0xff, 0x07, 0x00,
++                          0x00, 0xe0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00,
++                          0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0xf8, 0xff,
++                          0xff, 0x0f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0x7f },
++              .result = { 0xcf, 0x72, 0xb4, 0xaa, 0x6a, 0xa1, 0xc9, 0xf8,
++                          0x94, 0xf4, 0x16, 0x5b, 0x86, 0x10, 0x9a, 0xa4,
++                          0x68, 0x51, 0x76, 0x48, 0xe1, 0xf0, 0xcc, 0x70,
++                          0xe1, 0xab, 0x08, 0x46, 0x01, 0x76, 0x50, 0x6b },
++              .valid = true
++      },
++      /* wycheproof - edge case on twist */
++      {
++              .private = { 0x18, 0xa9, 0x3b, 0x64, 0x99, 0xb9, 0xf6, 0xb3,
++                           0x22, 0x5c, 0xa0, 0x2f, 0xef, 0x41, 0x0e, 0x0a,
++                           0xde, 0xc2, 0x35, 0x32, 0x32, 0x1d, 0x2d, 0x8e,
++                           0xf1, 0xa6, 0xd6, 0x02, 0xa8, 0xc6, 0x5b, 0x83 },
++              .public = { 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++                          0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++                          0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
++                          0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x7f },
++              .result = { 0x5d, 0x50, 0xb6, 0x28, 0x36, 0xbb, 0x69, 0x57,
++                          0x94, 0x10, 0x38, 0x6c, 0xf7, 0xbb, 0x81, 0x1c,
++                          0x14, 0xbf, 0x85, 0xb1, 0xc7, 0xb1, 0x7e, 0x59,
++                          0x24, 0xc7, 0xff, 0xea, 0x91, 0xef, 0x9e, 0x12 },
++              .valid = true
++      },
++      /* wycheproof - edge case on twist */
++      {
++              .private = { 0xc0, 0x1d, 0x13, 0x05, 0xa1, 0x33, 0x8a, 0x1f,
++                           0xca, 0xc2, 0xba, 0x7e, 0x2e, 0x03, 0x2b, 0x42,
++                           0x7e, 0x0b, 0x04, 0x90, 0x31, 0x65, 0xac, 0xa9,
++                           0x57, 0xd8, 0xd0, 0x55, 0x3d, 0x87, 0x17, 0xb0 },
++              .public = { 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .result = { 0x19, 0x23, 0x0e, 0xb1, 0x48, 0xd5, 0xd6, 0x7c,
++                          0x3c, 0x22, 0xab, 0x1d, 0xae, 0xff, 0x80, 0xa5,
++                          0x7e, 0xae, 0x42, 0x65, 0xce, 0x28, 0x72, 0x65,
++                          0x7b, 0x2c, 0x80, 0x99, 0xfc, 0x69, 0x8e, 0x50 },
++              .valid = true
++      },
++      /* wycheproof - edge case for public key */
++      {
++              .private = { 0x38, 0x6f, 0x7f, 0x16, 0xc5, 0x07, 0x31, 0xd6,
++                           0x4f, 0x82, 0xe6, 0xa1, 0x70, 0xb1, 0x42, 0xa4,
++                           0xe3, 0x4f, 0x31, 0xfd, 0x77, 0x68, 0xfc, 0xb8,
++                           0x90, 0x29, 0x25, 0xe7, 0xd1, 0xe2, 0x1a, 0xbe },
++              .public = { 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .result = { 0x0f, 0xca, 0xb5, 0xd8, 0x42, 0xa0, 0x78, 0xd7,
++                          0xa7, 0x1f, 0xc5, 0x9b, 0x57, 0xbf, 0xb4, 0xca,
++                          0x0b, 0xe6, 0x87, 0x3b, 0x49, 0xdc, 0xdb, 0x9f,
++                          0x44, 0xe1, 0x4a, 0xe8, 0xfb, 0xdf, 0xa5, 0x42 },
++              .valid = true
++      },
++      /* wycheproof - edge case for public key */
++      {
++              .private = { 0xe0, 0x23, 0xa2, 0x89, 0xbd, 0x5e, 0x90, 0xfa,
++                           0x28, 0x04, 0xdd, 0xc0, 0x19, 0xa0, 0x5e, 0xf3,
++                           0xe7, 0x9d, 0x43, 0x4b, 0xb6, 0xea, 0x2f, 0x52,
++                           0x2e, 0xcb, 0x64, 0x3a, 0x75, 0x29, 0x6e, 0x95 },
++              .public = { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++                          0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++                          0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++                          0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
++              .result = { 0x54, 0xce, 0x8f, 0x22, 0x75, 0xc0, 0x77, 0xe3,
++                          0xb1, 0x30, 0x6a, 0x39, 0x39, 0xc5, 0xe0, 0x3e,
++                          0xef, 0x6b, 0xbb, 0x88, 0x06, 0x05, 0x44, 0x75,
++                          0x8d, 0x9f, 0xef, 0x59, 0xb0, 0xbc, 0x3e, 0x4f },
++              .valid = true
++      },
++      /* wycheproof - edge case for public key */
++      {
++              .private = { 0x68, 0xf0, 0x10, 0xd6, 0x2e, 0xe8, 0xd9, 0x26,
++                           0x05, 0x3a, 0x36, 0x1c, 0x3a, 0x75, 0xc6, 0xea,
++                           0x4e, 0xbd, 0xc8, 0x60, 0x6a, 0xb2, 0x85, 0x00,
++                           0x3a, 0x6f, 0x8f, 0x40, 0x76, 0xb0, 0x1e, 0x83 },
++              .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 },
++              .result = { 0xf1, 0x36, 0x77, 0x5c, 0x5b, 0xeb, 0x0a, 0xf8,
++                          0x11, 0x0a, 0xf1, 0x0b, 0x20, 0x37, 0x23, 0x32,
++                          0x04, 0x3c, 0xab, 0x75, 0x24, 0x19, 0x67, 0x87,
++                          0x75, 0xa2, 0x23, 0xdf, 0x57, 0xc9, 0xd3, 0x0d },
++              .valid = true
++      },
++      /* wycheproof - edge case for public key */
++      {
++              .private = { 0x58, 0xeb, 0xcb, 0x35, 0xb0, 0xf8, 0x84, 0x5c,
++                           0xaf, 0x1e, 0xc6, 0x30, 0xf9, 0x65, 0x76, 0xb6,
++                           0x2c, 0x4b, 0x7b, 0x6c, 0x36, 0xb2, 0x9d, 0xeb,
++                           0x2c, 0xb0, 0x08, 0x46, 0x51, 0x75, 0x5c, 0x96 },
++              .public = { 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xfb, 0xff,
++                          0xff, 0xdf, 0xff, 0xff, 0xdf, 0xff, 0xff, 0xff,
++                          0xfe, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xf7, 0xff,
++                          0xff, 0xf7, 0xff, 0xff, 0xbf, 0xff, 0xff, 0x3f },
++              .result = { 0xbf, 0x9a, 0xff, 0xd0, 0x6b, 0x84, 0x40, 0x85,
++                          0x58, 0x64, 0x60, 0x96, 0x2e, 0xf2, 0x14, 0x6f,
++                          0xf3, 0xd4, 0x53, 0x3d, 0x94, 0x44, 0xaa, 0xb0,
++                          0x06, 0xeb, 0x88, 0xcc, 0x30, 0x54, 0x40, 0x7d },
++              .valid = true
++      },
++      /* wycheproof - edge case for public key */
++      {
++              .private = { 0x18, 0x8c, 0x4b, 0xc5, 0xb9, 0xc4, 0x4b, 0x38,
++                           0xbb, 0x65, 0x8b, 0x9b, 0x2a, 0xe8, 0x2d, 0x5b,
++                           0x01, 0x01, 0x5e, 0x09, 0x31, 0x84, 0xb1, 0x7c,
++                           0xb7, 0x86, 0x35, 0x03, 0xa7, 0x83, 0xe1, 0xbb },
++              .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
++              .result = { 0xd4, 0x80, 0xde, 0x04, 0xf6, 0x99, 0xcb, 0x3b,
++                          0xe0, 0x68, 0x4a, 0x9c, 0xc2, 0xe3, 0x12, 0x81,
++                          0xea, 0x0b, 0xc5, 0xa9, 0xdc, 0xc1, 0x57, 0xd3,
++                          0xd2, 0x01, 0x58, 0xd4, 0x6c, 0xa5, 0x24, 0x6d },
++              .valid = true
++      },
++      /* wycheproof - edge case for public key */
++      {
++              .private = { 0xe0, 0x6c, 0x11, 0xbb, 0x2e, 0x13, 0xce, 0x3d,
++                           0xc7, 0x67, 0x3f, 0x67, 0xf5, 0x48, 0x22, 0x42,
++                           0x90, 0x94, 0x23, 0xa9, 0xae, 0x95, 0xee, 0x98,
++                           0x6a, 0x98, 0x8d, 0x98, 0xfa, 0xee, 0x23, 0xa2 },
++              .public = { 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
++                          0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
++                          0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
++                          0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f },
++              .result = { 0x4c, 0x44, 0x01, 0xcc, 0xe6, 0xb5, 0x1e, 0x4c,
++                          0xb1, 0x8f, 0x27, 0x90, 0x24, 0x6c, 0x9b, 0xf9,
++                          0x14, 0xdb, 0x66, 0x77, 0x50, 0xa1, 0xcb, 0x89,
++                          0x06, 0x90, 0x92, 0xaf, 0x07, 0x29, 0x22, 0x76 },
++              .valid = true
++      },
++      /* wycheproof - edge case for public key */
++      {
++              .private = { 0xc0, 0x65, 0x8c, 0x46, 0xdd, 0xe1, 0x81, 0x29,
++                           0x29, 0x38, 0x77, 0x53, 0x5b, 0x11, 0x62, 0xb6,
++                           0xf9, 0xf5, 0x41, 0x4a, 0x23, 0xcf, 0x4d, 0x2c,
++                           0xbc, 0x14, 0x0a, 0x4d, 0x99, 0xda, 0x2b, 0x8f },
++              .public = { 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .result = { 0x57, 0x8b, 0xa8, 0xcc, 0x2d, 0xbd, 0xc5, 0x75,
++                          0xaf, 0xcf, 0x9d, 0xf2, 0xb3, 0xee, 0x61, 0x89,
++                          0xf5, 0x33, 0x7d, 0x68, 0x54, 0xc7, 0x9b, 0x4c,
++                          0xe1, 0x65, 0xea, 0x12, 0x29, 0x3b, 0x3a, 0x0f },
++              .valid = true
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0x10, 0x25, 0x5c, 0x92, 0x30, 0xa9, 0x7a, 0x30,
++                           0xa4, 0x58, 0xca, 0x28, 0x4a, 0x62, 0x96, 0x69,
++                           0x29, 0x3a, 0x31, 0x89, 0x0c, 0xda, 0x9d, 0x14,
++                           0x7f, 0xeb, 0xc7, 0xd1, 0xe2, 0x2d, 0x6b, 0xb1 },
++              .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae,
++                          0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a,
++                          0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd,
++                          0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8, 0x00 },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0x78, 0xf1, 0xe8, 0xed, 0xf1, 0x44, 0x81, 0xb3,
++                           0x89, 0x44, 0x8d, 0xac, 0x8f, 0x59, 0xc7, 0x0b,
++                           0x03, 0x8e, 0x7c, 0xf9, 0x2e, 0xf2, 0xc7, 0xef,
++                           0xf5, 0x7a, 0x72, 0x46, 0x6e, 0x11, 0x52, 0x96 },
++              .public = { 0x5f, 0x9c, 0x95, 0xbc, 0xa3, 0x50, 0x8c, 0x24,
++                          0xb1, 0xd0, 0xb1, 0x55, 0x9c, 0x83, 0xef, 0x5b,
++                          0x04, 0x44, 0x5c, 0xc4, 0x58, 0x1c, 0x8e, 0x86,
++                          0xd8, 0x22, 0x4e, 0xdd, 0xd0, 0x9f, 0x11, 0x57 },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0xa0, 0xa0, 0x5a, 0x3e, 0x8f, 0x9f, 0x44, 0x20,
++                           0x4d, 0x5f, 0x80, 0x59, 0xa9, 0x4a, 0xc7, 0xdf,
++                           0xc3, 0x9a, 0x49, 0xac, 0x01, 0x6d, 0xd7, 0x43,
++                           0xdb, 0xfa, 0x43, 0xc5, 0xd6, 0x71, 0xfd, 0x88 },
++              .public = { 0xec, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0xd0, 0xdb, 0xb3, 0xed, 0x19, 0x06, 0x66, 0x3f,
++                           0x15, 0x42, 0x0a, 0xf3, 0x1f, 0x4e, 0xaf, 0x65,
++                           0x09, 0xd9, 0xa9, 0x94, 0x97, 0x23, 0x50, 0x06,
++                           0x05, 0xad, 0x7c, 0x1c, 0x6e, 0x74, 0x50, 0xa9 },
++              .public = { 0xed, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0xc0, 0xb1, 0xd0, 0xeb, 0x22, 0xb2, 0x44, 0xfe,
++                           0x32, 0x91, 0x14, 0x00, 0x72, 0xcd, 0xd9, 0xd9,
++                           0x89, 0xb5, 0xf0, 0xec, 0xd9, 0x6c, 0x10, 0x0f,
++                           0xeb, 0x5b, 0xca, 0x24, 0x1c, 0x1d, 0x9f, 0x8f },
++              .public = { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0x48, 0x0b, 0xf4, 0x5f, 0x59, 0x49, 0x42, 0xa8,
++                           0xbc, 0x0f, 0x33, 0x53, 0xc6, 0xe8, 0xb8, 0x85,
++                           0x3d, 0x77, 0xf3, 0x51, 0xf1, 0xc2, 0xca, 0x6c,
++                           0x2d, 0x1a, 0xbf, 0x8a, 0x00, 0xb4, 0x22, 0x9c },
++              .public = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0x30, 0xf9, 0x93, 0xfc, 0xf8, 0x51, 0x4f, 0xc8,
++                           0x9b, 0xd8, 0xdb, 0x14, 0xcd, 0x43, 0xba, 0x0d,
++                           0x4b, 0x25, 0x30, 0xe7, 0x3c, 0x42, 0x76, 0xa0,
++                           0x5e, 0x1b, 0x14, 0x5d, 0x42, 0x0c, 0xed, 0xb4 },
++              .public = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0xc0, 0x49, 0x74, 0xb7, 0x58, 0x38, 0x0e, 0x2a,
++                           0x5b, 0x5d, 0xf6, 0xeb, 0x09, 0xbb, 0x2f, 0x6b,
++                           0x34, 0x34, 0xf9, 0x82, 0x72, 0x2a, 0x8e, 0x67,
++                           0x6d, 0x3d, 0xa2, 0x51, 0xd1, 0xb3, 0xde, 0x83 },
++              .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae,
++                          0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a,
++                          0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd,
++                          0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8, 0x80 },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0x50, 0x2a, 0x31, 0x37, 0x3d, 0xb3, 0x24, 0x46,
++                           0x84, 0x2f, 0xe5, 0xad, 0xd3, 0xe0, 0x24, 0x02,
++                           0x2e, 0xa5, 0x4f, 0x27, 0x41, 0x82, 0xaf, 0xc3,
++                           0xd9, 0xf1, 0xbb, 0x3d, 0x39, 0x53, 0x4e, 0xb5 },
++              .public = { 0x5f, 0x9c, 0x95, 0xbc, 0xa3, 0x50, 0x8c, 0x24,
++                          0xb1, 0xd0, 0xb1, 0x55, 0x9c, 0x83, 0xef, 0x5b,
++                          0x04, 0x44, 0x5c, 0xc4, 0x58, 0x1c, 0x8e, 0x86,
++                          0xd8, 0x22, 0x4e, 0xdd, 0xd0, 0x9f, 0x11, 0xd7 },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0x90, 0xfa, 0x64, 0x17, 0xb0, 0xe3, 0x70, 0x30,
++                           0xfd, 0x6e, 0x43, 0xef, 0xf2, 0xab, 0xae, 0xf1,
++                           0x4c, 0x67, 0x93, 0x11, 0x7a, 0x03, 0x9c, 0xf6,
++                           0x21, 0x31, 0x8b, 0xa9, 0x0f, 0x4e, 0x98, 0xbe },
++              .public = { 0xec, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0x78, 0xad, 0x3f, 0x26, 0x02, 0x7f, 0x1c, 0x9f,
++                           0xdd, 0x97, 0x5a, 0x16, 0x13, 0xb9, 0x47, 0x77,
++                           0x9b, 0xad, 0x2c, 0xf2, 0xb7, 0x41, 0xad, 0xe0,
++                           0x18, 0x40, 0x88, 0x5a, 0x30, 0xbb, 0x97, 0x9c },
++              .public = { 0xed, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key with low order */
++      {
++              .private = { 0x98, 0xe2, 0x3d, 0xe7, 0xb1, 0xe0, 0x92, 0x6e,
++                           0xd9, 0xc8, 0x7e, 0x7b, 0x14, 0xba, 0xf5, 0x5f,
++                           0x49, 0x7a, 0x1d, 0x70, 0x96, 0xf9, 0x39, 0x77,
++                           0x68, 0x0e, 0x44, 0xdc, 0x1c, 0x7b, 0x7b, 0x8b },
++              .public = { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = false
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0xf0, 0x1e, 0x48, 0xda, 0xfa, 0xc9, 0xd7, 0xbc,
++                           0xf5, 0x89, 0xcb, 0xc3, 0x82, 0xc8, 0x78, 0xd1,
++                           0x8b, 0xda, 0x35, 0x50, 0x58, 0x9f, 0xfb, 0x5d,
++                           0x50, 0xb5, 0x23, 0xbe, 0xbe, 0x32, 0x9d, 0xae },
++              .public = { 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .result = { 0xbd, 0x36, 0xa0, 0x79, 0x0e, 0xb8, 0x83, 0x09,
++                          0x8c, 0x98, 0x8b, 0x21, 0x78, 0x67, 0x73, 0xde,
++                          0x0b, 0x3a, 0x4d, 0xf1, 0x62, 0x28, 0x2c, 0xf1,
++                          0x10, 0xde, 0x18, 0xdd, 0x48, 0x4c, 0xe7, 0x4b },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0x28, 0x87, 0x96, 0xbc, 0x5a, 0xff, 0x4b, 0x81,
++                           0xa3, 0x75, 0x01, 0x75, 0x7b, 0xc0, 0x75, 0x3a,
++                           0x3c, 0x21, 0x96, 0x47, 0x90, 0xd3, 0x86, 0x99,
++                           0x30, 0x8d, 0xeb, 0xc1, 0x7a, 0x6e, 0xaf, 0x8d },
++              .public = { 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .result = { 0xb4, 0xe0, 0xdd, 0x76, 0xda, 0x7b, 0x07, 0x17,
++                          0x28, 0xb6, 0x1f, 0x85, 0x67, 0x71, 0xaa, 0x35,
++                          0x6e, 0x57, 0xed, 0xa7, 0x8a, 0x5b, 0x16, 0x55,
++                          0xcc, 0x38, 0x20, 0xfb, 0x5f, 0x85, 0x4c, 0x5c },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0x98, 0xdf, 0x84, 0x5f, 0x66, 0x51, 0xbf, 0x11,
++                           0x38, 0x22, 0x1f, 0x11, 0x90, 0x41, 0xf7, 0x2b,
++                           0x6d, 0xbc, 0x3c, 0x4a, 0xce, 0x71, 0x43, 0xd9,
++                           0x9f, 0xd5, 0x5a, 0xd8, 0x67, 0x48, 0x0d, 0xa8 },
++              .public = { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .result = { 0x6f, 0xdf, 0x6c, 0x37, 0x61, 0x1d, 0xbd, 0x53,
++                          0x04, 0xdc, 0x0f, 0x2e, 0xb7, 0xc9, 0x51, 0x7e,
++                          0xb3, 0xc5, 0x0e, 0x12, 0xfd, 0x05, 0x0a, 0xc6,
++                          0xde, 0xc2, 0x70, 0x71, 0xd4, 0xbf, 0xc0, 0x34 },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0xf0, 0x94, 0x98, 0xe4, 0x6f, 0x02, 0xf8, 0x78,
++                           0x82, 0x9e, 0x78, 0xb8, 0x03, 0xd3, 0x16, 0xa2,
++                           0xed, 0x69, 0x5d, 0x04, 0x98, 0xa0, 0x8a, 0xbd,
++                           0xf8, 0x27, 0x69, 0x30, 0xe2, 0x4e, 0xdc, 0xb0 },
++              .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .result = { 0x4c, 0x8f, 0xc4, 0xb1, 0xc6, 0xab, 0x88, 0xfb,
++                          0x21, 0xf1, 0x8f, 0x6d, 0x4c, 0x81, 0x02, 0x40,
++                          0xd4, 0xe9, 0x46, 0x51, 0xba, 0x44, 0xf7, 0xa2,
++                          0xc8, 0x63, 0xce, 0xc7, 0xdc, 0x56, 0x60, 0x2d },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0x18, 0x13, 0xc1, 0x0a, 0x5c, 0x7f, 0x21, 0xf9,
++                           0x6e, 0x17, 0xf2, 0x88, 0xc0, 0xcc, 0x37, 0x60,
++                           0x7c, 0x04, 0xc5, 0xf5, 0xae, 0xa2, 0xdb, 0x13,
++                           0x4f, 0x9e, 0x2f, 0xfc, 0x66, 0xbd, 0x9d, 0xb8 },
++              .public = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
++              .result = { 0x1c, 0xd0, 0xb2, 0x82, 0x67, 0xdc, 0x54, 0x1c,
++                          0x64, 0x2d, 0x6d, 0x7d, 0xca, 0x44, 0xa8, 0xb3,
++                          0x8a, 0x63, 0x73, 0x6e, 0xef, 0x5c, 0x4e, 0x65,
++                          0x01, 0xff, 0xbb, 0xb1, 0x78, 0x0c, 0x03, 0x3c },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0x78, 0x57, 0xfb, 0x80, 0x86, 0x53, 0x64, 0x5a,
++                           0x0b, 0xeb, 0x13, 0x8a, 0x64, 0xf5, 0xf4, 0xd7,
++                           0x33, 0xa4, 0x5e, 0xa8, 0x4c, 0x3c, 0xda, 0x11,
++                           0xa9, 0xc0, 0x6f, 0x7e, 0x71, 0x39, 0x14, 0x9e },
++              .public = { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
++              .result = { 0x87, 0x55, 0xbe, 0x01, 0xc6, 0x0a, 0x7e, 0x82,
++                          0x5c, 0xff, 0x3e, 0x0e, 0x78, 0xcb, 0x3a, 0xa4,
++                          0x33, 0x38, 0x61, 0x51, 0x6a, 0xa5, 0x9b, 0x1c,
++                          0x51, 0xa8, 0xb2, 0xa5, 0x43, 0xdf, 0xa8, 0x22 },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0xe0, 0x3a, 0xa8, 0x42, 0xe2, 0xab, 0xc5, 0x6e,
++                           0x81, 0xe8, 0x7b, 0x8b, 0x9f, 0x41, 0x7b, 0x2a,
++                           0x1e, 0x59, 0x13, 0xc7, 0x23, 0xee, 0xd2, 0x8d,
++                           0x75, 0x2f, 0x8d, 0x47, 0xa5, 0x9f, 0x49, 0x8f },
++              .public = { 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
++              .result = { 0x54, 0xc9, 0xa1, 0xed, 0x95, 0xe5, 0x46, 0xd2,
++                          0x78, 0x22, 0xa3, 0x60, 0x93, 0x1d, 0xda, 0x60,
++                          0xa1, 0xdf, 0x04, 0x9d, 0xa6, 0xf9, 0x04, 0x25,
++                          0x3c, 0x06, 0x12, 0xbb, 0xdc, 0x08, 0x74, 0x76 },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0xf8, 0xf7, 0x07, 0xb7, 0x99, 0x9b, 0x18, 0xcb,
++                           0x0d, 0x6b, 0x96, 0x12, 0x4f, 0x20, 0x45, 0x97,
++                           0x2c, 0xa2, 0x74, 0xbf, 0xc1, 0x54, 0xad, 0x0c,
++                           0x87, 0x03, 0x8c, 0x24, 0xc6, 0xd0, 0xd4, 0xb2 },
++              .public = { 0xda, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0xcc, 0x1f, 0x40, 0xd7, 0x43, 0xcd, 0xc2, 0x23,
++                          0x0e, 0x10, 0x43, 0xda, 0xba, 0x8b, 0x75, 0xe8,
++                          0x10, 0xf1, 0xfb, 0xab, 0x7f, 0x25, 0x52, 0x69,
++                          0xbd, 0x9e, 0xbb, 0x29, 0xe6, 0xbf, 0x49, 0x4f },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0xa0, 0x34, 0xf6, 0x84, 0xfa, 0x63, 0x1e, 0x1a,
++                           0x34, 0x81, 0x18, 0xc1, 0xce, 0x4c, 0x98, 0x23,
++                           0x1f, 0x2d, 0x9e, 0xec, 0x9b, 0xa5, 0x36, 0x5b,
++                           0x4a, 0x05, 0xd6, 0x9a, 0x78, 0x5b, 0x07, 0x96 },
++              .public = { 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0x54, 0x99, 0x8e, 0xe4, 0x3a, 0x5b, 0x00, 0x7b,
++                          0xf4, 0x99, 0xf0, 0x78, 0xe7, 0x36, 0x52, 0x44,
++                          0x00, 0xa8, 0xb5, 0xc7, 0xe9, 0xb9, 0xb4, 0x37,
++                          0x71, 0x74, 0x8c, 0x7c, 0xdf, 0x88, 0x04, 0x12 },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0x30, 0xb6, 0xc6, 0xa0, 0xf2, 0xff, 0xa6, 0x80,
++                           0x76, 0x8f, 0x99, 0x2b, 0xa8, 0x9e, 0x15, 0x2d,
++                           0x5b, 0xc9, 0x89, 0x3d, 0x38, 0xc9, 0x11, 0x9b,
++                           0xe4, 0xf7, 0x67, 0xbf, 0xab, 0x6e, 0x0c, 0xa5 },
++              .public = { 0xdc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0xea, 0xd9, 0xb3, 0x8e, 0xfd, 0xd7, 0x23, 0x63,
++                          0x79, 0x34, 0xe5, 0x5a, 0xb7, 0x17, 0xa7, 0xae,
++                          0x09, 0xeb, 0x86, 0xa2, 0x1d, 0xc3, 0x6a, 0x3f,
++                          0xee, 0xb8, 0x8b, 0x75, 0x9e, 0x39, 0x1e, 0x09 },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0x90, 0x1b, 0x9d, 0xcf, 0x88, 0x1e, 0x01, 0xe0,
++                           0x27, 0x57, 0x50, 0x35, 0xd4, 0x0b, 0x43, 0xbd,
++                           0xc1, 0xc5, 0x24, 0x2e, 0x03, 0x08, 0x47, 0x49,
++                           0x5b, 0x0c, 0x72, 0x86, 0x46, 0x9b, 0x65, 0x91 },
++              .public = { 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0x60, 0x2f, 0xf4, 0x07, 0x89, 0xb5, 0x4b, 0x41,
++                          0x80, 0x59, 0x15, 0xfe, 0x2a, 0x62, 0x21, 0xf0,
++                          0x7a, 0x50, 0xff, 0xc2, 0xc3, 0xfc, 0x94, 0xcf,
++                          0x61, 0xf1, 0x3d, 0x79, 0x04, 0xe8, 0x8e, 0x0e },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0x80, 0x46, 0x67, 0x7c, 0x28, 0xfd, 0x82, 0xc9,
++                           0xa1, 0xbd, 0xb7, 0x1a, 0x1a, 0x1a, 0x34, 0xfa,
++                           0xba, 0x12, 0x25, 0xe2, 0x50, 0x7f, 0xe3, 0xf5,
++                           0x4d, 0x10, 0xbd, 0x5b, 0x0d, 0x86, 0x5f, 0x8e },
++              .public = { 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0xe0, 0x0a, 0xe8, 0xb1, 0x43, 0x47, 0x12, 0x47,
++                          0xba, 0x24, 0xf1, 0x2c, 0x88, 0x55, 0x36, 0xc3,
++                          0xcb, 0x98, 0x1b, 0x58, 0xe1, 0xe5, 0x6b, 0x2b,
++                          0xaf, 0x35, 0xc1, 0x2a, 0xe1, 0xf7, 0x9c, 0x26 },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0x60, 0x2f, 0x7e, 0x2f, 0x68, 0xa8, 0x46, 0xb8,
++                           0x2c, 0xc2, 0x69, 0xb1, 0xd4, 0x8e, 0x93, 0x98,
++                           0x86, 0xae, 0x54, 0xfd, 0x63, 0x6c, 0x1f, 0xe0,
++                           0x74, 0xd7, 0x10, 0x12, 0x7d, 0x47, 0x24, 0x91 },
++              .public = { 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0x98, 0xcb, 0x9b, 0x50, 0xdd, 0x3f, 0xc2, 0xb0,
++                          0xd4, 0xf2, 0xd2, 0xbf, 0x7c, 0x5c, 0xfd, 0xd1,
++                          0x0c, 0x8f, 0xcd, 0x31, 0xfc, 0x40, 0xaf, 0x1a,
++                          0xd4, 0x4f, 0x47, 0xc1, 0x31, 0x37, 0x63, 0x62 },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0x60, 0x88, 0x7b, 0x3d, 0xc7, 0x24, 0x43, 0x02,
++                           0x6e, 0xbe, 0xdb, 0xbb, 0xb7, 0x06, 0x65, 0xf4,
++                           0x2b, 0x87, 0xad, 0xd1, 0x44, 0x0e, 0x77, 0x68,
++                           0xfb, 0xd7, 0xe8, 0xe2, 0xce, 0x5f, 0x63, 0x9d },
++              .public = { 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0x38, 0xd6, 0x30, 0x4c, 0x4a, 0x7e, 0x6d, 0x9f,
++                          0x79, 0x59, 0x33, 0x4f, 0xb5, 0x24, 0x5b, 0xd2,
++                          0xc7, 0x54, 0x52, 0x5d, 0x4c, 0x91, 0xdb, 0x95,
++                          0x02, 0x06, 0x92, 0x62, 0x34, 0xc1, 0xf6, 0x33 },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0x78, 0xd3, 0x1d, 0xfa, 0x85, 0x44, 0x97, 0xd7,
++                           0x2d, 0x8d, 0xef, 0x8a, 0x1b, 0x7f, 0xb0, 0x06,
++                           0xce, 0xc2, 0xd8, 0xc4, 0x92, 0x46, 0x47, 0xc9,
++                           0x38, 0x14, 0xae, 0x56, 0xfa, 0xed, 0xa4, 0x95 },
++              .public = { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0x78, 0x6c, 0xd5, 0x49, 0x96, 0xf0, 0x14, 0xa5,
++                          0xa0, 0x31, 0xec, 0x14, 0xdb, 0x81, 0x2e, 0xd0,
++                          0x83, 0x55, 0x06, 0x1f, 0xdb, 0x5d, 0xe6, 0x80,
++                          0xa8, 0x00, 0xac, 0x52, 0x1f, 0x31, 0x8e, 0x23 },
++              .valid = true
++      },
++      /* wycheproof - public key >= p */
++      {
++              .private = { 0xc0, 0x4c, 0x5b, 0xae, 0xfa, 0x83, 0x02, 0xdd,
++                           0xde, 0xd6, 0xa4, 0xbb, 0x95, 0x77, 0x61, 0xb4,
++                           0xeb, 0x97, 0xae, 0xfa, 0x4f, 0xc3, 0xb8, 0x04,
++                           0x30, 0x85, 0xf9, 0x6a, 0x56, 0x59, 0xb3, 0xa5 },
++              .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++              .result = { 0x29, 0xae, 0x8b, 0xc7, 0x3e, 0x9b, 0x10, 0xa0,
++                          0x8b, 0x4f, 0x68, 0x1c, 0x43, 0xc3, 0xe0, 0xac,
++                          0x1a, 0x17, 0x1d, 0x31, 0xb3, 0x8f, 0x1a, 0x48,
++                          0xef, 0xba, 0x29, 0xae, 0x63, 0x9e, 0xa1, 0x34 },
++              .valid = true
++      },
++      /* wycheproof - RFC 7748 */
++      {
++              .private = { 0xa0, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d,
++                           0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd,
++                           0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18,
++                           0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0x44 },
++              .public = { 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb,
++                          0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c,
++                          0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b,
++                          0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c },
++              .result = { 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90,
++                          0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f,
++                          0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7,
++                          0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 },
++              .valid = true
++      },
++      /* wycheproof - RFC 7748 */
++      {
++              .private = { 0x48, 0x66, 0xe9, 0xd4, 0xd1, 0xb4, 0x67, 0x3c,
++                           0x5a, 0xd2, 0x26, 0x91, 0x95, 0x7d, 0x6a, 0xf5,
++                           0xc1, 0x1b, 0x64, 0x21, 0xe0, 0xea, 0x01, 0xd4,
++                           0x2c, 0xa4, 0x16, 0x9e, 0x79, 0x18, 0xba, 0x4d },
++              .public = { 0xe5, 0x21, 0x0f, 0x12, 0x78, 0x68, 0x11, 0xd3,
++                          0xf4, 0xb7, 0x95, 0x9d, 0x05, 0x38, 0xae, 0x2c,
++                          0x31, 0xdb, 0xe7, 0x10, 0x6f, 0xc0, 0x3c, 0x3e,
++                          0xfc, 0x4c, 0xd5, 0x49, 0xc7, 0x15, 0xa4, 0x13 },
++              .result = { 0x95, 0xcb, 0xde, 0x94, 0x76, 0xe8, 0x90, 0x7d,
++                          0x7a, 0xad, 0xe4, 0x5c, 0xb4, 0xb8, 0x73, 0xf8,
++                          0x8b, 0x59, 0x5a, 0x68, 0x79, 0x9f, 0xa1, 0x52,
++                          0xe6, 0xf8, 0xf7, 0x64, 0x7a, 0xac, 0x79, 0x57 },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0x0a, 0xb4, 0xe7, 0x63, 0x80, 0xd8, 0x4d, 0xde,
++                          0x4f, 0x68, 0x33, 0xc5, 0x8f, 0x2a, 0x9f, 0xb8,
++                          0xf8, 0x3b, 0xb0, 0x16, 0x9b, 0x17, 0x2b, 0xe4,
++                          0xb6, 0xe0, 0x59, 0x28, 0x87, 0x74, 0x1a, 0x36 },
++              .result = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0x89, 0xe1, 0x0d, 0x57, 0x01, 0xb4, 0x33, 0x7d,
++                          0x2d, 0x03, 0x21, 0x81, 0x53, 0x8b, 0x10, 0x64,
++                          0xbd, 0x40, 0x84, 0x40, 0x1c, 0xec, 0xa1, 0xfd,
++                          0x12, 0x66, 0x3a, 0x19, 0x59, 0x38, 0x80, 0x00 },
++              .result = { 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0x2b, 0x55, 0xd3, 0xaa, 0x4a, 0x8f, 0x80, 0xc8,
++                          0xc0, 0xb2, 0xae, 0x5f, 0x93, 0x3e, 0x85, 0xaf,
++                          0x49, 0xbe, 0xac, 0x36, 0xc2, 0xfa, 0x73, 0x94,
++                          0xba, 0xb7, 0x6c, 0x89, 0x33, 0xf8, 0xf8, 0x1d },
++              .result = { 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0x63, 0xe5, 0xb1, 0xfe, 0x96, 0x01, 0xfe, 0x84,
++                          0x38, 0x5d, 0x88, 0x66, 0xb0, 0x42, 0x12, 0x62,
++                          0xf7, 0x8f, 0xbf, 0xa5, 0xaf, 0xf9, 0x58, 0x5e,
++                          0x62, 0x66, 0x79, 0xb1, 0x85, 0x47, 0xd9, 0x59 },
++              .result = { 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0xe4, 0x28, 0xf3, 0xda, 0xc1, 0x78, 0x09, 0xf8,
++                          0x27, 0xa5, 0x22, 0xce, 0x32, 0x35, 0x50, 0x58,
++                          0xd0, 0x73, 0x69, 0x36, 0x4a, 0xa7, 0x89, 0x02,
++                          0xee, 0x10, 0x13, 0x9b, 0x9f, 0x9d, 0xd6, 0x53 },
++              .result = { 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0xb3, 0xb5, 0x0e, 0x3e, 0xd3, 0xa4, 0x07, 0xb9,
++                          0x5d, 0xe9, 0x42, 0xef, 0x74, 0x57, 0x5b, 0x5a,
++                          0xb8, 0xa1, 0x0c, 0x09, 0xee, 0x10, 0x35, 0x44,
++                          0xd6, 0x0b, 0xdf, 0xed, 0x81, 0x38, 0xab, 0x2b },
++              .result = { 0xf9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0x21, 0x3f, 0xff, 0xe9, 0x3d, 0x5e, 0xa8, 0xcd,
++                          0x24, 0x2e, 0x46, 0x28, 0x44, 0x02, 0x99, 0x22,
++                          0xc4, 0x3c, 0x77, 0xc9, 0xe3, 0xe4, 0x2f, 0x56,
++                          0x2f, 0x48, 0x5d, 0x24, 0xc5, 0x01, 0xa2, 0x0b },
++              .result = { 0xf3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0x91, 0xb2, 0x32, 0xa1, 0x78, 0xb3, 0xcd, 0x53,
++                          0x09, 0x32, 0x44, 0x1e, 0x61, 0x39, 0x41, 0x8f,
++                          0x72, 0x17, 0x22, 0x92, 0xf1, 0xda, 0x4c, 0x18,
++                          0x34, 0xfc, 0x5e, 0xbf, 0xef, 0xb5, 0x1e, 0x3f },
++              .result = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0x04, 0x5c, 0x6e, 0x11, 0xc5, 0xd3, 0x32, 0x55,
++                          0x6c, 0x78, 0x22, 0xfe, 0x94, 0xeb, 0xf8, 0x9b,
++                          0x56, 0xa3, 0x87, 0x8d, 0xc2, 0x7c, 0xa0, 0x79,
++                          0x10, 0x30, 0x58, 0x84, 0x9f, 0xab, 0xcb, 0x4f },
++              .result = { 0xe5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0x1c, 0xa2, 0x19, 0x0b, 0x71, 0x16, 0x35, 0x39,
++                          0x06, 0x3c, 0x35, 0x77, 0x3b, 0xda, 0x0c, 0x9c,
++                          0x92, 0x8e, 0x91, 0x36, 0xf0, 0x62, 0x0a, 0xeb,
++                          0x09, 0x3f, 0x09, 0x91, 0x97, 0xb7, 0xf7, 0x4e },
++              .result = { 0xe3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0xf7, 0x6e, 0x90, 0x10, 0xac, 0x33, 0xc5, 0x04,
++                          0x3b, 0x2d, 0x3b, 0x76, 0xa8, 0x42, 0x17, 0x10,
++                          0x00, 0xc4, 0x91, 0x62, 0x22, 0xe9, 0xe8, 0x58,
++                          0x97, 0xa0, 0xae, 0xc7, 0xf6, 0x35, 0x0b, 0x3c },
++              .result = { 0xdd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0xbb, 0x72, 0x68, 0x8d, 0x8f, 0x8a, 0xa7, 0xa3,
++                          0x9c, 0xd6, 0x06, 0x0c, 0xd5, 0xc8, 0x09, 0x3c,
++                          0xde, 0xc6, 0xfe, 0x34, 0x19, 0x37, 0xc3, 0x88,
++                          0x6a, 0x99, 0x34, 0x6c, 0xd0, 0x7f, 0xaa, 0x55 },
++              .result = { 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0x88, 0xfd, 0xde, 0xa1, 0x93, 0x39, 0x1c, 0x6a,
++                          0x59, 0x33, 0xef, 0x9b, 0x71, 0x90, 0x15, 0x49,
++                          0x44, 0x72, 0x05, 0xaa, 0xe9, 0xda, 0x92, 0x8a,
++                          0x6b, 0x91, 0xa3, 0x52, 0xba, 0x10, 0xf4, 0x1f },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 },
++              .valid = true
++      },
++      /* wycheproof - edge case for shared secret */
++      {
++              .private = { 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
++                           0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
++                           0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
++                           0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
++              .public = { 0x30, 0x3b, 0x39, 0x2f, 0x15, 0x31, 0x16, 0xca,
++                          0xd9, 0xcc, 0x68, 0x2a, 0x00, 0xcc, 0xc4, 0x4c,
++                          0x95, 0xff, 0x0d, 0x3b, 0xbe, 0x56, 0x8b, 0xeb,
++                          0x6c, 0x4e, 0x73, 0x9b, 0xaf, 0xdc, 0x2c, 0x68 },
++              .result = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 },
++              .valid = true
++      },
++      /* wycheproof - checking for overflow */
++      {
++              .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
++                           0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
++                           0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
++                           0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
++              .public = { 0xfd, 0x30, 0x0a, 0xeb, 0x40, 0xe1, 0xfa, 0x58,
++                          0x25, 0x18, 0x41, 0x2b, 0x49, 0xb2, 0x08, 0xa7,
++                          0x84, 0x2b, 0x1e, 0x1f, 0x05, 0x6a, 0x04, 0x01,
++                          0x78, 0xea, 0x41, 0x41, 0x53, 0x4f, 0x65, 0x2d },
++              .result = { 0xb7, 0x34, 0x10, 0x5d, 0xc2, 0x57, 0x58, 0x5d,
++                          0x73, 0xb5, 0x66, 0xcc, 0xb7, 0x6f, 0x06, 0x27,
++                          0x95, 0xcc, 0xbe, 0xc8, 0x91, 0x28, 0xe5, 0x2b,
++                          0x02, 0xf3, 0xe5, 0x96, 0x39, 0xf1, 0x3c, 0x46 },
++              .valid = true
++      },
++      /* wycheproof - checking for overflow */
++      {
++              .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
++                           0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
++                           0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
++                           0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
++              .public = { 0xc8, 0xef, 0x79, 0xb5, 0x14, 0xd7, 0x68, 0x26,
++                          0x77, 0xbc, 0x79, 0x31, 0xe0, 0x6e, 0xe5, 0xc2,
++                          0x7c, 0x9b, 0x39, 0x2b, 0x4a, 0xe9, 0x48, 0x44,
++                          0x73, 0xf5, 0x54, 0xe6, 0x67, 0x8e, 0xcc, 0x2e },
++              .result = { 0x64, 0x7a, 0x46, 0xb6, 0xfc, 0x3f, 0x40, 0xd6,
++                          0x21, 0x41, 0xee, 0x3c, 0xee, 0x70, 0x6b, 0x4d,
++                          0x7a, 0x92, 0x71, 0x59, 0x3a, 0x7b, 0x14, 0x3e,
++                          0x8e, 0x2e, 0x22, 0x79, 0x88, 0x3e, 0x45, 0x50 },
++              .valid = true
++      },
++      /* wycheproof - checking for overflow */
++      {
++              .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
++                           0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
++                           0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
++                           0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
++              .public = { 0x64, 0xae, 0xac, 0x25, 0x04, 0x14, 0x48, 0x61,
++                          0x53, 0x2b, 0x7b, 0xbc, 0xb6, 0xc8, 0x7d, 0x67,
++                          0xdd, 0x4c, 0x1f, 0x07, 0xeb, 0xc2, 0xe0, 0x6e,
++                          0xff, 0xb9, 0x5a, 0xec, 0xc6, 0x17, 0x0b, 0x2c },
++              .result = { 0x4f, 0xf0, 0x3d, 0x5f, 0xb4, 0x3c, 0xd8, 0x65,
++                          0x7a, 0x3c, 0xf3, 0x7c, 0x13, 0x8c, 0xad, 0xce,
++                          0xcc, 0xe5, 0x09, 0xe4, 0xeb, 0xa0, 0x89, 0xd0,
++                          0xef, 0x40, 0xb4, 0xe4, 0xfb, 0x94, 0x61, 0x55 },
++              .valid = true
++      },
++      /* wycheproof - checking for overflow */
++      {
++              .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
++                           0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
++                           0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
++                           0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
++              .public = { 0xbf, 0x68, 0xe3, 0x5e, 0x9b, 0xdb, 0x7e, 0xee,
++                          0x1b, 0x50, 0x57, 0x02, 0x21, 0x86, 0x0f, 0x5d,
++                          0xcd, 0xad, 0x8a, 0xcb, 0xab, 0x03, 0x1b, 0x14,
++                          0x97, 0x4c, 0xc4, 0x90, 0x13, 0xc4, 0x98, 0x31 },
++              .result = { 0x21, 0xce, 0xe5, 0x2e, 0xfd, 0xbc, 0x81, 0x2e,
++                          0x1d, 0x02, 0x1a, 0x4a, 0xf1, 0xe1, 0xd8, 0xbc,
++                          0x4d, 0xb3, 0xc4, 0x00, 0xe4, 0xd2, 0xa2, 0xc5,
++                          0x6a, 0x39, 0x26, 0xdb, 0x4d, 0x99, 0xc6, 0x5b },
++              .valid = true
++      },
++      /* wycheproof - checking for overflow */
++      {
++              .private = { 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
++                           0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
++                           0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
++                           0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
++              .public = { 0x53, 0x47, 0xc4, 0x91, 0x33, 0x1a, 0x64, 0xb4,
++                          0x3d, 0xdc, 0x68, 0x30, 0x34, 0xe6, 0x77, 0xf5,
++                          0x3d, 0xc3, 0x2b, 0x52, 0xa5, 0x2a, 0x57, 0x7c,
++                          0x15, 0xa8, 0x3b, 0xf2, 0x98, 0xe9, 0x9f, 0x19 },
++              .result = { 0x18, 0xcb, 0x89, 0xe4, 0xe2, 0x0c, 0x0c, 0x2b,
++                          0xd3, 0x24, 0x30, 0x52, 0x45, 0x26, 0x6c, 0x93,
++                          0x27, 0x69, 0x0b, 0xbe, 0x79, 0xac, 0xb8, 0x8f,
++                          0x5b, 0x8f, 0xb3, 0xf7, 0x4e, 0xca, 0x3e, 0x52 },
++              .valid = true
++      },
++      /* wycheproof - private key == -1 (mod order) */
++      {
++              .private = { 0xa0, 0x23, 0xcd, 0xd0, 0x83, 0xef, 0x5b, 0xb8,
++                           0x2f, 0x10, 0xd6, 0x2e, 0x59, 0xe1, 0x5a, 0x68,
++                           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++                           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50 },
++              .public = { 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e,
++                          0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57,
++                          0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f,
++                          0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 },
++              .result = { 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e,
++                          0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57,
++                          0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f,
++                          0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 },
++              .valid = true
++      },
++      /* wycheproof - private key == 1 (mod order) on twist */
++      {
++              .private = { 0x58, 0x08, 0x3d, 0xd2, 0x61, 0xad, 0x91, 0xef,
++                           0xf9, 0x52, 0x32, 0x2e, 0xc8, 0x24, 0xc6, 0x82,
++                           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++                           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x5f },
++              .public = { 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f,
++                          0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6,
++                          0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64,
++                          0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 },
++              .result = { 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f,
++                          0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6,
++                          0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64,
++                          0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 },
++              .valid = true
++      }
++};
++
++bool __init curve25519_selftest(void)
++{
++      bool success = true, ret, ret2;
++      size_t i = 0, j;
++      u8 in[CURVE25519_KEY_SIZE];
++      u8 out[CURVE25519_KEY_SIZE], out2[CURVE25519_KEY_SIZE],
++         out3[CURVE25519_KEY_SIZE];
++
++      for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) {
++              memset(out, 0, CURVE25519_KEY_SIZE);
++              ret = curve25519(out, curve25519_test_vectors[i].private,
++                               curve25519_test_vectors[i].public);
++              if (ret != curve25519_test_vectors[i].valid ||
++                  memcmp(out, curve25519_test_vectors[i].result,
++                         CURVE25519_KEY_SIZE)) {
++                      pr_err("curve25519 self-test %zu: FAIL\n", i + 1);
++                      success = false;
++              }
++      }
++
++      for (i = 0; i < 5; ++i) {
++              get_random_bytes(in, sizeof(in));
++              ret = curve25519_generate_public(out, in);
++              ret2 = curve25519(out2, in, (u8[CURVE25519_KEY_SIZE]){ 9 });
++              curve25519_generic(out3, in, (u8[CURVE25519_KEY_SIZE]){ 9 });
++              if (ret != ret2 ||
++                  memcmp(out, out2, CURVE25519_KEY_SIZE) ||
++                  memcmp(out, out3, CURVE25519_KEY_SIZE)) {
++                      pr_err("curve25519 basepoint self-test %zu: FAIL: input - 0x",
++                             i + 1);
++                      for (j = CURVE25519_KEY_SIZE; j-- > 0;)
++                              printk(KERN_CONT "%02x", in[j]);
++                      printk(KERN_CONT "\n");
++                      success = false;
++              }
++      }
++
++      return success;
++}
+--- a/lib/crypto/curve25519.c
++++ b/lib/crypto/curve25519.c
+@@ -13,6 +13,8 @@
+ #include <linux/module.h>
+ #include <linux/init.h>
+ 
++bool curve25519_selftest(void);
++
+ const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
+ const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
+ 
+@@ -20,6 +22,21 @@ EXPORT_SYMBOL(curve25519_null_point);
+ EXPORT_SYMBOL(curve25519_base_point);
+ EXPORT_SYMBOL(curve25519_generic);
+ 
++static int __init mod_init(void)
++{
++      if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
++          WARN_ON(!curve25519_selftest()))
++              return -ENODEV;
++      return 0;
++}
++
++static void __exit mod_exit(void)
++{
++}
++
++module_init(mod_init);
++module_exit(mod_exit);
++
+ MODULE_LICENSE("GPL v2");
+ MODULE_DESCRIPTION("Curve25519 scalar multiplication");
+ MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0041-crypto-poly1305-add-new-32-and-64-bit-generic-versio.patch b/target/linux/generic/backport-5.4/080-wireguard-0041-crypto-poly1305-add-new-32-and-64-bit-generic-versio.patch

new file mode 100644 (file)

index 0000000..d843ea1
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0041-crypto-poly1305-add-new-32-and-64-bit-generic-versio.patch
@@ -0,0 +1,1165 @@
+From a9f240ba1206fb080c1b3f727dfba1512035a82b Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 5 Jan 2020 22:40:46 -0500
+Subject: [PATCH 041/124] crypto: poly1305 - add new 32 and 64-bit generic
+ versions
+
+commit 1c08a104360f3e18f4ee6346c21cc3923efb952e upstream.
+
+These two C implementations from Zinc -- a 32x32 one and a 64x64 one,
+depending on the platform -- come from Andrew Moon's public domain
+poly1305-donna portable code, modified for usage in the kernel. The
+precomputation in the 32-bit version and the use of 64x64 multiplies in
+the 64-bit version make these perform better than the code it replaces.
+Moon's code is also very widespread and has received many eyeballs of
+scrutiny.
+
+There's a bit of interference between the x86 implementation, which
+relies on internal details of the old scalar implementation. In the next
+commit, the x86 implementation will be replaced with a faster one that
+doesn't rely on this, so none of this matters much. But for now, to keep
+this passing the tests, we inline the bits of the old implementation
+that the x86 implementation relied on. Also, since we now support a
+slightly larger key space, via the union, some offsets had to be fixed
+up.
+
+Nonce calculation was folded in with the emit function, to take
+advantage of 64x64 arithmetic. However, Adiantum appeared to rely on no
+nonce handling in emit, so this path was conditionalized. We also
+introduced a new struct, poly1305_core_key, to represent the precise
+amount of space that particular implementation uses.
+
+Testing with kbench9000, depending on the CPU, the update function for
+the 32x32 version has been improved by 4%-7%, and for the 64x64 by
+19%-30%. The 32x32 gains are small, but I think there's great value in
+having a parallel implementation to the 64x64 one so that the two can be
+compared side-by-side as nice stand-alone units.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305-avx2-x86_64.S |  20 +--
+ arch/x86/crypto/poly1305_glue.c        | 215 +++++++++++++++++++++++--
+ crypto/adiantum.c                      |   4 +-
+ crypto/nhpoly1305.c                    |   2 +-
+ crypto/poly1305_generic.c              |  25 ++-
+ include/crypto/internal/poly1305.h     |  45 ++----
+ include/crypto/nhpoly1305.h            |   4 +-
+ include/crypto/poly1305.h              |  26 ++-
+ lib/crypto/Makefile                    |   4 +-
+ lib/crypto/poly1305-donna32.c          | 204 +++++++++++++++++++++++
+ lib/crypto/poly1305-donna64.c          | 185 +++++++++++++++++++++
+ lib/crypto/poly1305.c                  | 169 +------------------
+ 12 files changed, 675 insertions(+), 228 deletions(-)
+ create mode 100644 lib/crypto/poly1305-donna32.c
+ create mode 100644 lib/crypto/poly1305-donna64.c
+
+--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
++++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
+@@ -34,16 +34,16 @@ ORMASK:    .octa 0x000000000100000000000000
+ #define u2 0x08(%r8)
+ #define u3 0x0c(%r8)
+ #define u4 0x10(%r8)
+-#define w0 0x14(%r8)
+-#define w1 0x18(%r8)
+-#define w2 0x1c(%r8)
+-#define w3 0x20(%r8)
+-#define w4 0x24(%r8)
+-#define y0 0x28(%r8)
+-#define y1 0x2c(%r8)
+-#define y2 0x30(%r8)
+-#define y3 0x34(%r8)
+-#define y4 0x38(%r8)
++#define w0 0x18(%r8)
++#define w1 0x1c(%r8)
++#define w2 0x20(%r8)
++#define w3 0x24(%r8)
++#define w4 0x28(%r8)
++#define y0 0x30(%r8)
++#define y1 0x34(%r8)
++#define y2 0x38(%r8)
++#define y3 0x3c(%r8)
++#define y4 0x40(%r8)
+ #define m %rsi
+ #define hc0 %ymm0
+ #define hc1 %ymm1
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -25,6 +25,21 @@ asmlinkage void poly1305_4block_avx2(u32
+ static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
+ static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+ 
++static inline u64 mlt(u64 a, u64 b)
++{
++      return a * b;
++}
++
++static inline u32 sr(u64 v, u_char n)
++{
++      return v >> n;
++}
++
++static inline u32 and(u32 v, u32 mask)
++{
++      return v & mask;
++}
++
+ static void poly1305_simd_mult(u32 *a, const u32 *b)
+ {
+       u8 m[POLY1305_BLOCK_SIZE];
+@@ -36,6 +51,168 @@ static void poly1305_simd_mult(u32 *a, c
+       poly1305_block_sse2(a, m, b, 1);
+ }
+ 
++static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
++{
++      /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
++      key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
++      key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
++      key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
++      key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
++      key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
++}
++
++static void poly1305_integer_blocks(struct poly1305_state *state,
++                                  const struct poly1305_key *key,
++                                  const void *src,
++                                  unsigned int nblocks, u32 hibit)
++{
++      u32 r0, r1, r2, r3, r4;
++      u32 s1, s2, s3, s4;
++      u32 h0, h1, h2, h3, h4;
++      u64 d0, d1, d2, d3, d4;
++
++      if (!nblocks)
++              return;
++
++      r0 = key->r[0];
++      r1 = key->r[1];
++      r2 = key->r[2];
++      r3 = key->r[3];
++      r4 = key->r[4];
++
++      s1 = r1 * 5;
++      s2 = r2 * 5;
++      s3 = r3 * 5;
++      s4 = r4 * 5;
++
++      h0 = state->h[0];
++      h1 = state->h[1];
++      h2 = state->h[2];
++      h3 = state->h[3];
++      h4 = state->h[4];
++
++      do {
++              /* h += m[i] */
++              h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
++              h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
++              h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
++              h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
++              h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
++
++              /* h *= r */
++              d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
++                   mlt(h3, s2) + mlt(h4, s1);
++              d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
++                   mlt(h3, s3) + mlt(h4, s2);
++              d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
++                   mlt(h3, s4) + mlt(h4, s3);
++              d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
++                   mlt(h3, r0) + mlt(h4, s4);
++              d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
++                   mlt(h3, r1) + mlt(h4, r0);
++
++              /* (partial) h %= p */
++              d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
++              d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
++              d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
++              d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
++              h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
++              h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
++
++              src += POLY1305_BLOCK_SIZE;
++      } while (--nblocks);
++
++      state->h[0] = h0;
++      state->h[1] = h1;
++      state->h[2] = h2;
++      state->h[3] = h3;
++      state->h[4] = h4;
++}
++
++static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
++{
++      u32 h0, h1, h2, h3, h4;
++      u32 g0, g1, g2, g3, g4;
++      u32 mask;
++
++      /* fully carry h */
++      h0 = state->h[0];
++      h1 = state->h[1];
++      h2 = state->h[2];
++      h3 = state->h[3];
++      h4 = state->h[4];
++
++      h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
++      h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
++      h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
++      h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
++      h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
++
++      /* compute h + -p */
++      g0 = h0 + 5;
++      g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
++      g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
++      g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
++      g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
++
++      /* select h if h < p, or h + -p if h >= p */
++      mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
++      g0 &= mask;
++      g1 &= mask;
++      g2 &= mask;
++      g3 &= mask;
++      g4 &= mask;
++      mask = ~mask;
++      h0 = (h0 & mask) | g0;
++      h1 = (h1 & mask) | g1;
++      h2 = (h2 & mask) | g2;
++      h3 = (h3 & mask) | g3;
++      h4 = (h4 & mask) | g4;
++
++      /* h = h % (2^128) */
++      put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
++      put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
++      put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
++      put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
++}
++
++void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
++{
++      poly1305_integer_setkey(desc->opaque_r, key);
++      desc->s[0] = get_unaligned_le32(key + 16);
++      desc->s[1] = get_unaligned_le32(key + 20);
++      desc->s[2] = get_unaligned_le32(key + 24);
++      desc->s[3] = get_unaligned_le32(key + 28);
++      poly1305_core_init(&desc->h);
++      desc->buflen = 0;
++      desc->sset = true;
++      desc->rset = 1;
++}
++EXPORT_SYMBOL_GPL(poly1305_init_arch);
++
++static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
++                                             const u8 *src, unsigned int srclen)
++{
++      if (!dctx->sset) {
++              if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
++                      poly1305_integer_setkey(dctx->r, src);
++                      src += POLY1305_BLOCK_SIZE;
++                      srclen -= POLY1305_BLOCK_SIZE;
++                      dctx->rset = 1;
++              }
++              if (srclen >= POLY1305_BLOCK_SIZE) {
++                      dctx->s[0] = get_unaligned_le32(src +  0);
++                      dctx->s[1] = get_unaligned_le32(src +  4);
++                      dctx->s[2] = get_unaligned_le32(src +  8);
++                      dctx->s[3] = get_unaligned_le32(src + 12);
++                      src += POLY1305_BLOCK_SIZE;
++                      srclen -= POLY1305_BLOCK_SIZE;
++                      dctx->sset = true;
++              }
++      }
++      return srclen;
++}
++
+ static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
+                                          const u8 *src, unsigned int srclen)
+ {
+@@ -47,8 +224,8 @@ static unsigned int poly1305_scalar_bloc
+               srclen = datalen;
+       }
+       if (srclen >= POLY1305_BLOCK_SIZE) {
+-              poly1305_core_blocks(&dctx->h, dctx->r, src,
+-                                   srclen / POLY1305_BLOCK_SIZE, 1);
++              poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
++                                      srclen / POLY1305_BLOCK_SIZE, 1);
+               srclen %= POLY1305_BLOCK_SIZE;
+       }
+       return srclen;
+@@ -105,12 +282,6 @@ static unsigned int poly1305_simd_blocks
+       return srclen;
+ }
+ 
+-void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
+-{
+-      poly1305_init_generic(desc, key);
+-}
+-EXPORT_SYMBOL(poly1305_init_arch);
+-
+ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+                         unsigned int srclen)
+ {
+@@ -158,9 +329,31 @@ void poly1305_update_arch(struct poly130
+ }
+ EXPORT_SYMBOL(poly1305_update_arch);
+ 
+-void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest)
++void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
+ {
+-      poly1305_final_generic(desc, digest);
++      __le32 digest[4];
++      u64 f = 0;
++
++      if (unlikely(desc->buflen)) {
++              desc->buf[desc->buflen++] = 1;
++              memset(desc->buf + desc->buflen, 0,
++                     POLY1305_BLOCK_SIZE - desc->buflen);
++              poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
++      }
++
++      poly1305_integer_emit(&desc->h, digest);
++
++      /* mac = (h + s) % (2^128) */
++      f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
++      put_unaligned_le32(f, dst + 0);
++      f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
++      put_unaligned_le32(f, dst + 4);
++      f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
++      put_unaligned_le32(f, dst + 8);
++      f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
++      put_unaligned_le32(f, dst + 12);
++
++      *desc = (struct poly1305_desc_ctx){};
+ }
+ EXPORT_SYMBOL(poly1305_final_arch);
+ 
+@@ -183,7 +376,7 @@ static int crypto_poly1305_final(struct
+       if (unlikely(!dctx->sset))
+               return -ENOKEY;
+ 
+-      poly1305_final_generic(dctx, dst);
++      poly1305_final_arch(dctx, dst);
+       return 0;
+ }
+ 
+--- a/crypto/adiantum.c
++++ b/crypto/adiantum.c
+@@ -72,7 +72,7 @@ struct adiantum_tfm_ctx {
+       struct crypto_skcipher *streamcipher;
+       struct crypto_cipher *blockcipher;
+       struct crypto_shash *hash;
+-      struct poly1305_key header_hash_key;
++      struct poly1305_core_key header_hash_key;
+ };
+ 
+ struct adiantum_request_ctx {
+@@ -249,7 +249,7 @@ static void adiantum_hash_header(struct
+       poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
+                            TWEAK_SIZE / POLY1305_BLOCK_SIZE, 1);
+ 
+-      poly1305_core_emit(&state, &rctx->header_hash);
++      poly1305_core_emit(&state, NULL, &rctx->header_hash);
+ }
+ 
+ /* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */
+--- a/crypto/nhpoly1305.c
++++ b/crypto/nhpoly1305.c
+@@ -210,7 +210,7 @@ int crypto_nhpoly1305_final_helper(struc
+       if (state->nh_remaining)
+               process_nh_hash_value(state, key);
+ 
+-      poly1305_core_emit(&state->poly_state, dst);
++      poly1305_core_emit(&state->poly_state, NULL, dst);
+       return 0;
+ }
+ EXPORT_SYMBOL(crypto_nhpoly1305_final_helper);
+--- a/crypto/poly1305_generic.c
++++ b/crypto/poly1305_generic.c
+@@ -31,6 +31,29 @@ static int crypto_poly1305_init(struct s
+       return 0;
+ }
+ 
++static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
++                                             const u8 *src, unsigned int srclen)
++{
++      if (!dctx->sset) {
++              if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
++                      poly1305_core_setkey(&dctx->core_r, src);
++                      src += POLY1305_BLOCK_SIZE;
++                      srclen -= POLY1305_BLOCK_SIZE;
++                      dctx->rset = 2;
++              }
++              if (srclen >= POLY1305_BLOCK_SIZE) {
++                      dctx->s[0] = get_unaligned_le32(src +  0);
++                      dctx->s[1] = get_unaligned_le32(src +  4);
++                      dctx->s[2] = get_unaligned_le32(src +  8);
++                      dctx->s[3] = get_unaligned_le32(src + 12);
++                      src += POLY1305_BLOCK_SIZE;
++                      srclen -= POLY1305_BLOCK_SIZE;
++                      dctx->sset = true;
++              }
++      }
++      return srclen;
++}
++
+ static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+                           unsigned int srclen)
+ {
+@@ -42,7 +65,7 @@ static void poly1305_blocks(struct poly1
+               srclen = datalen;
+       }
+ 
+-      poly1305_core_blocks(&dctx->h, dctx->r, src,
++      poly1305_core_blocks(&dctx->h, &dctx->core_r, src,
+                            srclen / POLY1305_BLOCK_SIZE, 1);
+ }
+ 
+--- a/include/crypto/internal/poly1305.h
++++ b/include/crypto/internal/poly1305.h
+@@ -11,48 +11,23 @@
+ #include <crypto/poly1305.h>
+ 
+ /*
+- * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
+- * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
+- * ("s key") at the end.  They also only support block-aligned inputs.
++ * Poly1305 core functions.  These only accept whole blocks; the caller must
++ * handle any needed block buffering and padding.  'hibit' must be 1 for any
++ * full blocks, or 0 for the final block if it had to be padded.  If 'nonce' is
++ * non-NULL, then it's added at the end to compute the Poly1305 MAC.  Otherwise,
++ * only the ε-almost-∆-universal hash function (not the full MAC) is computed.
+  */
+-void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
++
++void poly1305_core_setkey(struct poly1305_core_key *key, const u8 *raw_key);
+ static inline void poly1305_core_init(struct poly1305_state *state)
+ {
+       *state = (struct poly1305_state){};
+ }
+ 
+ void poly1305_core_blocks(struct poly1305_state *state,
+-                        const struct poly1305_key *key, const void *src,
++                        const struct poly1305_core_key *key, const void *src,
+                         unsigned int nblocks, u32 hibit);
+-void poly1305_core_emit(const struct poly1305_state *state, void *dst);
+-
+-/*
+- * Poly1305 requires a unique key for each tag, which implies that we can't set
+- * it on the tfm that gets accessed by multiple users simultaneously. Instead we
+- * expect the key as the first 32 bytes in the update() call.
+- */
+-static inline
+-unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+-                                      const u8 *src, unsigned int srclen)
+-{
+-      if (!dctx->sset) {
+-              if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
+-                      poly1305_core_setkey(dctx->r, src);
+-                      src += POLY1305_BLOCK_SIZE;
+-                      srclen -= POLY1305_BLOCK_SIZE;
+-                      dctx->rset = 1;
+-              }
+-              if (srclen >= POLY1305_BLOCK_SIZE) {
+-                      dctx->s[0] = get_unaligned_le32(src +  0);
+-                      dctx->s[1] = get_unaligned_le32(src +  4);
+-                      dctx->s[2] = get_unaligned_le32(src +  8);
+-                      dctx->s[3] = get_unaligned_le32(src + 12);
+-                      src += POLY1305_BLOCK_SIZE;
+-                      srclen -= POLY1305_BLOCK_SIZE;
+-                      dctx->sset = true;
+-              }
+-      }
+-      return srclen;
+-}
++void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
++                      void *dst);
+ 
+ #endif
+--- a/include/crypto/nhpoly1305.h
++++ b/include/crypto/nhpoly1305.h
+@@ -7,7 +7,7 @@
+ #define _NHPOLY1305_H
+ 
+ #include <crypto/hash.h>
+-#include <crypto/poly1305.h>
++#include <crypto/internal/poly1305.h>
+ 
+ /* NH parameterization: */
+ 
+@@ -33,7 +33,7 @@
+ #define NHPOLY1305_KEY_SIZE   (POLY1305_BLOCK_SIZE + NH_KEY_BYTES)
+ 
+ struct nhpoly1305_key {
+-      struct poly1305_key poly_key;
++      struct poly1305_core_key poly_key;
+       u32 nh_key[NH_KEY_WORDS];
+ };
+ 
+--- a/include/crypto/poly1305.h
++++ b/include/crypto/poly1305.h
+@@ -13,12 +13,29 @@
+ #define POLY1305_KEY_SIZE     32
+ #define POLY1305_DIGEST_SIZE  16
+ 
++/* The poly1305_key and poly1305_state types are mostly opaque and
++ * implementation-defined. Limbs might be in base 2^64 or base 2^26, or
++ * different yet. The union type provided keeps these 64-bit aligned for the
++ * case in which this is implemented using 64x64 multiplies.
++ */
++
+ struct poly1305_key {
+-      u32 r[5];       /* key, base 2^26 */
++      union {
++              u32 r[5];
++              u64 r64[3];
++      };
++};
++
++struct poly1305_core_key {
++      struct poly1305_key key;
++      struct poly1305_key precomputed_s;
+ };
+ 
+ struct poly1305_state {
+-      u32 h[5];       /* accumulator, base 2^26 */
++      union {
++              u32 h[5];
++              u64 h64[3];
++      };
+ };
+ 
+ struct poly1305_desc_ctx {
+@@ -35,7 +52,10 @@ struct poly1305_desc_ctx {
+       /* accumulator */
+       struct poly1305_state h;
+       /* key */
+-      struct poly1305_key r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE];
++      union {
++              struct poly1305_key opaque_r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE];
++              struct poly1305_core_key core_r;
++      };
+ };
+ 
+ void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key);
+--- a/lib/crypto/Makefile
++++ b/lib/crypto/Makefile
+@@ -28,7 +28,9 @@ obj-$(CONFIG_CRYPTO_LIB_DES)                 += libdes
+ libdes-y                                      := des.o
+ 
+ obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC)     += libpoly1305.o
+-libpoly1305-y                                 := poly1305.o
++libpoly1305-y                                 := poly1305-donna32.o
++libpoly1305-$(CONFIG_ARCH_SUPPORTS_INT128)    := poly1305-donna64.o
++libpoly1305-y                                 += poly1305.o
+ 
+ obj-$(CONFIG_CRYPTO_LIB_SHA256)                       += libsha256.o
+ libsha256-y                                   := sha256.o
+--- /dev/null
++++ b/lib/crypto/poly1305-donna32.c
+@@ -0,0 +1,204 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This is based in part on Andrew Moon's poly1305-donna, which is in the
++ * public domain.
++ */
++
++#include <linux/kernel.h>
++#include <asm/unaligned.h>
++#include <crypto/internal/poly1305.h>
++
++void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[16])
++{
++      /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
++      key->key.r[0] = (get_unaligned_le32(&raw_key[0])) & 0x3ffffff;
++      key->key.r[1] = (get_unaligned_le32(&raw_key[3]) >> 2) & 0x3ffff03;
++      key->key.r[2] = (get_unaligned_le32(&raw_key[6]) >> 4) & 0x3ffc0ff;
++      key->key.r[3] = (get_unaligned_le32(&raw_key[9]) >> 6) & 0x3f03fff;
++      key->key.r[4] = (get_unaligned_le32(&raw_key[12]) >> 8) & 0x00fffff;
++
++      /* s = 5*r */
++      key->precomputed_s.r[0] = key->key.r[1] * 5;
++      key->precomputed_s.r[1] = key->key.r[2] * 5;
++      key->precomputed_s.r[2] = key->key.r[3] * 5;
++      key->precomputed_s.r[3] = key->key.r[4] * 5;
++}
++EXPORT_SYMBOL(poly1305_core_setkey);
++
++void poly1305_core_blocks(struct poly1305_state *state,
++                        const struct poly1305_core_key *key, const void *src,
++                        unsigned int nblocks, u32 hibit)
++{
++      const u8 *input = src;
++      u32 r0, r1, r2, r3, r4;
++      u32 s1, s2, s3, s4;
++      u32 h0, h1, h2, h3, h4;
++      u64 d0, d1, d2, d3, d4;
++      u32 c;
++
++      if (!nblocks)
++              return;
++
++      hibit <<= 24;
++
++      r0 = key->key.r[0];
++      r1 = key->key.r[1];
++      r2 = key->key.r[2];
++      r3 = key->key.r[3];
++      r4 = key->key.r[4];
++
++      s1 = key->precomputed_s.r[0];
++      s2 = key->precomputed_s.r[1];
++      s3 = key->precomputed_s.r[2];
++      s4 = key->precomputed_s.r[3];
++
++      h0 = state->h[0];
++      h1 = state->h[1];
++      h2 = state->h[2];
++      h3 = state->h[3];
++      h4 = state->h[4];
++
++      do {
++              /* h += m[i] */
++              h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff;
++              h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff;
++              h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff;
++              h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff;
++              h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit;
++
++              /* h *= r */
++              d0 = ((u64)h0 * r0) + ((u64)h1 * s4) +
++                   ((u64)h2 * s3) + ((u64)h3 * s2) +
++                   ((u64)h4 * s1);
++              d1 = ((u64)h0 * r1) + ((u64)h1 * r0) +
++                   ((u64)h2 * s4) + ((u64)h3 * s3) +
++                   ((u64)h4 * s2);
++              d2 = ((u64)h0 * r2) + ((u64)h1 * r1) +
++                   ((u64)h2 * r0) + ((u64)h3 * s4) +
++                   ((u64)h4 * s3);
++              d3 = ((u64)h0 * r3) + ((u64)h1 * r2) +
++                   ((u64)h2 * r1) + ((u64)h3 * r0) +
++                   ((u64)h4 * s4);
++              d4 = ((u64)h0 * r4) + ((u64)h1 * r3) +
++                   ((u64)h2 * r2) + ((u64)h3 * r1) +
++                   ((u64)h4 * r0);
++
++              /* (partial) h %= p */
++              c = (u32)(d0 >> 26);
++              h0 = (u32)d0 & 0x3ffffff;
++              d1 += c;
++              c = (u32)(d1 >> 26);
++              h1 = (u32)d1 & 0x3ffffff;
++              d2 += c;
++              c = (u32)(d2 >> 26);
++              h2 = (u32)d2 & 0x3ffffff;
++              d3 += c;
++              c = (u32)(d3 >> 26);
++              h3 = (u32)d3 & 0x3ffffff;
++              d4 += c;
++              c = (u32)(d4 >> 26);
++              h4 = (u32)d4 & 0x3ffffff;
++              h0 += c * 5;
++              c = (h0 >> 26);
++              h0 = h0 & 0x3ffffff;
++              h1 += c;
++
++              input += POLY1305_BLOCK_SIZE;
++      } while (--nblocks);
++
++      state->h[0] = h0;
++      state->h[1] = h1;
++      state->h[2] = h2;
++      state->h[3] = h3;
++      state->h[4] = h4;
++}
++EXPORT_SYMBOL(poly1305_core_blocks);
++
++void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
++                      void *dst)
++{
++      u8 *mac = dst;
++      u32 h0, h1, h2, h3, h4, c;
++      u32 g0, g1, g2, g3, g4;
++      u64 f;
++      u32 mask;
++
++      /* fully carry h */
++      h0 = state->h[0];
++      h1 = state->h[1];
++      h2 = state->h[2];
++      h3 = state->h[3];
++      h4 = state->h[4];
++
++      c = h1 >> 26;
++      h1 = h1 & 0x3ffffff;
++      h2 += c;
++      c = h2 >> 26;
++      h2 = h2 & 0x3ffffff;
++      h3 += c;
++      c = h3 >> 26;
++      h3 = h3 & 0x3ffffff;
++      h4 += c;
++      c = h4 >> 26;
++      h4 = h4 & 0x3ffffff;
++      h0 += c * 5;
++      c = h0 >> 26;
++      h0 = h0 & 0x3ffffff;
++      h1 += c;
++
++      /* compute h + -p */
++      g0 = h0 + 5;
++      c = g0 >> 26;
++      g0 &= 0x3ffffff;
++      g1 = h1 + c;
++      c = g1 >> 26;
++      g1 &= 0x3ffffff;
++      g2 = h2 + c;
++      c = g2 >> 26;
++      g2 &= 0x3ffffff;
++      g3 = h3 + c;
++      c = g3 >> 26;
++      g3 &= 0x3ffffff;
++      g4 = h4 + c - (1UL << 26);
++
++      /* select h if h < p, or h + -p if h >= p */
++      mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
++      g0 &= mask;
++      g1 &= mask;
++      g2 &= mask;
++      g3 &= mask;
++      g4 &= mask;
++      mask = ~mask;
++
++      h0 = (h0 & mask) | g0;
++      h1 = (h1 & mask) | g1;
++      h2 = (h2 & mask) | g2;
++      h3 = (h3 & mask) | g3;
++      h4 = (h4 & mask) | g4;
++
++      /* h = h % (2^128) */
++      h0 = ((h0) | (h1 << 26)) & 0xffffffff;
++      h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
++      h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
++      h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
++
++      if (likely(nonce)) {
++              /* mac = (h + nonce) % (2^128) */
++              f = (u64)h0 + nonce[0];
++              h0 = (u32)f;
++              f = (u64)h1 + nonce[1] + (f >> 32);
++              h1 = (u32)f;
++              f = (u64)h2 + nonce[2] + (f >> 32);
++              h2 = (u32)f;
++              f = (u64)h3 + nonce[3] + (f >> 32);
++              h3 = (u32)f;
++      }
++
++      put_unaligned_le32(h0, &mac[0]);
++      put_unaligned_le32(h1, &mac[4]);
++      put_unaligned_le32(h2, &mac[8]);
++      put_unaligned_le32(h3, &mac[12]);
++}
++EXPORT_SYMBOL(poly1305_core_emit);
+--- /dev/null
++++ b/lib/crypto/poly1305-donna64.c
+@@ -0,0 +1,185 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This is based in part on Andrew Moon's poly1305-donna, which is in the
++ * public domain.
++ */
++
++#include <linux/kernel.h>
++#include <asm/unaligned.h>
++#include <crypto/internal/poly1305.h>
++
++typedef __uint128_t u128;
++
++void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[16])
++{
++      u64 t0, t1;
++
++      /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
++      t0 = get_unaligned_le64(&raw_key[0]);
++      t1 = get_unaligned_le64(&raw_key[8]);
++
++      key->key.r64[0] = t0 & 0xffc0fffffffULL;
++      key->key.r64[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffffULL;
++      key->key.r64[2] = ((t1 >> 24)) & 0x00ffffffc0fULL;
++
++      /* s = 20*r */
++      key->precomputed_s.r64[0] = key->key.r64[1] * 20;
++      key->precomputed_s.r64[1] = key->key.r64[2] * 20;
++}
++EXPORT_SYMBOL(poly1305_core_setkey);
++
++void poly1305_core_blocks(struct poly1305_state *state,
++                        const struct poly1305_core_key *key, const void *src,
++                        unsigned int nblocks, u32 hibit)
++{
++      const u8 *input = src;
++      u64 hibit64;
++      u64 r0, r1, r2;
++      u64 s1, s2;
++      u64 h0, h1, h2;
++      u64 c;
++      u128 d0, d1, d2, d;
++
++      if (!nblocks)
++              return;
++
++      hibit64 = ((u64)hibit) << 40;
++
++      r0 = key->key.r64[0];
++      r1 = key->key.r64[1];
++      r2 = key->key.r64[2];
++
++      h0 = state->h64[0];
++      h1 = state->h64[1];
++      h2 = state->h64[2];
++
++      s1 = key->precomputed_s.r64[0];
++      s2 = key->precomputed_s.r64[1];
++
++      do {
++              u64 t0, t1;
++
++              /* h += m[i] */
++              t0 = get_unaligned_le64(&input[0]);
++              t1 = get_unaligned_le64(&input[8]);
++
++              h0 += t0 & 0xfffffffffffULL;
++              h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL;
++              h2 += (((t1 >> 24)) & 0x3ffffffffffULL) | hibit64;
++
++              /* h *= r */
++              d0 = (u128)h0 * r0;
++              d = (u128)h1 * s2;
++              d0 += d;
++              d = (u128)h2 * s1;
++              d0 += d;
++              d1 = (u128)h0 * r1;
++              d = (u128)h1 * r0;
++              d1 += d;
++              d = (u128)h2 * s2;
++              d1 += d;
++              d2 = (u128)h0 * r2;
++              d = (u128)h1 * r1;
++              d2 += d;
++              d = (u128)h2 * r0;
++              d2 += d;
++
++              /* (partial) h %= p */
++              c = (u64)(d0 >> 44);
++              h0 = (u64)d0 & 0xfffffffffffULL;
++              d1 += c;
++              c = (u64)(d1 >> 44);
++              h1 = (u64)d1 & 0xfffffffffffULL;
++              d2 += c;
++              c = (u64)(d2 >> 42);
++              h2 = (u64)d2 & 0x3ffffffffffULL;
++              h0 += c * 5;
++              c = h0 >> 44;
++              h0 = h0 & 0xfffffffffffULL;
++              h1 += c;
++
++              input += POLY1305_BLOCK_SIZE;
++      } while (--nblocks);
++
++      state->h64[0] = h0;
++      state->h64[1] = h1;
++      state->h64[2] = h2;
++}
++EXPORT_SYMBOL(poly1305_core_blocks);
++
++void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
++                      void *dst)
++{
++      u8 *mac = dst;
++      u64 h0, h1, h2, c;
++      u64 g0, g1, g2;
++      u64 t0, t1;
++
++      /* fully carry h */
++      h0 = state->h64[0];
++      h1 = state->h64[1];
++      h2 = state->h64[2];
++
++      c = h1 >> 44;
++      h1 &= 0xfffffffffffULL;
++      h2 += c;
++      c = h2 >> 42;
++      h2 &= 0x3ffffffffffULL;
++      h0 += c * 5;
++      c = h0 >> 44;
++      h0 &= 0xfffffffffffULL;
++      h1 += c;
++      c = h1 >> 44;
++      h1 &= 0xfffffffffffULL;
++      h2 += c;
++      c = h2 >> 42;
++      h2 &= 0x3ffffffffffULL;
++      h0 += c * 5;
++      c = h0 >> 44;
++      h0 &= 0xfffffffffffULL;
++      h1 += c;
++
++      /* compute h + -p */
++      g0 = h0 + 5;
++      c  = g0 >> 44;
++      g0 &= 0xfffffffffffULL;
++      g1 = h1 + c;
++      c  = g1 >> 44;
++      g1 &= 0xfffffffffffULL;
++      g2 = h2 + c - (1ULL << 42);
++
++      /* select h if h < p, or h + -p if h >= p */
++      c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1;
++      g0 &= c;
++      g1 &= c;
++      g2 &= c;
++      c  = ~c;
++      h0 = (h0 & c) | g0;
++      h1 = (h1 & c) | g1;
++      h2 = (h2 & c) | g2;
++
++      if (likely(nonce)) {
++              /* h = (h + nonce) */
++              t0 = ((u64)nonce[1] << 32) | nonce[0];
++              t1 = ((u64)nonce[3] << 32) | nonce[2];
++
++              h0 += t0 & 0xfffffffffffULL;
++              c = h0 >> 44;
++              h0 &= 0xfffffffffffULL;
++              h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL) + c;
++              c = h1 >> 44;
++              h1 &= 0xfffffffffffULL;
++              h2 += (((t1 >> 24)) & 0x3ffffffffffULL) + c;
++              h2 &= 0x3ffffffffffULL;
++      }
++
++      /* mac = h % (2^128) */
++      h0 = h0 | (h1 << 44);
++      h1 = (h1 >> 20) | (h2 << 24);
++
++      put_unaligned_le64(h0, &mac[0]);
++      put_unaligned_le64(h1, &mac[8]);
++}
++EXPORT_SYMBOL(poly1305_core_emit);
+--- a/lib/crypto/poly1305.c
++++ b/lib/crypto/poly1305.c
+@@ -12,151 +12,9 @@
+ #include <linux/module.h>
+ #include <asm/unaligned.h>
+ 
+-static inline u64 mlt(u64 a, u64 b)
+-{
+-      return a * b;
+-}
+-
+-static inline u32 sr(u64 v, u_char n)
+-{
+-      return v >> n;
+-}
+-
+-static inline u32 and(u32 v, u32 mask)
+-{
+-      return v & mask;
+-}
+-
+-void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
+-{
+-      /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+-      key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
+-      key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
+-      key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
+-      key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
+-      key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
+-}
+-EXPORT_SYMBOL_GPL(poly1305_core_setkey);
+-
+-void poly1305_core_blocks(struct poly1305_state *state,
+-                        const struct poly1305_key *key, const void *src,
+-                        unsigned int nblocks, u32 hibit)
+-{
+-      u32 r0, r1, r2, r3, r4;
+-      u32 s1, s2, s3, s4;
+-      u32 h0, h1, h2, h3, h4;
+-      u64 d0, d1, d2, d3, d4;
+-
+-      if (!nblocks)
+-              return;
+-
+-      r0 = key->r[0];
+-      r1 = key->r[1];
+-      r2 = key->r[2];
+-      r3 = key->r[3];
+-      r4 = key->r[4];
+-
+-      s1 = r1 * 5;
+-      s2 = r2 * 5;
+-      s3 = r3 * 5;
+-      s4 = r4 * 5;
+-
+-      h0 = state->h[0];
+-      h1 = state->h[1];
+-      h2 = state->h[2];
+-      h3 = state->h[3];
+-      h4 = state->h[4];
+-
+-      do {
+-              /* h += m[i] */
+-              h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
+-              h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
+-              h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
+-              h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
+-              h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
+-
+-              /* h *= r */
+-              d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
+-                   mlt(h3, s2) + mlt(h4, s1);
+-              d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
+-                   mlt(h3, s3) + mlt(h4, s2);
+-              d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
+-                   mlt(h3, s4) + mlt(h4, s3);
+-              d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
+-                   mlt(h3, r0) + mlt(h4, s4);
+-              d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
+-                   mlt(h3, r1) + mlt(h4, r0);
+-
+-              /* (partial) h %= p */
+-              d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
+-              d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
+-              d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
+-              d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
+-              h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
+-              h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
+-
+-              src += POLY1305_BLOCK_SIZE;
+-      } while (--nblocks);
+-
+-      state->h[0] = h0;
+-      state->h[1] = h1;
+-      state->h[2] = h2;
+-      state->h[3] = h3;
+-      state->h[4] = h4;
+-}
+-EXPORT_SYMBOL_GPL(poly1305_core_blocks);
+-
+-void poly1305_core_emit(const struct poly1305_state *state, void *dst)
+-{
+-      u32 h0, h1, h2, h3, h4;
+-      u32 g0, g1, g2, g3, g4;
+-      u32 mask;
+-
+-      /* fully carry h */
+-      h0 = state->h[0];
+-      h1 = state->h[1];
+-      h2 = state->h[2];
+-      h3 = state->h[3];
+-      h4 = state->h[4];
+-
+-      h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
+-      h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
+-      h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
+-      h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
+-      h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
+-
+-      /* compute h + -p */
+-      g0 = h0 + 5;
+-      g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
+-      g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
+-      g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
+-      g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
+-
+-      /* select h if h < p, or h + -p if h >= p */
+-      mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
+-      g0 &= mask;
+-      g1 &= mask;
+-      g2 &= mask;
+-      g3 &= mask;
+-      g4 &= mask;
+-      mask = ~mask;
+-      h0 = (h0 & mask) | g0;
+-      h1 = (h1 & mask) | g1;
+-      h2 = (h2 & mask) | g2;
+-      h3 = (h3 & mask) | g3;
+-      h4 = (h4 & mask) | g4;
+-
+-      /* h = h % (2^128) */
+-      put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
+-      put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
+-      put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
+-      put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
+-}
+-EXPORT_SYMBOL_GPL(poly1305_core_emit);
+-
+ void poly1305_init_generic(struct poly1305_desc_ctx *desc, const u8 *key)
+ {
+-      poly1305_core_setkey(desc->r, key);
++      poly1305_core_setkey(&desc->core_r, key);
+       desc->s[0] = get_unaligned_le32(key + 16);
+       desc->s[1] = get_unaligned_le32(key + 20);
+       desc->s[2] = get_unaligned_le32(key + 24);
+@@ -164,7 +22,7 @@ void poly1305_init_generic(struct poly13
+       poly1305_core_init(&desc->h);
+       desc->buflen = 0;
+       desc->sset = true;
+-      desc->rset = 1;
++      desc->rset = 2;
+ }
+ EXPORT_SYMBOL_GPL(poly1305_init_generic);
+ 
+@@ -181,13 +39,14 @@ void poly1305_update_generic(struct poly
+               desc->buflen += bytes;
+ 
+               if (desc->buflen == POLY1305_BLOCK_SIZE) {
+-                      poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 1);
++                      poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf,
++                                           1, 1);
+                       desc->buflen = 0;
+               }
+       }
+ 
+       if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
+-              poly1305_core_blocks(&desc->h, desc->r, src,
++              poly1305_core_blocks(&desc->h, &desc->core_r, src,
+                                    nbytes / POLY1305_BLOCK_SIZE, 1);
+               src += nbytes - (nbytes % POLY1305_BLOCK_SIZE);
+               nbytes %= POLY1305_BLOCK_SIZE;
+@@ -202,28 +61,14 @@ EXPORT_SYMBOL_GPL(poly1305_update_generi
+ 
+ void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *dst)
+ {
+-      __le32 digest[4];
+-      u64 f = 0;
+-
+       if (unlikely(desc->buflen)) {
+               desc->buf[desc->buflen++] = 1;
+               memset(desc->buf + desc->buflen, 0,
+                      POLY1305_BLOCK_SIZE - desc->buflen);
+-              poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 0);
++              poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf, 1, 0);
+       }
+ 
+-      poly1305_core_emit(&desc->h, digest);
+-
+-      /* mac = (h + s) % (2^128) */
+-      f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
+-      put_unaligned_le32(f, dst + 0);
+-      f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
+-      put_unaligned_le32(f, dst + 4);
+-      f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
+-      put_unaligned_le32(f, dst + 8);
+-      f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
+-      put_unaligned_le32(f, dst + 12);
+-
++      poly1305_core_emit(&desc->h, desc->s, dst);
+       *desc = (struct poly1305_desc_ctx){};
+ }
+ EXPORT_SYMBOL_GPL(poly1305_final_generic);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0042-crypto-x86-poly1305-import-unmodified-cryptogams-imp.patch b/target/linux/generic/backport-5.4/080-wireguard-0042-crypto-x86-poly1305-import-unmodified-cryptogams-imp.patch

new file mode 100644 (file)

index 0000000..7c80309
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0042-crypto-x86-poly1305-import-unmodified-cryptogams-imp.patch
@@ -0,0 +1,4183 @@
+From 6dbd1094c7b9897a3264418cd6543fae1a0bcade Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 5 Jan 2020 22:40:47 -0500
+Subject: [PATCH 042/124] crypto: x86/poly1305 - import unmodified cryptogams
+ implementation
+
+commit 0896ca2a0cb6127e8a129f1f2a680d49b6b0f65c upstream.
+
+These x86_64 vectorized implementations come from Andy Polyakov's
+CRYPTOGAMS implementation, and are included here in raw form without
+modification, so that subsequent commits that fix these up for the
+kernel can see how it has changed.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 4159 +++++++++++++++++
+ 1 file changed, 4159 insertions(+)
+ create mode 100644 arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+
+--- /dev/null
++++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+@@ -0,0 +1,4159 @@
++#! /usr/bin/env perl
++# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the OpenSSL license (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# ====================================================================
++# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++#
++# This module implements Poly1305 hash for x86_64.
++#
++# March 2015
++#
++# Initial release.
++#
++# December 2016
++#
++# Add AVX512F+VL+BW code path.
++#
++# November 2017
++#
++# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
++# executed even on Knights Landing. Trigger for modification was
++# observation that AVX512 code paths can negatively affect overall
++# Skylake-X system performance. Since we are likely to suppress
++# AVX512F capability flag [at least on Skylake-X], conversion serves
++# as kind of "investment protection". Note that next *lake processor,
++# Cannolake, has AVX512IFMA code path to execute...
++#
++# Numbers are cycles per processed byte with poly1305_blocks alone,
++# measured with rdtsc at fixed clock frequency.
++#
++#             IALU/gcc-4.8(*) AVX(**)         AVX2    AVX-512
++# P4          4.46/+120%      -
++# Core 2      2.41/+90%       -
++# Westmere    1.88/+120%      -
++# Sandy Bridge        1.39/+140%      1.10
++# Haswell     1.14/+175%      1.11            0.65
++# Skylake[-X] 1.13/+120%      0.96            0.51    [0.35]
++# Silvermont  2.83/+95%       -
++# Knights L   3.60/?          1.65            1.10    0.41(***)
++# Goldmont    1.70/+180%      -
++# VIA Nano    1.82/+150%      -
++# Sledgehammer        1.38/+160%      -
++# Bulldozer   2.30/+130%      0.97
++# Ryzen               1.15/+200%      1.08            1.18
++#
++# (*) improvement coefficients relative to clang are more modest and
++#     are ~50% on most processors, in both cases we are comparing to
++#     __int128 code;
++# (**)        SSE2 implementation was attempted, but among non-AVX processors
++#     it was faster than integer-only code only on older Intel P4 and
++#     Core processors, 50-30%, less newer processor is, but slower on
++#     contemporary ones, for example almost 2x slower on Atom, and as
++#     former are naturally disappearing, SSE2 is deemed unnecessary;
++# (***)       strangely enough performance seems to vary from core to core,
++#     listed result is best case;
++
++$flavour = shift;
++$output  = shift;
++if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
++
++$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
++die "can't locate x86_64-xlate.pl";
++
++if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
++              =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
++      $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
++}
++
++if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
++         `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
++      $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
++      $avx += 2 if ($1==2.11 && $2>=8);
++}
++
++if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
++         `ml64 2>&1` =~ /Version ([0-9]+)\./) {
++      $avx = ($1>=10) + ($1>=12);
++}
++
++if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
++      $avx = ($2>=3.0) + ($2>3.0);
++}
++
++open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
++*STDOUT=*OUT;
++
++my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
++my ($mac,$nonce)=($inp,$len); # *_emit arguments
++my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
++my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
++
++sub poly1305_iteration {
++# input:      copy of $r1 in %rax, $h0-$h2, $r0-$r1
++# output:     $h0-$h2 *= $r0-$r1
++$code.=<<___;
++      mulq    $h0                     # h0*r1
++      mov     %rax,$d2
++       mov    $r0,%rax
++      mov     %rdx,$d3
++
++      mulq    $h0                     # h0*r0
++      mov     %rax,$h0                # future $h0
++       mov    $r0,%rax
++      mov     %rdx,$d1
++
++      mulq    $h1                     # h1*r0
++      add     %rax,$d2
++       mov    $s1,%rax
++      adc     %rdx,$d3
++
++      mulq    $h1                     # h1*s1
++       mov    $h2,$h1                 # borrow $h1
++      add     %rax,$h0
++      adc     %rdx,$d1
++
++      imulq   $s1,$h1                 # h2*s1
++      add     $h1,$d2
++       mov    $d1,$h1
++      adc     \$0,$d3
++
++      imulq   $r0,$h2                 # h2*r0
++      add     $d2,$h1
++      mov     \$-4,%rax               # mask value
++      adc     $h2,$d3
++
++      and     $d3,%rax                # last reduction step
++      mov     $d3,$h2
++      shr     \$2,$d3
++      and     \$3,$h2
++      add     $d3,%rax
++      add     %rax,$h0
++      adc     \$0,$h1
++      adc     \$0,$h2
++___
++}
++
++########################################################################
++# Layout of opaque area is following.
++#
++#     unsigned __int64 h[3];          # current hash value base 2^64
++#     unsigned __int64 r[2];          # key value base 2^64
++
++$code.=<<___;
++.text
++
++.extern       OPENSSL_ia32cap_P
++
++.globl        poly1305_init
++.hidden       poly1305_init
++.globl        poly1305_blocks
++.hidden       poly1305_blocks
++.globl        poly1305_emit
++.hidden       poly1305_emit
++
++.type poly1305_init,\@function,3
++.align        32
++poly1305_init:
++      xor     %rax,%rax
++      mov     %rax,0($ctx)            # initialize hash value
++      mov     %rax,8($ctx)
++      mov     %rax,16($ctx)
++
++      cmp     \$0,$inp
++      je      .Lno_key
++
++      lea     poly1305_blocks(%rip),%r10
++      lea     poly1305_emit(%rip),%r11
++___
++$code.=<<___  if ($avx);
++      mov     OPENSSL_ia32cap_P+4(%rip),%r9
++      lea     poly1305_blocks_avx(%rip),%rax
++      lea     poly1305_emit_avx(%rip),%rcx
++      bt      \$`60-32`,%r9           # AVX?
++      cmovc   %rax,%r10
++      cmovc   %rcx,%r11
++___
++$code.=<<___  if ($avx>1);
++      lea     poly1305_blocks_avx2(%rip),%rax
++      bt      \$`5+32`,%r9            # AVX2?
++      cmovc   %rax,%r10
++___
++$code.=<<___  if ($avx>3);
++      mov     \$`(1<<31|1<<21|1<<16)`,%rax
++      shr     \$32,%r9
++      and     %rax,%r9
++      cmp     %rax,%r9
++      je      .Linit_base2_44
++___
++$code.=<<___;
++      mov     \$0x0ffffffc0fffffff,%rax
++      mov     \$0x0ffffffc0ffffffc,%rcx
++      and     0($inp),%rax
++      and     8($inp),%rcx
++      mov     %rax,24($ctx)
++      mov     %rcx,32($ctx)
++___
++$code.=<<___  if ($flavour !~ /elf32/);
++      mov     %r10,0(%rdx)
++      mov     %r11,8(%rdx)
++___
++$code.=<<___  if ($flavour =~ /elf32/);
++      mov     %r10d,0(%rdx)
++      mov     %r11d,4(%rdx)
++___
++$code.=<<___;
++      mov     \$1,%eax
++.Lno_key:
++      ret
++.size poly1305_init,.-poly1305_init
++
++.type poly1305_blocks,\@function,4
++.align        32
++poly1305_blocks:
++.cfi_startproc
++.Lblocks:
++      shr     \$4,$len
++      jz      .Lno_data               # too short
++
++      push    %rbx
++.cfi_push     %rbx
++      push    %rbp
++.cfi_push     %rbp
++      push    %r12
++.cfi_push     %r12
++      push    %r13
++.cfi_push     %r13
++      push    %r14
++.cfi_push     %r14
++      push    %r15
++.cfi_push     %r15
++.Lblocks_body:
++
++      mov     $len,%r15               # reassign $len
++
++      mov     24($ctx),$r0            # load r
++      mov     32($ctx),$s1
++
++      mov     0($ctx),$h0             # load hash value
++      mov     8($ctx),$h1
++      mov     16($ctx),$h2
++
++      mov     $s1,$r1
++      shr     \$2,$s1
++      mov     $r1,%rax
++      add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
++      jmp     .Loop
++
++.align        32
++.Loop:
++      add     0($inp),$h0             # accumulate input
++      adc     8($inp),$h1
++      lea     16($inp),$inp
++      adc     $padbit,$h2
++___
++      &poly1305_iteration();
++$code.=<<___;
++      mov     $r1,%rax
++      dec     %r15                    # len-=16
++      jnz     .Loop
++
++      mov     $h0,0($ctx)             # store hash value
++      mov     $h1,8($ctx)
++      mov     $h2,16($ctx)
++
++      mov     0(%rsp),%r15
++.cfi_restore  %r15
++      mov     8(%rsp),%r14
++.cfi_restore  %r14
++      mov     16(%rsp),%r13
++.cfi_restore  %r13
++      mov     24(%rsp),%r12
++.cfi_restore  %r12
++      mov     32(%rsp),%rbp
++.cfi_restore  %rbp
++      mov     40(%rsp),%rbx
++.cfi_restore  %rbx
++      lea     48(%rsp),%rsp
++.cfi_adjust_cfa_offset        -48
++.Lno_data:
++.Lblocks_epilogue:
++      ret
++.cfi_endproc
++.size poly1305_blocks,.-poly1305_blocks
++
++.type poly1305_emit,\@function,3
++.align        32
++poly1305_emit:
++.Lemit:
++      mov     0($ctx),%r8     # load hash value
++      mov     8($ctx),%r9
++      mov     16($ctx),%r10
++
++      mov     %r8,%rax
++      add     \$5,%r8         # compare to modulus
++      mov     %r9,%rcx
++      adc     \$0,%r9
++      adc     \$0,%r10
++      shr     \$2,%r10        # did 130-bit value overflow?
++      cmovnz  %r8,%rax
++      cmovnz  %r9,%rcx
++
++      add     0($nonce),%rax  # accumulate nonce
++      adc     8($nonce),%rcx
++      mov     %rax,0($mac)    # write result
++      mov     %rcx,8($mac)
++
++      ret
++.size poly1305_emit,.-poly1305_emit
++___
++if ($avx) {
++
++########################################################################
++# Layout of opaque area is following.
++#
++#     unsigned __int32 h[5];          # current hash value base 2^26
++#     unsigned __int32 is_base2_26;
++#     unsigned __int64 r[2];          # key value base 2^64
++#     unsigned __int64 pad;
++#     struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
++#
++# where r^n are base 2^26 digits of degrees of multiplier key. There are
++# 5 digits, but last four are interleaved with multiples of 5, totalling
++# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
++
++my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
++    map("%xmm$_",(0..15));
++
++$code.=<<___;
++.type __poly1305_block,\@abi-omnipotent
++.align        32
++__poly1305_block:
++___
++      &poly1305_iteration();
++$code.=<<___;
++      ret
++.size __poly1305_block,.-__poly1305_block
++
++.type __poly1305_init_avx,\@abi-omnipotent
++.align        32
++__poly1305_init_avx:
++      mov     $r0,$h0
++      mov     $r1,$h1
++      xor     $h2,$h2
++
++      lea     48+64($ctx),$ctx        # size optimization
++
++      mov     $r1,%rax
++      call    __poly1305_block        # r^2
++
++      mov     \$0x3ffffff,%eax        # save interleaved r^2 and r base 2^26
++      mov     \$0x3ffffff,%edx
++      mov     $h0,$d1
++      and     $h0#d,%eax
++      mov     $r0,$d2
++      and     $r0#d,%edx
++      mov     %eax,`16*0+0-64`($ctx)
++      shr     \$26,$d1
++      mov     %edx,`16*0+4-64`($ctx)
++      shr     \$26,$d2
++
++      mov     \$0x3ffffff,%eax
++      mov     \$0x3ffffff,%edx
++      and     $d1#d,%eax
++      and     $d2#d,%edx
++      mov     %eax,`16*1+0-64`($ctx)
++      lea     (%rax,%rax,4),%eax      # *5
++      mov     %edx,`16*1+4-64`($ctx)
++      lea     (%rdx,%rdx,4),%edx      # *5
++      mov     %eax,`16*2+0-64`($ctx)
++      shr     \$26,$d1
++      mov     %edx,`16*2+4-64`($ctx)
++      shr     \$26,$d2
++
++      mov     $h1,%rax
++      mov     $r1,%rdx
++      shl     \$12,%rax
++      shl     \$12,%rdx
++      or      $d1,%rax
++      or      $d2,%rdx
++      and     \$0x3ffffff,%eax
++      and     \$0x3ffffff,%edx
++      mov     %eax,`16*3+0-64`($ctx)
++      lea     (%rax,%rax,4),%eax      # *5
++      mov     %edx,`16*3+4-64`($ctx)
++      lea     (%rdx,%rdx,4),%edx      # *5
++      mov     %eax,`16*4+0-64`($ctx)
++      mov     $h1,$d1
++      mov     %edx,`16*4+4-64`($ctx)
++      mov     $r1,$d2
++
++      mov     \$0x3ffffff,%eax
++      mov     \$0x3ffffff,%edx
++      shr     \$14,$d1
++      shr     \$14,$d2
++      and     $d1#d,%eax
++      and     $d2#d,%edx
++      mov     %eax,`16*5+0-64`($ctx)
++      lea     (%rax,%rax,4),%eax      # *5
++      mov     %edx,`16*5+4-64`($ctx)
++      lea     (%rdx,%rdx,4),%edx      # *5
++      mov     %eax,`16*6+0-64`($ctx)
++      shr     \$26,$d1
++      mov     %edx,`16*6+4-64`($ctx)
++      shr     \$26,$d2
++
++      mov     $h2,%rax
++      shl     \$24,%rax
++      or      %rax,$d1
++      mov     $d1#d,`16*7+0-64`($ctx)
++      lea     ($d1,$d1,4),$d1         # *5
++      mov     $d2#d,`16*7+4-64`($ctx)
++      lea     ($d2,$d2,4),$d2         # *5
++      mov     $d1#d,`16*8+0-64`($ctx)
++      mov     $d2#d,`16*8+4-64`($ctx)
++
++      mov     $r1,%rax
++      call    __poly1305_block        # r^3
++
++      mov     \$0x3ffffff,%eax        # save r^3 base 2^26
++      mov     $h0,$d1
++      and     $h0#d,%eax
++      shr     \$26,$d1
++      mov     %eax,`16*0+12-64`($ctx)
++
++      mov     \$0x3ffffff,%edx
++      and     $d1#d,%edx
++      mov     %edx,`16*1+12-64`($ctx)
++      lea     (%rdx,%rdx,4),%edx      # *5
++      shr     \$26,$d1
++      mov     %edx,`16*2+12-64`($ctx)
++
++      mov     $h1,%rax
++      shl     \$12,%rax
++      or      $d1,%rax
++      and     \$0x3ffffff,%eax
++      mov     %eax,`16*3+12-64`($ctx)
++      lea     (%rax,%rax,4),%eax      # *5
++      mov     $h1,$d1
++      mov     %eax,`16*4+12-64`($ctx)
++
++      mov     \$0x3ffffff,%edx
++      shr     \$14,$d1
++      and     $d1#d,%edx
++      mov     %edx,`16*5+12-64`($ctx)
++      lea     (%rdx,%rdx,4),%edx      # *5
++      shr     \$26,$d1
++      mov     %edx,`16*6+12-64`($ctx)
++
++      mov     $h2,%rax
++      shl     \$24,%rax
++      or      %rax,$d1
++      mov     $d1#d,`16*7+12-64`($ctx)
++      lea     ($d1,$d1,4),$d1         # *5
++      mov     $d1#d,`16*8+12-64`($ctx)
++
++      mov     $r1,%rax
++      call    __poly1305_block        # r^4
++
++      mov     \$0x3ffffff,%eax        # save r^4 base 2^26
++      mov     $h0,$d1
++      and     $h0#d,%eax
++      shr     \$26,$d1
++      mov     %eax,`16*0+8-64`($ctx)
++
++      mov     \$0x3ffffff,%edx
++      and     $d1#d,%edx
++      mov     %edx,`16*1+8-64`($ctx)
++      lea     (%rdx,%rdx,4),%edx      # *5
++      shr     \$26,$d1
++      mov     %edx,`16*2+8-64`($ctx)
++
++      mov     $h1,%rax
++      shl     \$12,%rax
++      or      $d1,%rax
++      and     \$0x3ffffff,%eax
++      mov     %eax,`16*3+8-64`($ctx)
++      lea     (%rax,%rax,4),%eax      # *5
++      mov     $h1,$d1
++      mov     %eax,`16*4+8-64`($ctx)
++
++      mov     \$0x3ffffff,%edx
++      shr     \$14,$d1
++      and     $d1#d,%edx
++      mov     %edx,`16*5+8-64`($ctx)
++      lea     (%rdx,%rdx,4),%edx      # *5
++      shr     \$26,$d1
++      mov     %edx,`16*6+8-64`($ctx)
++
++      mov     $h2,%rax
++      shl     \$24,%rax
++      or      %rax,$d1
++      mov     $d1#d,`16*7+8-64`($ctx)
++      lea     ($d1,$d1,4),$d1         # *5
++      mov     $d1#d,`16*8+8-64`($ctx)
++
++      lea     -48-64($ctx),$ctx       # size [de-]optimization
++      ret
++.size __poly1305_init_avx,.-__poly1305_init_avx
++
++.type poly1305_blocks_avx,\@function,4
++.align        32
++poly1305_blocks_avx:
++.cfi_startproc
++      mov     20($ctx),%r8d           # is_base2_26
++      cmp     \$128,$len
++      jae     .Lblocks_avx
++      test    %r8d,%r8d
++      jz      .Lblocks
++
++.Lblocks_avx:
++      and     \$-16,$len
++      jz      .Lno_data_avx
++
++      vzeroupper
++
++      test    %r8d,%r8d
++      jz      .Lbase2_64_avx
++
++      test    \$31,$len
++      jz      .Leven_avx
++
++      push    %rbx
++.cfi_push     %rbx
++      push    %rbp
++.cfi_push     %rbp
++      push    %r12
++.cfi_push     %r12
++      push    %r13
++.cfi_push     %r13
++      push    %r14
++.cfi_push     %r14
++      push    %r15
++.cfi_push     %r15
++.Lblocks_avx_body:
++
++      mov     $len,%r15               # reassign $len
++
++      mov     0($ctx),$d1             # load hash value
++      mov     8($ctx),$d2
++      mov     16($ctx),$h2#d
++
++      mov     24($ctx),$r0            # load r
++      mov     32($ctx),$s1
++
++      ################################# base 2^26 -> base 2^64
++      mov     $d1#d,$h0#d
++      and     \$`-1*(1<<31)`,$d1
++      mov     $d2,$r1                 # borrow $r1
++      mov     $d2#d,$h1#d
++      and     \$`-1*(1<<31)`,$d2
++
++      shr     \$6,$d1
++      shl     \$52,$r1
++      add     $d1,$h0
++      shr     \$12,$h1
++      shr     \$18,$d2
++      add     $r1,$h0
++      adc     $d2,$h1
++
++      mov     $h2,$d1
++      shl     \$40,$d1
++      shr     \$24,$h2
++      add     $d1,$h1
++      adc     \$0,$h2                 # can be partially reduced...
++
++      mov     \$-4,$d2                # ... so reduce
++      mov     $h2,$d1
++      and     $h2,$d2
++      shr     \$2,$d1
++      and     \$3,$h2
++      add     $d2,$d1                 # =*5
++      add     $d1,$h0
++      adc     \$0,$h1
++      adc     \$0,$h2
++
++      mov     $s1,$r1
++      mov     $s1,%rax
++      shr     \$2,$s1
++      add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
++
++      add     0($inp),$h0             # accumulate input
++      adc     8($inp),$h1
++      lea     16($inp),$inp
++      adc     $padbit,$h2
++
++      call    __poly1305_block
++
++      test    $padbit,$padbit         # if $padbit is zero,
++      jz      .Lstore_base2_64_avx    # store hash in base 2^64 format
++
++      ################################# base 2^64 -> base 2^26
++      mov     $h0,%rax
++      mov     $h0,%rdx
++      shr     \$52,$h0
++      mov     $h1,$r0
++      mov     $h1,$r1
++      shr     \$26,%rdx
++      and     \$0x3ffffff,%rax        # h[0]
++      shl     \$12,$r0
++      and     \$0x3ffffff,%rdx        # h[1]
++      shr     \$14,$h1
++      or      $r0,$h0
++      shl     \$24,$h2
++      and     \$0x3ffffff,$h0         # h[2]
++      shr     \$40,$r1
++      and     \$0x3ffffff,$h1         # h[3]
++      or      $r1,$h2                 # h[4]
++
++      sub     \$16,%r15
++      jz      .Lstore_base2_26_avx
++
++      vmovd   %rax#d,$H0
++      vmovd   %rdx#d,$H1
++      vmovd   $h0#d,$H2
++      vmovd   $h1#d,$H3
++      vmovd   $h2#d,$H4
++      jmp     .Lproceed_avx
++
++.align        32
++.Lstore_base2_64_avx:
++      mov     $h0,0($ctx)
++      mov     $h1,8($ctx)
++      mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
++      jmp     .Ldone_avx
++
++.align        16
++.Lstore_base2_26_avx:
++      mov     %rax#d,0($ctx)          # store hash value base 2^26
++      mov     %rdx#d,4($ctx)
++      mov     $h0#d,8($ctx)
++      mov     $h1#d,12($ctx)
++      mov     $h2#d,16($ctx)
++.align        16
++.Ldone_avx:
++      mov     0(%rsp),%r15
++.cfi_restore  %r15
++      mov     8(%rsp),%r14
++.cfi_restore  %r14
++      mov     16(%rsp),%r13
++.cfi_restore  %r13
++      mov     24(%rsp),%r12
++.cfi_restore  %r12
++      mov     32(%rsp),%rbp
++.cfi_restore  %rbp
++      mov     40(%rsp),%rbx
++.cfi_restore  %rbx
++      lea     48(%rsp),%rsp
++.cfi_adjust_cfa_offset        -48
++.Lno_data_avx:
++.Lblocks_avx_epilogue:
++      ret
++.cfi_endproc
++
++.align        32
++.Lbase2_64_avx:
++.cfi_startproc
++      push    %rbx
++.cfi_push     %rbx
++      push    %rbp
++.cfi_push     %rbp
++      push    %r12
++.cfi_push     %r12
++      push    %r13
++.cfi_push     %r13
++      push    %r14
++.cfi_push     %r14
++      push    %r15
++.cfi_push     %r15
++.Lbase2_64_avx_body:
++
++      mov     $len,%r15               # reassign $len
++
++      mov     24($ctx),$r0            # load r
++      mov     32($ctx),$s1
++
++      mov     0($ctx),$h0             # load hash value
++      mov     8($ctx),$h1
++      mov     16($ctx),$h2#d
++
++      mov     $s1,$r1
++      mov     $s1,%rax
++      shr     \$2,$s1
++      add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
++
++      test    \$31,$len
++      jz      .Linit_avx
++
++      add     0($inp),$h0             # accumulate input
++      adc     8($inp),$h1
++      lea     16($inp),$inp
++      adc     $padbit,$h2
++      sub     \$16,%r15
++
++      call    __poly1305_block
++
++.Linit_avx:
++      ################################# base 2^64 -> base 2^26
++      mov     $h0,%rax
++      mov     $h0,%rdx
++      shr     \$52,$h0
++      mov     $h1,$d1
++      mov     $h1,$d2
++      shr     \$26,%rdx
++      and     \$0x3ffffff,%rax        # h[0]
++      shl     \$12,$d1
++      and     \$0x3ffffff,%rdx        # h[1]
++      shr     \$14,$h1
++      or      $d1,$h0
++      shl     \$24,$h2
++      and     \$0x3ffffff,$h0         # h[2]
++      shr     \$40,$d2
++      and     \$0x3ffffff,$h1         # h[3]
++      or      $d2,$h2                 # h[4]
++
++      vmovd   %rax#d,$H0
++      vmovd   %rdx#d,$H1
++      vmovd   $h0#d,$H2
++      vmovd   $h1#d,$H3
++      vmovd   $h2#d,$H4
++      movl    \$1,20($ctx)            # set is_base2_26
++
++      call    __poly1305_init_avx
++
++.Lproceed_avx:
++      mov     %r15,$len
++
++      mov     0(%rsp),%r15
++.cfi_restore  %r15
++      mov     8(%rsp),%r14
++.cfi_restore  %r14
++      mov     16(%rsp),%r13
++.cfi_restore  %r13
++      mov     24(%rsp),%r12
++.cfi_restore  %r12
++      mov     32(%rsp),%rbp
++.cfi_restore  %rbp
++      mov     40(%rsp),%rbx
++.cfi_restore  %rbx
++      lea     48(%rsp),%rax
++      lea     48(%rsp),%rsp
++.cfi_adjust_cfa_offset        -48
++.Lbase2_64_avx_epilogue:
++      jmp     .Ldo_avx
++.cfi_endproc
++
++.align        32
++.Leven_avx:
++.cfi_startproc
++      vmovd           4*0($ctx),$H0           # load hash value
++      vmovd           4*1($ctx),$H1
++      vmovd           4*2($ctx),$H2
++      vmovd           4*3($ctx),$H3
++      vmovd           4*4($ctx),$H4
++
++.Ldo_avx:
++___
++$code.=<<___  if (!$win64);
++      lea             -0x58(%rsp),%r11
++.cfi_def_cfa          %r11,0x60
++      sub             \$0x178,%rsp
++___
++$code.=<<___  if ($win64);
++      lea             -0xf8(%rsp),%r11
++      sub             \$0x218,%rsp
++      vmovdqa         %xmm6,0x50(%r11)
++      vmovdqa         %xmm7,0x60(%r11)
++      vmovdqa         %xmm8,0x70(%r11)
++      vmovdqa         %xmm9,0x80(%r11)
++      vmovdqa         %xmm10,0x90(%r11)
++      vmovdqa         %xmm11,0xa0(%r11)
++      vmovdqa         %xmm12,0xb0(%r11)
++      vmovdqa         %xmm13,0xc0(%r11)
++      vmovdqa         %xmm14,0xd0(%r11)
++      vmovdqa         %xmm15,0xe0(%r11)
++.Ldo_avx_body:
++___
++$code.=<<___;
++      sub             \$64,$len
++      lea             -32($inp),%rax
++      cmovc           %rax,$inp
++
++      vmovdqu         `16*3`($ctx),$D4        # preload r0^2
++      lea             `16*3+64`($ctx),$ctx    # size optimization
++      lea             .Lconst(%rip),%rcx
++
++      ################################################################
++      # load input
++      vmovdqu         16*2($inp),$T0
++      vmovdqu         16*3($inp),$T1
++      vmovdqa         64(%rcx),$MASK          # .Lmask26
++
++      vpsrldq         \$6,$T0,$T2             # splat input
++      vpsrldq         \$6,$T1,$T3
++      vpunpckhqdq     $T1,$T0,$T4             # 4
++      vpunpcklqdq     $T1,$T0,$T0             # 0:1
++      vpunpcklqdq     $T3,$T2,$T3             # 2:3
++
++      vpsrlq          \$40,$T4,$T4            # 4
++      vpsrlq          \$26,$T0,$T1
++      vpand           $MASK,$T0,$T0           # 0
++      vpsrlq          \$4,$T3,$T2
++      vpand           $MASK,$T1,$T1           # 1
++      vpsrlq          \$30,$T3,$T3
++      vpand           $MASK,$T2,$T2           # 2
++      vpand           $MASK,$T3,$T3           # 3
++      vpor            32(%rcx),$T4,$T4        # padbit, yes, always
++
++      jbe             .Lskip_loop_avx
++
++      # expand and copy pre-calculated table to stack
++      vmovdqu         `16*1-64`($ctx),$D1
++      vmovdqu         `16*2-64`($ctx),$D2
++      vpshufd         \$0xEE,$D4,$D3          # 34xx -> 3434
++      vpshufd         \$0x44,$D4,$D0          # xx12 -> 1212
++      vmovdqa         $D3,-0x90(%r11)
++      vmovdqa         $D0,0x00(%rsp)
++      vpshufd         \$0xEE,$D1,$D4
++      vmovdqu         `16*3-64`($ctx),$D0
++      vpshufd         \$0x44,$D1,$D1
++      vmovdqa         $D4,-0x80(%r11)
++      vmovdqa         $D1,0x10(%rsp)
++      vpshufd         \$0xEE,$D2,$D3
++      vmovdqu         `16*4-64`($ctx),$D1
++      vpshufd         \$0x44,$D2,$D2
++      vmovdqa         $D3,-0x70(%r11)
++      vmovdqa         $D2,0x20(%rsp)
++      vpshufd         \$0xEE,$D0,$D4
++      vmovdqu         `16*5-64`($ctx),$D2
++      vpshufd         \$0x44,$D0,$D0
++      vmovdqa         $D4,-0x60(%r11)
++      vmovdqa         $D0,0x30(%rsp)
++      vpshufd         \$0xEE,$D1,$D3
++      vmovdqu         `16*6-64`($ctx),$D0
++      vpshufd         \$0x44,$D1,$D1
++      vmovdqa         $D3,-0x50(%r11)
++      vmovdqa         $D1,0x40(%rsp)
++      vpshufd         \$0xEE,$D2,$D4
++      vmovdqu         `16*7-64`($ctx),$D1
++      vpshufd         \$0x44,$D2,$D2
++      vmovdqa         $D4,-0x40(%r11)
++      vmovdqa         $D2,0x50(%rsp)
++      vpshufd         \$0xEE,$D0,$D3
++      vmovdqu         `16*8-64`($ctx),$D2
++      vpshufd         \$0x44,$D0,$D0
++      vmovdqa         $D3,-0x30(%r11)
++      vmovdqa         $D0,0x60(%rsp)
++      vpshufd         \$0xEE,$D1,$D4
++      vpshufd         \$0x44,$D1,$D1
++      vmovdqa         $D4,-0x20(%r11)
++      vmovdqa         $D1,0x70(%rsp)
++      vpshufd         \$0xEE,$D2,$D3
++       vmovdqa        0x00(%rsp),$D4          # preload r0^2
++      vpshufd         \$0x44,$D2,$D2
++      vmovdqa         $D3,-0x10(%r11)
++      vmovdqa         $D2,0x80(%rsp)
++
++      jmp             .Loop_avx
++
++.align        32
++.Loop_avx:
++      ################################################################
++      # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
++      # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
++      #   \___________________/
++      # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
++      # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
++      #   \___________________/ \____________________/
++      #
++      # Note that we start with inp[2:3]*r^2. This is because it
++      # doesn't depend on reduction in previous iteration.
++      ################################################################
++      # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
++      # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
++      # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
++      # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
++      # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++      #
++      # though note that $Tx and $Hx are "reversed" in this section,
++      # and $D4 is preloaded with r0^2...
++
++      vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
++      vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
++        vmovdqa       $H2,0x20(%r11)                          # offload hash
++      vpmuludq        $T2,$D4,$D2             # d3 = h2*r0
++       vmovdqa        0x10(%rsp),$H2          # r1^2
++      vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
++      vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
++
++        vmovdqa       $H0,0x00(%r11)                          #
++      vpmuludq        0x20(%rsp),$T4,$H0      # h4*s1
++        vmovdqa       $H1,0x10(%r11)                          #
++      vpmuludq        $T3,$H2,$H1             # h3*r1
++      vpaddq          $H0,$D0,$D0             # d0 += h4*s1
++      vpaddq          $H1,$D4,$D4             # d4 += h3*r1
++        vmovdqa       $H3,0x30(%r11)                          #
++      vpmuludq        $T2,$H2,$H0             # h2*r1
++      vpmuludq        $T1,$H2,$H1             # h1*r1
++      vpaddq          $H0,$D3,$D3             # d3 += h2*r1
++       vmovdqa        0x30(%rsp),$H3          # r2^2
++      vpaddq          $H1,$D2,$D2             # d2 += h1*r1
++        vmovdqa       $H4,0x40(%r11)                          #
++      vpmuludq        $T0,$H2,$H2             # h0*r1
++       vpmuludq       $T2,$H3,$H0             # h2*r2
++      vpaddq          $H2,$D1,$D1             # d1 += h0*r1
++
++       vmovdqa        0x40(%rsp),$H4          # s2^2
++      vpaddq          $H0,$D4,$D4             # d4 += h2*r2
++      vpmuludq        $T1,$H3,$H1             # h1*r2
++      vpmuludq        $T0,$H3,$H3             # h0*r2
++      vpaddq          $H1,$D3,$D3             # d3 += h1*r2
++       vmovdqa        0x50(%rsp),$H2          # r3^2
++      vpaddq          $H3,$D2,$D2             # d2 += h0*r2
++      vpmuludq        $T4,$H4,$H0             # h4*s2
++      vpmuludq        $T3,$H4,$H4             # h3*s2
++      vpaddq          $H0,$D1,$D1             # d1 += h4*s2
++       vmovdqa        0x60(%rsp),$H3          # s3^2
++      vpaddq          $H4,$D0,$D0             # d0 += h3*s2
++
++       vmovdqa        0x80(%rsp),$H4          # s4^2
++      vpmuludq        $T1,$H2,$H1             # h1*r3
++      vpmuludq        $T0,$H2,$H2             # h0*r3
++      vpaddq          $H1,$D4,$D4             # d4 += h1*r3
++      vpaddq          $H2,$D3,$D3             # d3 += h0*r3
++      vpmuludq        $T4,$H3,$H0             # h4*s3
++      vpmuludq        $T3,$H3,$H1             # h3*s3
++      vpaddq          $H0,$D2,$D2             # d2 += h4*s3
++       vmovdqu        16*0($inp),$H0                          # load input
++      vpaddq          $H1,$D1,$D1             # d1 += h3*s3
++      vpmuludq        $T2,$H3,$H3             # h2*s3
++       vpmuludq       $T2,$H4,$T2             # h2*s4
++      vpaddq          $H3,$D0,$D0             # d0 += h2*s3
++
++       vmovdqu        16*1($inp),$H1                          #
++      vpaddq          $T2,$D1,$D1             # d1 += h2*s4
++      vpmuludq        $T3,$H4,$T3             # h3*s4
++      vpmuludq        $T4,$H4,$T4             # h4*s4
++       vpsrldq        \$6,$H0,$H2                             # splat input
++      vpaddq          $T3,$D2,$D2             # d2 += h3*s4
++      vpaddq          $T4,$D3,$D3             # d3 += h4*s4
++       vpsrldq        \$6,$H1,$H3                             #
++      vpmuludq        0x70(%rsp),$T0,$T4      # h0*r4
++      vpmuludq        $T1,$H4,$T0             # h1*s4
++       vpunpckhqdq    $H1,$H0,$H4             # 4
++      vpaddq          $T4,$D4,$D4             # d4 += h0*r4
++       vmovdqa        -0x90(%r11),$T4         # r0^4
++      vpaddq          $T0,$D0,$D0             # d0 += h1*s4
++
++      vpunpcklqdq     $H1,$H0,$H0             # 0:1
++      vpunpcklqdq     $H3,$H2,$H3             # 2:3
++
++      #vpsrlq         \$40,$H4,$H4            # 4
++      vpsrldq         \$`40/8`,$H4,$H4        # 4
++      vpsrlq          \$26,$H0,$H1
++      vpand           $MASK,$H0,$H0           # 0
++      vpsrlq          \$4,$H3,$H2
++      vpand           $MASK,$H1,$H1           # 1
++      vpand           0(%rcx),$H4,$H4         # .Lmask24
++      vpsrlq          \$30,$H3,$H3
++      vpand           $MASK,$H2,$H2           # 2
++      vpand           $MASK,$H3,$H3           # 3
++      vpor            32(%rcx),$H4,$H4        # padbit, yes, always
++
++      vpaddq          0x00(%r11),$H0,$H0      # add hash value
++      vpaddq          0x10(%r11),$H1,$H1
++      vpaddq          0x20(%r11),$H2,$H2
++      vpaddq          0x30(%r11),$H3,$H3
++      vpaddq          0x40(%r11),$H4,$H4
++
++      lea             16*2($inp),%rax
++      lea             16*4($inp),$inp
++      sub             \$64,$len
++      cmovc           %rax,$inp
++
++      ################################################################
++      # Now we accumulate (inp[0:1]+hash)*r^4
++      ################################################################
++      # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
++      # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
++      # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
++      # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
++      # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++
++      vpmuludq        $H0,$T4,$T0             # h0*r0
++      vpmuludq        $H1,$T4,$T1             # h1*r0
++      vpaddq          $T0,$D0,$D0
++      vpaddq          $T1,$D1,$D1
++       vmovdqa        -0x80(%r11),$T2         # r1^4
++      vpmuludq        $H2,$T4,$T0             # h2*r0
++      vpmuludq        $H3,$T4,$T1             # h3*r0
++      vpaddq          $T0,$D2,$D2
++      vpaddq          $T1,$D3,$D3
++      vpmuludq        $H4,$T4,$T4             # h4*r0
++       vpmuludq       -0x70(%r11),$H4,$T0     # h4*s1
++      vpaddq          $T4,$D4,$D4
++
++      vpaddq          $T0,$D0,$D0             # d0 += h4*s1
++      vpmuludq        $H2,$T2,$T1             # h2*r1
++      vpmuludq        $H3,$T2,$T0             # h3*r1
++      vpaddq          $T1,$D3,$D3             # d3 += h2*r1
++       vmovdqa        -0x60(%r11),$T3         # r2^4
++      vpaddq          $T0,$D4,$D4             # d4 += h3*r1
++      vpmuludq        $H1,$T2,$T1             # h1*r1
++      vpmuludq        $H0,$T2,$T2             # h0*r1
++      vpaddq          $T1,$D2,$D2             # d2 += h1*r1
++      vpaddq          $T2,$D1,$D1             # d1 += h0*r1
++
++       vmovdqa        -0x50(%r11),$T4         # s2^4
++      vpmuludq        $H2,$T3,$T0             # h2*r2
++      vpmuludq        $H1,$T3,$T1             # h1*r2
++      vpaddq          $T0,$D4,$D4             # d4 += h2*r2
++      vpaddq          $T1,$D3,$D3             # d3 += h1*r2
++       vmovdqa        -0x40(%r11),$T2         # r3^4
++      vpmuludq        $H0,$T3,$T3             # h0*r2
++      vpmuludq        $H4,$T4,$T0             # h4*s2
++      vpaddq          $T3,$D2,$D2             # d2 += h0*r2
++      vpaddq          $T0,$D1,$D1             # d1 += h4*s2
++       vmovdqa        -0x30(%r11),$T3         # s3^4
++      vpmuludq        $H3,$T4,$T4             # h3*s2
++       vpmuludq       $H1,$T2,$T1             # h1*r3
++      vpaddq          $T4,$D0,$D0             # d0 += h3*s2
++
++       vmovdqa        -0x10(%r11),$T4         # s4^4
++      vpaddq          $T1,$D4,$D4             # d4 += h1*r3
++      vpmuludq        $H0,$T2,$T2             # h0*r3
++      vpmuludq        $H4,$T3,$T0             # h4*s3
++      vpaddq          $T2,$D3,$D3             # d3 += h0*r3
++      vpaddq          $T0,$D2,$D2             # d2 += h4*s3
++       vmovdqu        16*2($inp),$T0                          # load input
++      vpmuludq        $H3,$T3,$T2             # h3*s3
++      vpmuludq        $H2,$T3,$T3             # h2*s3
++      vpaddq          $T2,$D1,$D1             # d1 += h3*s3
++       vmovdqu        16*3($inp),$T1                          #
++      vpaddq          $T3,$D0,$D0             # d0 += h2*s3
++
++      vpmuludq        $H2,$T4,$H2             # h2*s4
++      vpmuludq        $H3,$T4,$H3             # h3*s4
++       vpsrldq        \$6,$T0,$T2                             # splat input
++      vpaddq          $H2,$D1,$D1             # d1 += h2*s4
++      vpmuludq        $H4,$T4,$H4             # h4*s4
++       vpsrldq        \$6,$T1,$T3                             #
++      vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*s4
++      vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*s4
++      vpmuludq        -0x20(%r11),$H0,$H4     # h0*r4
++      vpmuludq        $H1,$T4,$H0
++       vpunpckhqdq    $T1,$T0,$T4             # 4
++      vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
++      vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
++
++      vpunpcklqdq     $T1,$T0,$T0             # 0:1
++      vpunpcklqdq     $T3,$T2,$T3             # 2:3
++
++      #vpsrlq         \$40,$T4,$T4            # 4
++      vpsrldq         \$`40/8`,$T4,$T4        # 4
++      vpsrlq          \$26,$T0,$T1
++       vmovdqa        0x00(%rsp),$D4          # preload r0^2
++      vpand           $MASK,$T0,$T0           # 0
++      vpsrlq          \$4,$T3,$T2
++      vpand           $MASK,$T1,$T1           # 1
++      vpand           0(%rcx),$T4,$T4         # .Lmask24
++      vpsrlq          \$30,$T3,$T3
++      vpand           $MASK,$T2,$T2           # 2
++      vpand           $MASK,$T3,$T3           # 3
++      vpor            32(%rcx),$T4,$T4        # padbit, yes, always
++
++      ################################################################
++      # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
++      # and P. Schwabe
++
++      vpsrlq          \$26,$H3,$D3
++      vpand           $MASK,$H3,$H3
++      vpaddq          $D3,$H4,$H4             # h3 -> h4
++
++      vpsrlq          \$26,$H0,$D0
++      vpand           $MASK,$H0,$H0
++      vpaddq          $D0,$D1,$H1             # h0 -> h1
++
++      vpsrlq          \$26,$H4,$D0
++      vpand           $MASK,$H4,$H4
++
++      vpsrlq          \$26,$H1,$D1
++      vpand           $MASK,$H1,$H1
++      vpaddq          $D1,$H2,$H2             # h1 -> h2
++
++      vpaddq          $D0,$H0,$H0
++      vpsllq          \$2,$D0,$D0
++      vpaddq          $D0,$H0,$H0             # h4 -> h0
++
++      vpsrlq          \$26,$H2,$D2
++      vpand           $MASK,$H2,$H2
++      vpaddq          $D2,$H3,$H3             # h2 -> h3
++
++      vpsrlq          \$26,$H0,$D0
++      vpand           $MASK,$H0,$H0
++      vpaddq          $D0,$H1,$H1             # h0 -> h1
++
++      vpsrlq          \$26,$H3,$D3
++      vpand           $MASK,$H3,$H3
++      vpaddq          $D3,$H4,$H4             # h3 -> h4
++
++      ja              .Loop_avx
++
++.Lskip_loop_avx:
++      ################################################################
++      # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
++
++      vpshufd         \$0x10,$D4,$D4          # r0^n, xx12 -> x1x2
++      add             \$32,$len
++      jnz             .Long_tail_avx
++
++      vpaddq          $H2,$T2,$T2
++      vpaddq          $H0,$T0,$T0
++      vpaddq          $H1,$T1,$T1
++      vpaddq          $H3,$T3,$T3
++      vpaddq          $H4,$T4,$T4
++
++.Long_tail_avx:
++      vmovdqa         $H2,0x20(%r11)
++      vmovdqa         $H0,0x00(%r11)
++      vmovdqa         $H1,0x10(%r11)
++      vmovdqa         $H3,0x30(%r11)
++      vmovdqa         $H4,0x40(%r11)
++
++      # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
++      # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
++      # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
++      # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
++      # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++
++      vpmuludq        $T2,$D4,$D2             # d2 = h2*r0
++      vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
++       vpshufd        \$0x10,`16*1-64`($ctx),$H2              # r1^n
++      vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
++      vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
++      vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
++
++      vpmuludq        $T3,$H2,$H0             # h3*r1
++      vpaddq          $H0,$D4,$D4             # d4 += h3*r1
++       vpshufd        \$0x10,`16*2-64`($ctx),$H3              # s1^n
++      vpmuludq        $T2,$H2,$H1             # h2*r1
++      vpaddq          $H1,$D3,$D3             # d3 += h2*r1
++       vpshufd        \$0x10,`16*3-64`($ctx),$H4              # r2^n
++      vpmuludq        $T1,$H2,$H0             # h1*r1
++      vpaddq          $H0,$D2,$D2             # d2 += h1*r1
++      vpmuludq        $T0,$H2,$H2             # h0*r1
++      vpaddq          $H2,$D1,$D1             # d1 += h0*r1
++      vpmuludq        $T4,$H3,$H3             # h4*s1
++      vpaddq          $H3,$D0,$D0             # d0 += h4*s1
++
++       vpshufd        \$0x10,`16*4-64`($ctx),$H2              # s2^n
++      vpmuludq        $T2,$H4,$H1             # h2*r2
++      vpaddq          $H1,$D4,$D4             # d4 += h2*r2
++      vpmuludq        $T1,$H4,$H0             # h1*r2
++      vpaddq          $H0,$D3,$D3             # d3 += h1*r2
++       vpshufd        \$0x10,`16*5-64`($ctx),$H3              # r3^n
++      vpmuludq        $T0,$H4,$H4             # h0*r2
++      vpaddq          $H4,$D2,$D2             # d2 += h0*r2
++      vpmuludq        $T4,$H2,$H1             # h4*s2
++      vpaddq          $H1,$D1,$D1             # d1 += h4*s2
++       vpshufd        \$0x10,`16*6-64`($ctx),$H4              # s3^n
++      vpmuludq        $T3,$H2,$H2             # h3*s2
++      vpaddq          $H2,$D0,$D0             # d0 += h3*s2
++
++      vpmuludq        $T1,$H3,$H0             # h1*r3
++      vpaddq          $H0,$D4,$D4             # d4 += h1*r3
++      vpmuludq        $T0,$H3,$H3             # h0*r3
++      vpaddq          $H3,$D3,$D3             # d3 += h0*r3
++       vpshufd        \$0x10,`16*7-64`($ctx),$H2              # r4^n
++      vpmuludq        $T4,$H4,$H1             # h4*s3
++      vpaddq          $H1,$D2,$D2             # d2 += h4*s3
++       vpshufd        \$0x10,`16*8-64`($ctx),$H3              # s4^n
++      vpmuludq        $T3,$H4,$H0             # h3*s3
++      vpaddq          $H0,$D1,$D1             # d1 += h3*s3
++      vpmuludq        $T2,$H4,$H4             # h2*s3
++      vpaddq          $H4,$D0,$D0             # d0 += h2*s3
++
++      vpmuludq        $T0,$H2,$H2             # h0*r4
++      vpaddq          $H2,$D4,$D4             # h4 = d4 + h0*r4
++      vpmuludq        $T4,$H3,$H1             # h4*s4
++      vpaddq          $H1,$D3,$D3             # h3 = d3 + h4*s4
++      vpmuludq        $T3,$H3,$H0             # h3*s4
++      vpaddq          $H0,$D2,$D2             # h2 = d2 + h3*s4
++      vpmuludq        $T2,$H3,$H1             # h2*s4
++      vpaddq          $H1,$D1,$D1             # h1 = d1 + h2*s4
++      vpmuludq        $T1,$H3,$H3             # h1*s4
++      vpaddq          $H3,$D0,$D0             # h0 = d0 + h1*s4
++
++      jz              .Lshort_tail_avx
++
++      vmovdqu         16*0($inp),$H0          # load input
++      vmovdqu         16*1($inp),$H1
++
++      vpsrldq         \$6,$H0,$H2             # splat input
++      vpsrldq         \$6,$H1,$H3
++      vpunpckhqdq     $H1,$H0,$H4             # 4
++      vpunpcklqdq     $H1,$H0,$H0             # 0:1
++      vpunpcklqdq     $H3,$H2,$H3             # 2:3
++
++      vpsrlq          \$40,$H4,$H4            # 4
++      vpsrlq          \$26,$H0,$H1
++      vpand           $MASK,$H0,$H0           # 0
++      vpsrlq          \$4,$H3,$H2
++      vpand           $MASK,$H1,$H1           # 1
++      vpsrlq          \$30,$H3,$H3
++      vpand           $MASK,$H2,$H2           # 2
++      vpand           $MASK,$H3,$H3           # 3
++      vpor            32(%rcx),$H4,$H4        # padbit, yes, always
++
++      vpshufd         \$0x32,`16*0-64`($ctx),$T4      # r0^n, 34xx -> x3x4
++      vpaddq          0x00(%r11),$H0,$H0
++      vpaddq          0x10(%r11),$H1,$H1
++      vpaddq          0x20(%r11),$H2,$H2
++      vpaddq          0x30(%r11),$H3,$H3
++      vpaddq          0x40(%r11),$H4,$H4
++
++      ################################################################
++      # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
++
++      vpmuludq        $H0,$T4,$T0             # h0*r0
++      vpaddq          $T0,$D0,$D0             # d0 += h0*r0
++      vpmuludq        $H1,$T4,$T1             # h1*r0
++      vpaddq          $T1,$D1,$D1             # d1 += h1*r0
++      vpmuludq        $H2,$T4,$T0             # h2*r0
++      vpaddq          $T0,$D2,$D2             # d2 += h2*r0
++       vpshufd        \$0x32,`16*1-64`($ctx),$T2              # r1^n
++      vpmuludq        $H3,$T4,$T1             # h3*r0
++      vpaddq          $T1,$D3,$D3             # d3 += h3*r0
++      vpmuludq        $H4,$T4,$T4             # h4*r0
++      vpaddq          $T4,$D4,$D4             # d4 += h4*r0
++
++      vpmuludq        $H3,$T2,$T0             # h3*r1
++      vpaddq          $T0,$D4,$D4             # d4 += h3*r1
++       vpshufd        \$0x32,`16*2-64`($ctx),$T3              # s1
++      vpmuludq        $H2,$T2,$T1             # h2*r1
++      vpaddq          $T1,$D3,$D3             # d3 += h2*r1
++       vpshufd        \$0x32,`16*3-64`($ctx),$T4              # r2
++      vpmuludq        $H1,$T2,$T0             # h1*r1
++      vpaddq          $T0,$D2,$D2             # d2 += h1*r1
++      vpmuludq        $H0,$T2,$T2             # h0*r1
++      vpaddq          $T2,$D1,$D1             # d1 += h0*r1
++      vpmuludq        $H4,$T3,$T3             # h4*s1
++      vpaddq          $T3,$D0,$D0             # d0 += h4*s1
++
++       vpshufd        \$0x32,`16*4-64`($ctx),$T2              # s2
++      vpmuludq        $H2,$T4,$T1             # h2*r2
++      vpaddq          $T1,$D4,$D4             # d4 += h2*r2
++      vpmuludq        $H1,$T4,$T0             # h1*r2
++      vpaddq          $T0,$D3,$D3             # d3 += h1*r2
++       vpshufd        \$0x32,`16*5-64`($ctx),$T3              # r3
++      vpmuludq        $H0,$T4,$T4             # h0*r2
++      vpaddq          $T4,$D2,$D2             # d2 += h0*r2
++      vpmuludq        $H4,$T2,$T1             # h4*s2
++      vpaddq          $T1,$D1,$D1             # d1 += h4*s2
++       vpshufd        \$0x32,`16*6-64`($ctx),$T4              # s3
++      vpmuludq        $H3,$T2,$T2             # h3*s2
++      vpaddq          $T2,$D0,$D0             # d0 += h3*s2
++
++      vpmuludq        $H1,$T3,$T0             # h1*r3
++      vpaddq          $T0,$D4,$D4             # d4 += h1*r3
++      vpmuludq        $H0,$T3,$T3             # h0*r3
++      vpaddq          $T3,$D3,$D3             # d3 += h0*r3
++       vpshufd        \$0x32,`16*7-64`($ctx),$T2              # r4
++      vpmuludq        $H4,$T4,$T1             # h4*s3
++      vpaddq          $T1,$D2,$D2             # d2 += h4*s3
++       vpshufd        \$0x32,`16*8-64`($ctx),$T3              # s4
++      vpmuludq        $H3,$T4,$T0             # h3*s3
++      vpaddq          $T0,$D1,$D1             # d1 += h3*s3
++      vpmuludq        $H2,$T4,$T4             # h2*s3
++      vpaddq          $T4,$D0,$D0             # d0 += h2*s3
++
++      vpmuludq        $H0,$T2,$T2             # h0*r4
++      vpaddq          $T2,$D4,$D4             # d4 += h0*r4
++      vpmuludq        $H4,$T3,$T1             # h4*s4
++      vpaddq          $T1,$D3,$D3             # d3 += h4*s4
++      vpmuludq        $H3,$T3,$T0             # h3*s4
++      vpaddq          $T0,$D2,$D2             # d2 += h3*s4
++      vpmuludq        $H2,$T3,$T1             # h2*s4
++      vpaddq          $T1,$D1,$D1             # d1 += h2*s4
++      vpmuludq        $H1,$T3,$T3             # h1*s4
++      vpaddq          $T3,$D0,$D0             # d0 += h1*s4
++
++.Lshort_tail_avx:
++      ################################################################
++      # horizontal addition
++
++      vpsrldq         \$8,$D4,$T4
++      vpsrldq         \$8,$D3,$T3
++      vpsrldq         \$8,$D1,$T1
++      vpsrldq         \$8,$D0,$T0
++      vpsrldq         \$8,$D2,$T2
++      vpaddq          $T3,$D3,$D3
++      vpaddq          $T4,$D4,$D4
++      vpaddq          $T0,$D0,$D0
++      vpaddq          $T1,$D1,$D1
++      vpaddq          $T2,$D2,$D2
++
++      ################################################################
++      # lazy reduction
++
++      vpsrlq          \$26,$D3,$H3
++      vpand           $MASK,$D3,$D3
++      vpaddq          $H3,$D4,$D4             # h3 -> h4
++
++      vpsrlq          \$26,$D0,$H0
++      vpand           $MASK,$D0,$D0
++      vpaddq          $H0,$D1,$D1             # h0 -> h1
++
++      vpsrlq          \$26,$D4,$H4
++      vpand           $MASK,$D4,$D4
++
++      vpsrlq          \$26,$D1,$H1
++      vpand           $MASK,$D1,$D1
++      vpaddq          $H1,$D2,$D2             # h1 -> h2
++
++      vpaddq          $H4,$D0,$D0
++      vpsllq          \$2,$H4,$H4
++      vpaddq          $H4,$D0,$D0             # h4 -> h0
++
++      vpsrlq          \$26,$D2,$H2
++      vpand           $MASK,$D2,$D2
++      vpaddq          $H2,$D3,$D3             # h2 -> h3
++
++      vpsrlq          \$26,$D0,$H0
++      vpand           $MASK,$D0,$D0
++      vpaddq          $H0,$D1,$D1             # h0 -> h1
++
++      vpsrlq          \$26,$D3,$H3
++      vpand           $MASK,$D3,$D3
++      vpaddq          $H3,$D4,$D4             # h3 -> h4
++
++      vmovd           $D0,`4*0-48-64`($ctx)   # save partially reduced
++      vmovd           $D1,`4*1-48-64`($ctx)
++      vmovd           $D2,`4*2-48-64`($ctx)
++      vmovd           $D3,`4*3-48-64`($ctx)
++      vmovd           $D4,`4*4-48-64`($ctx)
++___
++$code.=<<___  if ($win64);
++      vmovdqa         0x50(%r11),%xmm6
++      vmovdqa         0x60(%r11),%xmm7
++      vmovdqa         0x70(%r11),%xmm8
++      vmovdqa         0x80(%r11),%xmm9
++      vmovdqa         0x90(%r11),%xmm10
++      vmovdqa         0xa0(%r11),%xmm11
++      vmovdqa         0xb0(%r11),%xmm12
++      vmovdqa         0xc0(%r11),%xmm13
++      vmovdqa         0xd0(%r11),%xmm14
++      vmovdqa         0xe0(%r11),%xmm15
++      lea             0xf8(%r11),%rsp
++.Ldo_avx_epilogue:
++___
++$code.=<<___  if (!$win64);
++      lea             0x58(%r11),%rsp
++.cfi_def_cfa          %rsp,8
++___
++$code.=<<___;
++      vzeroupper
++      ret
++.cfi_endproc
++.size poly1305_blocks_avx,.-poly1305_blocks_avx
++
++.type poly1305_emit_avx,\@function,3
++.align        32
++poly1305_emit_avx:
++      cmpl    \$0,20($ctx)    # is_base2_26?
++      je      .Lemit
++
++      mov     0($ctx),%eax    # load hash value base 2^26
++      mov     4($ctx),%ecx
++      mov     8($ctx),%r8d
++      mov     12($ctx),%r11d
++      mov     16($ctx),%r10d
++
++      shl     \$26,%rcx       # base 2^26 -> base 2^64
++      mov     %r8,%r9
++      shl     \$52,%r8
++      add     %rcx,%rax
++      shr     \$12,%r9
++      add     %rax,%r8        # h0
++      adc     \$0,%r9
++
++      shl     \$14,%r11
++      mov     %r10,%rax
++      shr     \$24,%r10
++      add     %r11,%r9
++      shl     \$40,%rax
++      add     %rax,%r9        # h1
++      adc     \$0,%r10        # h2
++
++      mov     %r10,%rax       # could be partially reduced, so reduce
++      mov     %r10,%rcx
++      and     \$3,%r10
++      shr     \$2,%rax
++      and     \$-4,%rcx
++      add     %rcx,%rax
++      add     %rax,%r8
++      adc     \$0,%r9
++      adc     \$0,%r10
++
++      mov     %r8,%rax
++      add     \$5,%r8         # compare to modulus
++      mov     %r9,%rcx
++      adc     \$0,%r9
++      adc     \$0,%r10
++      shr     \$2,%r10        # did 130-bit value overflow?
++      cmovnz  %r8,%rax
++      cmovnz  %r9,%rcx
++
++      add     0($nonce),%rax  # accumulate nonce
++      adc     8($nonce),%rcx
++      mov     %rax,0($mac)    # write result
++      mov     %rcx,8($mac)
++
++      ret
++.size poly1305_emit_avx,.-poly1305_emit_avx
++___
++
++if ($avx>1) {
++my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
++    map("%ymm$_",(0..15));
++my $S4=$MASK;
++
++$code.=<<___;
++.type poly1305_blocks_avx2,\@function,4
++.align        32
++poly1305_blocks_avx2:
++.cfi_startproc
++      mov     20($ctx),%r8d           # is_base2_26
++      cmp     \$128,$len
++      jae     .Lblocks_avx2
++      test    %r8d,%r8d
++      jz      .Lblocks
++
++.Lblocks_avx2:
++      and     \$-16,$len
++      jz      .Lno_data_avx2
++
++      vzeroupper
++
++      test    %r8d,%r8d
++      jz      .Lbase2_64_avx2
++
++      test    \$63,$len
++      jz      .Leven_avx2
++
++      push    %rbx
++.cfi_push     %rbx
++      push    %rbp
++.cfi_push     %rbp
++      push    %r12
++.cfi_push     %r12
++      push    %r13
++.cfi_push     %r13
++      push    %r14
++.cfi_push     %r14
++      push    %r15
++.cfi_push     %r15
++.Lblocks_avx2_body:
++
++      mov     $len,%r15               # reassign $len
++
++      mov     0($ctx),$d1             # load hash value
++      mov     8($ctx),$d2
++      mov     16($ctx),$h2#d
++
++      mov     24($ctx),$r0            # load r
++      mov     32($ctx),$s1
++
++      ################################# base 2^26 -> base 2^64
++      mov     $d1#d,$h0#d
++      and     \$`-1*(1<<31)`,$d1
++      mov     $d2,$r1                 # borrow $r1
++      mov     $d2#d,$h1#d
++      and     \$`-1*(1<<31)`,$d2
++
++      shr     \$6,$d1
++      shl     \$52,$r1
++      add     $d1,$h0
++      shr     \$12,$h1
++      shr     \$18,$d2
++      add     $r1,$h0
++      adc     $d2,$h1
++
++      mov     $h2,$d1
++      shl     \$40,$d1
++      shr     \$24,$h2
++      add     $d1,$h1
++      adc     \$0,$h2                 # can be partially reduced...
++
++      mov     \$-4,$d2                # ... so reduce
++      mov     $h2,$d1
++      and     $h2,$d2
++      shr     \$2,$d1
++      and     \$3,$h2
++      add     $d2,$d1                 # =*5
++      add     $d1,$h0
++      adc     \$0,$h1
++      adc     \$0,$h2
++
++      mov     $s1,$r1
++      mov     $s1,%rax
++      shr     \$2,$s1
++      add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
++
++.Lbase2_26_pre_avx2:
++      add     0($inp),$h0             # accumulate input
++      adc     8($inp),$h1
++      lea     16($inp),$inp
++      adc     $padbit,$h2
++      sub     \$16,%r15
++
++      call    __poly1305_block
++      mov     $r1,%rax
++
++      test    \$63,%r15
++      jnz     .Lbase2_26_pre_avx2
++
++      test    $padbit,$padbit         # if $padbit is zero,
++      jz      .Lstore_base2_64_avx2   # store hash in base 2^64 format
++
++      ################################# base 2^64 -> base 2^26
++      mov     $h0,%rax
++      mov     $h0,%rdx
++      shr     \$52,$h0
++      mov     $h1,$r0
++      mov     $h1,$r1
++      shr     \$26,%rdx
++      and     \$0x3ffffff,%rax        # h[0]
++      shl     \$12,$r0
++      and     \$0x3ffffff,%rdx        # h[1]
++      shr     \$14,$h1
++      or      $r0,$h0
++      shl     \$24,$h2
++      and     \$0x3ffffff,$h0         # h[2]
++      shr     \$40,$r1
++      and     \$0x3ffffff,$h1         # h[3]
++      or      $r1,$h2                 # h[4]
++
++      test    %r15,%r15
++      jz      .Lstore_base2_26_avx2
++
++      vmovd   %rax#d,%x#$H0
++      vmovd   %rdx#d,%x#$H1
++      vmovd   $h0#d,%x#$H2
++      vmovd   $h1#d,%x#$H3
++      vmovd   $h2#d,%x#$H4
++      jmp     .Lproceed_avx2
++
++.align        32
++.Lstore_base2_64_avx2:
++      mov     $h0,0($ctx)
++      mov     $h1,8($ctx)
++      mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
++      jmp     .Ldone_avx2
++
++.align        16
++.Lstore_base2_26_avx2:
++      mov     %rax#d,0($ctx)          # store hash value base 2^26
++      mov     %rdx#d,4($ctx)
++      mov     $h0#d,8($ctx)
++      mov     $h1#d,12($ctx)
++      mov     $h2#d,16($ctx)
++.align        16
++.Ldone_avx2:
++      mov     0(%rsp),%r15
++.cfi_restore  %r15
++      mov     8(%rsp),%r14
++.cfi_restore  %r14
++      mov     16(%rsp),%r13
++.cfi_restore  %r13
++      mov     24(%rsp),%r12
++.cfi_restore  %r12
++      mov     32(%rsp),%rbp
++.cfi_restore  %rbp
++      mov     40(%rsp),%rbx
++.cfi_restore  %rbx
++      lea     48(%rsp),%rsp
++.cfi_adjust_cfa_offset        -48
++.Lno_data_avx2:
++.Lblocks_avx2_epilogue:
++      ret
++.cfi_endproc
++
++.align        32
++.Lbase2_64_avx2:
++.cfi_startproc
++      push    %rbx
++.cfi_push     %rbx
++      push    %rbp
++.cfi_push     %rbp
++      push    %r12
++.cfi_push     %r12
++      push    %r13
++.cfi_push     %r13
++      push    %r14
++.cfi_push     %r14
++      push    %r15
++.cfi_push     %r15
++.Lbase2_64_avx2_body:
++
++      mov     $len,%r15               # reassign $len
++
++      mov     24($ctx),$r0            # load r
++      mov     32($ctx),$s1
++
++      mov     0($ctx),$h0             # load hash value
++      mov     8($ctx),$h1
++      mov     16($ctx),$h2#d
++
++      mov     $s1,$r1
++      mov     $s1,%rax
++      shr     \$2,$s1
++      add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
++
++      test    \$63,$len
++      jz      .Linit_avx2
++
++.Lbase2_64_pre_avx2:
++      add     0($inp),$h0             # accumulate input
++      adc     8($inp),$h1
++      lea     16($inp),$inp
++      adc     $padbit,$h2
++      sub     \$16,%r15
++
++      call    __poly1305_block
++      mov     $r1,%rax
++
++      test    \$63,%r15
++      jnz     .Lbase2_64_pre_avx2
++
++.Linit_avx2:
++      ################################# base 2^64 -> base 2^26
++      mov     $h0,%rax
++      mov     $h0,%rdx
++      shr     \$52,$h0
++      mov     $h1,$d1
++      mov     $h1,$d2
++      shr     \$26,%rdx
++      and     \$0x3ffffff,%rax        # h[0]
++      shl     \$12,$d1
++      and     \$0x3ffffff,%rdx        # h[1]
++      shr     \$14,$h1
++      or      $d1,$h0
++      shl     \$24,$h2
++      and     \$0x3ffffff,$h0         # h[2]
++      shr     \$40,$d2
++      and     \$0x3ffffff,$h1         # h[3]
++      or      $d2,$h2                 # h[4]
++
++      vmovd   %rax#d,%x#$H0
++      vmovd   %rdx#d,%x#$H1
++      vmovd   $h0#d,%x#$H2
++      vmovd   $h1#d,%x#$H3
++      vmovd   $h2#d,%x#$H4
++      movl    \$1,20($ctx)            # set is_base2_26
++
++      call    __poly1305_init_avx
++
++.Lproceed_avx2:
++      mov     %r15,$len                       # restore $len
++      mov     OPENSSL_ia32cap_P+8(%rip),%r10d
++      mov     \$`(1<<31|1<<30|1<<16)`,%r11d
++
++      mov     0(%rsp),%r15
++.cfi_restore  %r15
++      mov     8(%rsp),%r14
++.cfi_restore  %r14
++      mov     16(%rsp),%r13
++.cfi_restore  %r13
++      mov     24(%rsp),%r12
++.cfi_restore  %r12
++      mov     32(%rsp),%rbp
++.cfi_restore  %rbp
++      mov     40(%rsp),%rbx
++.cfi_restore  %rbx
++      lea     48(%rsp),%rax
++      lea     48(%rsp),%rsp
++.cfi_adjust_cfa_offset        -48
++.Lbase2_64_avx2_epilogue:
++      jmp     .Ldo_avx2
++.cfi_endproc
++
++.align        32
++.Leven_avx2:
++.cfi_startproc
++      mov             OPENSSL_ia32cap_P+8(%rip),%r10d
++      vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
++      vmovd           4*1($ctx),%x#$H1
++      vmovd           4*2($ctx),%x#$H2
++      vmovd           4*3($ctx),%x#$H3
++      vmovd           4*4($ctx),%x#$H4
++
++.Ldo_avx2:
++___
++$code.=<<___          if ($avx>2);
++      cmp             \$512,$len
++      jb              .Lskip_avx512
++      and             %r11d,%r10d
++      test            \$`1<<16`,%r10d         # check for AVX512F
++      jnz             .Lblocks_avx512
++.Lskip_avx512:
++___
++$code.=<<___  if (!$win64);
++      lea             -8(%rsp),%r11
++.cfi_def_cfa          %r11,16
++      sub             \$0x128,%rsp
++___
++$code.=<<___  if ($win64);
++      lea             -0xf8(%rsp),%r11
++      sub             \$0x1c8,%rsp
++      vmovdqa         %xmm6,0x50(%r11)
++      vmovdqa         %xmm7,0x60(%r11)
++      vmovdqa         %xmm8,0x70(%r11)
++      vmovdqa         %xmm9,0x80(%r11)
++      vmovdqa         %xmm10,0x90(%r11)
++      vmovdqa         %xmm11,0xa0(%r11)
++      vmovdqa         %xmm12,0xb0(%r11)
++      vmovdqa         %xmm13,0xc0(%r11)
++      vmovdqa         %xmm14,0xd0(%r11)
++      vmovdqa         %xmm15,0xe0(%r11)
++.Ldo_avx2_body:
++___
++$code.=<<___;
++      lea             .Lconst(%rip),%rcx
++      lea             48+64($ctx),$ctx        # size optimization
++      vmovdqa         96(%rcx),$T0            # .Lpermd_avx2
++
++      # expand and copy pre-calculated table to stack
++      vmovdqu         `16*0-64`($ctx),%x#$T2
++      and             \$-512,%rsp
++      vmovdqu         `16*1-64`($ctx),%x#$T3
++      vmovdqu         `16*2-64`($ctx),%x#$T4
++      vmovdqu         `16*3-64`($ctx),%x#$D0
++      vmovdqu         `16*4-64`($ctx),%x#$D1
++      vmovdqu         `16*5-64`($ctx),%x#$D2
++      lea             0x90(%rsp),%rax         # size optimization
++      vmovdqu         `16*6-64`($ctx),%x#$D3
++      vpermd          $T2,$T0,$T2             # 00003412 -> 14243444
++      vmovdqu         `16*7-64`($ctx),%x#$D4
++      vpermd          $T3,$T0,$T3
++      vmovdqu         `16*8-64`($ctx),%x#$MASK
++      vpermd          $T4,$T0,$T4
++      vmovdqa         $T2,0x00(%rsp)
++      vpermd          $D0,$T0,$D0
++      vmovdqa         $T3,0x20-0x90(%rax)
++      vpermd          $D1,$T0,$D1
++      vmovdqa         $T4,0x40-0x90(%rax)
++      vpermd          $D2,$T0,$D2
++      vmovdqa         $D0,0x60-0x90(%rax)
++      vpermd          $D3,$T0,$D3
++      vmovdqa         $D1,0x80-0x90(%rax)
++      vpermd          $D4,$T0,$D4
++      vmovdqa         $D2,0xa0-0x90(%rax)
++      vpermd          $MASK,$T0,$MASK
++      vmovdqa         $D3,0xc0-0x90(%rax)
++      vmovdqa         $D4,0xe0-0x90(%rax)
++      vmovdqa         $MASK,0x100-0x90(%rax)
++      vmovdqa         64(%rcx),$MASK          # .Lmask26
++
++      ################################################################
++      # load input
++      vmovdqu         16*0($inp),%x#$T0
++      vmovdqu         16*1($inp),%x#$T1
++      vinserti128     \$1,16*2($inp),$T0,$T0
++      vinserti128     \$1,16*3($inp),$T1,$T1
++      lea             16*4($inp),$inp
++
++      vpsrldq         \$6,$T0,$T2             # splat input
++      vpsrldq         \$6,$T1,$T3
++      vpunpckhqdq     $T1,$T0,$T4             # 4
++      vpunpcklqdq     $T3,$T2,$T2             # 2:3
++      vpunpcklqdq     $T1,$T0,$T0             # 0:1
++
++      vpsrlq          \$30,$T2,$T3
++      vpsrlq          \$4,$T2,$T2
++      vpsrlq          \$26,$T0,$T1
++      vpsrlq          \$40,$T4,$T4            # 4
++      vpand           $MASK,$T2,$T2           # 2
++      vpand           $MASK,$T0,$T0           # 0
++      vpand           $MASK,$T1,$T1           # 1
++      vpand           $MASK,$T3,$T3           # 3
++      vpor            32(%rcx),$T4,$T4        # padbit, yes, always
++
++      vpaddq          $H2,$T2,$H2             # accumulate input
++      sub             \$64,$len
++      jz              .Ltail_avx2
++      jmp             .Loop_avx2
++
++.align        32
++.Loop_avx2:
++      ################################################################
++      # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
++      # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
++      # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
++      # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
++      #   \________/\__________/
++      ################################################################
++      #vpaddq         $H2,$T2,$H2             # accumulate input
++      vpaddq          $H0,$T0,$H0
++      vmovdqa         `32*0`(%rsp),$T0        # r0^4
++      vpaddq          $H1,$T1,$H1
++      vmovdqa         `32*1`(%rsp),$T1        # r1^4
++      vpaddq          $H3,$T3,$H3
++      vmovdqa         `32*3`(%rsp),$T2        # r2^4
++      vpaddq          $H4,$T4,$H4
++      vmovdqa         `32*6-0x90`(%rax),$T3   # s3^4
++      vmovdqa         `32*8-0x90`(%rax),$S4   # s4^4
++
++      # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
++      # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
++      # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
++      # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
++      # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++      #
++      # however, as h2 is "chronologically" first one available pull
++      # corresponding operations up, so it's
++      #
++      # d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
++      # d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
++      # d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
++      # d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
++      # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
++
++      vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
++      vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
++      vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
++      vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
++      vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
++
++      vpmuludq        $H0,$T1,$T4             # h0*r1
++      vpmuludq        $H1,$T1,$H2             # h1*r1, borrow $H2 as temp
++      vpaddq          $T4,$D1,$D1             # d1 += h0*r1
++      vpaddq          $H2,$D2,$D2             # d2 += h1*r1
++      vpmuludq        $H3,$T1,$T4             # h3*r1
++      vpmuludq        `32*2`(%rsp),$H4,$H2    # h4*s1
++      vpaddq          $T4,$D4,$D4             # d4 += h3*r1
++      vpaddq          $H2,$D0,$D0             # d0 += h4*s1
++       vmovdqa        `32*4-0x90`(%rax),$T1   # s2
++
++      vpmuludq        $H0,$T0,$T4             # h0*r0
++      vpmuludq        $H1,$T0,$H2             # h1*r0
++      vpaddq          $T4,$D0,$D0             # d0 += h0*r0
++      vpaddq          $H2,$D1,$D1             # d1 += h1*r0
++      vpmuludq        $H3,$T0,$T4             # h3*r0
++      vpmuludq        $H4,$T0,$H2             # h4*r0
++       vmovdqu        16*0($inp),%x#$T0       # load input
++      vpaddq          $T4,$D3,$D3             # d3 += h3*r0
++      vpaddq          $H2,$D4,$D4             # d4 += h4*r0
++       vinserti128    \$1,16*2($inp),$T0,$T0
++
++      vpmuludq        $H3,$T1,$T4             # h3*s2
++      vpmuludq        $H4,$T1,$H2             # h4*s2
++       vmovdqu        16*1($inp),%x#$T1
++      vpaddq          $T4,$D0,$D0             # d0 += h3*s2
++      vpaddq          $H2,$D1,$D1             # d1 += h4*s2
++       vmovdqa        `32*5-0x90`(%rax),$H2   # r3
++      vpmuludq        $H1,$T2,$T4             # h1*r2
++      vpmuludq        $H0,$T2,$T2             # h0*r2
++      vpaddq          $T4,$D3,$D3             # d3 += h1*r2
++      vpaddq          $T2,$D2,$D2             # d2 += h0*r2
++       vinserti128    \$1,16*3($inp),$T1,$T1
++       lea            16*4($inp),$inp
++
++      vpmuludq        $H1,$H2,$T4             # h1*r3
++      vpmuludq        $H0,$H2,$H2             # h0*r3
++       vpsrldq        \$6,$T0,$T2             # splat input
++      vpaddq          $T4,$D4,$D4             # d4 += h1*r3
++      vpaddq          $H2,$D3,$D3             # d3 += h0*r3
++      vpmuludq        $H3,$T3,$T4             # h3*s3
++      vpmuludq        $H4,$T3,$H2             # h4*s3
++       vpsrldq        \$6,$T1,$T3
++      vpaddq          $T4,$D1,$D1             # d1 += h3*s3
++      vpaddq          $H2,$D2,$D2             # d2 += h4*s3
++       vpunpckhqdq    $T1,$T0,$T4             # 4
++
++      vpmuludq        $H3,$S4,$H3             # h3*s4
++      vpmuludq        $H4,$S4,$H4             # h4*s4
++       vpunpcklqdq    $T1,$T0,$T0             # 0:1
++      vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
++      vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
++       vpunpcklqdq    $T3,$T2,$T3             # 2:3
++      vpmuludq        `32*7-0x90`(%rax),$H0,$H4       # h0*r4
++      vpmuludq        $H1,$S4,$H0             # h1*s4
++      vmovdqa         64(%rcx),$MASK          # .Lmask26
++      vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
++      vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
++
++      ################################################################
++      # lazy reduction (interleaved with tail of input splat)
++
++      vpsrlq          \$26,$H3,$D3
++      vpand           $MASK,$H3,$H3
++      vpaddq          $D3,$H4,$H4             # h3 -> h4
++
++      vpsrlq          \$26,$H0,$D0
++      vpand           $MASK,$H0,$H0
++      vpaddq          $D0,$D1,$H1             # h0 -> h1
++
++      vpsrlq          \$26,$H4,$D4
++      vpand           $MASK,$H4,$H4
++
++       vpsrlq         \$4,$T3,$T2
++
++      vpsrlq          \$26,$H1,$D1
++      vpand           $MASK,$H1,$H1
++      vpaddq          $D1,$H2,$H2             # h1 -> h2
++
++      vpaddq          $D4,$H0,$H0
++      vpsllq          \$2,$D4,$D4
++      vpaddq          $D4,$H0,$H0             # h4 -> h0
++
++       vpand          $MASK,$T2,$T2           # 2
++       vpsrlq         \$26,$T0,$T1
++
++      vpsrlq          \$26,$H2,$D2
++      vpand           $MASK,$H2,$H2
++      vpaddq          $D2,$H3,$H3             # h2 -> h3
++
++       vpaddq         $T2,$H2,$H2             # modulo-scheduled
++       vpsrlq         \$30,$T3,$T3
++
++      vpsrlq          \$26,$H0,$D0
++      vpand           $MASK,$H0,$H0
++      vpaddq          $D0,$H1,$H1             # h0 -> h1
++
++       vpsrlq         \$40,$T4,$T4            # 4
++
++      vpsrlq          \$26,$H3,$D3
++      vpand           $MASK,$H3,$H3
++      vpaddq          $D3,$H4,$H4             # h3 -> h4
++
++       vpand          $MASK,$T0,$T0           # 0
++       vpand          $MASK,$T1,$T1           # 1
++       vpand          $MASK,$T3,$T3           # 3
++       vpor           32(%rcx),$T4,$T4        # padbit, yes, always
++
++      sub             \$64,$len
++      jnz             .Loop_avx2
++
++      .byte           0x66,0x90
++.Ltail_avx2:
++      ################################################################
++      # while above multiplications were by r^4 in all lanes, in last
++      # iteration we multiply least significant lane by r^4 and most
++      # significant one by r, so copy of above except that references
++      # to the precomputed table are displaced by 4...
++
++      #vpaddq         $H2,$T2,$H2             # accumulate input
++      vpaddq          $H0,$T0,$H0
++      vmovdqu         `32*0+4`(%rsp),$T0      # r0^4
++      vpaddq          $H1,$T1,$H1
++      vmovdqu         `32*1+4`(%rsp),$T1      # r1^4
++      vpaddq          $H3,$T3,$H3
++      vmovdqu         `32*3+4`(%rsp),$T2      # r2^4
++      vpaddq          $H4,$T4,$H4
++      vmovdqu         `32*6+4-0x90`(%rax),$T3 # s3^4
++      vmovdqu         `32*8+4-0x90`(%rax),$S4 # s4^4
++
++      vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
++      vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
++      vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
++      vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
++      vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
++
++      vpmuludq        $H0,$T1,$T4             # h0*r1
++      vpmuludq        $H1,$T1,$H2             # h1*r1
++      vpaddq          $T4,$D1,$D1             # d1 += h0*r1
++      vpaddq          $H2,$D2,$D2             # d2 += h1*r1
++      vpmuludq        $H3,$T1,$T4             # h3*r1
++      vpmuludq        `32*2+4`(%rsp),$H4,$H2  # h4*s1
++      vpaddq          $T4,$D4,$D4             # d4 += h3*r1
++      vpaddq          $H2,$D0,$D0             # d0 += h4*s1
++
++      vpmuludq        $H0,$T0,$T4             # h0*r0
++      vpmuludq        $H1,$T0,$H2             # h1*r0
++      vpaddq          $T4,$D0,$D0             # d0 += h0*r0
++       vmovdqu        `32*4+4-0x90`(%rax),$T1 # s2
++      vpaddq          $H2,$D1,$D1             # d1 += h1*r0
++      vpmuludq        $H3,$T0,$T4             # h3*r0
++      vpmuludq        $H4,$T0,$H2             # h4*r0
++      vpaddq          $T4,$D3,$D3             # d3 += h3*r0
++      vpaddq          $H2,$D4,$D4             # d4 += h4*r0
++
++      vpmuludq        $H3,$T1,$T4             # h3*s2
++      vpmuludq        $H4,$T1,$H2             # h4*s2
++      vpaddq          $T4,$D0,$D0             # d0 += h3*s2
++      vpaddq          $H2,$D1,$D1             # d1 += h4*s2
++       vmovdqu        `32*5+4-0x90`(%rax),$H2 # r3
++      vpmuludq        $H1,$T2,$T4             # h1*r2
++      vpmuludq        $H0,$T2,$T2             # h0*r2
++      vpaddq          $T4,$D3,$D3             # d3 += h1*r2
++      vpaddq          $T2,$D2,$D2             # d2 += h0*r2
++
++      vpmuludq        $H1,$H2,$T4             # h1*r3
++      vpmuludq        $H0,$H2,$H2             # h0*r3
++      vpaddq          $T4,$D4,$D4             # d4 += h1*r3
++      vpaddq          $H2,$D3,$D3             # d3 += h0*r3
++      vpmuludq        $H3,$T3,$T4             # h3*s3
++      vpmuludq        $H4,$T3,$H2             # h4*s3
++      vpaddq          $T4,$D1,$D1             # d1 += h3*s3
++      vpaddq          $H2,$D2,$D2             # d2 += h4*s3
++
++      vpmuludq        $H3,$S4,$H3             # h3*s4
++      vpmuludq        $H4,$S4,$H4             # h4*s4
++      vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
++      vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
++      vpmuludq        `32*7+4-0x90`(%rax),$H0,$H4             # h0*r4
++      vpmuludq        $H1,$S4,$H0             # h1*s4
++      vmovdqa         64(%rcx),$MASK          # .Lmask26
++      vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
++      vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
++
++      ################################################################
++      # horizontal addition
++
++      vpsrldq         \$8,$D1,$T1
++      vpsrldq         \$8,$H2,$T2
++      vpsrldq         \$8,$H3,$T3
++      vpsrldq         \$8,$H4,$T4
++      vpsrldq         \$8,$H0,$T0
++      vpaddq          $T1,$D1,$D1
++      vpaddq          $T2,$H2,$H2
++      vpaddq          $T3,$H3,$H3
++      vpaddq          $T4,$H4,$H4
++      vpaddq          $T0,$H0,$H0
++
++      vpermq          \$0x2,$H3,$T3
++      vpermq          \$0x2,$H4,$T4
++      vpermq          \$0x2,$H0,$T0
++      vpermq          \$0x2,$D1,$T1
++      vpermq          \$0x2,$H2,$T2
++      vpaddq          $T3,$H3,$H3
++      vpaddq          $T4,$H4,$H4
++      vpaddq          $T0,$H0,$H0
++      vpaddq          $T1,$D1,$D1
++      vpaddq          $T2,$H2,$H2
++
++      ################################################################
++      # lazy reduction
++
++      vpsrlq          \$26,$H3,$D3
++      vpand           $MASK,$H3,$H3
++      vpaddq          $D3,$H4,$H4             # h3 -> h4
++
++      vpsrlq          \$26,$H0,$D0
++      vpand           $MASK,$H0,$H0
++      vpaddq          $D0,$D1,$H1             # h0 -> h1
++
++      vpsrlq          \$26,$H4,$D4
++      vpand           $MASK,$H4,$H4
++
++      vpsrlq          \$26,$H1,$D1
++      vpand           $MASK,$H1,$H1
++      vpaddq          $D1,$H2,$H2             # h1 -> h2
++
++      vpaddq          $D4,$H0,$H0
++      vpsllq          \$2,$D4,$D4
++      vpaddq          $D4,$H0,$H0             # h4 -> h0
++
++      vpsrlq          \$26,$H2,$D2
++      vpand           $MASK,$H2,$H2
++      vpaddq          $D2,$H3,$H3             # h2 -> h3
++
++      vpsrlq          \$26,$H0,$D0
++      vpand           $MASK,$H0,$H0
++      vpaddq          $D0,$H1,$H1             # h0 -> h1
++
++      vpsrlq          \$26,$H3,$D3
++      vpand           $MASK,$H3,$H3
++      vpaddq          $D3,$H4,$H4             # h3 -> h4
++
++      vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
++      vmovd           %x#$H1,`4*1-48-64`($ctx)
++      vmovd           %x#$H2,`4*2-48-64`($ctx)
++      vmovd           %x#$H3,`4*3-48-64`($ctx)
++      vmovd           %x#$H4,`4*4-48-64`($ctx)
++___
++$code.=<<___  if ($win64);
++      vmovdqa         0x50(%r11),%xmm6
++      vmovdqa         0x60(%r11),%xmm7
++      vmovdqa         0x70(%r11),%xmm8
++      vmovdqa         0x80(%r11),%xmm9
++      vmovdqa         0x90(%r11),%xmm10
++      vmovdqa         0xa0(%r11),%xmm11
++      vmovdqa         0xb0(%r11),%xmm12
++      vmovdqa         0xc0(%r11),%xmm13
++      vmovdqa         0xd0(%r11),%xmm14
++      vmovdqa         0xe0(%r11),%xmm15
++      lea             0xf8(%r11),%rsp
++.Ldo_avx2_epilogue:
++___
++$code.=<<___  if (!$win64);
++      lea             8(%r11),%rsp
++.cfi_def_cfa          %rsp,8
++___
++$code.=<<___;
++      vzeroupper
++      ret
++.cfi_endproc
++.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
++___
++#######################################################################
++if ($avx>2) {
++# On entry we have input length divisible by 64. But since inner loop
++# processes 128 bytes per iteration, cases when length is not divisible
++# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
++# reason stack layout is kept identical to poly1305_blocks_avx2. If not
++# for this tail, we wouldn't have to even allocate stack frame...
++
++my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
++my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
++my $PADBIT="%zmm30";
++
++map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));          # switch to %zmm domain
++map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
++map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
++map(s/%y/%z/,($MASK));
++
++$code.=<<___;
++.type poly1305_blocks_avx512,\@function,4
++.align        32
++poly1305_blocks_avx512:
++.cfi_startproc
++.Lblocks_avx512:
++      mov             \$15,%eax
++      kmovw           %eax,%k2
++___
++$code.=<<___  if (!$win64);
++      lea             -8(%rsp),%r11
++.cfi_def_cfa          %r11,16
++      sub             \$0x128,%rsp
++___
++$code.=<<___  if ($win64);
++      lea             -0xf8(%rsp),%r11
++      sub             \$0x1c8,%rsp
++      vmovdqa         %xmm6,0x50(%r11)
++      vmovdqa         %xmm7,0x60(%r11)
++      vmovdqa         %xmm8,0x70(%r11)
++      vmovdqa         %xmm9,0x80(%r11)
++      vmovdqa         %xmm10,0x90(%r11)
++      vmovdqa         %xmm11,0xa0(%r11)
++      vmovdqa         %xmm12,0xb0(%r11)
++      vmovdqa         %xmm13,0xc0(%r11)
++      vmovdqa         %xmm14,0xd0(%r11)
++      vmovdqa         %xmm15,0xe0(%r11)
++.Ldo_avx512_body:
++___
++$code.=<<___;
++      lea             .Lconst(%rip),%rcx
++      lea             48+64($ctx),$ctx        # size optimization
++      vmovdqa         96(%rcx),%y#$T2         # .Lpermd_avx2
++
++      # expand pre-calculated table
++      vmovdqu         `16*0-64`($ctx),%x#$D0  # will become expanded ${R0}
++      and             \$-512,%rsp
++      vmovdqu         `16*1-64`($ctx),%x#$D1  # will become ... ${R1}
++      mov             \$0x20,%rax
++      vmovdqu         `16*2-64`($ctx),%x#$T0  # ... ${S1}
++      vmovdqu         `16*3-64`($ctx),%x#$D2  # ... ${R2}
++      vmovdqu         `16*4-64`($ctx),%x#$T1  # ... ${S2}
++      vmovdqu         `16*5-64`($ctx),%x#$D3  # ... ${R3}
++      vmovdqu         `16*6-64`($ctx),%x#$T3  # ... ${S3}
++      vmovdqu         `16*7-64`($ctx),%x#$D4  # ... ${R4}
++      vmovdqu         `16*8-64`($ctx),%x#$T4  # ... ${S4}
++      vpermd          $D0,$T2,$R0             # 00003412 -> 14243444
++      vpbroadcastq    64(%rcx),$MASK          # .Lmask26
++      vpermd          $D1,$T2,$R1
++      vpermd          $T0,$T2,$S1
++      vpermd          $D2,$T2,$R2
++      vmovdqa64       $R0,0x00(%rsp){%k2}     # save in case $len%128 != 0
++       vpsrlq         \$32,$R0,$T0            # 14243444 -> 01020304
++      vpermd          $T1,$T2,$S2
++      vmovdqu64       $R1,0x00(%rsp,%rax){%k2}
++       vpsrlq         \$32,$R1,$T1
++      vpermd          $D3,$T2,$R3
++      vmovdqa64       $S1,0x40(%rsp){%k2}
++      vpermd          $T3,$T2,$S3
++      vpermd          $D4,$T2,$R4
++      vmovdqu64       $R2,0x40(%rsp,%rax){%k2}
++      vpermd          $T4,$T2,$S4
++      vmovdqa64       $S2,0x80(%rsp){%k2}
++      vmovdqu64       $R3,0x80(%rsp,%rax){%k2}
++      vmovdqa64       $S3,0xc0(%rsp){%k2}
++      vmovdqu64       $R4,0xc0(%rsp,%rax){%k2}
++      vmovdqa64       $S4,0x100(%rsp){%k2}
++
++      ################################################################
++      # calculate 5th through 8th powers of the key
++      #
++      # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
++      # d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
++      # d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
++      # d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
++      # d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
++
++      vpmuludq        $T0,$R0,$D0             # d0 = r0'*r0
++      vpmuludq        $T0,$R1,$D1             # d1 = r0'*r1
++      vpmuludq        $T0,$R2,$D2             # d2 = r0'*r2
++      vpmuludq        $T0,$R3,$D3             # d3 = r0'*r3
++      vpmuludq        $T0,$R4,$D4             # d4 = r0'*r4
++       vpsrlq         \$32,$R2,$T2
++
++      vpmuludq        $T1,$S4,$M0
++      vpmuludq        $T1,$R0,$M1
++      vpmuludq        $T1,$R1,$M2
++      vpmuludq        $T1,$R2,$M3
++      vpmuludq        $T1,$R3,$M4
++       vpsrlq         \$32,$R3,$T3
++      vpaddq          $M0,$D0,$D0             # d0 += r1'*5*r4
++      vpaddq          $M1,$D1,$D1             # d1 += r1'*r0
++      vpaddq          $M2,$D2,$D2             # d2 += r1'*r1
++      vpaddq          $M3,$D3,$D3             # d3 += r1'*r2
++      vpaddq          $M4,$D4,$D4             # d4 += r1'*r3
++
++      vpmuludq        $T2,$S3,$M0
++      vpmuludq        $T2,$S4,$M1
++      vpmuludq        $T2,$R1,$M3
++      vpmuludq        $T2,$R2,$M4
++      vpmuludq        $T2,$R0,$M2
++       vpsrlq         \$32,$R4,$T4
++      vpaddq          $M0,$D0,$D0             # d0 += r2'*5*r3
++      vpaddq          $M1,$D1,$D1             # d1 += r2'*5*r4
++      vpaddq          $M3,$D3,$D3             # d3 += r2'*r1
++      vpaddq          $M4,$D4,$D4             # d4 += r2'*r2
++      vpaddq          $M2,$D2,$D2             # d2 += r2'*r0
++
++      vpmuludq        $T3,$S2,$M0
++      vpmuludq        $T3,$R0,$M3
++      vpmuludq        $T3,$R1,$M4
++      vpmuludq        $T3,$S3,$M1
++      vpmuludq        $T3,$S4,$M2
++      vpaddq          $M0,$D0,$D0             # d0 += r3'*5*r2
++      vpaddq          $M3,$D3,$D3             # d3 += r3'*r0
++      vpaddq          $M4,$D4,$D4             # d4 += r3'*r1
++      vpaddq          $M1,$D1,$D1             # d1 += r3'*5*r3
++      vpaddq          $M2,$D2,$D2             # d2 += r3'*5*r4
++
++      vpmuludq        $T4,$S4,$M3
++      vpmuludq        $T4,$R0,$M4
++      vpmuludq        $T4,$S1,$M0
++      vpmuludq        $T4,$S2,$M1
++      vpmuludq        $T4,$S3,$M2
++      vpaddq          $M3,$D3,$D3             # d3 += r2'*5*r4
++      vpaddq          $M4,$D4,$D4             # d4 += r2'*r0
++      vpaddq          $M0,$D0,$D0             # d0 += r2'*5*r1
++      vpaddq          $M1,$D1,$D1             # d1 += r2'*5*r2
++      vpaddq          $M2,$D2,$D2             # d2 += r2'*5*r3
++
++      ################################################################
++      # load input
++      vmovdqu64       16*0($inp),%z#$T3
++      vmovdqu64       16*4($inp),%z#$T4
++      lea             16*8($inp),$inp
++
++      ################################################################
++      # lazy reduction
++
++      vpsrlq          \$26,$D3,$M3
++      vpandq          $MASK,$D3,$D3
++      vpaddq          $M3,$D4,$D4             # d3 -> d4
++
++      vpsrlq          \$26,$D0,$M0
++      vpandq          $MASK,$D0,$D0
++      vpaddq          $M0,$D1,$D1             # d0 -> d1
++
++      vpsrlq          \$26,$D4,$M4
++      vpandq          $MASK,$D4,$D4
++
++      vpsrlq          \$26,$D1,$M1
++      vpandq          $MASK,$D1,$D1
++      vpaddq          $M1,$D2,$D2             # d1 -> d2
++
++      vpaddq          $M4,$D0,$D0
++      vpsllq          \$2,$M4,$M4
++      vpaddq          $M4,$D0,$D0             # d4 -> d0
++
++      vpsrlq          \$26,$D2,$M2
++      vpandq          $MASK,$D2,$D2
++      vpaddq          $M2,$D3,$D3             # d2 -> d3
++
++      vpsrlq          \$26,$D0,$M0
++      vpandq          $MASK,$D0,$D0
++      vpaddq          $M0,$D1,$D1             # d0 -> d1
++
++      vpsrlq          \$26,$D3,$M3
++      vpandq          $MASK,$D3,$D3
++      vpaddq          $M3,$D4,$D4             # d3 -> d4
++
++      ################################################################
++      # at this point we have 14243444 in $R0-$S4 and 05060708 in
++      # $D0-$D4, ...
++
++      vpunpcklqdq     $T4,$T3,$T0     # transpose input
++      vpunpckhqdq     $T4,$T3,$T4
++
++      # ... since input 64-bit lanes are ordered as 73625140, we could
++      # "vperm" it to 76543210 (here and in each loop iteration), *or*
++      # we could just flow along, hence the goal for $R0-$S4 is
++      # 1858286838784888 ...
++
++      vmovdqa32       128(%rcx),$M0           # .Lpermd_avx512:
++      mov             \$0x7777,%eax
++      kmovw           %eax,%k1
++
++      vpermd          $R0,$M0,$R0             # 14243444 -> 1---2---3---4---
++      vpermd          $R1,$M0,$R1
++      vpermd          $R2,$M0,$R2
++      vpermd          $R3,$M0,$R3
++      vpermd          $R4,$M0,$R4
++
++      vpermd          $D0,$M0,${R0}{%k1}      # 05060708 -> 1858286838784888
++      vpermd          $D1,$M0,${R1}{%k1}
++      vpermd          $D2,$M0,${R2}{%k1}
++      vpermd          $D3,$M0,${R3}{%k1}
++      vpermd          $D4,$M0,${R4}{%k1}
++
++      vpslld          \$2,$R1,$S1             # *5
++      vpslld          \$2,$R2,$S2
++      vpslld          \$2,$R3,$S3
++      vpslld          \$2,$R4,$S4
++      vpaddd          $R1,$S1,$S1
++      vpaddd          $R2,$S2,$S2
++      vpaddd          $R3,$S3,$S3
++      vpaddd          $R4,$S4,$S4
++
++      vpbroadcastq    32(%rcx),$PADBIT        # .L129
++
++      vpsrlq          \$52,$T0,$T2            # splat input
++      vpsllq          \$12,$T4,$T3
++      vporq           $T3,$T2,$T2
++      vpsrlq          \$26,$T0,$T1
++      vpsrlq          \$14,$T4,$T3
++      vpsrlq          \$40,$T4,$T4            # 4
++      vpandq          $MASK,$T2,$T2           # 2
++      vpandq          $MASK,$T0,$T0           # 0
++      #vpandq         $MASK,$T1,$T1           # 1
++      #vpandq         $MASK,$T3,$T3           # 3
++      #vporq          $PADBIT,$T4,$T4         # padbit, yes, always
++
++      vpaddq          $H2,$T2,$H2             # accumulate input
++      sub             \$192,$len
++      jbe             .Ltail_avx512
++      jmp             .Loop_avx512
++
++.align        32
++.Loop_avx512:
++      ################################################################
++      # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
++      # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
++      # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
++      # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
++      # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
++      # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
++      # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
++      # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
++      #   \________/\___________/
++      ################################################################
++      #vpaddq         $H2,$T2,$H2             # accumulate input
++
++      # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
++      # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
++      # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
++      # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
++      # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
++      #
++      # however, as h2 is "chronologically" first one available pull
++      # corresponding operations up, so it's
++      #
++      # d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
++      # d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
++      # d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
++      # d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
++      # d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
++
++      vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
++       vpaddq         $H0,$T0,$H0
++      vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
++       vpandq         $MASK,$T1,$T1           # 1
++      vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
++       vpandq         $MASK,$T3,$T3           # 3
++      vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
++       vporq          $PADBIT,$T4,$T4         # padbit, yes, always
++      vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
++       vpaddq         $H1,$T1,$H1             # accumulate input
++       vpaddq         $H3,$T3,$H3
++       vpaddq         $H4,$T4,$H4
++
++        vmovdqu64     16*0($inp),$T3          # load input
++        vmovdqu64     16*4($inp),$T4
++        lea           16*8($inp),$inp
++      vpmuludq        $H0,$R3,$M3
++      vpmuludq        $H0,$R4,$M4
++      vpmuludq        $H0,$R0,$M0
++      vpmuludq        $H0,$R1,$M1
++      vpaddq          $M3,$D3,$D3             # d3 += h0*r3
++      vpaddq          $M4,$D4,$D4             # d4 += h0*r4
++      vpaddq          $M0,$D0,$D0             # d0 += h0*r0
++      vpaddq          $M1,$D1,$D1             # d1 += h0*r1
++
++      vpmuludq        $H1,$R2,$M3
++      vpmuludq        $H1,$R3,$M4
++      vpmuludq        $H1,$S4,$M0
++      vpmuludq        $H0,$R2,$M2
++      vpaddq          $M3,$D3,$D3             # d3 += h1*r2
++      vpaddq          $M4,$D4,$D4             # d4 += h1*r3
++      vpaddq          $M0,$D0,$D0             # d0 += h1*s4
++      vpaddq          $M2,$D2,$D2             # d2 += h0*r2
++
++        vpunpcklqdq   $T4,$T3,$T0             # transpose input
++        vpunpckhqdq   $T4,$T3,$T4
++
++      vpmuludq        $H3,$R0,$M3
++      vpmuludq        $H3,$R1,$M4
++      vpmuludq        $H1,$R0,$M1
++      vpmuludq        $H1,$R1,$M2
++      vpaddq          $M3,$D3,$D3             # d3 += h3*r0
++      vpaddq          $M4,$D4,$D4             # d4 += h3*r1
++      vpaddq          $M1,$D1,$D1             # d1 += h1*r0
++      vpaddq          $M2,$D2,$D2             # d2 += h1*r1
++
++      vpmuludq        $H4,$S4,$M3
++      vpmuludq        $H4,$R0,$M4
++      vpmuludq        $H3,$S2,$M0
++      vpmuludq        $H3,$S3,$M1
++      vpaddq          $M3,$D3,$D3             # d3 += h4*s4
++      vpmuludq        $H3,$S4,$M2
++      vpaddq          $M4,$D4,$D4             # d4 += h4*r0
++      vpaddq          $M0,$D0,$D0             # d0 += h3*s2
++      vpaddq          $M1,$D1,$D1             # d1 += h3*s3
++      vpaddq          $M2,$D2,$D2             # d2 += h3*s4
++
++      vpmuludq        $H4,$S1,$M0
++      vpmuludq        $H4,$S2,$M1
++      vpmuludq        $H4,$S3,$M2
++      vpaddq          $M0,$D0,$H0             # h0 = d0 + h4*s1
++      vpaddq          $M1,$D1,$H1             # h1 = d2 + h4*s2
++      vpaddq          $M2,$D2,$H2             # h2 = d3 + h4*s3
++
++      ################################################################
++      # lazy reduction (interleaved with input splat)
++
++       vpsrlq         \$52,$T0,$T2            # splat input
++       vpsllq         \$12,$T4,$T3
++
++      vpsrlq          \$26,$D3,$H3
++      vpandq          $MASK,$D3,$D3
++      vpaddq          $H3,$D4,$H4             # h3 -> h4
++
++       vporq          $T3,$T2,$T2
++
++      vpsrlq          \$26,$H0,$D0
++      vpandq          $MASK,$H0,$H0
++      vpaddq          $D0,$H1,$H1             # h0 -> h1
++
++       vpandq         $MASK,$T2,$T2           # 2
++
++      vpsrlq          \$26,$H4,$D4
++      vpandq          $MASK,$H4,$H4
++
++      vpsrlq          \$26,$H1,$D1
++      vpandq          $MASK,$H1,$H1
++      vpaddq          $D1,$H2,$H2             # h1 -> h2
++
++      vpaddq          $D4,$H0,$H0
++      vpsllq          \$2,$D4,$D4
++      vpaddq          $D4,$H0,$H0             # h4 -> h0
++
++       vpaddq         $T2,$H2,$H2             # modulo-scheduled
++       vpsrlq         \$26,$T0,$T1
++
++      vpsrlq          \$26,$H2,$D2
++      vpandq          $MASK,$H2,$H2
++      vpaddq          $D2,$D3,$H3             # h2 -> h3
++
++       vpsrlq         \$14,$T4,$T3
++
++      vpsrlq          \$26,$H0,$D0
++      vpandq          $MASK,$H0,$H0
++      vpaddq          $D0,$H1,$H1             # h0 -> h1
++
++       vpsrlq         \$40,$T4,$T4            # 4
++
++      vpsrlq          \$26,$H3,$D3
++      vpandq          $MASK,$H3,$H3
++      vpaddq          $D3,$H4,$H4             # h3 -> h4
++
++       vpandq         $MASK,$T0,$T0           # 0
++       #vpandq        $MASK,$T1,$T1           # 1
++       #vpandq        $MASK,$T3,$T3           # 3
++       #vporq         $PADBIT,$T4,$T4         # padbit, yes, always
++
++      sub             \$128,$len
++      ja              .Loop_avx512
++
++.Ltail_avx512:
++      ################################################################
++      # while above multiplications were by r^8 in all lanes, in last
++      # iteration we multiply least significant lane by r^8 and most
++      # significant one by r, that's why table gets shifted...
++
++      vpsrlq          \$32,$R0,$R0            # 0105020603070408
++      vpsrlq          \$32,$R1,$R1
++      vpsrlq          \$32,$R2,$R2
++      vpsrlq          \$32,$S3,$S3
++      vpsrlq          \$32,$S4,$S4
++      vpsrlq          \$32,$R3,$R3
++      vpsrlq          \$32,$R4,$R4
++      vpsrlq          \$32,$S1,$S1
++      vpsrlq          \$32,$S2,$S2
++
++      ################################################################
++      # load either next or last 64 byte of input
++      lea             ($inp,$len),$inp
++
++      #vpaddq         $H2,$T2,$H2             # accumulate input
++      vpaddq          $H0,$T0,$H0
++
++      vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
++      vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
++      vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
++       vpandq         $MASK,$T1,$T1           # 1
++      vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
++       vpandq         $MASK,$T3,$T3           # 3
++      vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
++       vporq          $PADBIT,$T4,$T4         # padbit, yes, always
++       vpaddq         $H1,$T1,$H1             # accumulate input
++       vpaddq         $H3,$T3,$H3
++       vpaddq         $H4,$T4,$H4
++
++        vmovdqu       16*0($inp),%x#$T0
++      vpmuludq        $H0,$R3,$M3
++      vpmuludq        $H0,$R4,$M4
++      vpmuludq        $H0,$R0,$M0
++      vpmuludq        $H0,$R1,$M1
++      vpaddq          $M3,$D3,$D3             # d3 += h0*r3
++      vpaddq          $M4,$D4,$D4             # d4 += h0*r4
++      vpaddq          $M0,$D0,$D0             # d0 += h0*r0
++      vpaddq          $M1,$D1,$D1             # d1 += h0*r1
++
++        vmovdqu       16*1($inp),%x#$T1
++      vpmuludq        $H1,$R2,$M3
++      vpmuludq        $H1,$R3,$M4
++      vpmuludq        $H1,$S4,$M0
++      vpmuludq        $H0,$R2,$M2
++      vpaddq          $M3,$D3,$D3             # d3 += h1*r2
++      vpaddq          $M4,$D4,$D4             # d4 += h1*r3
++      vpaddq          $M0,$D0,$D0             # d0 += h1*s4
++      vpaddq          $M2,$D2,$D2             # d2 += h0*r2
++
++        vinserti128   \$1,16*2($inp),%y#$T0,%y#$T0
++      vpmuludq        $H3,$R0,$M3
++      vpmuludq        $H3,$R1,$M4
++      vpmuludq        $H1,$R0,$M1
++      vpmuludq        $H1,$R1,$M2
++      vpaddq          $M3,$D3,$D3             # d3 += h3*r0
++      vpaddq          $M4,$D4,$D4             # d4 += h3*r1
++      vpaddq          $M1,$D1,$D1             # d1 += h1*r0
++      vpaddq          $M2,$D2,$D2             # d2 += h1*r1
++
++        vinserti128   \$1,16*3($inp),%y#$T1,%y#$T1
++      vpmuludq        $H4,$S4,$M3
++      vpmuludq        $H4,$R0,$M4
++      vpmuludq        $H3,$S2,$M0
++      vpmuludq        $H3,$S3,$M1
++      vpmuludq        $H3,$S4,$M2
++      vpaddq          $M3,$D3,$H3             # h3 = d3 + h4*s4
++      vpaddq          $M4,$D4,$D4             # d4 += h4*r0
++      vpaddq          $M0,$D0,$D0             # d0 += h3*s2
++      vpaddq          $M1,$D1,$D1             # d1 += h3*s3
++      vpaddq          $M2,$D2,$D2             # d2 += h3*s4
++
++      vpmuludq        $H4,$S1,$M0
++      vpmuludq        $H4,$S2,$M1
++      vpmuludq        $H4,$S3,$M2
++      vpaddq          $M0,$D0,$H0             # h0 = d0 + h4*s1
++      vpaddq          $M1,$D1,$H1             # h1 = d2 + h4*s2
++      vpaddq          $M2,$D2,$H2             # h2 = d3 + h4*s3
++
++      ################################################################
++      # horizontal addition
++
++      mov             \$1,%eax
++      vpermq          \$0xb1,$H3,$D3
++      vpermq          \$0xb1,$D4,$H4
++      vpermq          \$0xb1,$H0,$D0
++      vpermq          \$0xb1,$H1,$D1
++      vpermq          \$0xb1,$H2,$D2
++      vpaddq          $D3,$H3,$H3
++      vpaddq          $D4,$H4,$H4
++      vpaddq          $D0,$H0,$H0
++      vpaddq          $D1,$H1,$H1
++      vpaddq          $D2,$H2,$H2
++
++      kmovw           %eax,%k3
++      vpermq          \$0x2,$H3,$D3
++      vpermq          \$0x2,$H4,$D4
++      vpermq          \$0x2,$H0,$D0
++      vpermq          \$0x2,$H1,$D1
++      vpermq          \$0x2,$H2,$D2
++      vpaddq          $D3,$H3,$H3
++      vpaddq          $D4,$H4,$H4
++      vpaddq          $D0,$H0,$H0
++      vpaddq          $D1,$H1,$H1
++      vpaddq          $D2,$H2,$H2
++
++      vextracti64x4   \$0x1,$H3,%y#$D3
++      vextracti64x4   \$0x1,$H4,%y#$D4
++      vextracti64x4   \$0x1,$H0,%y#$D0
++      vextracti64x4   \$0x1,$H1,%y#$D1
++      vextracti64x4   \$0x1,$H2,%y#$D2
++      vpaddq          $D3,$H3,${H3}{%k3}{z}   # keep single qword in case
++      vpaddq          $D4,$H4,${H4}{%k3}{z}   # it's passed to .Ltail_avx2
++      vpaddq          $D0,$H0,${H0}{%k3}{z}
++      vpaddq          $D1,$H1,${H1}{%k3}{z}
++      vpaddq          $D2,$H2,${H2}{%k3}{z}
++___
++map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
++map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
++$code.=<<___;
++      ################################################################
++      # lazy reduction (interleaved with input splat)
++
++      vpsrlq          \$26,$H3,$D3
++      vpand           $MASK,$H3,$H3
++       vpsrldq        \$6,$T0,$T2             # splat input
++       vpsrldq        \$6,$T1,$T3
++       vpunpckhqdq    $T1,$T0,$T4             # 4
++      vpaddq          $D3,$H4,$H4             # h3 -> h4
++
++      vpsrlq          \$26,$H0,$D0
++      vpand           $MASK,$H0,$H0
++       vpunpcklqdq    $T3,$T2,$T2             # 2:3
++       vpunpcklqdq    $T1,$T0,$T0             # 0:1
++      vpaddq          $D0,$H1,$H1             # h0 -> h1
++
++      vpsrlq          \$26,$H4,$D4
++      vpand           $MASK,$H4,$H4
++
++      vpsrlq          \$26,$H1,$D1
++      vpand           $MASK,$H1,$H1
++       vpsrlq         \$30,$T2,$T3
++       vpsrlq         \$4,$T2,$T2
++      vpaddq          $D1,$H2,$H2             # h1 -> h2
++
++      vpaddq          $D4,$H0,$H0
++      vpsllq          \$2,$D4,$D4
++       vpsrlq         \$26,$T0,$T1
++       vpsrlq         \$40,$T4,$T4            # 4
++      vpaddq          $D4,$H0,$H0             # h4 -> h0
++
++      vpsrlq          \$26,$H2,$D2
++      vpand           $MASK,$H2,$H2
++       vpand          $MASK,$T2,$T2           # 2
++       vpand          $MASK,$T0,$T0           # 0
++      vpaddq          $D2,$H3,$H3             # h2 -> h3
++
++      vpsrlq          \$26,$H0,$D0
++      vpand           $MASK,$H0,$H0
++       vpaddq         $H2,$T2,$H2             # accumulate input for .Ltail_avx2
++       vpand          $MASK,$T1,$T1           # 1
++      vpaddq          $D0,$H1,$H1             # h0 -> h1
++
++      vpsrlq          \$26,$H3,$D3
++      vpand           $MASK,$H3,$H3
++       vpand          $MASK,$T3,$T3           # 3
++       vpor           32(%rcx),$T4,$T4        # padbit, yes, always
++      vpaddq          $D3,$H4,$H4             # h3 -> h4
++
++      lea             0x90(%rsp),%rax         # size optimization for .Ltail_avx2
++      add             \$64,$len
++      jnz             .Ltail_avx2
++
++      vpsubq          $T2,$H2,$H2             # undo input accumulation
++      vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
++      vmovd           %x#$H1,`4*1-48-64`($ctx)
++      vmovd           %x#$H2,`4*2-48-64`($ctx)
++      vmovd           %x#$H3,`4*3-48-64`($ctx)
++      vmovd           %x#$H4,`4*4-48-64`($ctx)
++      vzeroall
++___
++$code.=<<___  if ($win64);
++      movdqa          0x50(%r11),%xmm6
++      movdqa          0x60(%r11),%xmm7
++      movdqa          0x70(%r11),%xmm8
++      movdqa          0x80(%r11),%xmm9
++      movdqa          0x90(%r11),%xmm10
++      movdqa          0xa0(%r11),%xmm11
++      movdqa          0xb0(%r11),%xmm12
++      movdqa          0xc0(%r11),%xmm13
++      movdqa          0xd0(%r11),%xmm14
++      movdqa          0xe0(%r11),%xmm15
++      lea             0xf8(%r11),%rsp
++.Ldo_avx512_epilogue:
++___
++$code.=<<___  if (!$win64);
++      lea             8(%r11),%rsp
++.cfi_def_cfa          %rsp,8
++___
++$code.=<<___;
++      ret
++.cfi_endproc
++.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
++___
++if ($avx>3) {
++########################################################################
++# VPMADD52 version using 2^44 radix.
++#
++# One can argue that base 2^52 would be more natural. Well, even though
++# some operations would be more natural, one has to recognize couple of
++# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
++# at amount of multiply-n-accumulate operations. Secondly, it makes it
++# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
++# reference implementations], which means that more such operations
++# would have to be performed in inner loop, which in turn makes critical
++# path longer. In other words, even though base 2^44 reduction might
++# look less elegant, overall critical path is actually shorter...
++
++########################################################################
++# Layout of opaque area is following.
++#
++#     unsigned __int64 h[3];          # current hash value base 2^44
++#     unsigned __int64 s[2];          # key value*20 base 2^44
++#     unsigned __int64 r[3];          # key value base 2^44
++#     struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
++#                                     # r^n positions reflect
++#                                     # placement in register, not
++#                                     # memory, R[3] is R[1]*20
++
++$code.=<<___;
++.type poly1305_init_base2_44,\@function,3
++.align        32
++poly1305_init_base2_44:
++      xor     %rax,%rax
++      mov     %rax,0($ctx)            # initialize hash value
++      mov     %rax,8($ctx)
++      mov     %rax,16($ctx)
++
++.Linit_base2_44:
++      lea     poly1305_blocks_vpmadd52(%rip),%r10
++      lea     poly1305_emit_base2_44(%rip),%r11
++
++      mov     \$0x0ffffffc0fffffff,%rax
++      mov     \$0x0ffffffc0ffffffc,%rcx
++      and     0($inp),%rax
++      mov     \$0x00000fffffffffff,%r8
++      and     8($inp),%rcx
++      mov     \$0x00000fffffffffff,%r9
++      and     %rax,%r8
++      shrd    \$44,%rcx,%rax
++      mov     %r8,40($ctx)            # r0
++      and     %r9,%rax
++      shr     \$24,%rcx
++      mov     %rax,48($ctx)           # r1
++      lea     (%rax,%rax,4),%rax      # *5
++      mov     %rcx,56($ctx)           # r2
++      shl     \$2,%rax                # magic <<2
++      lea     (%rcx,%rcx,4),%rcx      # *5
++      shl     \$2,%rcx                # magic <<2
++      mov     %rax,24($ctx)           # s1
++      mov     %rcx,32($ctx)           # s2
++      movq    \$-1,64($ctx)           # write impossible value
++___
++$code.=<<___  if ($flavour !~ /elf32/);
++      mov     %r10,0(%rdx)
++      mov     %r11,8(%rdx)
++___
++$code.=<<___  if ($flavour =~ /elf32/);
++      mov     %r10d,0(%rdx)
++      mov     %r11d,4(%rdx)
++___
++$code.=<<___;
++      mov     \$1,%eax
++      ret
++.size poly1305_init_base2_44,.-poly1305_init_base2_44
++___
++{
++my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
++my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
++my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
++
++$code.=<<___;
++.type poly1305_blocks_vpmadd52,\@function,4
++.align        32
++poly1305_blocks_vpmadd52:
++      shr     \$4,$len
++      jz      .Lno_data_vpmadd52              # too short
++
++      shl     \$40,$padbit
++      mov     64($ctx),%r8                    # peek on power of the key
++
++      # if powers of the key are not calculated yet, process up to 3
++      # blocks with this single-block subroutine, otherwise ensure that
++      # length is divisible by 2 blocks and pass the rest down to next
++      # subroutine...
++
++      mov     \$3,%rax
++      mov     \$1,%r10
++      cmp     \$4,$len                        # is input long
++      cmovae  %r10,%rax
++      test    %r8,%r8                         # is power value impossible?
++      cmovns  %r10,%rax
++
++      and     $len,%rax                       # is input of favourable length?
++      jz      .Lblocks_vpmadd52_4x
++
++      sub             %rax,$len
++      mov             \$7,%r10d
++      mov             \$1,%r11d
++      kmovw           %r10d,%k7
++      lea             .L2_44_inp_permd(%rip),%r10
++      kmovw           %r11d,%k1
++
++      vmovq           $padbit,%x#$PAD
++      vmovdqa64       0(%r10),$inp_permd      # .L2_44_inp_permd
++      vmovdqa64       32(%r10),$inp_shift     # .L2_44_inp_shift
++      vpermq          \$0xcf,$PAD,$PAD
++      vmovdqa64       64(%r10),$reduc_mask    # .L2_44_mask
++
++      vmovdqu64       0($ctx),${Dlo}{%k7}{z}          # load hash value
++      vmovdqu64       40($ctx),${r2r1r0}{%k7}{z}      # load keys
++      vmovdqu64       32($ctx),${r1r0s2}{%k7}{z}
++      vmovdqu64       24($ctx),${r0s2s1}{%k7}{z}
++
++      vmovdqa64       96(%r10),$reduc_rght    # .L2_44_shift_rgt
++      vmovdqa64       128(%r10),$reduc_left   # .L2_44_shift_lft
++
++      jmp             .Loop_vpmadd52
++
++.align        32
++.Loop_vpmadd52:
++      vmovdqu32       0($inp),%x#$T0          # load input as ----3210
++      lea             16($inp),$inp
++
++      vpermd          $T0,$inp_permd,$T0      # ----3210 -> --322110
++      vpsrlvq         $inp_shift,$T0,$T0
++      vpandq          $reduc_mask,$T0,$T0
++      vporq           $PAD,$T0,$T0
++
++      vpaddq          $T0,$Dlo,$Dlo           # accumulate input
++
++      vpermq          \$0,$Dlo,${H0}{%k7}{z}  # smash hash value
++      vpermq          \$0b01010101,$Dlo,${H1}{%k7}{z}
++      vpermq          \$0b10101010,$Dlo,${H2}{%k7}{z}
++
++      vpxord          $Dlo,$Dlo,$Dlo
++      vpxord          $Dhi,$Dhi,$Dhi
++
++      vpmadd52luq     $r2r1r0,$H0,$Dlo
++      vpmadd52huq     $r2r1r0,$H0,$Dhi
++
++      vpmadd52luq     $r1r0s2,$H1,$Dlo
++      vpmadd52huq     $r1r0s2,$H1,$Dhi
++
++      vpmadd52luq     $r0s2s1,$H2,$Dlo
++      vpmadd52huq     $r0s2s1,$H2,$Dhi
++
++      vpsrlvq         $reduc_rght,$Dlo,$T0    # 0 in topmost qword
++      vpsllvq         $reduc_left,$Dhi,$Dhi   # 0 in topmost qword
++      vpandq          $reduc_mask,$Dlo,$Dlo
++
++      vpaddq          $T0,$Dhi,$Dhi
++
++      vpermq          \$0b10010011,$Dhi,$Dhi  # 0 in lowest qword
++
++      vpaddq          $Dhi,$Dlo,$Dlo          # note topmost qword :-)
++
++      vpsrlvq         $reduc_rght,$Dlo,$T0    # 0 in topmost word
++      vpandq          $reduc_mask,$Dlo,$Dlo
++
++      vpermq          \$0b10010011,$T0,$T0
++
++      vpaddq          $T0,$Dlo,$Dlo
++
++      vpermq          \$0b10010011,$Dlo,${T0}{%k1}{z}
++
++      vpaddq          $T0,$Dlo,$Dlo
++      vpsllq          \$2,$T0,$T0
++
++      vpaddq          $T0,$Dlo,$Dlo
++
++      dec             %rax                    # len-=16
++      jnz             .Loop_vpmadd52
++
++      vmovdqu64       $Dlo,0($ctx){%k7}       # store hash value
++
++      test            $len,$len
++      jnz             .Lblocks_vpmadd52_4x
++
++.Lno_data_vpmadd52:
++      ret
++.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
++___
++}
++{
++########################################################################
++# As implied by its name 4x subroutine processes 4 blocks in parallel
++# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
++# and is handled in 256-bit %ymm registers.
++
++my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
++my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
++my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
++
++$code.=<<___;
++.type poly1305_blocks_vpmadd52_4x,\@function,4
++.align        32
++poly1305_blocks_vpmadd52_4x:
++      shr     \$4,$len
++      jz      .Lno_data_vpmadd52_4x           # too short
++
++      shl     \$40,$padbit
++      mov     64($ctx),%r8                    # peek on power of the key
++
++.Lblocks_vpmadd52_4x:
++      vpbroadcastq    $padbit,$PAD
++
++      vmovdqa64       .Lx_mask44(%rip),$mask44
++      mov             \$5,%eax
++      vmovdqa64       .Lx_mask42(%rip),$mask42
++      kmovw           %eax,%k1                # used in 2x path
++
++      test            %r8,%r8                 # is power value impossible?
++      js              .Linit_vpmadd52         # if it is, then init R[4]
++
++      vmovq           0($ctx),%x#$H0          # load current hash value
++      vmovq           8($ctx),%x#$H1
++      vmovq           16($ctx),%x#$H2
++
++      test            \$3,$len                # is length 4*n+2?
++      jnz             .Lblocks_vpmadd52_2x_do
++
++.Lblocks_vpmadd52_4x_do:
++      vpbroadcastq    64($ctx),$R0            # load 4th power of the key
++      vpbroadcastq    96($ctx),$R1
++      vpbroadcastq    128($ctx),$R2
++      vpbroadcastq    160($ctx),$S1
++
++.Lblocks_vpmadd52_4x_key_loaded:
++      vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
++      vpaddq          $R2,$S2,$S2
++      vpsllq          \$2,$S2,$S2
++
++      test            \$7,$len                # is len 8*n?
++      jz              .Lblocks_vpmadd52_8x
++
++      vmovdqu64       16*0($inp),$T2          # load data
++      vmovdqu64       16*2($inp),$T3
++      lea             16*4($inp),$inp
++
++      vpunpcklqdq     $T3,$T2,$T1             # transpose data
++      vpunpckhqdq     $T3,$T2,$T3
++
++      # at this point 64-bit lanes are ordered as 3-1-2-0
++
++      vpsrlq          \$24,$T3,$T2            # splat the data
++      vporq           $PAD,$T2,$T2
++       vpaddq         $T2,$H2,$H2             # accumulate input
++      vpandq          $mask44,$T1,$T0
++      vpsrlq          \$44,$T1,$T1
++      vpsllq          \$20,$T3,$T3
++      vporq           $T3,$T1,$T1
++      vpandq          $mask44,$T1,$T1
++
++      sub             \$4,$len
++      jz              .Ltail_vpmadd52_4x
++      jmp             .Loop_vpmadd52_4x
++      ud2
++
++.align        32
++.Linit_vpmadd52:
++      vmovq           24($ctx),%x#$S1         # load key
++      vmovq           56($ctx),%x#$H2
++      vmovq           32($ctx),%x#$S2
++      vmovq           40($ctx),%x#$R0
++      vmovq           48($ctx),%x#$R1
++
++      vmovdqa         $R0,$H0
++      vmovdqa         $R1,$H1
++      vmovdqa         $H2,$R2
++
++      mov             \$2,%eax
++
++.Lmul_init_vpmadd52:
++      vpxorq          $D0lo,$D0lo,$D0lo
++      vpmadd52luq     $H2,$S1,$D0lo
++      vpxorq          $D0hi,$D0hi,$D0hi
++      vpmadd52huq     $H2,$S1,$D0hi
++      vpxorq          $D1lo,$D1lo,$D1lo
++      vpmadd52luq     $H2,$S2,$D1lo
++      vpxorq          $D1hi,$D1hi,$D1hi
++      vpmadd52huq     $H2,$S2,$D1hi
++      vpxorq          $D2lo,$D2lo,$D2lo
++      vpmadd52luq     $H2,$R0,$D2lo
++      vpxorq          $D2hi,$D2hi,$D2hi
++      vpmadd52huq     $H2,$R0,$D2hi
++
++      vpmadd52luq     $H0,$R0,$D0lo
++      vpmadd52huq     $H0,$R0,$D0hi
++      vpmadd52luq     $H0,$R1,$D1lo
++      vpmadd52huq     $H0,$R1,$D1hi
++      vpmadd52luq     $H0,$R2,$D2lo
++      vpmadd52huq     $H0,$R2,$D2hi
++
++      vpmadd52luq     $H1,$S2,$D0lo
++      vpmadd52huq     $H1,$S2,$D0hi
++      vpmadd52luq     $H1,$R0,$D1lo
++      vpmadd52huq     $H1,$R0,$D1hi
++      vpmadd52luq     $H1,$R1,$D2lo
++      vpmadd52huq     $H1,$R1,$D2hi
++
++      ################################################################
++      # partial reduction
++      vpsrlq          \$44,$D0lo,$tmp
++      vpsllq          \$8,$D0hi,$D0hi
++      vpandq          $mask44,$D0lo,$H0
++      vpaddq          $tmp,$D0hi,$D0hi
++
++      vpaddq          $D0hi,$D1lo,$D1lo
++
++      vpsrlq          \$44,$D1lo,$tmp
++      vpsllq          \$8,$D1hi,$D1hi
++      vpandq          $mask44,$D1lo,$H1
++      vpaddq          $tmp,$D1hi,$D1hi
++
++      vpaddq          $D1hi,$D2lo,$D2lo
++
++      vpsrlq          \$42,$D2lo,$tmp
++      vpsllq          \$10,$D2hi,$D2hi
++      vpandq          $mask42,$D2lo,$H2
++      vpaddq          $tmp,$D2hi,$D2hi
++
++      vpaddq          $D2hi,$H0,$H0
++      vpsllq          \$2,$D2hi,$D2hi
++
++      vpaddq          $D2hi,$H0,$H0
++
++      vpsrlq          \$44,$H0,$tmp           # additional step
++      vpandq          $mask44,$H0,$H0
++
++      vpaddq          $tmp,$H1,$H1
++
++      dec             %eax
++      jz              .Ldone_init_vpmadd52
++
++      vpunpcklqdq     $R1,$H1,$R1             # 1,2
++      vpbroadcastq    %x#$H1,%x#$H1           # 2,2
++      vpunpcklqdq     $R2,$H2,$R2
++      vpbroadcastq    %x#$H2,%x#$H2
++      vpunpcklqdq     $R0,$H0,$R0
++      vpbroadcastq    %x#$H0,%x#$H0
++
++      vpsllq          \$2,$R1,$S1             # S1 = R1*5*4
++      vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
++      vpaddq          $R1,$S1,$S1
++      vpaddq          $R2,$S2,$S2
++      vpsllq          \$2,$S1,$S1
++      vpsllq          \$2,$S2,$S2
++
++      jmp             .Lmul_init_vpmadd52
++      ud2
++
++.align        32
++.Ldone_init_vpmadd52:
++      vinserti128     \$1,%x#$R1,$H1,$R1      # 1,2,3,4
++      vinserti128     \$1,%x#$R2,$H2,$R2
++      vinserti128     \$1,%x#$R0,$H0,$R0
++
++      vpermq          \$0b11011000,$R1,$R1    # 1,3,2,4
++      vpermq          \$0b11011000,$R2,$R2
++      vpermq          \$0b11011000,$R0,$R0
++
++      vpsllq          \$2,$R1,$S1             # S1 = R1*5*4
++      vpaddq          $R1,$S1,$S1
++      vpsllq          \$2,$S1,$S1
++
++      vmovq           0($ctx),%x#$H0          # load current hash value
++      vmovq           8($ctx),%x#$H1
++      vmovq           16($ctx),%x#$H2
++
++      test            \$3,$len                # is length 4*n+2?
++      jnz             .Ldone_init_vpmadd52_2x
++
++      vmovdqu64       $R0,64($ctx)            # save key powers
++      vpbroadcastq    %x#$R0,$R0              # broadcast 4th power
++      vmovdqu64       $R1,96($ctx)
++      vpbroadcastq    %x#$R1,$R1
++      vmovdqu64       $R2,128($ctx)
++      vpbroadcastq    %x#$R2,$R2
++      vmovdqu64       $S1,160($ctx)
++      vpbroadcastq    %x#$S1,$S1
++
++      jmp             .Lblocks_vpmadd52_4x_key_loaded
++      ud2
++
++.align        32
++.Ldone_init_vpmadd52_2x:
++      vmovdqu64       $R0,64($ctx)            # save key powers
++      vpsrldq         \$8,$R0,$R0             # 0-1-0-2
++      vmovdqu64       $R1,96($ctx)
++      vpsrldq         \$8,$R1,$R1
++      vmovdqu64       $R2,128($ctx)
++      vpsrldq         \$8,$R2,$R2
++      vmovdqu64       $S1,160($ctx)
++      vpsrldq         \$8,$S1,$S1
++      jmp             .Lblocks_vpmadd52_2x_key_loaded
++      ud2
++
++.align        32
++.Lblocks_vpmadd52_2x_do:
++      vmovdqu64       128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
++      vmovdqu64       160+8($ctx),${S1}{%k1}{z}
++      vmovdqu64       64+8($ctx),${R0}{%k1}{z}
++      vmovdqu64       96+8($ctx),${R1}{%k1}{z}
++
++.Lblocks_vpmadd52_2x_key_loaded:
++      vmovdqu64       16*0($inp),$T2          # load data
++      vpxorq          $T3,$T3,$T3
++      lea             16*2($inp),$inp
++
++      vpunpcklqdq     $T3,$T2,$T1             # transpose data
++      vpunpckhqdq     $T3,$T2,$T3
++
++      # at this point 64-bit lanes are ordered as x-1-x-0
++
++      vpsrlq          \$24,$T3,$T2            # splat the data
++      vporq           $PAD,$T2,$T2
++       vpaddq         $T2,$H2,$H2             # accumulate input
++      vpandq          $mask44,$T1,$T0
++      vpsrlq          \$44,$T1,$T1
++      vpsllq          \$20,$T3,$T3
++      vporq           $T3,$T1,$T1
++      vpandq          $mask44,$T1,$T1
++
++      jmp             .Ltail_vpmadd52_2x
++      ud2
++
++.align        32
++.Loop_vpmadd52_4x:
++      #vpaddq         $T2,$H2,$H2             # accumulate input
++      vpaddq          $T0,$H0,$H0
++      vpaddq          $T1,$H1,$H1
++
++      vpxorq          $D0lo,$D0lo,$D0lo
++      vpmadd52luq     $H2,$S1,$D0lo
++      vpxorq          $D0hi,$D0hi,$D0hi
++      vpmadd52huq     $H2,$S1,$D0hi
++      vpxorq          $D1lo,$D1lo,$D1lo
++      vpmadd52luq     $H2,$S2,$D1lo
++      vpxorq          $D1hi,$D1hi,$D1hi
++      vpmadd52huq     $H2,$S2,$D1hi
++      vpxorq          $D2lo,$D2lo,$D2lo
++      vpmadd52luq     $H2,$R0,$D2lo
++      vpxorq          $D2hi,$D2hi,$D2hi
++      vpmadd52huq     $H2,$R0,$D2hi
++
++       vmovdqu64      16*0($inp),$T2          # load data
++       vmovdqu64      16*2($inp),$T3
++       lea            16*4($inp),$inp
++      vpmadd52luq     $H0,$R0,$D0lo
++      vpmadd52huq     $H0,$R0,$D0hi
++      vpmadd52luq     $H0,$R1,$D1lo
++      vpmadd52huq     $H0,$R1,$D1hi
++      vpmadd52luq     $H0,$R2,$D2lo
++      vpmadd52huq     $H0,$R2,$D2hi
++
++       vpunpcklqdq    $T3,$T2,$T1             # transpose data
++       vpunpckhqdq    $T3,$T2,$T3
++      vpmadd52luq     $H1,$S2,$D0lo
++      vpmadd52huq     $H1,$S2,$D0hi
++      vpmadd52luq     $H1,$R0,$D1lo
++      vpmadd52huq     $H1,$R0,$D1hi
++      vpmadd52luq     $H1,$R1,$D2lo
++      vpmadd52huq     $H1,$R1,$D2hi
++
++      ################################################################
++      # partial reduction (interleaved with data splat)
++      vpsrlq          \$44,$D0lo,$tmp
++      vpsllq          \$8,$D0hi,$D0hi
++      vpandq          $mask44,$D0lo,$H0
++      vpaddq          $tmp,$D0hi,$D0hi
++
++       vpsrlq         \$24,$T3,$T2
++       vporq          $PAD,$T2,$T2
++      vpaddq          $D0hi,$D1lo,$D1lo
++
++      vpsrlq          \$44,$D1lo,$tmp
++      vpsllq          \$8,$D1hi,$D1hi
++      vpandq          $mask44,$D1lo,$H1
++      vpaddq          $tmp,$D1hi,$D1hi
++
++       vpandq         $mask44,$T1,$T0
++       vpsrlq         \$44,$T1,$T1
++       vpsllq         \$20,$T3,$T3
++      vpaddq          $D1hi,$D2lo,$D2lo
++
++      vpsrlq          \$42,$D2lo,$tmp
++      vpsllq          \$10,$D2hi,$D2hi
++      vpandq          $mask42,$D2lo,$H2
++      vpaddq          $tmp,$D2hi,$D2hi
++
++        vpaddq        $T2,$H2,$H2             # accumulate input
++      vpaddq          $D2hi,$H0,$H0
++      vpsllq          \$2,$D2hi,$D2hi
++
++      vpaddq          $D2hi,$H0,$H0
++       vporq          $T3,$T1,$T1
++       vpandq         $mask44,$T1,$T1
++
++      vpsrlq          \$44,$H0,$tmp           # additional step
++      vpandq          $mask44,$H0,$H0
++
++      vpaddq          $tmp,$H1,$H1
++
++      sub             \$4,$len                # len-=64
++      jnz             .Loop_vpmadd52_4x
++
++.Ltail_vpmadd52_4x:
++      vmovdqu64       128($ctx),$R2           # load all key powers
++      vmovdqu64       160($ctx),$S1
++      vmovdqu64       64($ctx),$R0
++      vmovdqu64       96($ctx),$R1
++
++.Ltail_vpmadd52_2x:
++      vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
++      vpaddq          $R2,$S2,$S2
++      vpsllq          \$2,$S2,$S2
++
++      #vpaddq         $T2,$H2,$H2             # accumulate input
++      vpaddq          $T0,$H0,$H0
++      vpaddq          $T1,$H1,$H1
++
++      vpxorq          $D0lo,$D0lo,$D0lo
++      vpmadd52luq     $H2,$S1,$D0lo
++      vpxorq          $D0hi,$D0hi,$D0hi
++      vpmadd52huq     $H2,$S1,$D0hi
++      vpxorq          $D1lo,$D1lo,$D1lo
++      vpmadd52luq     $H2,$S2,$D1lo
++      vpxorq          $D1hi,$D1hi,$D1hi
++      vpmadd52huq     $H2,$S2,$D1hi
++      vpxorq          $D2lo,$D2lo,$D2lo
++      vpmadd52luq     $H2,$R0,$D2lo
++      vpxorq          $D2hi,$D2hi,$D2hi
++      vpmadd52huq     $H2,$R0,$D2hi
++
++      vpmadd52luq     $H0,$R0,$D0lo
++      vpmadd52huq     $H0,$R0,$D0hi
++      vpmadd52luq     $H0,$R1,$D1lo
++      vpmadd52huq     $H0,$R1,$D1hi
++      vpmadd52luq     $H0,$R2,$D2lo
++      vpmadd52huq     $H0,$R2,$D2hi
++
++      vpmadd52luq     $H1,$S2,$D0lo
++      vpmadd52huq     $H1,$S2,$D0hi
++      vpmadd52luq     $H1,$R0,$D1lo
++      vpmadd52huq     $H1,$R0,$D1hi
++      vpmadd52luq     $H1,$R1,$D2lo
++      vpmadd52huq     $H1,$R1,$D2hi
++
++      ################################################################
++      # horizontal addition
++
++      mov             \$1,%eax
++      kmovw           %eax,%k1
++      vpsrldq         \$8,$D0lo,$T0
++      vpsrldq         \$8,$D0hi,$H0
++      vpsrldq         \$8,$D1lo,$T1
++      vpsrldq         \$8,$D1hi,$H1
++      vpaddq          $T0,$D0lo,$D0lo
++      vpaddq          $H0,$D0hi,$D0hi
++      vpsrldq         \$8,$D2lo,$T2
++      vpsrldq         \$8,$D2hi,$H2
++      vpaddq          $T1,$D1lo,$D1lo
++      vpaddq          $H1,$D1hi,$D1hi
++       vpermq         \$0x2,$D0lo,$T0
++       vpermq         \$0x2,$D0hi,$H0
++      vpaddq          $T2,$D2lo,$D2lo
++      vpaddq          $H2,$D2hi,$D2hi
++
++      vpermq          \$0x2,$D1lo,$T1
++      vpermq          \$0x2,$D1hi,$H1
++      vpaddq          $T0,$D0lo,${D0lo}{%k1}{z}
++      vpaddq          $H0,$D0hi,${D0hi}{%k1}{z}
++      vpermq          \$0x2,$D2lo,$T2
++      vpermq          \$0x2,$D2hi,$H2
++      vpaddq          $T1,$D1lo,${D1lo}{%k1}{z}
++      vpaddq          $H1,$D1hi,${D1hi}{%k1}{z}
++      vpaddq          $T2,$D2lo,${D2lo}{%k1}{z}
++      vpaddq          $H2,$D2hi,${D2hi}{%k1}{z}
++
++      ################################################################
++      # partial reduction
++      vpsrlq          \$44,$D0lo,$tmp
++      vpsllq          \$8,$D0hi,$D0hi
++      vpandq          $mask44,$D0lo,$H0
++      vpaddq          $tmp,$D0hi,$D0hi
++
++      vpaddq          $D0hi,$D1lo,$D1lo
++
++      vpsrlq          \$44,$D1lo,$tmp
++      vpsllq          \$8,$D1hi,$D1hi
++      vpandq          $mask44,$D1lo,$H1
++      vpaddq          $tmp,$D1hi,$D1hi
++
++      vpaddq          $D1hi,$D2lo,$D2lo
++
++      vpsrlq          \$42,$D2lo,$tmp
++      vpsllq          \$10,$D2hi,$D2hi
++      vpandq          $mask42,$D2lo,$H2
++      vpaddq          $tmp,$D2hi,$D2hi
++
++      vpaddq          $D2hi,$H0,$H0
++      vpsllq          \$2,$D2hi,$D2hi
++
++      vpaddq          $D2hi,$H0,$H0
++
++      vpsrlq          \$44,$H0,$tmp           # additional step
++      vpandq          $mask44,$H0,$H0
++
++      vpaddq          $tmp,$H1,$H1
++                                              # at this point $len is
++                                              # either 4*n+2 or 0...
++      sub             \$2,$len                # len-=32
++      ja              .Lblocks_vpmadd52_4x_do
++
++      vmovq           %x#$H0,0($ctx)
++      vmovq           %x#$H1,8($ctx)
++      vmovq           %x#$H2,16($ctx)
++      vzeroall
++
++.Lno_data_vpmadd52_4x:
++      ret
++.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
++___
++}
++{
++########################################################################
++# As implied by its name 8x subroutine processes 8 blocks in parallel...
++# This is intermediate version, as it's used only in cases when input
++# length is either 8*n, 8*n+1 or 8*n+2...
++
++my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
++my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
++my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
++my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
++
++$code.=<<___;
++.type poly1305_blocks_vpmadd52_8x,\@function,4
++.align        32
++poly1305_blocks_vpmadd52_8x:
++      shr     \$4,$len
++      jz      .Lno_data_vpmadd52_8x           # too short
++
++      shl     \$40,$padbit
++      mov     64($ctx),%r8                    # peek on power of the key
++
++      vmovdqa64       .Lx_mask44(%rip),$mask44
++      vmovdqa64       .Lx_mask42(%rip),$mask42
++
++      test    %r8,%r8                         # is power value impossible?
++      js      .Linit_vpmadd52                 # if it is, then init R[4]
++
++      vmovq   0($ctx),%x#$H0                  # load current hash value
++      vmovq   8($ctx),%x#$H1
++      vmovq   16($ctx),%x#$H2
++
++.Lblocks_vpmadd52_8x:
++      ################################################################
++      # fist we calculate more key powers
++
++      vmovdqu64       128($ctx),$R2           # load 1-3-2-4 powers
++      vmovdqu64       160($ctx),$S1
++      vmovdqu64       64($ctx),$R0
++      vmovdqu64       96($ctx),$R1
++
++      vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
++      vpaddq          $R2,$S2,$S2
++      vpsllq          \$2,$S2,$S2
++
++      vpbroadcastq    %x#$R2,$RR2             # broadcast 4th power
++      vpbroadcastq    %x#$R0,$RR0
++      vpbroadcastq    %x#$R1,$RR1
++
++      vpxorq          $D0lo,$D0lo,$D0lo
++      vpmadd52luq     $RR2,$S1,$D0lo
++      vpxorq          $D0hi,$D0hi,$D0hi
++      vpmadd52huq     $RR2,$S1,$D0hi
++      vpxorq          $D1lo,$D1lo,$D1lo
++      vpmadd52luq     $RR2,$S2,$D1lo
++      vpxorq          $D1hi,$D1hi,$D1hi
++      vpmadd52huq     $RR2,$S2,$D1hi
++      vpxorq          $D2lo,$D2lo,$D2lo
++      vpmadd52luq     $RR2,$R0,$D2lo
++      vpxorq          $D2hi,$D2hi,$D2hi
++      vpmadd52huq     $RR2,$R0,$D2hi
++
++      vpmadd52luq     $RR0,$R0,$D0lo
++      vpmadd52huq     $RR0,$R0,$D0hi
++      vpmadd52luq     $RR0,$R1,$D1lo
++      vpmadd52huq     $RR0,$R1,$D1hi
++      vpmadd52luq     $RR0,$R2,$D2lo
++      vpmadd52huq     $RR0,$R2,$D2hi
++
++      vpmadd52luq     $RR1,$S2,$D0lo
++      vpmadd52huq     $RR1,$S2,$D0hi
++      vpmadd52luq     $RR1,$R0,$D1lo
++      vpmadd52huq     $RR1,$R0,$D1hi
++      vpmadd52luq     $RR1,$R1,$D2lo
++      vpmadd52huq     $RR1,$R1,$D2hi
++
++      ################################################################
++      # partial reduction
++      vpsrlq          \$44,$D0lo,$tmp
++      vpsllq          \$8,$D0hi,$D0hi
++      vpandq          $mask44,$D0lo,$RR0
++      vpaddq          $tmp,$D0hi,$D0hi
++
++      vpaddq          $D0hi,$D1lo,$D1lo
++
++      vpsrlq          \$44,$D1lo,$tmp
++      vpsllq          \$8,$D1hi,$D1hi
++      vpandq          $mask44,$D1lo,$RR1
++      vpaddq          $tmp,$D1hi,$D1hi
++
++      vpaddq          $D1hi,$D2lo,$D2lo
++
++      vpsrlq          \$42,$D2lo,$tmp
++      vpsllq          \$10,$D2hi,$D2hi
++      vpandq          $mask42,$D2lo,$RR2
++      vpaddq          $tmp,$D2hi,$D2hi
++
++      vpaddq          $D2hi,$RR0,$RR0
++      vpsllq          \$2,$D2hi,$D2hi
++
++      vpaddq          $D2hi,$RR0,$RR0
++
++      vpsrlq          \$44,$RR0,$tmp          # additional step
++      vpandq          $mask44,$RR0,$RR0
++
++      vpaddq          $tmp,$RR1,$RR1
++
++      ################################################################
++      # At this point Rx holds 1324 powers, RRx - 5768, and the goal
++      # is 15263748, which reflects how data is loaded...
++
++      vpunpcklqdq     $R2,$RR2,$T2            # 3748
++      vpunpckhqdq     $R2,$RR2,$R2            # 1526
++      vpunpcklqdq     $R0,$RR0,$T0
++      vpunpckhqdq     $R0,$RR0,$R0
++      vpunpcklqdq     $R1,$RR1,$T1
++      vpunpckhqdq     $R1,$RR1,$R1
++___
++######## switch to %zmm
++map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
++map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
++map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
++map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
++
++$code.=<<___;
++      vshufi64x2      \$0x44,$R2,$T2,$RR2     # 15263748
++      vshufi64x2      \$0x44,$R0,$T0,$RR0
++      vshufi64x2      \$0x44,$R1,$T1,$RR1
++
++      vmovdqu64       16*0($inp),$T2          # load data
++      vmovdqu64       16*4($inp),$T3
++      lea             16*8($inp),$inp
++
++      vpsllq          \$2,$RR2,$SS2           # S2 = R2*5*4
++      vpsllq          \$2,$RR1,$SS1           # S1 = R1*5*4
++      vpaddq          $RR2,$SS2,$SS2
++      vpaddq          $RR1,$SS1,$SS1
++      vpsllq          \$2,$SS2,$SS2
++      vpsllq          \$2,$SS1,$SS1
++
++      vpbroadcastq    $padbit,$PAD
++      vpbroadcastq    %x#$mask44,$mask44
++      vpbroadcastq    %x#$mask42,$mask42
++
++      vpbroadcastq    %x#$SS1,$S1             # broadcast 8th power
++      vpbroadcastq    %x#$SS2,$S2
++      vpbroadcastq    %x#$RR0,$R0
++      vpbroadcastq    %x#$RR1,$R1
++      vpbroadcastq    %x#$RR2,$R2
++
++      vpunpcklqdq     $T3,$T2,$T1             # transpose data
++      vpunpckhqdq     $T3,$T2,$T3
++
++      # at this point 64-bit lanes are ordered as 73625140
++
++      vpsrlq          \$24,$T3,$T2            # splat the data
++      vporq           $PAD,$T2,$T2
++       vpaddq         $T2,$H2,$H2             # accumulate input
++      vpandq          $mask44,$T1,$T0
++      vpsrlq          \$44,$T1,$T1
++      vpsllq          \$20,$T3,$T3
++      vporq           $T3,$T1,$T1
++      vpandq          $mask44,$T1,$T1
++
++      sub             \$8,$len
++      jz              .Ltail_vpmadd52_8x
++      jmp             .Loop_vpmadd52_8x
++
++.align        32
++.Loop_vpmadd52_8x:
++      #vpaddq         $T2,$H2,$H2             # accumulate input
++      vpaddq          $T0,$H0,$H0
++      vpaddq          $T1,$H1,$H1
++
++      vpxorq          $D0lo,$D0lo,$D0lo
++      vpmadd52luq     $H2,$S1,$D0lo
++      vpxorq          $D0hi,$D0hi,$D0hi
++      vpmadd52huq     $H2,$S1,$D0hi
++      vpxorq          $D1lo,$D1lo,$D1lo
++      vpmadd52luq     $H2,$S2,$D1lo
++      vpxorq          $D1hi,$D1hi,$D1hi
++      vpmadd52huq     $H2,$S2,$D1hi
++      vpxorq          $D2lo,$D2lo,$D2lo
++      vpmadd52luq     $H2,$R0,$D2lo
++      vpxorq          $D2hi,$D2hi,$D2hi
++      vpmadd52huq     $H2,$R0,$D2hi
++
++       vmovdqu64      16*0($inp),$T2          # load data
++       vmovdqu64      16*4($inp),$T3
++       lea            16*8($inp),$inp
++      vpmadd52luq     $H0,$R0,$D0lo
++      vpmadd52huq     $H0,$R0,$D0hi
++      vpmadd52luq     $H0,$R1,$D1lo
++      vpmadd52huq     $H0,$R1,$D1hi
++      vpmadd52luq     $H0,$R2,$D2lo
++      vpmadd52huq     $H0,$R2,$D2hi
++
++       vpunpcklqdq    $T3,$T2,$T1             # transpose data
++       vpunpckhqdq    $T3,$T2,$T3
++      vpmadd52luq     $H1,$S2,$D0lo
++      vpmadd52huq     $H1,$S2,$D0hi
++      vpmadd52luq     $H1,$R0,$D1lo
++      vpmadd52huq     $H1,$R0,$D1hi
++      vpmadd52luq     $H1,$R1,$D2lo
++      vpmadd52huq     $H1,$R1,$D2hi
++
++      ################################################################
++      # partial reduction (interleaved with data splat)
++      vpsrlq          \$44,$D0lo,$tmp
++      vpsllq          \$8,$D0hi,$D0hi
++      vpandq          $mask44,$D0lo,$H0
++      vpaddq          $tmp,$D0hi,$D0hi
++
++       vpsrlq         \$24,$T3,$T2
++       vporq          $PAD,$T2,$T2
++      vpaddq          $D0hi,$D1lo,$D1lo
++
++      vpsrlq          \$44,$D1lo,$tmp
++      vpsllq          \$8,$D1hi,$D1hi
++      vpandq          $mask44,$D1lo,$H1
++      vpaddq          $tmp,$D1hi,$D1hi
++
++       vpandq         $mask44,$T1,$T0
++       vpsrlq         \$44,$T1,$T1
++       vpsllq         \$20,$T3,$T3
++      vpaddq          $D1hi,$D2lo,$D2lo
++
++      vpsrlq          \$42,$D2lo,$tmp
++      vpsllq          \$10,$D2hi,$D2hi
++      vpandq          $mask42,$D2lo,$H2
++      vpaddq          $tmp,$D2hi,$D2hi
++
++        vpaddq        $T2,$H2,$H2             # accumulate input
++      vpaddq          $D2hi,$H0,$H0
++      vpsllq          \$2,$D2hi,$D2hi
++
++      vpaddq          $D2hi,$H0,$H0
++       vporq          $T3,$T1,$T1
++       vpandq         $mask44,$T1,$T1
++
++      vpsrlq          \$44,$H0,$tmp           # additional step
++      vpandq          $mask44,$H0,$H0
++
++      vpaddq          $tmp,$H1,$H1
++
++      sub             \$8,$len                # len-=128
++      jnz             .Loop_vpmadd52_8x
++
++.Ltail_vpmadd52_8x:
++      #vpaddq         $T2,$H2,$H2             # accumulate input
++      vpaddq          $T0,$H0,$H0
++      vpaddq          $T1,$H1,$H1
++
++      vpxorq          $D0lo,$D0lo,$D0lo
++      vpmadd52luq     $H2,$SS1,$D0lo
++      vpxorq          $D0hi,$D0hi,$D0hi
++      vpmadd52huq     $H2,$SS1,$D0hi
++      vpxorq          $D1lo,$D1lo,$D1lo
++      vpmadd52luq     $H2,$SS2,$D1lo
++      vpxorq          $D1hi,$D1hi,$D1hi
++      vpmadd52huq     $H2,$SS2,$D1hi
++      vpxorq          $D2lo,$D2lo,$D2lo
++      vpmadd52luq     $H2,$RR0,$D2lo
++      vpxorq          $D2hi,$D2hi,$D2hi
++      vpmadd52huq     $H2,$RR0,$D2hi
++
++      vpmadd52luq     $H0,$RR0,$D0lo
++      vpmadd52huq     $H0,$RR0,$D0hi
++      vpmadd52luq     $H0,$RR1,$D1lo
++      vpmadd52huq     $H0,$RR1,$D1hi
++      vpmadd52luq     $H0,$RR2,$D2lo
++      vpmadd52huq     $H0,$RR2,$D2hi
++
++      vpmadd52luq     $H1,$SS2,$D0lo
++      vpmadd52huq     $H1,$SS2,$D0hi
++      vpmadd52luq     $H1,$RR0,$D1lo
++      vpmadd52huq     $H1,$RR0,$D1hi
++      vpmadd52luq     $H1,$RR1,$D2lo
++      vpmadd52huq     $H1,$RR1,$D2hi
++
++      ################################################################
++      # horizontal addition
++
++      mov             \$1,%eax
++      kmovw           %eax,%k1
++      vpsrldq         \$8,$D0lo,$T0
++      vpsrldq         \$8,$D0hi,$H0
++      vpsrldq         \$8,$D1lo,$T1
++      vpsrldq         \$8,$D1hi,$H1
++      vpaddq          $T0,$D0lo,$D0lo
++      vpaddq          $H0,$D0hi,$D0hi
++      vpsrldq         \$8,$D2lo,$T2
++      vpsrldq         \$8,$D2hi,$H2
++      vpaddq          $T1,$D1lo,$D1lo
++      vpaddq          $H1,$D1hi,$D1hi
++       vpermq         \$0x2,$D0lo,$T0
++       vpermq         \$0x2,$D0hi,$H0
++      vpaddq          $T2,$D2lo,$D2lo
++      vpaddq          $H2,$D2hi,$D2hi
++
++      vpermq          \$0x2,$D1lo,$T1
++      vpermq          \$0x2,$D1hi,$H1
++      vpaddq          $T0,$D0lo,$D0lo
++      vpaddq          $H0,$D0hi,$D0hi
++      vpermq          \$0x2,$D2lo,$T2
++      vpermq          \$0x2,$D2hi,$H2
++      vpaddq          $T1,$D1lo,$D1lo
++      vpaddq          $H1,$D1hi,$D1hi
++       vextracti64x4  \$1,$D0lo,%y#$T0
++       vextracti64x4  \$1,$D0hi,%y#$H0
++      vpaddq          $T2,$D2lo,$D2lo
++      vpaddq          $H2,$D2hi,$D2hi
++
++      vextracti64x4   \$1,$D1lo,%y#$T1
++      vextracti64x4   \$1,$D1hi,%y#$H1
++      vextracti64x4   \$1,$D2lo,%y#$T2
++      vextracti64x4   \$1,$D2hi,%y#$H2
++___
++######## switch back to %ymm
++map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
++map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
++map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
++
++$code.=<<___;
++      vpaddq          $T0,$D0lo,${D0lo}{%k1}{z}
++      vpaddq          $H0,$D0hi,${D0hi}{%k1}{z}
++      vpaddq          $T1,$D1lo,${D1lo}{%k1}{z}
++      vpaddq          $H1,$D1hi,${D1hi}{%k1}{z}
++      vpaddq          $T2,$D2lo,${D2lo}{%k1}{z}
++      vpaddq          $H2,$D2hi,${D2hi}{%k1}{z}
++
++      ################################################################
++      # partial reduction
++      vpsrlq          \$44,$D0lo,$tmp
++      vpsllq          \$8,$D0hi,$D0hi
++      vpandq          $mask44,$D0lo,$H0
++      vpaddq          $tmp,$D0hi,$D0hi
++
++      vpaddq          $D0hi,$D1lo,$D1lo
++
++      vpsrlq          \$44,$D1lo,$tmp
++      vpsllq          \$8,$D1hi,$D1hi
++      vpandq          $mask44,$D1lo,$H1
++      vpaddq          $tmp,$D1hi,$D1hi
++
++      vpaddq          $D1hi,$D2lo,$D2lo
++
++      vpsrlq          \$42,$D2lo,$tmp
++      vpsllq          \$10,$D2hi,$D2hi
++      vpandq          $mask42,$D2lo,$H2
++      vpaddq          $tmp,$D2hi,$D2hi
++
++      vpaddq          $D2hi,$H0,$H0
++      vpsllq          \$2,$D2hi,$D2hi
++
++      vpaddq          $D2hi,$H0,$H0
++
++      vpsrlq          \$44,$H0,$tmp           # additional step
++      vpandq          $mask44,$H0,$H0
++
++      vpaddq          $tmp,$H1,$H1
++
++      ################################################################
++
++      vmovq           %x#$H0,0($ctx)
++      vmovq           %x#$H1,8($ctx)
++      vmovq           %x#$H2,16($ctx)
++      vzeroall
++
++.Lno_data_vpmadd52_8x:
++      ret
++.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
++___
++}
++$code.=<<___;
++.type poly1305_emit_base2_44,\@function,3
++.align        32
++poly1305_emit_base2_44:
++      mov     0($ctx),%r8     # load hash value
++      mov     8($ctx),%r9
++      mov     16($ctx),%r10
++
++      mov     %r9,%rax
++      shr     \$20,%r9
++      shl     \$44,%rax
++      mov     %r10,%rcx
++      shr     \$40,%r10
++      shl     \$24,%rcx
++
++      add     %rax,%r8
++      adc     %rcx,%r9
++      adc     \$0,%r10
++
++      mov     %r8,%rax
++      add     \$5,%r8         # compare to modulus
++      mov     %r9,%rcx
++      adc     \$0,%r9
++      adc     \$0,%r10
++      shr     \$2,%r10        # did 130-bit value overflow?
++      cmovnz  %r8,%rax
++      cmovnz  %r9,%rcx
++
++      add     0($nonce),%rax  # accumulate nonce
++      adc     8($nonce),%rcx
++      mov     %rax,0($mac)    # write result
++      mov     %rcx,8($mac)
++
++      ret
++.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
++___
++}     }       }
++$code.=<<___;
++.align        64
++.Lconst:
++.Lmask24:
++.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
++.L129:
++.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
++.Lmask26:
++.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
++.Lpermd_avx2:
++.long 2,2,2,3,2,0,2,1
++.Lpermd_avx512:
++.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
++
++.L2_44_inp_permd:
++.long 0,1,1,2,2,3,7,7
++.L2_44_inp_shift:
++.quad 0,12,24,64
++.L2_44_mask:
++.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
++.L2_44_shift_rgt:
++.quad 44,44,42,64
++.L2_44_shift_lft:
++.quad 8,8,10,64
++
++.align        64
++.Lx_mask44:
++.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
++.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
++.Lx_mask42:
++.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
++.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
++___
++}
++$code.=<<___;
++.asciz        "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
++.align        16
++___
++
++{     # chacha20-poly1305 helpers
++my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
++                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
++$code.=<<___;
++.globl        xor128_encrypt_n_pad
++.type xor128_encrypt_n_pad,\@abi-omnipotent
++.align        16
++xor128_encrypt_n_pad:
++      sub     $otp,$inp
++      sub     $otp,$out
++      mov     $len,%r10               # put len aside
++      shr     \$4,$len                # len / 16
++      jz      .Ltail_enc
++      nop
++.Loop_enc_xmm:
++      movdqu  ($inp,$otp),%xmm0
++      pxor    ($otp),%xmm0
++      movdqu  %xmm0,($out,$otp)
++      movdqa  %xmm0,($otp)
++      lea     16($otp),$otp
++      dec     $len
++      jnz     .Loop_enc_xmm
++
++      and     \$15,%r10               # len % 16
++      jz      .Ldone_enc
++
++.Ltail_enc:
++      mov     \$16,$len
++      sub     %r10,$len
++      xor     %eax,%eax
++.Loop_enc_byte:
++      mov     ($inp,$otp),%al
++      xor     ($otp),%al
++      mov     %al,($out,$otp)
++      mov     %al,($otp)
++      lea     1($otp),$otp
++      dec     %r10
++      jnz     .Loop_enc_byte
++
++      xor     %eax,%eax
++.Loop_enc_pad:
++      mov     %al,($otp)
++      lea     1($otp),$otp
++      dec     $len
++      jnz     .Loop_enc_pad
++
++.Ldone_enc:
++      mov     $otp,%rax
++      ret
++.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
++
++.globl        xor128_decrypt_n_pad
++.type xor128_decrypt_n_pad,\@abi-omnipotent
++.align        16
++xor128_decrypt_n_pad:
++      sub     $otp,$inp
++      sub     $otp,$out
++      mov     $len,%r10               # put len aside
++      shr     \$4,$len                # len / 16
++      jz      .Ltail_dec
++      nop
++.Loop_dec_xmm:
++      movdqu  ($inp,$otp),%xmm0
++      movdqa  ($otp),%xmm1
++      pxor    %xmm0,%xmm1
++      movdqu  %xmm1,($out,$otp)
++      movdqa  %xmm0,($otp)
++      lea     16($otp),$otp
++      dec     $len
++      jnz     .Loop_dec_xmm
++
++      pxor    %xmm1,%xmm1
++      and     \$15,%r10               # len % 16
++      jz      .Ldone_dec
++
++.Ltail_dec:
++      mov     \$16,$len
++      sub     %r10,$len
++      xor     %eax,%eax
++      xor     %r11,%r11
++.Loop_dec_byte:
++      mov     ($inp,$otp),%r11b
++      mov     ($otp),%al
++      xor     %r11b,%al
++      mov     %al,($out,$otp)
++      mov     %r11b,($otp)
++      lea     1($otp),$otp
++      dec     %r10
++      jnz     .Loop_dec_byte
++
++      xor     %eax,%eax
++.Loop_dec_pad:
++      mov     %al,($otp)
++      lea     1($otp),$otp
++      dec     $len
++      jnz     .Loop_dec_pad
++
++.Ldone_dec:
++      mov     $otp,%rax
++      ret
++.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
++___
++}
++
++# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
++#             CONTEXT *context,DISPATCHER_CONTEXT *disp)
++if ($win64) {
++$rec="%rcx";
++$frame="%rdx";
++$context="%r8";
++$disp="%r9";
++
++$code.=<<___;
++.extern       __imp_RtlVirtualUnwind
++.type se_handler,\@abi-omnipotent
++.align        16
++se_handler:
++      push    %rsi
++      push    %rdi
++      push    %rbx
++      push    %rbp
++      push    %r12
++      push    %r13
++      push    %r14
++      push    %r15
++      pushfq
++      sub     \$64,%rsp
++
++      mov     120($context),%rax      # pull context->Rax
++      mov     248($context),%rbx      # pull context->Rip
++
++      mov     8($disp),%rsi           # disp->ImageBase
++      mov     56($disp),%r11          # disp->HandlerData
++
++      mov     0(%r11),%r10d           # HandlerData[0]
++      lea     (%rsi,%r10),%r10        # prologue label
++      cmp     %r10,%rbx               # context->Rip<.Lprologue
++      jb      .Lcommon_seh_tail
++
++      mov     152($context),%rax      # pull context->Rsp
++
++      mov     4(%r11),%r10d           # HandlerData[1]
++      lea     (%rsi,%r10),%r10        # epilogue label
++      cmp     %r10,%rbx               # context->Rip>=.Lepilogue
++      jae     .Lcommon_seh_tail
++
++      lea     48(%rax),%rax
++
++      mov     -8(%rax),%rbx
++      mov     -16(%rax),%rbp
++      mov     -24(%rax),%r12
++      mov     -32(%rax),%r13
++      mov     -40(%rax),%r14
++      mov     -48(%rax),%r15
++      mov     %rbx,144($context)      # restore context->Rbx
++      mov     %rbp,160($context)      # restore context->Rbp
++      mov     %r12,216($context)      # restore context->R12
++      mov     %r13,224($context)      # restore context->R13
++      mov     %r14,232($context)      # restore context->R14
++      mov     %r15,240($context)      # restore context->R14
++
++      jmp     .Lcommon_seh_tail
++.size se_handler,.-se_handler
++
++.type avx_handler,\@abi-omnipotent
++.align        16
++avx_handler:
++      push    %rsi
++      push    %rdi
++      push    %rbx
++      push    %rbp
++      push    %r12
++      push    %r13
++      push    %r14
++      push    %r15
++      pushfq
++      sub     \$64,%rsp
++
++      mov     120($context),%rax      # pull context->Rax
++      mov     248($context),%rbx      # pull context->Rip
++
++      mov     8($disp),%rsi           # disp->ImageBase
++      mov     56($disp),%r11          # disp->HandlerData
++
++      mov     0(%r11),%r10d           # HandlerData[0]
++      lea     (%rsi,%r10),%r10        # prologue label
++      cmp     %r10,%rbx               # context->Rip<prologue label
++      jb      .Lcommon_seh_tail
++
++      mov     152($context),%rax      # pull context->Rsp
++
++      mov     4(%r11),%r10d           # HandlerData[1]
++      lea     (%rsi,%r10),%r10        # epilogue label
++      cmp     %r10,%rbx               # context->Rip>=epilogue label
++      jae     .Lcommon_seh_tail
++
++      mov     208($context),%rax      # pull context->R11
++
++      lea     0x50(%rax),%rsi
++      lea     0xf8(%rax),%rax
++      lea     512($context),%rdi      # &context.Xmm6
++      mov     \$20,%ecx
++      .long   0xa548f3fc              # cld; rep movsq
++
++.Lcommon_seh_tail:
++      mov     8(%rax),%rdi
++      mov     16(%rax),%rsi
++      mov     %rax,152($context)      # restore context->Rsp
++      mov     %rsi,168($context)      # restore context->Rsi
++      mov     %rdi,176($context)      # restore context->Rdi
++
++      mov     40($disp),%rdi          # disp->ContextRecord
++      mov     $context,%rsi           # context
++      mov     \$154,%ecx              # sizeof(CONTEXT)
++      .long   0xa548f3fc              # cld; rep movsq
++
++      mov     $disp,%rsi
++      xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
++      mov     8(%rsi),%rdx            # arg2, disp->ImageBase
++      mov     0(%rsi),%r8             # arg3, disp->ControlPc
++      mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
++      mov     40(%rsi),%r10           # disp->ContextRecord
++      lea     56(%rsi),%r11           # &disp->HandlerData
++      lea     24(%rsi),%r12           # &disp->EstablisherFrame
++      mov     %r10,32(%rsp)           # arg5
++      mov     %r11,40(%rsp)           # arg6
++      mov     %r12,48(%rsp)           # arg7
++      mov     %rcx,56(%rsp)           # arg8, (NULL)
++      call    *__imp_RtlVirtualUnwind(%rip)
++
++      mov     \$1,%eax                # ExceptionContinueSearch
++      add     \$64,%rsp
++      popfq
++      pop     %r15
++      pop     %r14
++      pop     %r13
++      pop     %r12
++      pop     %rbp
++      pop     %rbx
++      pop     %rdi
++      pop     %rsi
++      ret
++.size avx_handler,.-avx_handler
++
++.section      .pdata
++.align        4
++      .rva    .LSEH_begin_poly1305_init
++      .rva    .LSEH_end_poly1305_init
++      .rva    .LSEH_info_poly1305_init
++
++      .rva    .LSEH_begin_poly1305_blocks
++      .rva    .LSEH_end_poly1305_blocks
++      .rva    .LSEH_info_poly1305_blocks
++
++      .rva    .LSEH_begin_poly1305_emit
++      .rva    .LSEH_end_poly1305_emit
++      .rva    .LSEH_info_poly1305_emit
++___
++$code.=<<___ if ($avx);
++      .rva    .LSEH_begin_poly1305_blocks_avx
++      .rva    .Lbase2_64_avx
++      .rva    .LSEH_info_poly1305_blocks_avx_1
++
++      .rva    .Lbase2_64_avx
++      .rva    .Leven_avx
++      .rva    .LSEH_info_poly1305_blocks_avx_2
++
++      .rva    .Leven_avx
++      .rva    .LSEH_end_poly1305_blocks_avx
++      .rva    .LSEH_info_poly1305_blocks_avx_3
++
++      .rva    .LSEH_begin_poly1305_emit_avx
++      .rva    .LSEH_end_poly1305_emit_avx
++      .rva    .LSEH_info_poly1305_emit_avx
++___
++$code.=<<___ if ($avx>1);
++      .rva    .LSEH_begin_poly1305_blocks_avx2
++      .rva    .Lbase2_64_avx2
++      .rva    .LSEH_info_poly1305_blocks_avx2_1
++
++      .rva    .Lbase2_64_avx2
++      .rva    .Leven_avx2
++      .rva    .LSEH_info_poly1305_blocks_avx2_2
++
++      .rva    .Leven_avx2
++      .rva    .LSEH_end_poly1305_blocks_avx2
++      .rva    .LSEH_info_poly1305_blocks_avx2_3
++___
++$code.=<<___ if ($avx>2);
++      .rva    .LSEH_begin_poly1305_blocks_avx512
++      .rva    .LSEH_end_poly1305_blocks_avx512
++      .rva    .LSEH_info_poly1305_blocks_avx512
++___
++$code.=<<___;
++.section      .xdata
++.align        8
++.LSEH_info_poly1305_init:
++      .byte   9,0,0,0
++      .rva    se_handler
++      .rva    .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
++
++.LSEH_info_poly1305_blocks:
++      .byte   9,0,0,0
++      .rva    se_handler
++      .rva    .Lblocks_body,.Lblocks_epilogue
++
++.LSEH_info_poly1305_emit:
++      .byte   9,0,0,0
++      .rva    se_handler
++      .rva    .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
++___
++$code.=<<___ if ($avx);
++.LSEH_info_poly1305_blocks_avx_1:
++      .byte   9,0,0,0
++      .rva    se_handler
++      .rva    .Lblocks_avx_body,.Lblocks_avx_epilogue         # HandlerData[]
++
++.LSEH_info_poly1305_blocks_avx_2:
++      .byte   9,0,0,0
++      .rva    se_handler
++      .rva    .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue     # HandlerData[]
++
++.LSEH_info_poly1305_blocks_avx_3:
++      .byte   9,0,0,0
++      .rva    avx_handler
++      .rva    .Ldo_avx_body,.Ldo_avx_epilogue                 # HandlerData[]
++
++.LSEH_info_poly1305_emit_avx:
++      .byte   9,0,0,0
++      .rva    se_handler
++      .rva    .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
++___
++$code.=<<___ if ($avx>1);
++.LSEH_info_poly1305_blocks_avx2_1:
++      .byte   9,0,0,0
++      .rva    se_handler
++      .rva    .Lblocks_avx2_body,.Lblocks_avx2_epilogue       # HandlerData[]
++
++.LSEH_info_poly1305_blocks_avx2_2:
++      .byte   9,0,0,0
++      .rva    se_handler
++      .rva    .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue   # HandlerData[]
++
++.LSEH_info_poly1305_blocks_avx2_3:
++      .byte   9,0,0,0
++      .rva    avx_handler
++      .rva    .Ldo_avx2_body,.Ldo_avx2_epilogue               # HandlerData[]
++___
++$code.=<<___ if ($avx>2);
++.LSEH_info_poly1305_blocks_avx512:
++      .byte   9,0,0,0
++      .rva    avx_handler
++      .rva    .Ldo_avx512_body,.Ldo_avx512_epilogue           # HandlerData[]
++___
++}
++
++foreach (split('\n',$code)) {
++      s/\`([^\`]*)\`/eval($1)/ge;
++      s/%r([a-z]+)#d/%e$1/g;
++      s/%r([0-9]+)#d/%r$1d/g;
++      s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
++
++      print $_,"\n";
++}
++close STDOUT;
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch b/target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch

new file mode 100644 (file)

index 0000000..307c9b6
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch
@@ -0,0 +1,2927 @@
+From a81b2f8bd42fe51705d7102e9d9a2a40c2a9d624 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 5 Jan 2020 22:40:48 -0500
+Subject: [PATCH 043/124] crypto: x86/poly1305 - wire up faster implementations
+ for kernel
+
+commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream.
+
+These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F.
+The AVX-512F implementation is disabled on Skylake, due to throttling,
+but it is quite fast on >= Cannonlake.
+
+On the left is cycle counts on a Core i7 6700HQ using the AVX-2
+codepath, comparing this implementation ("new") to the implementation in
+the current crypto api ("old"). On the right are benchmarks on a Xeon
+Gold 5120 using the AVX-512 codepath. The new implementation is faster
+on all benchmarks.
+
+        AVX-2                  AVX-512
+      ---------              -----------
+
+    size    old     new      size   old     new
+    ----    ----    ----     ----   ----    ----
+    0       70      68       0      74      70
+    16      92      90       16     96      92
+    32      134     104      32     136     106
+    48      172     120      48     184     124
+    64      218     136      64     218     138
+    80      254     158      80     260     160
+    96      298     174      96     300     176
+    112     342     192      112    342     194
+    128     388     212      128    384     212
+    144     428     228      144    420     226
+    160     466     246      160    464     248
+    176     510     264      176    504     264
+    192     550     282      192    544     282
+    208     594     302      208    582     300
+    224     628     316      224    624     318
+    240     676     334      240    662     338
+    256     716     354      256    708     358
+    272     764     374      272    748     372
+    288     802     352      288    788     358
+    304     420     366      304    422     370
+    320     428     360      320    432     364
+    336     484     378      336    486     380
+    352     426     384      352    434     390
+    368     478     400      368    480     408
+    384     488     394      384    490     398
+    400     542     408      400    542     412
+    416     486     416      416    492     426
+    432     534     430      432    538     436
+    448     544     422      448    546     432
+    464     600     438      464    600     448
+    480     540     448      480    548     456
+    496     594     464      496    594     476
+    512     602     456      512    606     470
+    528     656     476      528    656     480
+    544     600     480      544    606     498
+    560     650     494      560    652     512
+    576     664     490      576    662     508
+    592     714     508      592    716     522
+    608     656     514      608    664     538
+    624     708     532      624    710     552
+    640     716     524      640    720     516
+    656     770     536      656    772     526
+    672     716     548      672    722     544
+    688     770     562      688    768     556
+    704     774     552      704    778     556
+    720     826     568      720    832     568
+    736     768     574      736    780     584
+    752     822     592      752    826     600
+    768     830     584      768    836     560
+    784     884     602      784    888     572
+    800     828     610      800    838     588
+    816     884     628      816    884     604
+    832     888     618      832    894     598
+    848     942     632      848    946     612
+    864     884     644      864    896     628
+    880     936     660      880    942     644
+    896     948     652      896    952     608
+    912     1000    664      912    1004    616
+    928     942     676      928    954     634
+    944     994     690      944    1000    646
+    960     1002    680      960    1008    646
+    976     1054    694      976    1062    658
+    992     1002    706      992    1012    674
+    1008    1052    720      1008   1058    690
+
+This commit wires in the prior implementation from Andy, and makes the
+following changes to be suitable for kernel land.
+
+  - Some cosmetic and structural changes, like renaming labels to
+    .Lname, constants, and other Linux conventions, as well as making
+    the code easy for us to maintain moving forward.
+
+  - CPU feature checking is done in C by the glue code.
+
+  - We avoid jumping into the middle of functions, to appease objtool,
+    and instead parameterize shared code.
+
+  - We maintain frame pointers so that stack traces make sense.
+
+  - We remove the dependency on the perl xlate code, which transforms
+    the output into things that assemblers we don't care about use.
+
+Importantly, none of our changes affect the arithmetic or core code, but
+just involve the differing environment of kernel space.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
+Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/.gitignore                    |   1 +
+ arch/x86/crypto/Makefile                      |  11 +-
+ arch/x86/crypto/poly1305-avx2-x86_64.S        | 390 ----------
+ arch/x86/crypto/poly1305-sse2-x86_64.S        | 590 ---------------
+ arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 ++++++++++--------
+ arch/x86/crypto/poly1305_glue.c               | 473 +++++-------
+ lib/crypto/Kconfig                            |   2 +-
+ 7 files changed, 572 insertions(+), 1577 deletions(-)
+ create mode 100644 arch/x86/crypto/.gitignore
+ delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S
+ delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S
+
+--- /dev/null
++++ b/arch/x86/crypto/.gitignore
+@@ -0,0 +1 @@
++poly1305-x86_64.S
+--- a/arch/x86/crypto/Makefile
++++ b/arch/x86/crypto/Makefile
+@@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o
+ 
+ nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+ blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
++poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
++ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
++targets += poly1305-x86_64-cryptogams.S
++endif
+ 
+ ifeq ($(avx_supported),yes)
+       camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+@@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni
+ aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
+ ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
+ sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+-poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
+ ifeq ($(avx2_supported),yes)
+ sha1-ssse3-y += sha1_avx2_x86_64_asm.o
+-poly1305-x86_64-y += poly1305-avx2-x86_64.o
+ endif
+ ifeq ($(sha1_ni_supported),yes)
+ sha1-ssse3-y += sha1_ni_asm.o
+@@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o
+ endif
+ sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+ crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
++
++quiet_cmd_perlasm = PERLASM $@
++      cmd_perlasm = $(PERL) $< > $@
++$(obj)/%.S: $(src)/%.pl FORCE
++      $(call if_changed,perlasm)
+--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
++++ /dev/null
+@@ -1,390 +0,0 @@
+-/* SPDX-License-Identifier: GPL-2.0-or-later */
+-/*
+- * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
+- *
+- * Copyright (C) 2015 Martin Willi
+- */
+-
+-#include <linux/linkage.h>
+-
+-.section      .rodata.cst32.ANMASK, "aM", @progbits, 32
+-.align 32
+-ANMASK:       .octa 0x0000000003ffffff0000000003ffffff
+-      .octa 0x0000000003ffffff0000000003ffffff
+-
+-.section      .rodata.cst32.ORMASK, "aM", @progbits, 32
+-.align 32
+-ORMASK:       .octa 0x00000000010000000000000001000000
+-      .octa 0x00000000010000000000000001000000
+-
+-.text
+-
+-#define h0 0x00(%rdi)
+-#define h1 0x04(%rdi)
+-#define h2 0x08(%rdi)
+-#define h3 0x0c(%rdi)
+-#define h4 0x10(%rdi)
+-#define r0 0x00(%rdx)
+-#define r1 0x04(%rdx)
+-#define r2 0x08(%rdx)
+-#define r3 0x0c(%rdx)
+-#define r4 0x10(%rdx)
+-#define u0 0x00(%r8)
+-#define u1 0x04(%r8)
+-#define u2 0x08(%r8)
+-#define u3 0x0c(%r8)
+-#define u4 0x10(%r8)
+-#define w0 0x18(%r8)
+-#define w1 0x1c(%r8)
+-#define w2 0x20(%r8)
+-#define w3 0x24(%r8)
+-#define w4 0x28(%r8)
+-#define y0 0x30(%r8)
+-#define y1 0x34(%r8)
+-#define y2 0x38(%r8)
+-#define y3 0x3c(%r8)
+-#define y4 0x40(%r8)
+-#define m %rsi
+-#define hc0 %ymm0
+-#define hc1 %ymm1
+-#define hc2 %ymm2
+-#define hc3 %ymm3
+-#define hc4 %ymm4
+-#define hc0x %xmm0
+-#define hc1x %xmm1
+-#define hc2x %xmm2
+-#define hc3x %xmm3
+-#define hc4x %xmm4
+-#define t1 %ymm5
+-#define t2 %ymm6
+-#define t1x %xmm5
+-#define t2x %xmm6
+-#define ruwy0 %ymm7
+-#define ruwy1 %ymm8
+-#define ruwy2 %ymm9
+-#define ruwy3 %ymm10
+-#define ruwy4 %ymm11
+-#define ruwy0x %xmm7
+-#define ruwy1x %xmm8
+-#define ruwy2x %xmm9
+-#define ruwy3x %xmm10
+-#define ruwy4x %xmm11
+-#define svxz1 %ymm12
+-#define svxz2 %ymm13
+-#define svxz3 %ymm14
+-#define svxz4 %ymm15
+-#define d0 %r9
+-#define d1 %r10
+-#define d2 %r11
+-#define d3 %r12
+-#define d4 %r13
+-
+-ENTRY(poly1305_4block_avx2)
+-      # %rdi: Accumulator h[5]
+-      # %rsi: 64 byte input block m
+-      # %rdx: Poly1305 key r[5]
+-      # %rcx: Quadblock count
+-      # %r8:  Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
+-
+-      # This four-block variant uses loop unrolled block processing. It
+-      # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
+-      # h = (h + m) * r  =>  h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
+-
+-      vzeroupper
+-      push            %rbx
+-      push            %r12
+-      push            %r13
+-
+-      # combine r0,u0,w0,y0
+-      vmovd           y0,ruwy0x
+-      vmovd           w0,t1x
+-      vpunpcklqdq     t1,ruwy0,ruwy0
+-      vmovd           u0,t1x
+-      vmovd           r0,t2x
+-      vpunpcklqdq     t2,t1,t1
+-      vperm2i128      $0x20,t1,ruwy0,ruwy0
+-
+-      # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
+-      vmovd           y1,ruwy1x
+-      vmovd           w1,t1x
+-      vpunpcklqdq     t1,ruwy1,ruwy1
+-      vmovd           u1,t1x
+-      vmovd           r1,t2x
+-      vpunpcklqdq     t2,t1,t1
+-      vperm2i128      $0x20,t1,ruwy1,ruwy1
+-      vpslld          $2,ruwy1,svxz1
+-      vpaddd          ruwy1,svxz1,svxz1
+-
+-      # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
+-      vmovd           y2,ruwy2x
+-      vmovd           w2,t1x
+-      vpunpcklqdq     t1,ruwy2,ruwy2
+-      vmovd           u2,t1x
+-      vmovd           r2,t2x
+-      vpunpcklqdq     t2,t1,t1
+-      vperm2i128      $0x20,t1,ruwy2,ruwy2
+-      vpslld          $2,ruwy2,svxz2
+-      vpaddd          ruwy2,svxz2,svxz2
+-
+-      # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
+-      vmovd           y3,ruwy3x
+-      vmovd           w3,t1x
+-      vpunpcklqdq     t1,ruwy3,ruwy3
+-      vmovd           u3,t1x
+-      vmovd           r3,t2x
+-      vpunpcklqdq     t2,t1,t1
+-      vperm2i128      $0x20,t1,ruwy3,ruwy3
+-      vpslld          $2,ruwy3,svxz3
+-      vpaddd          ruwy3,svxz3,svxz3
+-
+-      # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
+-      vmovd           y4,ruwy4x
+-      vmovd           w4,t1x
+-      vpunpcklqdq     t1,ruwy4,ruwy4
+-      vmovd           u4,t1x
+-      vmovd           r4,t2x
+-      vpunpcklqdq     t2,t1,t1
+-      vperm2i128      $0x20,t1,ruwy4,ruwy4
+-      vpslld          $2,ruwy4,svxz4
+-      vpaddd          ruwy4,svxz4,svxz4
+-
+-.Ldoblock4:
+-      # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
+-      #        m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
+-      vmovd           0x00(m),hc0x
+-      vmovd           0x10(m),t1x
+-      vpunpcklqdq     t1,hc0,hc0
+-      vmovd           0x20(m),t1x
+-      vmovd           0x30(m),t2x
+-      vpunpcklqdq     t2,t1,t1
+-      vperm2i128      $0x20,t1,hc0,hc0
+-      vpand           ANMASK(%rip),hc0,hc0
+-      vmovd           h0,t1x
+-      vpaddd          t1,hc0,hc0
+-      # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
+-      #        (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
+-      vmovd           0x03(m),hc1x
+-      vmovd           0x13(m),t1x
+-      vpunpcklqdq     t1,hc1,hc1
+-      vmovd           0x23(m),t1x
+-      vmovd           0x33(m),t2x
+-      vpunpcklqdq     t2,t1,t1
+-      vperm2i128      $0x20,t1,hc1,hc1
+-      vpsrld          $2,hc1,hc1
+-      vpand           ANMASK(%rip),hc1,hc1
+-      vmovd           h1,t1x
+-      vpaddd          t1,hc1,hc1
+-      # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
+-      #        (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
+-      vmovd           0x06(m),hc2x
+-      vmovd           0x16(m),t1x
+-      vpunpcklqdq     t1,hc2,hc2
+-      vmovd           0x26(m),t1x
+-      vmovd           0x36(m),t2x
+-      vpunpcklqdq     t2,t1,t1
+-      vperm2i128      $0x20,t1,hc2,hc2
+-      vpsrld          $4,hc2,hc2
+-      vpand           ANMASK(%rip),hc2,hc2
+-      vmovd           h2,t1x
+-      vpaddd          t1,hc2,hc2
+-      # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
+-      #        (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
+-      vmovd           0x09(m),hc3x
+-      vmovd           0x19(m),t1x
+-      vpunpcklqdq     t1,hc3,hc3
+-      vmovd           0x29(m),t1x
+-      vmovd           0x39(m),t2x
+-      vpunpcklqdq     t2,t1,t1
+-      vperm2i128      $0x20,t1,hc3,hc3
+-      vpsrld          $6,hc3,hc3
+-      vpand           ANMASK(%rip),hc3,hc3
+-      vmovd           h3,t1x
+-      vpaddd          t1,hc3,hc3
+-      # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
+-      #        (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
+-      vmovd           0x0c(m),hc4x
+-      vmovd           0x1c(m),t1x
+-      vpunpcklqdq     t1,hc4,hc4
+-      vmovd           0x2c(m),t1x
+-      vmovd           0x3c(m),t2x
+-      vpunpcklqdq     t2,t1,t1
+-      vperm2i128      $0x20,t1,hc4,hc4
+-      vpsrld          $8,hc4,hc4
+-      vpor            ORMASK(%rip),hc4,hc4
+-      vmovd           h4,t1x
+-      vpaddd          t1,hc4,hc4
+-
+-      # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
+-      vpmuludq        hc0,ruwy0,t1
+-      # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
+-      vpmuludq        hc1,svxz4,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
+-      vpmuludq        hc2,svxz3,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
+-      vpmuludq        hc3,svxz2,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
+-      vpmuludq        hc4,svxz1,t2
+-      vpaddq          t2,t1,t1
+-      # d0 = t1[0] + t1[1] + t[2] + t[3]
+-      vpermq          $0xee,t1,t2
+-      vpaddq          t2,t1,t1
+-      vpsrldq         $8,t1,t2
+-      vpaddq          t2,t1,t1
+-      vmovq           t1x,d0
+-
+-      # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
+-      vpmuludq        hc0,ruwy1,t1
+-      # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
+-      vpmuludq        hc1,ruwy0,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
+-      vpmuludq        hc2,svxz4,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
+-      vpmuludq        hc3,svxz3,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
+-      vpmuludq        hc4,svxz2,t2
+-      vpaddq          t2,t1,t1
+-      # d1 = t1[0] + t1[1] + t1[3] + t1[4]
+-      vpermq          $0xee,t1,t2
+-      vpaddq          t2,t1,t1
+-      vpsrldq         $8,t1,t2
+-      vpaddq          t2,t1,t1
+-      vmovq           t1x,d1
+-
+-      # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
+-      vpmuludq        hc0,ruwy2,t1
+-      # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
+-      vpmuludq        hc1,ruwy1,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
+-      vpmuludq        hc2,ruwy0,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
+-      vpmuludq        hc3,svxz4,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
+-      vpmuludq        hc4,svxz3,t2
+-      vpaddq          t2,t1,t1
+-      # d2 = t1[0] + t1[1] + t1[2] + t1[3]
+-      vpermq          $0xee,t1,t2
+-      vpaddq          t2,t1,t1
+-      vpsrldq         $8,t1,t2
+-      vpaddq          t2,t1,t1
+-      vmovq           t1x,d2
+-
+-      # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
+-      vpmuludq        hc0,ruwy3,t1
+-      # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
+-      vpmuludq        hc1,ruwy2,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
+-      vpmuludq        hc2,ruwy1,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
+-      vpmuludq        hc3,ruwy0,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
+-      vpmuludq        hc4,svxz4,t2
+-      vpaddq          t2,t1,t1
+-      # d3 = t1[0] + t1[1] + t1[2] + t1[3]
+-      vpermq          $0xee,t1,t2
+-      vpaddq          t2,t1,t1
+-      vpsrldq         $8,t1,t2
+-      vpaddq          t2,t1,t1
+-      vmovq           t1x,d3
+-
+-      # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
+-      vpmuludq        hc0,ruwy4,t1
+-      # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
+-      vpmuludq        hc1,ruwy3,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
+-      vpmuludq        hc2,ruwy2,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
+-      vpmuludq        hc3,ruwy1,t2
+-      vpaddq          t2,t1,t1
+-      # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
+-      vpmuludq        hc4,ruwy0,t2
+-      vpaddq          t2,t1,t1
+-      # d4 = t1[0] + t1[1] + t1[2] + t1[3]
+-      vpermq          $0xee,t1,t2
+-      vpaddq          t2,t1,t1
+-      vpsrldq         $8,t1,t2
+-      vpaddq          t2,t1,t1
+-      vmovq           t1x,d4
+-
+-      # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+-      # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+-      # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
+-      # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+-      # integers.  It's true in a single-block implementation, but not here.
+-
+-      # d1 += d0 >> 26
+-      mov             d0,%rax
+-      shr             $26,%rax
+-      add             %rax,d1
+-      # h0 = d0 & 0x3ffffff
+-      mov             d0,%rbx
+-      and             $0x3ffffff,%ebx
+-
+-      # d2 += d1 >> 26
+-      mov             d1,%rax
+-      shr             $26,%rax
+-      add             %rax,d2
+-      # h1 = d1 & 0x3ffffff
+-      mov             d1,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h1
+-
+-      # d3 += d2 >> 26
+-      mov             d2,%rax
+-      shr             $26,%rax
+-      add             %rax,d3
+-      # h2 = d2 & 0x3ffffff
+-      mov             d2,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h2
+-
+-      # d4 += d3 >> 26
+-      mov             d3,%rax
+-      shr             $26,%rax
+-      add             %rax,d4
+-      # h3 = d3 & 0x3ffffff
+-      mov             d3,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h3
+-
+-      # h0 += (d4 >> 26) * 5
+-      mov             d4,%rax
+-      shr             $26,%rax
+-      lea             (%rax,%rax,4),%rax
+-      add             %rax,%rbx
+-      # h4 = d4 & 0x3ffffff
+-      mov             d4,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h4
+-
+-      # h1 += h0 >> 26
+-      mov             %rbx,%rax
+-      shr             $26,%rax
+-      add             %eax,h1
+-      # h0 = h0 & 0x3ffffff
+-      andl            $0x3ffffff,%ebx
+-      mov             %ebx,h0
+-
+-      add             $0x40,m
+-      dec             %rcx
+-      jnz             .Ldoblock4
+-
+-      vzeroupper
+-      pop             %r13
+-      pop             %r12
+-      pop             %rbx
+-      ret
+-ENDPROC(poly1305_4block_avx2)
+--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
++++ /dev/null
+@@ -1,590 +0,0 @@
+-/* SPDX-License-Identifier: GPL-2.0-or-later */
+-/*
+- * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
+- *
+- * Copyright (C) 2015 Martin Willi
+- */
+-
+-#include <linux/linkage.h>
+-
+-.section      .rodata.cst16.ANMASK, "aM", @progbits, 16
+-.align 16
+-ANMASK:       .octa 0x0000000003ffffff0000000003ffffff
+-
+-.section      .rodata.cst16.ORMASK, "aM", @progbits, 16
+-.align 16
+-ORMASK:       .octa 0x00000000010000000000000001000000
+-
+-.text
+-
+-#define h0 0x00(%rdi)
+-#define h1 0x04(%rdi)
+-#define h2 0x08(%rdi)
+-#define h3 0x0c(%rdi)
+-#define h4 0x10(%rdi)
+-#define r0 0x00(%rdx)
+-#define r1 0x04(%rdx)
+-#define r2 0x08(%rdx)
+-#define r3 0x0c(%rdx)
+-#define r4 0x10(%rdx)
+-#define s1 0x00(%rsp)
+-#define s2 0x04(%rsp)
+-#define s3 0x08(%rsp)
+-#define s4 0x0c(%rsp)
+-#define m %rsi
+-#define h01 %xmm0
+-#define h23 %xmm1
+-#define h44 %xmm2
+-#define t1 %xmm3
+-#define t2 %xmm4
+-#define t3 %xmm5
+-#define t4 %xmm6
+-#define mask %xmm7
+-#define d0 %r8
+-#define d1 %r9
+-#define d2 %r10
+-#define d3 %r11
+-#define d4 %r12
+-
+-ENTRY(poly1305_block_sse2)
+-      # %rdi: Accumulator h[5]
+-      # %rsi: 16 byte input block m
+-      # %rdx: Poly1305 key r[5]
+-      # %rcx: Block count
+-
+-      # This single block variant tries to improve performance by doing two
+-      # multiplications in parallel using SSE instructions. There is quite
+-      # some quardword packing involved, hence the speedup is marginal.
+-
+-      push            %rbx
+-      push            %r12
+-      sub             $0x10,%rsp
+-
+-      # s1..s4 = r1..r4 * 5
+-      mov             r1,%eax
+-      lea             (%eax,%eax,4),%eax
+-      mov             %eax,s1
+-      mov             r2,%eax
+-      lea             (%eax,%eax,4),%eax
+-      mov             %eax,s2
+-      mov             r3,%eax
+-      lea             (%eax,%eax,4),%eax
+-      mov             %eax,s3
+-      mov             r4,%eax
+-      lea             (%eax,%eax,4),%eax
+-      mov             %eax,s4
+-
+-      movdqa          ANMASK(%rip),mask
+-
+-.Ldoblock:
+-      # h01 = [0, h1, 0, h0]
+-      # h23 = [0, h3, 0, h2]
+-      # h44 = [0, h4, 0, h4]
+-      movd            h0,h01
+-      movd            h1,t1
+-      movd            h2,h23
+-      movd            h3,t2
+-      movd            h4,h44
+-      punpcklqdq      t1,h01
+-      punpcklqdq      t2,h23
+-      punpcklqdq      h44,h44
+-
+-      # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
+-      movd            0x00(m),t1
+-      movd            0x03(m),t2
+-      psrld           $2,t2
+-      punpcklqdq      t2,t1
+-      pand            mask,t1
+-      paddd           t1,h01
+-      # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
+-      movd            0x06(m),t1
+-      movd            0x09(m),t2
+-      psrld           $4,t1
+-      psrld           $6,t2
+-      punpcklqdq      t2,t1
+-      pand            mask,t1
+-      paddd           t1,h23
+-      # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
+-      mov             0x0c(m),%eax
+-      shr             $8,%eax
+-      or              $0x01000000,%eax
+-      movd            %eax,t1
+-      pshufd          $0xc4,t1,t1
+-      paddd           t1,h44
+-
+-      # t1[0] = h0 * r0 + h2 * s3
+-      # t1[1] = h1 * s4 + h3 * s2
+-      movd            r0,t1
+-      movd            s4,t2
+-      punpcklqdq      t2,t1
+-      pmuludq         h01,t1
+-      movd            s3,t2
+-      movd            s2,t3
+-      punpcklqdq      t3,t2
+-      pmuludq         h23,t2
+-      paddq           t2,t1
+-      # t2[0] = h0 * r1 + h2 * s4
+-      # t2[1] = h1 * r0 + h3 * s3
+-      movd            r1,t2
+-      movd            r0,t3
+-      punpcklqdq      t3,t2
+-      pmuludq         h01,t2
+-      movd            s4,t3
+-      movd            s3,t4
+-      punpcklqdq      t4,t3
+-      pmuludq         h23,t3
+-      paddq           t3,t2
+-      # t3[0] = h4 * s1
+-      # t3[1] = h4 * s2
+-      movd            s1,t3
+-      movd            s2,t4
+-      punpcklqdq      t4,t3
+-      pmuludq         h44,t3
+-      # d0 = t1[0] + t1[1] + t3[0]
+-      # d1 = t2[0] + t2[1] + t3[1]
+-      movdqa          t1,t4
+-      punpcklqdq      t2,t4
+-      punpckhqdq      t2,t1
+-      paddq           t4,t1
+-      paddq           t3,t1
+-      movq            t1,d0
+-      psrldq          $8,t1
+-      movq            t1,d1
+-
+-      # t1[0] = h0 * r2 + h2 * r0
+-      # t1[1] = h1 * r1 + h3 * s4
+-      movd            r2,t1
+-      movd            r1,t2
+-      punpcklqdq      t2,t1
+-      pmuludq         h01,t1
+-      movd            r0,t2
+-      movd            s4,t3
+-      punpcklqdq      t3,t2
+-      pmuludq         h23,t2
+-      paddq           t2,t1
+-      # t2[0] = h0 * r3 + h2 * r1
+-      # t2[1] = h1 * r2 + h3 * r0
+-      movd            r3,t2
+-      movd            r2,t3
+-      punpcklqdq      t3,t2
+-      pmuludq         h01,t2
+-      movd            r1,t3
+-      movd            r0,t4
+-      punpcklqdq      t4,t3
+-      pmuludq         h23,t3
+-      paddq           t3,t2
+-      # t3[0] = h4 * s3
+-      # t3[1] = h4 * s4
+-      movd            s3,t3
+-      movd            s4,t4
+-      punpcklqdq      t4,t3
+-      pmuludq         h44,t3
+-      # d2 = t1[0] + t1[1] + t3[0]
+-      # d3 = t2[0] + t2[1] + t3[1]
+-      movdqa          t1,t4
+-      punpcklqdq      t2,t4
+-      punpckhqdq      t2,t1
+-      paddq           t4,t1
+-      paddq           t3,t1
+-      movq            t1,d2
+-      psrldq          $8,t1
+-      movq            t1,d3
+-
+-      # t1[0] = h0 * r4 + h2 * r2
+-      # t1[1] = h1 * r3 + h3 * r1
+-      movd            r4,t1
+-      movd            r3,t2
+-      punpcklqdq      t2,t1
+-      pmuludq         h01,t1
+-      movd            r2,t2
+-      movd            r1,t3
+-      punpcklqdq      t3,t2
+-      pmuludq         h23,t2
+-      paddq           t2,t1
+-      # t3[0] = h4 * r0
+-      movd            r0,t3
+-      pmuludq         h44,t3
+-      # d4 = t1[0] + t1[1] + t3[0]
+-      movdqa          t1,t4
+-      psrldq          $8,t4
+-      paddq           t4,t1
+-      paddq           t3,t1
+-      movq            t1,d4
+-
+-      # d1 += d0 >> 26
+-      mov             d0,%rax
+-      shr             $26,%rax
+-      add             %rax,d1
+-      # h0 = d0 & 0x3ffffff
+-      mov             d0,%rbx
+-      and             $0x3ffffff,%ebx
+-
+-      # d2 += d1 >> 26
+-      mov             d1,%rax
+-      shr             $26,%rax
+-      add             %rax,d2
+-      # h1 = d1 & 0x3ffffff
+-      mov             d1,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h1
+-
+-      # d3 += d2 >> 26
+-      mov             d2,%rax
+-      shr             $26,%rax
+-      add             %rax,d3
+-      # h2 = d2 & 0x3ffffff
+-      mov             d2,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h2
+-
+-      # d4 += d3 >> 26
+-      mov             d3,%rax
+-      shr             $26,%rax
+-      add             %rax,d4
+-      # h3 = d3 & 0x3ffffff
+-      mov             d3,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h3
+-
+-      # h0 += (d4 >> 26) * 5
+-      mov             d4,%rax
+-      shr             $26,%rax
+-      lea             (%rax,%rax,4),%rax
+-      add             %rax,%rbx
+-      # h4 = d4 & 0x3ffffff
+-      mov             d4,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h4
+-
+-      # h1 += h0 >> 26
+-      mov             %rbx,%rax
+-      shr             $26,%rax
+-      add             %eax,h1
+-      # h0 = h0 & 0x3ffffff
+-      andl            $0x3ffffff,%ebx
+-      mov             %ebx,h0
+-
+-      add             $0x10,m
+-      dec             %rcx
+-      jnz             .Ldoblock
+-
+-      # Zeroing of key material
+-      mov             %rcx,0x00(%rsp)
+-      mov             %rcx,0x08(%rsp)
+-
+-      add             $0x10,%rsp
+-      pop             %r12
+-      pop             %rbx
+-      ret
+-ENDPROC(poly1305_block_sse2)
+-
+-
+-#define u0 0x00(%r8)
+-#define u1 0x04(%r8)
+-#define u2 0x08(%r8)
+-#define u3 0x0c(%r8)
+-#define u4 0x10(%r8)
+-#define hc0 %xmm0
+-#define hc1 %xmm1
+-#define hc2 %xmm2
+-#define hc3 %xmm5
+-#define hc4 %xmm6
+-#define ru0 %xmm7
+-#define ru1 %xmm8
+-#define ru2 %xmm9
+-#define ru3 %xmm10
+-#define ru4 %xmm11
+-#define sv1 %xmm12
+-#define sv2 %xmm13
+-#define sv3 %xmm14
+-#define sv4 %xmm15
+-#undef d0
+-#define d0 %r13
+-
+-ENTRY(poly1305_2block_sse2)
+-      # %rdi: Accumulator h[5]
+-      # %rsi: 16 byte input block m
+-      # %rdx: Poly1305 key r[5]
+-      # %rcx: Doubleblock count
+-      # %r8:  Poly1305 derived key r^2 u[5]
+-
+-      # This two-block variant further improves performance by using loop
+-      # unrolled block processing. This is more straight forward and does
+-      # less byte shuffling, but requires a second Poly1305 key r^2:
+-      # h = (h + m) * r    =>    h = (h + m1) * r^2 + m2 * r
+-
+-      push            %rbx
+-      push            %r12
+-      push            %r13
+-
+-      # combine r0,u0
+-      movd            u0,ru0
+-      movd            r0,t1
+-      punpcklqdq      t1,ru0
+-
+-      # combine r1,u1 and s1=r1*5,v1=u1*5
+-      movd            u1,ru1
+-      movd            r1,t1
+-      punpcklqdq      t1,ru1
+-      movdqa          ru1,sv1
+-      pslld           $2,sv1
+-      paddd           ru1,sv1
+-
+-      # combine r2,u2 and s2=r2*5,v2=u2*5
+-      movd            u2,ru2
+-      movd            r2,t1
+-      punpcklqdq      t1,ru2
+-      movdqa          ru2,sv2
+-      pslld           $2,sv2
+-      paddd           ru2,sv2
+-
+-      # combine r3,u3 and s3=r3*5,v3=u3*5
+-      movd            u3,ru3
+-      movd            r3,t1
+-      punpcklqdq      t1,ru3
+-      movdqa          ru3,sv3
+-      pslld           $2,sv3
+-      paddd           ru3,sv3
+-
+-      # combine r4,u4 and s4=r4*5,v4=u4*5
+-      movd            u4,ru4
+-      movd            r4,t1
+-      punpcklqdq      t1,ru4
+-      movdqa          ru4,sv4
+-      pslld           $2,sv4
+-      paddd           ru4,sv4
+-
+-.Ldoblock2:
+-      # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
+-      movd            0x00(m),hc0
+-      movd            0x10(m),t1
+-      punpcklqdq      t1,hc0
+-      pand            ANMASK(%rip),hc0
+-      movd            h0,t1
+-      paddd           t1,hc0
+-      # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
+-      movd            0x03(m),hc1
+-      movd            0x13(m),t1
+-      punpcklqdq      t1,hc1
+-      psrld           $2,hc1
+-      pand            ANMASK(%rip),hc1
+-      movd            h1,t1
+-      paddd           t1,hc1
+-      # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
+-      movd            0x06(m),hc2
+-      movd            0x16(m),t1
+-      punpcklqdq      t1,hc2
+-      psrld           $4,hc2
+-      pand            ANMASK(%rip),hc2
+-      movd            h2,t1
+-      paddd           t1,hc2
+-      # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
+-      movd            0x09(m),hc3
+-      movd            0x19(m),t1
+-      punpcklqdq      t1,hc3
+-      psrld           $6,hc3
+-      pand            ANMASK(%rip),hc3
+-      movd            h3,t1
+-      paddd           t1,hc3
+-      # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
+-      movd            0x0c(m),hc4
+-      movd            0x1c(m),t1
+-      punpcklqdq      t1,hc4
+-      psrld           $8,hc4
+-      por             ORMASK(%rip),hc4
+-      movd            h4,t1
+-      paddd           t1,hc4
+-
+-      # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
+-      movdqa          ru0,t1
+-      pmuludq         hc0,t1
+-      # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
+-      movdqa          sv4,t2
+-      pmuludq         hc1,t2
+-      paddq           t2,t1
+-      # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
+-      movdqa          sv3,t2
+-      pmuludq         hc2,t2
+-      paddq           t2,t1
+-      # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
+-      movdqa          sv2,t2
+-      pmuludq         hc3,t2
+-      paddq           t2,t1
+-      # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
+-      movdqa          sv1,t2
+-      pmuludq         hc4,t2
+-      paddq           t2,t1
+-      # d0 = t1[0] + t1[1]
+-      movdqa          t1,t2
+-      psrldq          $8,t2
+-      paddq           t2,t1
+-      movq            t1,d0
+-
+-      # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
+-      movdqa          ru1,t1
+-      pmuludq         hc0,t1
+-      # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
+-      movdqa          ru0,t2
+-      pmuludq         hc1,t2
+-      paddq           t2,t1
+-      # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
+-      movdqa          sv4,t2
+-      pmuludq         hc2,t2
+-      paddq           t2,t1
+-      # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
+-      movdqa          sv3,t2
+-      pmuludq         hc3,t2
+-      paddq           t2,t1
+-      # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
+-      movdqa          sv2,t2
+-      pmuludq         hc4,t2
+-      paddq           t2,t1
+-      # d1 = t1[0] + t1[1]
+-      movdqa          t1,t2
+-      psrldq          $8,t2
+-      paddq           t2,t1
+-      movq            t1,d1
+-
+-      # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
+-      movdqa          ru2,t1
+-      pmuludq         hc0,t1
+-      # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
+-      movdqa          ru1,t2
+-      pmuludq         hc1,t2
+-      paddq           t2,t1
+-      # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
+-      movdqa          ru0,t2
+-      pmuludq         hc2,t2
+-      paddq           t2,t1
+-      # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
+-      movdqa          sv4,t2
+-      pmuludq         hc3,t2
+-      paddq           t2,t1
+-      # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
+-      movdqa          sv3,t2
+-      pmuludq         hc4,t2
+-      paddq           t2,t1
+-      # d2 = t1[0] + t1[1]
+-      movdqa          t1,t2
+-      psrldq          $8,t2
+-      paddq           t2,t1
+-      movq            t1,d2
+-
+-      # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
+-      movdqa          ru3,t1
+-      pmuludq         hc0,t1
+-      # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
+-      movdqa          ru2,t2
+-      pmuludq         hc1,t2
+-      paddq           t2,t1
+-      # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
+-      movdqa          ru1,t2
+-      pmuludq         hc2,t2
+-      paddq           t2,t1
+-      # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
+-      movdqa          ru0,t2
+-      pmuludq         hc3,t2
+-      paddq           t2,t1
+-      # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
+-      movdqa          sv4,t2
+-      pmuludq         hc4,t2
+-      paddq           t2,t1
+-      # d3 = t1[0] + t1[1]
+-      movdqa          t1,t2
+-      psrldq          $8,t2
+-      paddq           t2,t1
+-      movq            t1,d3
+-
+-      # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
+-      movdqa          ru4,t1
+-      pmuludq         hc0,t1
+-      # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
+-      movdqa          ru3,t2
+-      pmuludq         hc1,t2
+-      paddq           t2,t1
+-      # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
+-      movdqa          ru2,t2
+-      pmuludq         hc2,t2
+-      paddq           t2,t1
+-      # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
+-      movdqa          ru1,t2
+-      pmuludq         hc3,t2
+-      paddq           t2,t1
+-      # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
+-      movdqa          ru0,t2
+-      pmuludq         hc4,t2
+-      paddq           t2,t1
+-      # d4 = t1[0] + t1[1]
+-      movdqa          t1,t2
+-      psrldq          $8,t2
+-      paddq           t2,t1
+-      movq            t1,d4
+-
+-      # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+-      # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+-      # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
+-      # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+-      # integers.  It's true in a single-block implementation, but not here.
+-
+-      # d1 += d0 >> 26
+-      mov             d0,%rax
+-      shr             $26,%rax
+-      add             %rax,d1
+-      # h0 = d0 & 0x3ffffff
+-      mov             d0,%rbx
+-      and             $0x3ffffff,%ebx
+-
+-      # d2 += d1 >> 26
+-      mov             d1,%rax
+-      shr             $26,%rax
+-      add             %rax,d2
+-      # h1 = d1 & 0x3ffffff
+-      mov             d1,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h1
+-
+-      # d3 += d2 >> 26
+-      mov             d2,%rax
+-      shr             $26,%rax
+-      add             %rax,d3
+-      # h2 = d2 & 0x3ffffff
+-      mov             d2,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h2
+-
+-      # d4 += d3 >> 26
+-      mov             d3,%rax
+-      shr             $26,%rax
+-      add             %rax,d4
+-      # h3 = d3 & 0x3ffffff
+-      mov             d3,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h3
+-
+-      # h0 += (d4 >> 26) * 5
+-      mov             d4,%rax
+-      shr             $26,%rax
+-      lea             (%rax,%rax,4),%rax
+-      add             %rax,%rbx
+-      # h4 = d4 & 0x3ffffff
+-      mov             d4,%rax
+-      and             $0x3ffffff,%eax
+-      mov             %eax,h4
+-
+-      # h1 += h0 >> 26
+-      mov             %rbx,%rax
+-      shr             $26,%rax
+-      add             %eax,h1
+-      # h0 = h0 & 0x3ffffff
+-      andl            $0x3ffffff,%ebx
+-      mov             %ebx,h0
+-
+-      add             $0x20,m
+-      dec             %rcx
+-      jnz             .Ldoblock2
+-
+-      pop             %r13
+-      pop             %r12
+-      pop             %rbx
+-      ret
+-ENDPROC(poly1305_2block_sse2)
+--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
++++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+@@ -1,11 +1,14 @@
+-#! /usr/bin/env perl
+-# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
++#!/usr/bin/env perl
++# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+ #
+-# Licensed under the OpenSSL license (the "License").  You may not use
+-# this file except in compliance with the License.  You can obtain a copy
+-# in the file LICENSE in the source distribution or at
+-# https://www.openssl.org/source/license.html
+-
++# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
++# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
++#
++# This code is taken from the OpenSSL project but the author, Andy Polyakov,
++# has relicensed it under the licenses specified in the SPDX header above.
++# The original headers, including the original license headers, are
++# included below for completeness.
+ #
+ # ====================================================================
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@@ -32,7 +35,7 @@
+ # Skylake-X system performance. Since we are likely to suppress
+ # AVX512F capability flag [at least on Skylake-X], conversion serves
+ # as kind of "investment protection". Note that next *lake processor,
+-# Cannolake, has AVX512IFMA code path to execute...
++# Cannonlake, has AVX512IFMA code path to execute...
+ #
+ # Numbers are cycles per processed byte with poly1305_blocks alone,
+ # measured with rdtsc at fixed clock frequency.
+@@ -68,39 +71,114 @@ $output  = shift;
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+ 
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
++$kernel=0; $kernel=1 if (!$flavour && !$output);
+ 
+-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+-die "can't locate x86_64-xlate.pl";
+-
+-if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+-              =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+-      $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
++if (!$kernel) {
++      $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++      ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
++      ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
++      die "can't locate x86_64-xlate.pl";
++
++      open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
++      *STDOUT=*OUT;
++
++      if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
++          =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
++              $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
++      }
++
++      if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
++          `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
++              $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
++              $avx += 1 if ($1==2.11 && $2>=8);
++      }
++
++      if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
++          `ml64 2>&1` =~ /Version ([0-9]+)\./) {
++              $avx = ($1>=10) + ($1>=11);
++      }
++
++      if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
++              $avx = ($2>=3.0) + ($2>3.0);
++      }
++} else {
++      $avx = 4; # The kernel uses ifdefs for this.
+ }
+ 
+-if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+-         `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+-      $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
+-      $avx += 2 if ($1==2.11 && $2>=8);
++sub declare_function() {
++      my ($name, $align, $nargs) = @_;
++      if($kernel) {
++              $code .= ".align $align\n";
++              $code .= "ENTRY($name)\n";
++              $code .= ".L$name:\n";
++      } else {
++              $code .= ".globl        $name\n";
++              $code .= ".type $name,\@function,$nargs\n";
++              $code .= ".align        $align\n";
++              $code .= "$name:\n";
++      }
+ }
+ 
+-if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+-         `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+-      $avx = ($1>=10) + ($1>=12);
++sub end_function() {
++      my ($name) = @_;
++      if($kernel) {
++              $code .= "ENDPROC($name)\n";
++      } else {
++              $code .= ".size   $name,.-$name\n";
++      }
+ }
+ 
+-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+-      $avx = ($2>=3.0) + ($2>3.0);
+-}
++$code.=<<___ if $kernel;
++#include <linux/linkage.h>
++___
+ 
+-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+-*STDOUT=*OUT;
++if ($avx) {
++$code.=<<___ if $kernel;
++.section .rodata
++___
++$code.=<<___;
++.align        64
++.Lconst:
++.Lmask24:
++.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
++.L129:
++.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
++.Lmask26:
++.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
++.Lpermd_avx2:
++.long 2,2,2,3,2,0,2,1
++.Lpermd_avx512:
++.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
++
++.L2_44_inp_permd:
++.long 0,1,1,2,2,3,7,7
++.L2_44_inp_shift:
++.quad 0,12,24,64
++.L2_44_mask:
++.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
++.L2_44_shift_rgt:
++.quad 44,44,42,64
++.L2_44_shift_lft:
++.quad 8,8,10,64
++
++.align        64
++.Lx_mask44:
++.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
++.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
++.Lx_mask42:
++.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
++.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
++___
++}
++$code.=<<___ if (!$kernel);
++.asciz        "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
++.align        16
++___
+ 
+ my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
+ my ($mac,$nonce)=($inp,$len); # *_emit arguments
+-my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
+-my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
++my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
++my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
+ 
+ sub poly1305_iteration {
+ # input:      copy of $r1 in %rax, $h0-$h2, $r0-$r1
+@@ -155,19 +233,19 @@ ___
+ 
+ $code.=<<___;
+ .text
+-
++___
++$code.=<<___ if (!$kernel);
+ .extern       OPENSSL_ia32cap_P
+ 
+-.globl        poly1305_init
+-.hidden       poly1305_init
+-.globl        poly1305_blocks
+-.hidden       poly1305_blocks
+-.globl        poly1305_emit
+-.hidden       poly1305_emit
+-
+-.type poly1305_init,\@function,3
+-.align        32
+-poly1305_init:
++.globl        poly1305_init_x86_64
++.hidden       poly1305_init_x86_64
++.globl        poly1305_blocks_x86_64
++.hidden       poly1305_blocks_x86_64
++.globl        poly1305_emit_x86_64
++.hidden       poly1305_emit_x86_64
++___
++&declare_function("poly1305_init_x86_64", 32, 3);
++$code.=<<___;
+       xor     %rax,%rax
+       mov     %rax,0($ctx)            # initialize hash value
+       mov     %rax,8($ctx)
+@@ -175,11 +253,12 @@ poly1305_init:
+ 
+       cmp     \$0,$inp
+       je      .Lno_key
+-
+-      lea     poly1305_blocks(%rip),%r10
+-      lea     poly1305_emit(%rip),%r11
+ ___
+-$code.=<<___  if ($avx);
++$code.=<<___ if (!$kernel);
++      lea     poly1305_blocks_x86_64(%rip),%r10
++      lea     poly1305_emit_x86_64(%rip),%r11
++___
++$code.=<<___  if (!$kernel && $avx);
+       mov     OPENSSL_ia32cap_P+4(%rip),%r9
+       lea     poly1305_blocks_avx(%rip),%rax
+       lea     poly1305_emit_avx(%rip),%rcx
+@@ -187,12 +266,12 @@ $code.=<<___     if ($avx);
+       cmovc   %rax,%r10
+       cmovc   %rcx,%r11
+ ___
+-$code.=<<___  if ($avx>1);
++$code.=<<___  if (!$kernel && $avx>1);
+       lea     poly1305_blocks_avx2(%rip),%rax
+       bt      \$`5+32`,%r9            # AVX2?
+       cmovc   %rax,%r10
+ ___
+-$code.=<<___  if ($avx>3);
++$code.=<<___  if (!$kernel && $avx>3);
+       mov     \$`(1<<31|1<<21|1<<16)`,%rax
+       shr     \$32,%r9
+       and     %rax,%r9
+@@ -207,11 +286,11 @@ $code.=<<___;
+       mov     %rax,24($ctx)
+       mov     %rcx,32($ctx)
+ ___
+-$code.=<<___  if ($flavour !~ /elf32/);
++$code.=<<___  if (!$kernel && $flavour !~ /elf32/);
+       mov     %r10,0(%rdx)
+       mov     %r11,8(%rdx)
+ ___
+-$code.=<<___  if ($flavour =~ /elf32/);
++$code.=<<___  if (!$kernel && $flavour =~ /elf32/);
+       mov     %r10d,0(%rdx)
+       mov     %r11d,4(%rdx)
+ ___
+@@ -219,11 +298,11 @@ $code.=<<___;
+       mov     \$1,%eax
+ .Lno_key:
+       ret
+-.size poly1305_init,.-poly1305_init
++___
++&end_function("poly1305_init_x86_64");
+ 
+-.type poly1305_blocks,\@function,4
+-.align        32
+-poly1305_blocks:
++&declare_function("poly1305_blocks_x86_64", 32, 4);
++$code.=<<___;
+ .cfi_startproc
+ .Lblocks:
+       shr     \$4,$len
+@@ -231,8 +310,6 @@ poly1305_blocks:
+ 
+       push    %rbx
+ .cfi_push     %rbx
+-      push    %rbp
+-.cfi_push     %rbp
+       push    %r12
+ .cfi_push     %r12
+       push    %r13
+@@ -241,6 +318,8 @@ poly1305_blocks:
+ .cfi_push     %r14
+       push    %r15
+ .cfi_push     %r15
++      push    $ctx
++.cfi_push     $ctx
+ .Lblocks_body:
+ 
+       mov     $len,%r15               # reassign $len
+@@ -265,26 +344,29 @@ poly1305_blocks:
+       lea     16($inp),$inp
+       adc     $padbit,$h2
+ ___
++
+       &poly1305_iteration();
++
+ $code.=<<___;
+       mov     $r1,%rax
+       dec     %r15                    # len-=16
+       jnz     .Loop
+ 
++      mov     0(%rsp),$ctx
++.cfi_restore  $ctx
++
+       mov     $h0,0($ctx)             # store hash value
+       mov     $h1,8($ctx)
+       mov     $h2,16($ctx)
+ 
+-      mov     0(%rsp),%r15
++      mov     8(%rsp),%r15
+ .cfi_restore  %r15
+-      mov     8(%rsp),%r14
++      mov     16(%rsp),%r14
+ .cfi_restore  %r14
+-      mov     16(%rsp),%r13
++      mov     24(%rsp),%r13
+ .cfi_restore  %r13
+-      mov     24(%rsp),%r12
++      mov     32(%rsp),%r12
+ .cfi_restore  %r12
+-      mov     32(%rsp),%rbp
+-.cfi_restore  %rbp
+       mov     40(%rsp),%rbx
+ .cfi_restore  %rbx
+       lea     48(%rsp),%rsp
+@@ -293,11 +375,11 @@ $code.=<<___;
+ .Lblocks_epilogue:
+       ret
+ .cfi_endproc
+-.size poly1305_blocks,.-poly1305_blocks
++___
++&end_function("poly1305_blocks_x86_64");
+ 
+-.type poly1305_emit,\@function,3
+-.align        32
+-poly1305_emit:
++&declare_function("poly1305_emit_x86_64", 32, 3);
++$code.=<<___;
+ .Lemit:
+       mov     0($ctx),%r8     # load hash value
+       mov     8($ctx),%r9
+@@ -318,10 +400,14 @@ poly1305_emit:
+       mov     %rcx,8($mac)
+ 
+       ret
+-.size poly1305_emit,.-poly1305_emit
+ ___
++&end_function("poly1305_emit_x86_64");
+ if ($avx) {
+ 
++if($kernel) {
++      $code .= "#ifdef CONFIG_AS_AVX\n";
++}
++
+ ########################################################################
+ # Layout of opaque area is following.
+ #
+@@ -342,15 +428,19 @@ $code.=<<___;
+ .type __poly1305_block,\@abi-omnipotent
+ .align        32
+ __poly1305_block:
++      push $ctx
+ ___
+       &poly1305_iteration();
+ $code.=<<___;
++      pop $ctx
+       ret
+ .size __poly1305_block,.-__poly1305_block
+ 
+ .type __poly1305_init_avx,\@abi-omnipotent
+ .align        32
+ __poly1305_init_avx:
++      push %rbp
++      mov %rsp,%rbp
+       mov     $r0,$h0
+       mov     $r1,$h1
+       xor     $h2,$h2
+@@ -507,12 +597,13 @@ __poly1305_init_avx:
+       mov     $d1#d,`16*8+8-64`($ctx)
+ 
+       lea     -48-64($ctx),$ctx       # size [de-]optimization
++      pop %rbp
+       ret
+ .size __poly1305_init_avx,.-__poly1305_init_avx
++___
+ 
+-.type poly1305_blocks_avx,\@function,4
+-.align        32
+-poly1305_blocks_avx:
++&declare_function("poly1305_blocks_avx", 32, 4);
++$code.=<<___;
+ .cfi_startproc
+       mov     20($ctx),%r8d           # is_base2_26
+       cmp     \$128,$len
+@@ -532,10 +623,11 @@ poly1305_blocks_avx:
+       test    \$31,$len
+       jz      .Leven_avx
+ 
+-      push    %rbx
+-.cfi_push     %rbx
+       push    %rbp
+ .cfi_push     %rbp
++      mov     %rsp,%rbp
++      push    %rbx
++.cfi_push     %rbx
+       push    %r12
+ .cfi_push     %r12
+       push    %r13
+@@ -645,20 +737,18 @@ poly1305_blocks_avx:
+       mov     $h2#d,16($ctx)
+ .align        16
+ .Ldone_avx:
+-      mov     0(%rsp),%r15
++      pop             %r15
+ .cfi_restore  %r15
+-      mov     8(%rsp),%r14
++      pop             %r14
+ .cfi_restore  %r14
+-      mov     16(%rsp),%r13
++      pop             %r13
+ .cfi_restore  %r13
+-      mov     24(%rsp),%r12
++      pop             %r12
+ .cfi_restore  %r12
+-      mov     32(%rsp),%rbp
+-.cfi_restore  %rbp
+-      mov     40(%rsp),%rbx
++      pop             %rbx
+ .cfi_restore  %rbx
+-      lea     48(%rsp),%rsp
+-.cfi_adjust_cfa_offset        -48
++      pop             %rbp
++.cfi_restore  %rbp
+ .Lno_data_avx:
+ .Lblocks_avx_epilogue:
+       ret
+@@ -667,10 +757,11 @@ poly1305_blocks_avx:
+ .align        32
+ .Lbase2_64_avx:
+ .cfi_startproc
+-      push    %rbx
+-.cfi_push     %rbx
+       push    %rbp
+ .cfi_push     %rbp
++      mov     %rsp,%rbp
++      push    %rbx
++.cfi_push     %rbx
+       push    %r12
+ .cfi_push     %r12
+       push    %r13
+@@ -736,22 +827,18 @@ poly1305_blocks_avx:
+ 
+ .Lproceed_avx:
+       mov     %r15,$len
+-
+-      mov     0(%rsp),%r15
++      pop             %r15
+ .cfi_restore  %r15
+-      mov     8(%rsp),%r14
++      pop             %r14
+ .cfi_restore  %r14
+-      mov     16(%rsp),%r13
++      pop             %r13
+ .cfi_restore  %r13
+-      mov     24(%rsp),%r12
++      pop             %r12
+ .cfi_restore  %r12
+-      mov     32(%rsp),%rbp
+-.cfi_restore  %rbp
+-      mov     40(%rsp),%rbx
++      pop             %rbx
+ .cfi_restore  %rbx
+-      lea     48(%rsp),%rax
+-      lea     48(%rsp),%rsp
+-.cfi_adjust_cfa_offset        -48
++      pop             %rbp
++.cfi_restore  %rbp
+ .Lbase2_64_avx_epilogue:
+       jmp     .Ldo_avx
+ .cfi_endproc
+@@ -768,8 +855,11 @@ poly1305_blocks_avx:
+ .Ldo_avx:
+ ___
+ $code.=<<___  if (!$win64);
++      lea             8(%rsp),%r10
++.cfi_def_cfa_register %r10
++      and             \$-32,%rsp
++      sub             \$-8,%rsp
+       lea             -0x58(%rsp),%r11
+-.cfi_def_cfa          %r11,0x60
+       sub             \$0x178,%rsp
+ ___
+ $code.=<<___  if ($win64);
+@@ -1361,18 +1451,18 @@ $code.=<<___   if ($win64);
+ .Ldo_avx_epilogue:
+ ___
+ $code.=<<___  if (!$win64);
+-      lea             0x58(%r11),%rsp
+-.cfi_def_cfa          %rsp,8
++      lea             -8(%r10),%rsp
++.cfi_def_cfa_register %rsp
+ ___
+ $code.=<<___;
+       vzeroupper
+       ret
+ .cfi_endproc
+-.size poly1305_blocks_avx,.-poly1305_blocks_avx
++___
++&end_function("poly1305_blocks_avx");
+ 
+-.type poly1305_emit_avx,\@function,3
+-.align        32
+-poly1305_emit_avx:
++&declare_function("poly1305_emit_avx", 32, 3);
++$code.=<<___;
+       cmpl    \$0,20($ctx)    # is_base2_26?
+       je      .Lemit
+ 
+@@ -1423,41 +1513,51 @@ poly1305_emit_avx:
+       mov     %rcx,8($mac)
+ 
+       ret
+-.size poly1305_emit_avx,.-poly1305_emit_avx
+ ___
++&end_function("poly1305_emit_avx");
++
++if ($kernel) {
++      $code .= "#endif\n";
++}
+ 
+ if ($avx>1) {
++
++if ($kernel) {
++      $code .= "#ifdef CONFIG_AS_AVX2\n";
++}
++
+ my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
+     map("%ymm$_",(0..15));
+ my $S4=$MASK;
+ 
++sub poly1305_blocks_avxN {
++      my ($avx512) = @_;
++      my $suffix = $avx512 ? "_avx512" : "";
+ $code.=<<___;
+-.type poly1305_blocks_avx2,\@function,4
+-.align        32
+-poly1305_blocks_avx2:
+ .cfi_startproc
+       mov     20($ctx),%r8d           # is_base2_26
+       cmp     \$128,$len
+-      jae     .Lblocks_avx2
++      jae     .Lblocks_avx2$suffix
+       test    %r8d,%r8d
+       jz      .Lblocks
+ 
+-.Lblocks_avx2:
++.Lblocks_avx2$suffix:
+       and     \$-16,$len
+-      jz      .Lno_data_avx2
++      jz      .Lno_data_avx2$suffix
+ 
+       vzeroupper
+ 
+       test    %r8d,%r8d
+-      jz      .Lbase2_64_avx2
++      jz      .Lbase2_64_avx2$suffix
+ 
+       test    \$63,$len
+-      jz      .Leven_avx2
++      jz      .Leven_avx2$suffix
+ 
+-      push    %rbx
+-.cfi_push     %rbx
+       push    %rbp
+ .cfi_push     %rbp
++      mov     %rsp,%rbp
++      push    %rbx
++.cfi_push     %rbx
+       push    %r12
+ .cfi_push     %r12
+       push    %r13
+@@ -1466,7 +1566,7 @@ poly1305_blocks_avx2:
+ .cfi_push     %r14
+       push    %r15
+ .cfi_push     %r15
+-.Lblocks_avx2_body:
++.Lblocks_avx2_body$suffix:
+ 
+       mov     $len,%r15               # reassign $len
+ 
+@@ -1513,7 +1613,7 @@ poly1305_blocks_avx2:
+       shr     \$2,$s1
+       add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
+ 
+-.Lbase2_26_pre_avx2:
++.Lbase2_26_pre_avx2$suffix:
+       add     0($inp),$h0             # accumulate input
+       adc     8($inp),$h1
+       lea     16($inp),$inp
+@@ -1524,10 +1624,10 @@ poly1305_blocks_avx2:
+       mov     $r1,%rax
+ 
+       test    \$63,%r15
+-      jnz     .Lbase2_26_pre_avx2
++      jnz     .Lbase2_26_pre_avx2$suffix
+ 
+       test    $padbit,$padbit         # if $padbit is zero,
+-      jz      .Lstore_base2_64_avx2   # store hash in base 2^64 format
++      jz      .Lstore_base2_64_avx2$suffix    # store hash in base 2^64 format
+ 
+       ################################# base 2^64 -> base 2^26
+       mov     $h0,%rax
+@@ -1548,57 +1648,56 @@ poly1305_blocks_avx2:
+       or      $r1,$h2                 # h[4]
+ 
+       test    %r15,%r15
+-      jz      .Lstore_base2_26_avx2
++      jz      .Lstore_base2_26_avx2$suffix
+ 
+       vmovd   %rax#d,%x#$H0
+       vmovd   %rdx#d,%x#$H1
+       vmovd   $h0#d,%x#$H2
+       vmovd   $h1#d,%x#$H3
+       vmovd   $h2#d,%x#$H4
+-      jmp     .Lproceed_avx2
++      jmp     .Lproceed_avx2$suffix
+ 
+ .align        32
+-.Lstore_base2_64_avx2:
++.Lstore_base2_64_avx2$suffix:
+       mov     $h0,0($ctx)
+       mov     $h1,8($ctx)
+       mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
+-      jmp     .Ldone_avx2
++      jmp     .Ldone_avx2$suffix
+ 
+ .align        16
+-.Lstore_base2_26_avx2:
++.Lstore_base2_26_avx2$suffix:
+       mov     %rax#d,0($ctx)          # store hash value base 2^26
+       mov     %rdx#d,4($ctx)
+       mov     $h0#d,8($ctx)
+       mov     $h1#d,12($ctx)
+       mov     $h2#d,16($ctx)
+ .align        16
+-.Ldone_avx2:
+-      mov     0(%rsp),%r15
++.Ldone_avx2$suffix:
++      pop             %r15
+ .cfi_restore  %r15
+-      mov     8(%rsp),%r14
++      pop             %r14
+ .cfi_restore  %r14
+-      mov     16(%rsp),%r13
++      pop             %r13
+ .cfi_restore  %r13
+-      mov     24(%rsp),%r12
++      pop             %r12
+ .cfi_restore  %r12
+-      mov     32(%rsp),%rbp
+-.cfi_restore  %rbp
+-      mov     40(%rsp),%rbx
++      pop             %rbx
+ .cfi_restore  %rbx
+-      lea     48(%rsp),%rsp
+-.cfi_adjust_cfa_offset        -48
+-.Lno_data_avx2:
+-.Lblocks_avx2_epilogue:
++      pop             %rbp
++.cfi_restore  %rbp
++.Lno_data_avx2$suffix:
++.Lblocks_avx2_epilogue$suffix:
+       ret
+ .cfi_endproc
+ 
+ .align        32
+-.Lbase2_64_avx2:
++.Lbase2_64_avx2$suffix:
+ .cfi_startproc
+-      push    %rbx
+-.cfi_push     %rbx
+       push    %rbp
+ .cfi_push     %rbp
++      mov     %rsp,%rbp
++      push    %rbx
++.cfi_push     %rbx
+       push    %r12
+ .cfi_push     %r12
+       push    %r13
+@@ -1607,7 +1706,7 @@ poly1305_blocks_avx2:
+ .cfi_push     %r14
+       push    %r15
+ .cfi_push     %r15
+-.Lbase2_64_avx2_body:
++.Lbase2_64_avx2_body$suffix:
+ 
+       mov     $len,%r15               # reassign $len
+ 
+@@ -1624,9 +1723,9 @@ poly1305_blocks_avx2:
+       add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
+ 
+       test    \$63,$len
+-      jz      .Linit_avx2
++      jz      .Linit_avx2$suffix
+ 
+-.Lbase2_64_pre_avx2:
++.Lbase2_64_pre_avx2$suffix:
+       add     0($inp),$h0             # accumulate input
+       adc     8($inp),$h1
+       lea     16($inp),$inp
+@@ -1637,9 +1736,9 @@ poly1305_blocks_avx2:
+       mov     $r1,%rax
+ 
+       test    \$63,%r15
+-      jnz     .Lbase2_64_pre_avx2
++      jnz     .Lbase2_64_pre_avx2$suffix
+ 
+-.Linit_avx2:
++.Linit_avx2$suffix:
+       ################################# base 2^64 -> base 2^26
+       mov     $h0,%rax
+       mov     $h0,%rdx
+@@ -1667,69 +1766,77 @@ poly1305_blocks_avx2:
+ 
+       call    __poly1305_init_avx
+ 
+-.Lproceed_avx2:
++.Lproceed_avx2$suffix:
+       mov     %r15,$len                       # restore $len
+-      mov     OPENSSL_ia32cap_P+8(%rip),%r10d
++___
++$code.=<<___ if (!$kernel);
++      mov     OPENSSL_ia32cap_P+8(%rip),%r9d
+       mov     \$`(1<<31|1<<30|1<<16)`,%r11d
+-
+-      mov     0(%rsp),%r15
++___
++$code.=<<___;
++      pop             %r15
+ .cfi_restore  %r15
+-      mov     8(%rsp),%r14
++      pop             %r14
+ .cfi_restore  %r14
+-      mov     16(%rsp),%r13
++      pop             %r13
+ .cfi_restore  %r13
+-      mov     24(%rsp),%r12
++      pop             %r12
+ .cfi_restore  %r12
+-      mov     32(%rsp),%rbp
+-.cfi_restore  %rbp
+-      mov     40(%rsp),%rbx
++      pop             %rbx
+ .cfi_restore  %rbx
+-      lea     48(%rsp),%rax
+-      lea     48(%rsp),%rsp
+-.cfi_adjust_cfa_offset        -48
+-.Lbase2_64_avx2_epilogue:
+-      jmp     .Ldo_avx2
++      pop             %rbp
++.cfi_restore  %rbp
++.Lbase2_64_avx2_epilogue$suffix:
++      jmp     .Ldo_avx2$suffix
+ .cfi_endproc
+ 
+ .align        32
+-.Leven_avx2:
++.Leven_avx2$suffix:
+ .cfi_startproc
+-      mov             OPENSSL_ia32cap_P+8(%rip),%r10d
++___
++$code.=<<___ if (!$kernel);
++      mov             OPENSSL_ia32cap_P+8(%rip),%r9d
++___
++$code.=<<___;
+       vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
+       vmovd           4*1($ctx),%x#$H1
+       vmovd           4*2($ctx),%x#$H2
+       vmovd           4*3($ctx),%x#$H3
+       vmovd           4*4($ctx),%x#$H4
+ 
+-.Ldo_avx2:
++.Ldo_avx2$suffix:
+ ___
+-$code.=<<___          if ($avx>2);
++$code.=<<___          if (!$kernel && $avx>2);
+       cmp             \$512,$len
+       jb              .Lskip_avx512
+-      and             %r11d,%r10d
+-      test            \$`1<<16`,%r10d         # check for AVX512F
++      and             %r11d,%r9d
++      test            \$`1<<16`,%r9d          # check for AVX512F
+       jnz             .Lblocks_avx512
+-.Lskip_avx512:
++.Lskip_avx512$suffix:
++___
++$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
++      cmp             \$512,$len
++      jae             .Lblocks_avx512
+ ___
+ $code.=<<___  if (!$win64);
+-      lea             -8(%rsp),%r11
+-.cfi_def_cfa          %r11,16
++      lea             8(%rsp),%r10
++.cfi_def_cfa_register %r10
+       sub             \$0x128,%rsp
+ ___
+ $code.=<<___  if ($win64);
+-      lea             -0xf8(%rsp),%r11
++      lea             8(%rsp),%r10
+       sub             \$0x1c8,%rsp
+-      vmovdqa         %xmm6,0x50(%r11)
+-      vmovdqa         %xmm7,0x60(%r11)
+-      vmovdqa         %xmm8,0x70(%r11)
+-      vmovdqa         %xmm9,0x80(%r11)
+-      vmovdqa         %xmm10,0x90(%r11)
+-      vmovdqa         %xmm11,0xa0(%r11)
+-      vmovdqa         %xmm12,0xb0(%r11)
+-      vmovdqa         %xmm13,0xc0(%r11)
+-      vmovdqa         %xmm14,0xd0(%r11)
+-      vmovdqa         %xmm15,0xe0(%r11)
+-.Ldo_avx2_body:
++      vmovdqa         %xmm6,-0xb0(%r10)
++      vmovdqa         %xmm7,-0xa0(%r10)
++      vmovdqa         %xmm8,-0x90(%r10)
++      vmovdqa         %xmm9,-0x80(%r10)
++      vmovdqa         %xmm10,-0x70(%r10)
++      vmovdqa         %xmm11,-0x60(%r10)
++      vmovdqa         %xmm12,-0x50(%r10)
++      vmovdqa         %xmm13,-0x40(%r10)
++      vmovdqa         %xmm14,-0x30(%r10)
++      vmovdqa         %xmm15,-0x20(%r10)
++.Ldo_avx2_body$suffix:
+ ___
+ $code.=<<___;
+       lea             .Lconst(%rip),%rcx
+@@ -1794,11 +1901,11 @@ $code.=<<___;
+ 
+       vpaddq          $H2,$T2,$H2             # accumulate input
+       sub             \$64,$len
+-      jz              .Ltail_avx2
+-      jmp             .Loop_avx2
++      jz              .Ltail_avx2$suffix
++      jmp             .Loop_avx2$suffix
+ 
+ .align        32
+-.Loop_avx2:
++.Loop_avx2$suffix:
+       ################################################################
+       # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
+       # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
+@@ -1946,10 +2053,10 @@ $code.=<<___;
+        vpor           32(%rcx),$T4,$T4        # padbit, yes, always
+ 
+       sub             \$64,$len
+-      jnz             .Loop_avx2
++      jnz             .Loop_avx2$suffix
+ 
+       .byte           0x66,0x90
+-.Ltail_avx2:
++.Ltail_avx2$suffix:
+       ################################################################
+       # while above multiplications were by r^4 in all lanes, in last
+       # iteration we multiply least significant lane by r^4 and most
+@@ -2087,37 +2194,29 @@ $code.=<<___;
+       vmovd           %x#$H4,`4*4-48-64`($ctx)
+ ___
+ $code.=<<___  if ($win64);
+-      vmovdqa         0x50(%r11),%xmm6
+-      vmovdqa         0x60(%r11),%xmm7
+-      vmovdqa         0x70(%r11),%xmm8
+-      vmovdqa         0x80(%r11),%xmm9
+-      vmovdqa         0x90(%r11),%xmm10
+-      vmovdqa         0xa0(%r11),%xmm11
+-      vmovdqa         0xb0(%r11),%xmm12
+-      vmovdqa         0xc0(%r11),%xmm13
+-      vmovdqa         0xd0(%r11),%xmm14
+-      vmovdqa         0xe0(%r11),%xmm15
+-      lea             0xf8(%r11),%rsp
+-.Ldo_avx2_epilogue:
++      vmovdqa         -0xb0(%r10),%xmm6
++      vmovdqa         -0xa0(%r10),%xmm7
++      vmovdqa         -0x90(%r10),%xmm8
++      vmovdqa         -0x80(%r10),%xmm9
++      vmovdqa         -0x70(%r10),%xmm10
++      vmovdqa         -0x60(%r10),%xmm11
++      vmovdqa         -0x50(%r10),%xmm12
++      vmovdqa         -0x40(%r10),%xmm13
++      vmovdqa         -0x30(%r10),%xmm14
++      vmovdqa         -0x20(%r10),%xmm15
++      lea             -8(%r10),%rsp
++.Ldo_avx2_epilogue$suffix:
+ ___
+ $code.=<<___  if (!$win64);
+-      lea             8(%r11),%rsp
+-.cfi_def_cfa          %rsp,8
++      lea             -8(%r10),%rsp
++.cfi_def_cfa_register %rsp
+ ___
+ $code.=<<___;
+       vzeroupper
+       ret
+ .cfi_endproc
+-.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
+ ___
+-#######################################################################
+-if ($avx>2) {
+-# On entry we have input length divisible by 64. But since inner loop
+-# processes 128 bytes per iteration, cases when length is not divisible
+-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
+-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
+-# for this tail, we wouldn't have to even allocate stack frame...
+-
++if($avx > 2 && $avx512) {
+ my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
+ my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
+ my $PADBIT="%zmm30";
+@@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
+ map(s/%y/%z/,($MASK));
+ 
+ $code.=<<___;
+-.type poly1305_blocks_avx512,\@function,4
+-.align        32
+-poly1305_blocks_avx512:
+ .cfi_startproc
+ .Lblocks_avx512:
+       mov             \$15,%eax
+       kmovw           %eax,%k2
+ ___
+ $code.=<<___  if (!$win64);
+-      lea             -8(%rsp),%r11
+-.cfi_def_cfa          %r11,16
++      lea             8(%rsp),%r10
++.cfi_def_cfa_register %r10
+       sub             \$0x128,%rsp
+ ___
+ $code.=<<___  if ($win64);
+-      lea             -0xf8(%rsp),%r11
++      lea             8(%rsp),%r10
+       sub             \$0x1c8,%rsp
+-      vmovdqa         %xmm6,0x50(%r11)
+-      vmovdqa         %xmm7,0x60(%r11)
+-      vmovdqa         %xmm8,0x70(%r11)
+-      vmovdqa         %xmm9,0x80(%r11)
+-      vmovdqa         %xmm10,0x90(%r11)
+-      vmovdqa         %xmm11,0xa0(%r11)
+-      vmovdqa         %xmm12,0xb0(%r11)
+-      vmovdqa         %xmm13,0xc0(%r11)
+-      vmovdqa         %xmm14,0xd0(%r11)
+-      vmovdqa         %xmm15,0xe0(%r11)
++      vmovdqa         %xmm6,-0xb0(%r10)
++      vmovdqa         %xmm7,-0xa0(%r10)
++      vmovdqa         %xmm8,-0x90(%r10)
++      vmovdqa         %xmm9,-0x80(%r10)
++      vmovdqa         %xmm10,-0x70(%r10)
++      vmovdqa         %xmm11,-0x60(%r10)
++      vmovdqa         %xmm12,-0x50(%r10)
++      vmovdqa         %xmm13,-0x40(%r10)
++      vmovdqa         %xmm14,-0x30(%r10)
++      vmovdqa         %xmm15,-0x20(%r10)
+ .Ldo_avx512_body:
+ ___
+ $code.=<<___;
+@@ -2679,7 +2775,7 @@ $code.=<<___;
+ 
+       lea             0x90(%rsp),%rax         # size optimization for .Ltail_avx2
+       add             \$64,$len
+-      jnz             .Ltail_avx2
++      jnz             .Ltail_avx2$suffix
+ 
+       vpsubq          $T2,$H2,$H2             # undo input accumulation
+       vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
+@@ -2690,29 +2786,61 @@ $code.=<<___;
+       vzeroall
+ ___
+ $code.=<<___  if ($win64);
+-      movdqa          0x50(%r11),%xmm6
+-      movdqa          0x60(%r11),%xmm7
+-      movdqa          0x70(%r11),%xmm8
+-      movdqa          0x80(%r11),%xmm9
+-      movdqa          0x90(%r11),%xmm10
+-      movdqa          0xa0(%r11),%xmm11
+-      movdqa          0xb0(%r11),%xmm12
+-      movdqa          0xc0(%r11),%xmm13
+-      movdqa          0xd0(%r11),%xmm14
+-      movdqa          0xe0(%r11),%xmm15
+-      lea             0xf8(%r11),%rsp
++      movdqa          -0xb0(%r10),%xmm6
++      movdqa          -0xa0(%r10),%xmm7
++      movdqa          -0x90(%r10),%xmm8
++      movdqa          -0x80(%r10),%xmm9
++      movdqa          -0x70(%r10),%xmm10
++      movdqa          -0x60(%r10),%xmm11
++      movdqa          -0x50(%r10),%xmm12
++      movdqa          -0x40(%r10),%xmm13
++      movdqa          -0x30(%r10),%xmm14
++      movdqa          -0x20(%r10),%xmm15
++      lea             -8(%r10),%rsp
+ .Ldo_avx512_epilogue:
+ ___
+ $code.=<<___  if (!$win64);
+-      lea             8(%r11),%rsp
+-.cfi_def_cfa          %rsp,8
++      lea             -8(%r10),%rsp
++.cfi_def_cfa_register %rsp
+ ___
+ $code.=<<___;
+       ret
+ .cfi_endproc
+-.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
+ ___
+-if ($avx>3) {
++
++}
++
++}
++
++&declare_function("poly1305_blocks_avx2", 32, 4);
++poly1305_blocks_avxN(0);
++&end_function("poly1305_blocks_avx2");
++
++if($kernel) {
++      $code .= "#endif\n";
++}
++
++#######################################################################
++if ($avx>2) {
++# On entry we have input length divisible by 64. But since inner loop
++# processes 128 bytes per iteration, cases when length is not divisible
++# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
++# reason stack layout is kept identical to poly1305_blocks_avx2. If not
++# for this tail, we wouldn't have to even allocate stack frame...
++
++if($kernel) {
++      $code .= "#ifdef CONFIG_AS_AVX512\n";
++}
++
++&declare_function("poly1305_blocks_avx512", 32, 4);
++poly1305_blocks_avxN(1);
++&end_function("poly1305_blocks_avx512");
++
++if ($kernel) {
++      $code .= "#endif\n";
++}
++
++if (!$kernel && $avx>3) {
+ ########################################################################
+ # VPMADD52 version using 2^44 radix.
+ #
+@@ -3753,45 +3881,9 @@ poly1305_emit_base2_44:
+ .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
+ ___
+ }     }       }
+-$code.=<<___;
+-.align        64
+-.Lconst:
+-.Lmask24:
+-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+-.L129:
+-.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
+-.Lmask26:
+-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+-.Lpermd_avx2:
+-.long 2,2,2,3,2,0,2,1
+-.Lpermd_avx512:
+-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+-
+-.L2_44_inp_permd:
+-.long 0,1,1,2,2,3,7,7
+-.L2_44_inp_shift:
+-.quad 0,12,24,64
+-.L2_44_mask:
+-.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+-.L2_44_shift_rgt:
+-.quad 44,44,42,64
+-.L2_44_shift_lft:
+-.quad 8,8,10,64
+-
+-.align        64
+-.Lx_mask44:
+-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+-.Lx_mask42:
+-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+-___
+ }
+-$code.=<<___;
+-.asciz        "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+-.align        16
+-___
+ 
++if (!$kernel)
+ {     # chacha20-poly1305 helpers
+ my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                   ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+@@ -4038,17 +4130,17 @@ avx_handler:
+ 
+ .section      .pdata
+ .align        4
+-      .rva    .LSEH_begin_poly1305_init
+-      .rva    .LSEH_end_poly1305_init
+-      .rva    .LSEH_info_poly1305_init
+-
+-      .rva    .LSEH_begin_poly1305_blocks
+-      .rva    .LSEH_end_poly1305_blocks
+-      .rva    .LSEH_info_poly1305_blocks
+-
+-      .rva    .LSEH_begin_poly1305_emit
+-      .rva    .LSEH_end_poly1305_emit
+-      .rva    .LSEH_info_poly1305_emit
++      .rva    .LSEH_begin_poly1305_init_x86_64
++      .rva    .LSEH_end_poly1305_init_x86_64
++      .rva    .LSEH_info_poly1305_init_x86_64
++
++      .rva    .LSEH_begin_poly1305_blocks_x86_64
++      .rva    .LSEH_end_poly1305_blocks_x86_64
++      .rva    .LSEH_info_poly1305_blocks_x86_64
++
++      .rva    .LSEH_begin_poly1305_emit_x86_64
++      .rva    .LSEH_end_poly1305_emit_x86_64
++      .rva    .LSEH_info_poly1305_emit_x86_64
+ ___
+ $code.=<<___ if ($avx);
+       .rva    .LSEH_begin_poly1305_blocks_avx
+@@ -4088,20 +4180,20 @@ ___
+ $code.=<<___;
+ .section      .xdata
+ .align        8
+-.LSEH_info_poly1305_init:
++.LSEH_info_poly1305_init_x86_64:
+       .byte   9,0,0,0
+       .rva    se_handler
+-      .rva    .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
++      .rva    .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
+ 
+-.LSEH_info_poly1305_blocks:
++.LSEH_info_poly1305_blocks_x86_64:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lblocks_body,.Lblocks_epilogue
+ 
+-.LSEH_info_poly1305_emit:
++.LSEH_info_poly1305_emit_x86_64:
+       .byte   9,0,0,0
+       .rva    se_handler
+-      .rva    .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
++      .rva    .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
+ ___
+ $code.=<<___ if ($avx);
+ .LSEH_info_poly1305_blocks_avx_1:
+@@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2);
+ ___
+ }
+ 
++open SELF,$0;
++while(<SELF>) {
++      next if (/^#!/);
++      last if (!s/^#/\/\// and !/^$/);
++      print;
++}
++close SELF;
++
+ foreach (split('\n',$code)) {
+       s/\`([^\`]*)\`/eval($1)/ge;
+       s/%r([a-z]+)#d/%e$1/g;
+       s/%r([0-9]+)#d/%r$1d/g;
+       s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
+ 
++      if ($kernel) {
++              s/(^\.type.*),[0-9]+$/\1/;
++              s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
++              next if /^\.cfi.*/;
++      }
++
+       print $_,"\n";
+ }
+ close STDOUT;
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -1,8 +1,6 @@
+-// SPDX-License-Identifier: GPL-2.0-or-later
++// SPDX-License-Identifier: GPL-2.0 OR MIT
+ /*
+- * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
+- *
+- * Copyright (C) 2015 Martin Willi
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+  */
+ 
+ #include <crypto/algapi.h>
+@@ -13,279 +11,170 @@
+ #include <linux/jump_label.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
++#include <asm/intel-family.h>
+ #include <asm/simd.h>
+ 
+-asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
+-                                  const u32 *r, unsigned int blocks);
+-asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
+-                                   unsigned int blocks, const u32 *u);
+-asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
+-                                   unsigned int blocks, const u32 *u);
++asmlinkage void poly1305_init_x86_64(void *ctx,
++                                   const u8 key[POLY1305_KEY_SIZE]);
++asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
++                                     const size_t len, const u32 padbit);
++asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
++                                   const u32 nonce[4]);
++asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
++                                const u32 nonce[4]);
++asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
++                                  const u32 padbit);
++asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
++                                   const u32 padbit);
++asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
++                                     const size_t len, const u32 padbit);
+ 
+-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
+ static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
+ 
+-static inline u64 mlt(u64 a, u64 b)
+-{
+-      return a * b;
+-}
+-
+-static inline u32 sr(u64 v, u_char n)
+-{
+-      return v >> n;
+-}
+-
+-static inline u32 and(u32 v, u32 mask)
+-{
+-      return v & mask;
+-}
+-
+-static void poly1305_simd_mult(u32 *a, const u32 *b)
+-{
+-      u8 m[POLY1305_BLOCK_SIZE];
+-
+-      memset(m, 0, sizeof(m));
+-      /* The poly1305 block function adds a hi-bit to the accumulator which
+-       * we don't need for key multiplication; compensate for it. */
+-      a[4] -= 1 << 24;
+-      poly1305_block_sse2(a, m, b, 1);
+-}
+-
+-static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
+-{
+-      /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+-      key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
+-      key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
+-      key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
+-      key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
+-      key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
+-}
++struct poly1305_arch_internal {
++      union {
++              struct {
++                      u32 h[5];
++                      u32 is_base2_26;
++              };
++              u64 hs[3];
++      };
++      u64 r[2];
++      u64 pad;
++      struct { u32 r2, r1, r4, r3; } rn[9];
++};
+ 
+-static void poly1305_integer_blocks(struct poly1305_state *state,
+-                                  const struct poly1305_key *key,
+-                                  const void *src,
+-                                  unsigned int nblocks, u32 hibit)
++/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
++ * the unfortunate situation of using AVX and then having to go back to scalar
++ * -- because the user is silly and has called the update function from two
++ * separate contexts -- then we need to convert back to the original base before
++ * proceeding. It is possible to reason that the initial reduction below is
++ * sufficient given the implementation invariants. However, for an avoidance of
++ * doubt and because this is not performance critical, we do the full reduction
++ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
++ */
++static void convert_to_base2_64(void *ctx)
+ {
+-      u32 r0, r1, r2, r3, r4;
+-      u32 s1, s2, s3, s4;
+-      u32 h0, h1, h2, h3, h4;
+-      u64 d0, d1, d2, d3, d4;
++      struct poly1305_arch_internal *state = ctx;
++      u32 cy;
+ 
+-      if (!nblocks)
++      if (!state->is_base2_26)
+               return;
+ 
+-      r0 = key->r[0];
+-      r1 = key->r[1];
+-      r2 = key->r[2];
+-      r3 = key->r[3];
+-      r4 = key->r[4];
+-
+-      s1 = r1 * 5;
+-      s2 = r2 * 5;
+-      s3 = r3 * 5;
+-      s4 = r4 * 5;
+-
+-      h0 = state->h[0];
+-      h1 = state->h[1];
+-      h2 = state->h[2];
+-      h3 = state->h[3];
+-      h4 = state->h[4];
+-
+-      do {
+-              /* h += m[i] */
+-              h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
+-              h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
+-              h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
+-              h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
+-              h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
+-
+-              /* h *= r */
+-              d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
+-                   mlt(h3, s2) + mlt(h4, s1);
+-              d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
+-                   mlt(h3, s3) + mlt(h4, s2);
+-              d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
+-                   mlt(h3, s4) + mlt(h4, s3);
+-              d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
+-                   mlt(h3, r0) + mlt(h4, s4);
+-              d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
+-                   mlt(h3, r1) + mlt(h4, r0);
+-
+-              /* (partial) h %= p */
+-              d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
+-              d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
+-              d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
+-              d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
+-              h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
+-              h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
+-
+-              src += POLY1305_BLOCK_SIZE;
+-      } while (--nblocks);
+-
+-      state->h[0] = h0;
+-      state->h[1] = h1;
+-      state->h[2] = h2;
+-      state->h[3] = h3;
+-      state->h[4] = h4;
+-}
+-
+-static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
+-{
+-      u32 h0, h1, h2, h3, h4;
+-      u32 g0, g1, g2, g3, g4;
+-      u32 mask;
+-
+-      /* fully carry h */
+-      h0 = state->h[0];
+-      h1 = state->h[1];
+-      h2 = state->h[2];
+-      h3 = state->h[3];
+-      h4 = state->h[4];
+-
+-      h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
+-      h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
+-      h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
+-      h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
+-      h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
+-
+-      /* compute h + -p */
+-      g0 = h0 + 5;
+-      g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
+-      g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
+-      g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
+-      g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
+-
+-      /* select h if h < p, or h + -p if h >= p */
+-      mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
+-      g0 &= mask;
+-      g1 &= mask;
+-      g2 &= mask;
+-      g3 &= mask;
+-      g4 &= mask;
+-      mask = ~mask;
+-      h0 = (h0 & mask) | g0;
+-      h1 = (h1 & mask) | g1;
+-      h2 = (h2 & mask) | g2;
+-      h3 = (h3 & mask) | g3;
+-      h4 = (h4 & mask) | g4;
+-
+-      /* h = h % (2^128) */
+-      put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
+-      put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
+-      put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
+-      put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
+-}
+-
+-void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
+-{
+-      poly1305_integer_setkey(desc->opaque_r, key);
+-      desc->s[0] = get_unaligned_le32(key + 16);
+-      desc->s[1] = get_unaligned_le32(key + 20);
+-      desc->s[2] = get_unaligned_le32(key + 24);
+-      desc->s[3] = get_unaligned_le32(key + 28);
+-      poly1305_core_init(&desc->h);
+-      desc->buflen = 0;
+-      desc->sset = true;
+-      desc->rset = 1;
+-}
+-EXPORT_SYMBOL_GPL(poly1305_init_arch);
+-
+-static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+-                                             const u8 *src, unsigned int srclen)
+-{
+-      if (!dctx->sset) {
+-              if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
+-                      poly1305_integer_setkey(dctx->r, src);
+-                      src += POLY1305_BLOCK_SIZE;
+-                      srclen -= POLY1305_BLOCK_SIZE;
+-                      dctx->rset = 1;
+-              }
+-              if (srclen >= POLY1305_BLOCK_SIZE) {
+-                      dctx->s[0] = get_unaligned_le32(src +  0);
+-                      dctx->s[1] = get_unaligned_le32(src +  4);
+-                      dctx->s[2] = get_unaligned_le32(src +  8);
+-                      dctx->s[3] = get_unaligned_le32(src + 12);
+-                      src += POLY1305_BLOCK_SIZE;
+-                      srclen -= POLY1305_BLOCK_SIZE;
+-                      dctx->sset = true;
+-              }
++      cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
++      cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
++      cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
++      cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
++      state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
++      state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
++      state->hs[2] = state->h[4] >> 24;
++#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
++      cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
++      state->hs[2] &= 3;
++      state->hs[0] += cy;
++      state->hs[1] += (cy = ULT(state->hs[0], cy));
++      state->hs[2] += ULT(state->hs[1], cy);
++#undef ULT
++      state->is_base2_26 = 0;
++}
++
++static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
++{
++      poly1305_init_x86_64(ctx, key);
++}
++
++static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
++                               const u32 padbit)
++{
++      struct poly1305_arch_internal *state = ctx;
++
++      /* SIMD disables preemption, so relax after processing each page. */
++      BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
++                   PAGE_SIZE % POLY1305_BLOCK_SIZE);
++
++      if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
++          (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
++          !crypto_simd_usable()) {
++              convert_to_base2_64(ctx);
++              poly1305_blocks_x86_64(ctx, inp, len, padbit);
++              return;
+       }
+-      return srclen;
+-}
+ 
+-static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
+-                                         const u8 *src, unsigned int srclen)
+-{
+-      unsigned int datalen;
++      for (;;) {
++              const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+ 
+-      if (unlikely(!dctx->sset)) {
+-              datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+-              src += srclen - datalen;
+-              srclen = datalen;
+-      }
+-      if (srclen >= POLY1305_BLOCK_SIZE) {
+-              poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
+-                                      srclen / POLY1305_BLOCK_SIZE, 1);
+-              srclen %= POLY1305_BLOCK_SIZE;
++              kernel_fpu_begin();
++              if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
++                      poly1305_blocks_avx512(ctx, inp, bytes, padbit);
++              else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
++                      poly1305_blocks_avx2(ctx, inp, bytes, padbit);
++              else
++                      poly1305_blocks_avx(ctx, inp, bytes, padbit);
++              kernel_fpu_end();
++              len -= bytes;
++              if (!len)
++                      break;
++              inp += bytes;
+       }
+-      return srclen;
+ }
+ 
+-static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
+-                                       const u8 *src, unsigned int srclen)
+-{
+-      unsigned int blocks, datalen;
++static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
++                             const u32 nonce[4])
++{
++      struct poly1305_arch_internal *state = ctx;
++
++      if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
++          !state->is_base2_26 || !crypto_simd_usable()) {
++              convert_to_base2_64(ctx);
++              poly1305_emit_x86_64(ctx, mac, nonce);
++      } else
++              poly1305_emit_avx(ctx, mac, nonce);
++}
++
++void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
++{
++      poly1305_simd_init(&dctx->h, key);
++      dctx->s[0] = get_unaligned_le32(&key[16]);
++      dctx->s[1] = get_unaligned_le32(&key[20]);
++      dctx->s[2] = get_unaligned_le32(&key[24]);
++      dctx->s[3] = get_unaligned_le32(&key[28]);
++      dctx->buflen = 0;
++      dctx->sset = true;
++}
++EXPORT_SYMBOL(poly1305_init_arch);
+ 
++static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
++                                             const u8 *inp, unsigned int len)
++{
++      unsigned int acc = 0;
+       if (unlikely(!dctx->sset)) {
+-              datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+-              src += srclen - datalen;
+-              srclen = datalen;
+-      }
+-
+-      if (IS_ENABLED(CONFIG_AS_AVX2) &&
+-          static_branch_likely(&poly1305_use_avx2) &&
+-          srclen >= POLY1305_BLOCK_SIZE * 4) {
+-              if (unlikely(dctx->rset < 4)) {
+-                      if (dctx->rset < 2) {
+-                              dctx->r[1] = dctx->r[0];
+-                              poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
+-                      }
+-                      dctx->r[2] = dctx->r[1];
+-                      poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
+-                      dctx->r[3] = dctx->r[2];
+-                      poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
+-                      dctx->rset = 4;
++              if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
++                      poly1305_simd_init(&dctx->h, inp);
++                      inp += POLY1305_BLOCK_SIZE;
++                      len -= POLY1305_BLOCK_SIZE;
++                      acc += POLY1305_BLOCK_SIZE;
++                      dctx->rset = 1;
+               }
+-              blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
+-              poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
+-                                   dctx->r[1].r);
+-              src += POLY1305_BLOCK_SIZE * 4 * blocks;
+-              srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
+-      }
+-
+-      if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
+-              if (unlikely(dctx->rset < 2)) {
+-                      dctx->r[1] = dctx->r[0];
+-                      poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
+-                      dctx->rset = 2;
++              if (len >= POLY1305_BLOCK_SIZE) {
++                      dctx->s[0] = get_unaligned_le32(&inp[0]);
++                      dctx->s[1] = get_unaligned_le32(&inp[4]);
++                      dctx->s[2] = get_unaligned_le32(&inp[8]);
++                      dctx->s[3] = get_unaligned_le32(&inp[12]);
++                      inp += POLY1305_BLOCK_SIZE;
++                      len -= POLY1305_BLOCK_SIZE;
++                      acc += POLY1305_BLOCK_SIZE;
++                      dctx->sset = true;
+               }
+-              blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
+-              poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
+-                                   blocks, dctx->r[1].r);
+-              src += POLY1305_BLOCK_SIZE * 2 * blocks;
+-              srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
+-      }
+-      if (srclen >= POLY1305_BLOCK_SIZE) {
+-              poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
+-              srclen -= POLY1305_BLOCK_SIZE;
+       }
+-      return srclen;
++      return acc;
+ }
+ 
+ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+                         unsigned int srclen)
+ {
+-      unsigned int bytes;
++      unsigned int bytes, used;
+ 
+       if (unlikely(dctx->buflen)) {
+               bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+@@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130
+               dctx->buflen += bytes;
+ 
+               if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+-                      if (static_branch_likely(&poly1305_use_simd) &&
+-                          likely(crypto_simd_usable())) {
+-                              kernel_fpu_begin();
+-                              poly1305_simd_blocks(dctx, dctx->buf,
+-                                                   POLY1305_BLOCK_SIZE);
+-                              kernel_fpu_end();
+-                      } else {
+-                              poly1305_scalar_blocks(dctx, dctx->buf,
+-                                                     POLY1305_BLOCK_SIZE);
+-                      }
++                      if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
++                              poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
+                       dctx->buflen = 0;
+               }
+       }
+ 
+       if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+-              if (static_branch_likely(&poly1305_use_simd) &&
+-                  likely(crypto_simd_usable())) {
+-                      kernel_fpu_begin();
+-                      bytes = poly1305_simd_blocks(dctx, src, srclen);
+-                      kernel_fpu_end();
+-              } else {
+-                      bytes = poly1305_scalar_blocks(dctx, src, srclen);
+-              }
+-              src += srclen - bytes;
+-              srclen = bytes;
++              bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
++              srclen -= bytes;
++              used = crypto_poly1305_setdctxkey(dctx, src, bytes);
++              if (likely(bytes - used))
++                      poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
++              src += bytes;
+       }
+ 
+       if (unlikely(srclen)) {
+@@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130
+ }
+ EXPORT_SYMBOL(poly1305_update_arch);
+ 
+-void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
++void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+ {
+-      __le32 digest[4];
+-      u64 f = 0;
+-
+-      if (unlikely(desc->buflen)) {
+-              desc->buf[desc->buflen++] = 1;
+-              memset(desc->buf + desc->buflen, 0,
+-                     POLY1305_BLOCK_SIZE - desc->buflen);
+-              poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
++      if (unlikely(dctx->buflen)) {
++              dctx->buf[dctx->buflen++] = 1;
++              memset(dctx->buf + dctx->buflen, 0,
++                     POLY1305_BLOCK_SIZE - dctx->buflen);
++              poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+       }
+ 
+-      poly1305_integer_emit(&desc->h, digest);
+-
+-      /* mac = (h + s) % (2^128) */
+-      f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
+-      put_unaligned_le32(f, dst + 0);
+-      f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
+-      put_unaligned_le32(f, dst + 4);
+-      f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
+-      put_unaligned_le32(f, dst + 8);
+-      f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
+-      put_unaligned_le32(f, dst + 12);
+-
+-      *desc = (struct poly1305_desc_ctx){};
++      poly1305_simd_emit(&dctx->h, dst, dctx->s);
++      *dctx = (struct poly1305_desc_ctx){};
+ }
+ EXPORT_SYMBOL(poly1305_final_arch);
+ 
+@@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s
+ {
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ 
+-      poly1305_core_init(&dctx->h);
+-      dctx->buflen = 0;
+-      dctx->rset = 0;
+-      dctx->sset = false;
+-
++      *dctx = (struct poly1305_desc_ctx){};
+       return 0;
+ }
+ 
+-static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
++static int crypto_poly1305_update(struct shash_desc *desc,
++                                const u8 *src, unsigned int srclen)
+ {
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ 
+-      if (unlikely(!dctx->sset))
+-              return -ENOKEY;
+-
+-      poly1305_final_arch(dctx, dst);
++      poly1305_update_arch(dctx, src, srclen);
+       return 0;
+ }
+ 
+-static int poly1305_simd_update(struct shash_desc *desc,
+-                              const u8 *src, unsigned int srclen)
++static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+ {
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ 
+-      poly1305_update_arch(dctx, src, srclen);
++      if (unlikely(!dctx->sset))
++              return -ENOKEY;
++
++      poly1305_final_arch(dctx, dst);
+       return 0;
+ }
+ 
+ static struct shash_alg alg = {
+       .digestsize     = POLY1305_DIGEST_SIZE,
+       .init           = crypto_poly1305_init,
+-      .update         = poly1305_simd_update,
++      .update         = crypto_poly1305_update,
+       .final          = crypto_poly1305_final,
+       .descsize       = sizeof(struct poly1305_desc_ctx),
+       .base           = {
+@@ -406,17 +265,19 @@ static struct shash_alg alg = {
+ 
+ static int __init poly1305_simd_mod_init(void)
+ {
+-      if (!boot_cpu_has(X86_FEATURE_XMM2))
+-              return 0;
+-
+-      static_branch_enable(&poly1305_use_simd);
+-
+-      if (IS_ENABLED(CONFIG_AS_AVX2) &&
+-          boot_cpu_has(X86_FEATURE_AVX) &&
++      if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
++          cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
++              static_branch_enable(&poly1305_use_avx);
++      if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
+           boot_cpu_has(X86_FEATURE_AVX2) &&
+           cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+               static_branch_enable(&poly1305_use_avx2);
+-
++      if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
++          boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
++          cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
++          /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
++          boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
++              static_branch_enable(&poly1305_use_avx512);
+       return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
+ }
+ 
+@@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init);
+ module_exit(poly1305_simd_mod_exit);
+ 
+ MODULE_LICENSE("GPL");
+-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
++MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+ MODULE_DESCRIPTION("Poly1305 authenticator");
+ MODULE_ALIAS_CRYPTO("poly1305");
+ MODULE_ALIAS_CRYPTO("poly1305-simd");
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -90,7 +90,7 @@ config CRYPTO_LIB_DES
+ config CRYPTO_LIB_POLY1305_RSIZE
+       int
+       default 2 if MIPS
+-      default 4 if X86_64
++      default 11 if X86_64
+       default 9 if ARM || ARM64
+       default 1
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0044-crypto-arm-arm64-mips-poly1305-remove-redundant-non-.patch b/target/linux/generic/backport-5.4/080-wireguard-0044-crypto-arm-arm64-mips-poly1305-remove-redundant-non-.patch

new file mode 100644 (file)

index 0000000..e1c719f
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0044-crypto-arm-arm64-mips-poly1305-remove-redundant-non-.patch
@@ -0,0 +1,171 @@
+From 3b1cffd5e47b394b8c0a92583e26acf599022364 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 5 Jan 2020 22:40:49 -0500
+Subject: [PATCH 044/124] crypto: {arm,arm64,mips}/poly1305 - remove redundant
+ non-reduction from emit
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 31899908a0d248b030b4464425b86c717e0007d4 upstream.
+
+This appears to be some kind of copy and paste error, and is actually
+dead code.
+
+Pre: f = 0 ⇒ (f >> 32) = 0
+    f = (f >> 32) + le32_to_cpu(digest[0]);
+Post: 0 ≤ f < 2³²
+    put_unaligned_le32(f, dst);
+
+Pre: 0 ≤ f < 2³² ⇒ (f >> 32) = 0
+    f = (f >> 32) + le32_to_cpu(digest[1]);
+Post: 0 ≤ f < 2³²
+    put_unaligned_le32(f, dst + 4);
+
+Pre: 0 ≤ f < 2³² ⇒ (f >> 32) = 0
+    f = (f >> 32) + le32_to_cpu(digest[2]);
+Post: 0 ≤ f < 2³²
+    put_unaligned_le32(f, dst + 8);
+
+Pre: 0 ≤ f < 2³² ⇒ (f >> 32) = 0
+    f = (f >> 32) + le32_to_cpu(digest[3]);
+Post: 0 ≤ f < 2³²
+    put_unaligned_le32(f, dst + 12);
+
+Therefore this sequence is redundant. And Andy's code appears to handle
+misalignment acceptably.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Tested-by: Ard Biesheuvel <ardb@kernel.org>
+Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/poly1305-glue.c   | 18 ++----------------
+ arch/arm64/crypto/poly1305-glue.c | 18 ++----------------
+ arch/mips/crypto/poly1305-glue.c  | 18 ++----------------
+ 3 files changed, 6 insertions(+), 48 deletions(-)
+
+--- a/arch/arm/crypto/poly1305-glue.c
++++ b/arch/arm/crypto/poly1305-glue.c
+@@ -20,7 +20,7 @@
+ 
+ void poly1305_init_arm(void *state, const u8 *key);
+ void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
+-void poly1305_emit_arm(void *state, __le32 *digest, const u32 *nonce);
++void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
+ 
+ void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
+ {
+@@ -179,9 +179,6 @@ EXPORT_SYMBOL(poly1305_update_arch);
+ 
+ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+ {
+-      __le32 digest[4];
+-      u64 f = 0;
+-
+       if (unlikely(dctx->buflen)) {
+               dctx->buf[dctx->buflen++] = 1;
+               memset(dctx->buf + dctx->buflen, 0,
+@@ -189,18 +186,7 @@ void poly1305_final_arch(struct poly1305
+               poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+       }
+ 
+-      poly1305_emit_arm(&dctx->h, digest, dctx->s);
+-
+-      /* mac = (h + s) % (2^128) */
+-      f = (f >> 32) + le32_to_cpu(digest[0]);
+-      put_unaligned_le32(f, dst);
+-      f = (f >> 32) + le32_to_cpu(digest[1]);
+-      put_unaligned_le32(f, dst + 4);
+-      f = (f >> 32) + le32_to_cpu(digest[2]);
+-      put_unaligned_le32(f, dst + 8);
+-      f = (f >> 32) + le32_to_cpu(digest[3]);
+-      put_unaligned_le32(f, dst + 12);
+-
++      poly1305_emit_arm(&dctx->h, dst, dctx->s);
+       *dctx = (struct poly1305_desc_ctx){};
+ }
+ EXPORT_SYMBOL(poly1305_final_arch);
+--- a/arch/arm64/crypto/poly1305-glue.c
++++ b/arch/arm64/crypto/poly1305-glue.c
+@@ -21,7 +21,7 @@
+ asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
+ asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
+ asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
+-asmlinkage void poly1305_emit(void *state, __le32 *digest, const u32 *nonce);
++asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
+ 
+ static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+ 
+@@ -162,9 +162,6 @@ EXPORT_SYMBOL(poly1305_update_arch);
+ 
+ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+ {
+-      __le32 digest[4];
+-      u64 f = 0;
+-
+       if (unlikely(dctx->buflen)) {
+               dctx->buf[dctx->buflen++] = 1;
+               memset(dctx->buf + dctx->buflen, 0,
+@@ -172,18 +169,7 @@ void poly1305_final_arch(struct poly1305
+               poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+       }
+ 
+-      poly1305_emit(&dctx->h, digest, dctx->s);
+-
+-      /* mac = (h + s) % (2^128) */
+-      f = (f >> 32) + le32_to_cpu(digest[0]);
+-      put_unaligned_le32(f, dst);
+-      f = (f >> 32) + le32_to_cpu(digest[1]);
+-      put_unaligned_le32(f, dst + 4);
+-      f = (f >> 32) + le32_to_cpu(digest[2]);
+-      put_unaligned_le32(f, dst + 8);
+-      f = (f >> 32) + le32_to_cpu(digest[3]);
+-      put_unaligned_le32(f, dst + 12);
+-
++      poly1305_emit(&dctx->h, dst, dctx->s);
+       *dctx = (struct poly1305_desc_ctx){};
+ }
+ EXPORT_SYMBOL(poly1305_final_arch);
+--- a/arch/mips/crypto/poly1305-glue.c
++++ b/arch/mips/crypto/poly1305-glue.c
+@@ -15,7 +15,7 @@
+ 
+ asmlinkage void poly1305_init_mips(void *state, const u8 *key);
+ asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
+-asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce);
++asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
+ 
+ void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
+ {
+@@ -134,9 +134,6 @@ EXPORT_SYMBOL(poly1305_update_arch);
+ 
+ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+ {
+-      __le32 digest[4];
+-      u64 f = 0;
+-
+       if (unlikely(dctx->buflen)) {
+               dctx->buf[dctx->buflen++] = 1;
+               memset(dctx->buf + dctx->buflen, 0,
+@@ -144,18 +141,7 @@ void poly1305_final_arch(struct poly1305
+               poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+       }
+ 
+-      poly1305_emit_mips(&dctx->h, digest, dctx->s);
+-
+-      /* mac = (h + s) % (2^128) */
+-      f = (f >> 32) + le32_to_cpu(digest[0]);
+-      put_unaligned_le32(f, dst);
+-      f = (f >> 32) + le32_to_cpu(digest[1]);
+-      put_unaligned_le32(f, dst + 4);
+-      f = (f >> 32) + le32_to_cpu(digest[2]);
+-      put_unaligned_le32(f, dst + 8);
+-      f = (f >> 32) + le32_to_cpu(digest[3]);
+-      put_unaligned_le32(f, dst + 12);
+-
++      poly1305_emit_mips(&dctx->h, dst, dctx->s);
+       *dctx = (struct poly1305_desc_ctx){};
+ }
+ EXPORT_SYMBOL(poly1305_final_arch);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0045-crypto-curve25519-Fix-selftest-build-error.patch b/target/linux/generic/backport-5.4/080-wireguard-0045-crypto-curve25519-Fix-selftest-build-error.patch

new file mode 100644 (file)

index 0000000..6247d99
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0045-crypto-curve25519-Fix-selftest-build-error.patch
@@ -0,0 +1,102 @@
+From a7e800af9c95490f8b42934eccc88d02d0af6d2a Mon Sep 17 00:00:00 2001
+From: Herbert Xu <herbert@gondor.apana.org.au>
+Date: Wed, 8 Jan 2020 12:37:35 +0800
+Subject: [PATCH 045/124] crypto: curve25519 - Fix selftest build error
+
+commit a8bdf2c42ee4d1ee42af1f3601f85de94e70a421 upstream.
+
+If CRYPTO_CURVE25519 is y, CRYPTO_LIB_CURVE25519_GENERIC will be
+y, but CRYPTO_LIB_CURVE25519 may be set to m, this causes build
+errors:
+
+lib/crypto/curve25519-selftest.o: In function `curve25519':
+curve25519-selftest.c:(.text.unlikely+0xc): undefined reference to `curve25519_arch'
+lib/crypto/curve25519-selftest.o: In function `curve25519_selftest':
+curve25519-selftest.c:(.init.text+0x17e): undefined reference to `curve25519_base_arch'
+
+This is because the curve25519 self-test code is being controlled
+by the GENERIC option rather than the overall CURVE25519 option,
+as is the case with blake2s.  To recap, the GENERIC and ARCH options
+for CURVE25519 are internal only and selected by users such as
+the Crypto API, or the externally visible CURVE25519 option which
+in turn is selected by wireguard.  The self-test is specific to the
+the external CURVE25519 option and should not be enabled by the
+Crypto API.
+
+This patch fixes this by splitting the GENERIC module from the
+CURVE25519 module with the latter now containing just the self-test.
+
+Reported-by: Hulk Robot <hulkci@huawei.com>
+Fixes: aa127963f1ca ("crypto: lib/curve25519 - re-add selftests")
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/Makefile             |  9 ++++++---
+ lib/crypto/curve25519-generic.c | 24 ++++++++++++++++++++++++
+ lib/crypto/curve25519.c         |  7 -------
+ 3 files changed, 30 insertions(+), 10 deletions(-)
+ create mode 100644 lib/crypto/curve25519-generic.c
+
+--- a/lib/crypto/Makefile
++++ b/lib/crypto/Makefile
+@@ -19,9 +19,12 @@ libblake2s-y                                        += blake2s.o
+ obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)     += libchacha20poly1305.o
+ libchacha20poly1305-y                         += chacha20poly1305.o
+ 
+-obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC)   += libcurve25519.o
+-libcurve25519-y                                       := curve25519-fiat32.o
+-libcurve25519-$(CONFIG_ARCH_SUPPORTS_INT128)  := curve25519-hacl64.o
++obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC)   += libcurve25519-generic.o
++libcurve25519-generic-y                               := curve25519-fiat32.o
++libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128)  := curve25519-hacl64.o
++libcurve25519-generic-y                               += curve25519-generic.o
++
++obj-$(CONFIG_CRYPTO_LIB_CURVE25519)           += libcurve25519.o
+ libcurve25519-y                                       += curve25519.o
+ 
+ obj-$(CONFIG_CRYPTO_LIB_DES)                  += libdes.o
+--- /dev/null
++++ b/lib/crypto/curve25519-generic.c
+@@ -0,0 +1,24 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This is an implementation of the Curve25519 ECDH algorithm, using either
++ * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
++ * depending on what is supported by the target compiler.
++ *
++ * Information: https://cr.yp.to/ecdh.html
++ */
++
++#include <crypto/curve25519.h>
++#include <linux/module.h>
++
++const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
++const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
++
++EXPORT_SYMBOL(curve25519_null_point);
++EXPORT_SYMBOL(curve25519_base_point);
++EXPORT_SYMBOL(curve25519_generic);
++
++MODULE_LICENSE("GPL v2");
++MODULE_DESCRIPTION("Curve25519 scalar multiplication");
++MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+--- a/lib/crypto/curve25519.c
++++ b/lib/crypto/curve25519.c
+@@ -15,13 +15,6 @@
+ 
+ bool curve25519_selftest(void);
+ 
+-const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
+-const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
+-
+-EXPORT_SYMBOL(curve25519_null_point);
+-EXPORT_SYMBOL(curve25519_base_point);
+-EXPORT_SYMBOL(curve25519_generic);
+-
+ static int __init mod_init(void)
+ {
+       if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0046-crypto-x86-poly1305-fix-.gitignore-typo.patch b/target/linux/generic/backport-5.4/080-wireguard-0046-crypto-x86-poly1305-fix-.gitignore-typo.patch

new file mode 100644 (file)

index 0000000..f5a7c21
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0046-crypto-x86-poly1305-fix-.gitignore-typo.patch
@@ -0,0 +1,23 @@
+From cd86f0664c2e42b6406cb56ac8d5182a65764e93 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 16 Jan 2020 18:23:55 +0100
+Subject: [PATCH 046/124] crypto: x86/poly1305 - fix .gitignore typo
+
+commit 1f6868995326cc82102049e349d8dbd116bdb656 upstream.
+
+Admist the kbuild robot induced changes, the .gitignore file for the
+generated file wasn't updated with the non-clashing filename. This
+commit adjusts that.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/.gitignore | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/crypto/.gitignore
++++ b/arch/x86/crypto/.gitignore
+@@ -1 +1 @@
+-poly1305-x86_64.S
++poly1305-x86_64-cryptogams.S
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0047-crypto-chacha20poly1305-add-back-missing-test-vector.patch b/target/linux/generic/backport-5.4/080-wireguard-0047-crypto-chacha20poly1305-add-back-missing-test-vector.patch

new file mode 100644 (file)

index 0000000..9e10334
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0047-crypto-chacha20poly1305-add-back-missing-test-vector.patch
@@ -0,0 +1,1858 @@
+From 956c2d9a4e69f7458c9b7cb81db98ec1be75ea49 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 16 Jan 2020 21:26:34 +0100
+Subject: [PATCH 047/124] crypto: chacha20poly1305 - add back missing test
+ vectors and test chunking
+
+commit 72c7943792c9e7788ddd182337bcf8f650cf56f5 upstream.
+
+When this was originally ported, the 12-byte nonce vectors were left out
+to keep things simple. I agree that we don't need nor want a library
+interface for 12-byte nonces. But these test vectors were specially
+crafted to look at issues in the underlying primitives and related
+interactions.  Therefore, we actually want to keep around all of the
+test vectors, and simply have a helper function to test them with.
+
+Secondly, the sglist-based chunking code in the library interface is
+rather complicated, so this adds a developer-only test for ensuring that
+all the book keeping is correct, across a wide array of possibilities.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/chacha20poly1305-selftest.c | 1712 +++++++++++++++++++++++-
+ 1 file changed, 1698 insertions(+), 14 deletions(-)
+
+--- a/lib/crypto/chacha20poly1305-selftest.c
++++ b/lib/crypto/chacha20poly1305-selftest.c
+@@ -4,6 +4,7 @@
+  */
+ 
+ #include <crypto/chacha20poly1305.h>
++#include <crypto/chacha.h>
+ #include <crypto/poly1305.h>
+ 
+ #include <asm/unaligned.h>
+@@ -1926,6 +1927,1104 @@ static const u8 enc_key012[] __initconst
+       0x65, 0x91, 0x6e, 0x2a, 0x79, 0x22, 0xda, 0x64
+ };
+ 
++/* wycheproof - rfc7539 */
++static const u8 enc_input013[] __initconst = {
++      0x4c, 0x61, 0x64, 0x69, 0x65, 0x73, 0x20, 0x61,
++      0x6e, 0x64, 0x20, 0x47, 0x65, 0x6e, 0x74, 0x6c,
++      0x65, 0x6d, 0x65, 0x6e, 0x20, 0x6f, 0x66, 0x20,
++      0x74, 0x68, 0x65, 0x20, 0x63, 0x6c, 0x61, 0x73,
++      0x73, 0x20, 0x6f, 0x66, 0x20, 0x27, 0x39, 0x39,
++      0x3a, 0x20, 0x49, 0x66, 0x20, 0x49, 0x20, 0x63,
++      0x6f, 0x75, 0x6c, 0x64, 0x20, 0x6f, 0x66, 0x66,
++      0x65, 0x72, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x6f,
++      0x6e, 0x6c, 0x79, 0x20, 0x6f, 0x6e, 0x65, 0x20,
++      0x74, 0x69, 0x70, 0x20, 0x66, 0x6f, 0x72, 0x20,
++      0x74, 0x68, 0x65, 0x20, 0x66, 0x75, 0x74, 0x75,
++      0x72, 0x65, 0x2c, 0x20, 0x73, 0x75, 0x6e, 0x73,
++      0x63, 0x72, 0x65, 0x65, 0x6e, 0x20, 0x77, 0x6f,
++      0x75, 0x6c, 0x64, 0x20, 0x62, 0x65, 0x20, 0x69,
++      0x74, 0x2e
++};
++static const u8 enc_output013[] __initconst = {
++      0xd3, 0x1a, 0x8d, 0x34, 0x64, 0x8e, 0x60, 0xdb,
++      0x7b, 0x86, 0xaf, 0xbc, 0x53, 0xef, 0x7e, 0xc2,
++      0xa4, 0xad, 0xed, 0x51, 0x29, 0x6e, 0x08, 0xfe,
++      0xa9, 0xe2, 0xb5, 0xa7, 0x36, 0xee, 0x62, 0xd6,
++      0x3d, 0xbe, 0xa4, 0x5e, 0x8c, 0xa9, 0x67, 0x12,
++      0x82, 0xfa, 0xfb, 0x69, 0xda, 0x92, 0x72, 0x8b,
++      0x1a, 0x71, 0xde, 0x0a, 0x9e, 0x06, 0x0b, 0x29,
++      0x05, 0xd6, 0xa5, 0xb6, 0x7e, 0xcd, 0x3b, 0x36,
++      0x92, 0xdd, 0xbd, 0x7f, 0x2d, 0x77, 0x8b, 0x8c,
++      0x98, 0x03, 0xae, 0xe3, 0x28, 0x09, 0x1b, 0x58,
++      0xfa, 0xb3, 0x24, 0xe4, 0xfa, 0xd6, 0x75, 0x94,
++      0x55, 0x85, 0x80, 0x8b, 0x48, 0x31, 0xd7, 0xbc,
++      0x3f, 0xf4, 0xde, 0xf0, 0x8e, 0x4b, 0x7a, 0x9d,
++      0xe5, 0x76, 0xd2, 0x65, 0x86, 0xce, 0xc6, 0x4b,
++      0x61, 0x16, 0x1a, 0xe1, 0x0b, 0x59, 0x4f, 0x09,
++      0xe2, 0x6a, 0x7e, 0x90, 0x2e, 0xcb, 0xd0, 0x60,
++      0x06, 0x91
++};
++static const u8 enc_assoc013[] __initconst = {
++      0x50, 0x51, 0x52, 0x53, 0xc0, 0xc1, 0xc2, 0xc3,
++      0xc4, 0xc5, 0xc6, 0xc7
++};
++static const u8 enc_nonce013[] __initconst = {
++      0x07, 0x00, 0x00, 0x00, 0x40, 0x41, 0x42, 0x43,
++      0x44, 0x45, 0x46, 0x47
++};
++static const u8 enc_key013[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input014[] __initconst = { };
++static const u8 enc_output014[] __initconst = {
++      0x76, 0xac, 0xb3, 0x42, 0xcf, 0x31, 0x66, 0xa5,
++      0xb6, 0x3c, 0x0c, 0x0e, 0xa1, 0x38, 0x3c, 0x8d
++};
++static const u8 enc_assoc014[] __initconst = { };
++static const u8 enc_nonce014[] __initconst = {
++      0x4d, 0xa5, 0xbf, 0x8d, 0xfd, 0x58, 0x52, 0xc1,
++      0xea, 0x12, 0x37, 0x9d
++};
++static const u8 enc_key014[] __initconst = {
++      0x80, 0xba, 0x31, 0x92, 0xc8, 0x03, 0xce, 0x96,
++      0x5e, 0xa3, 0x71, 0xd5, 0xff, 0x07, 0x3c, 0xf0,
++      0xf4, 0x3b, 0x6a, 0x2a, 0xb5, 0x76, 0xb2, 0x08,
++      0x42, 0x6e, 0x11, 0x40, 0x9c, 0x09, 0xb9, 0xb0
++};
++
++/* wycheproof - misc */
++static const u8 enc_input015[] __initconst = { };
++static const u8 enc_output015[] __initconst = {
++      0x90, 0x6f, 0xa6, 0x28, 0x4b, 0x52, 0xf8, 0x7b,
++      0x73, 0x59, 0xcb, 0xaa, 0x75, 0x63, 0xc7, 0x09
++};
++static const u8 enc_assoc015[] __initconst = {
++      0xbd, 0x50, 0x67, 0x64, 0xf2, 0xd2, 0xc4, 0x10
++};
++static const u8 enc_nonce015[] __initconst = {
++      0xa9, 0x2e, 0xf0, 0xac, 0x99, 0x1d, 0xd5, 0x16,
++      0xa3, 0xc6, 0xf6, 0x89
++};
++static const u8 enc_key015[] __initconst = {
++      0x7a, 0x4c, 0xd7, 0x59, 0x17, 0x2e, 0x02, 0xeb,
++      0x20, 0x4d, 0xb2, 0xc3, 0xf5, 0xc7, 0x46, 0x22,
++      0x7d, 0xf5, 0x84, 0xfc, 0x13, 0x45, 0x19, 0x63,
++      0x91, 0xdb, 0xb9, 0x57, 0x7a, 0x25, 0x07, 0x42
++};
++
++/* wycheproof - misc */
++static const u8 enc_input016[] __initconst = {
++      0x2a
++};
++static const u8 enc_output016[] __initconst = {
++      0x3a, 0xca, 0xc2, 0x7d, 0xec, 0x09, 0x68, 0x80,
++      0x1e, 0x9f, 0x6e, 0xde, 0xd6, 0x9d, 0x80, 0x75,
++      0x22
++};
++static const u8 enc_assoc016[] __initconst = { };
++static const u8 enc_nonce016[] __initconst = {
++      0x99, 0xe2, 0x3e, 0xc4, 0x89, 0x85, 0xbc, 0xcd,
++      0xee, 0xab, 0x60, 0xf1
++};
++static const u8 enc_key016[] __initconst = {
++      0xcc, 0x56, 0xb6, 0x80, 0x55, 0x2e, 0xb7, 0x50,
++      0x08, 0xf5, 0x48, 0x4b, 0x4c, 0xb8, 0x03, 0xfa,
++      0x50, 0x63, 0xeb, 0xd6, 0xea, 0xb9, 0x1f, 0x6a,
++      0xb6, 0xae, 0xf4, 0x91, 0x6a, 0x76, 0x62, 0x73
++};
++
++/* wycheproof - misc */
++static const u8 enc_input017[] __initconst = {
++      0x51
++};
++static const u8 enc_output017[] __initconst = {
++      0xc4, 0x16, 0x83, 0x10, 0xca, 0x45, 0xb1, 0xf7,
++      0xc6, 0x6c, 0xad, 0x4e, 0x99, 0xe4, 0x3f, 0x72,
++      0xb9
++};
++static const u8 enc_assoc017[] __initconst = {
++      0x91, 0xca, 0x6c, 0x59, 0x2c, 0xbc, 0xca, 0x53
++};
++static const u8 enc_nonce017[] __initconst = {
++      0xab, 0x0d, 0xca, 0x71, 0x6e, 0xe0, 0x51, 0xd2,
++      0x78, 0x2f, 0x44, 0x03
++};
++static const u8 enc_key017[] __initconst = {
++      0x46, 0xf0, 0x25, 0x49, 0x65, 0xf7, 0x69, 0xd5,
++      0x2b, 0xdb, 0x4a, 0x70, 0xb4, 0x43, 0x19, 0x9f,
++      0x8e, 0xf2, 0x07, 0x52, 0x0d, 0x12, 0x20, 0xc5,
++      0x5e, 0x4b, 0x70, 0xf0, 0xfd, 0xa6, 0x20, 0xee
++};
++
++/* wycheproof - misc */
++static const u8 enc_input018[] __initconst = {
++      0x5c, 0x60
++};
++static const u8 enc_output018[] __initconst = {
++      0x4d, 0x13, 0x91, 0xe8, 0xb6, 0x1e, 0xfb, 0x39,
++      0xc1, 0x22, 0x19, 0x54, 0x53, 0x07, 0x7b, 0x22,
++      0xe5, 0xe2
++};
++static const u8 enc_assoc018[] __initconst = { };
++static const u8 enc_nonce018[] __initconst = {
++      0x46, 0x1a, 0xf1, 0x22, 0xe9, 0xf2, 0xe0, 0x34,
++      0x7e, 0x03, 0xf2, 0xdb
++};
++static const u8 enc_key018[] __initconst = {
++      0x2f, 0x7f, 0x7e, 0x4f, 0x59, 0x2b, 0xb3, 0x89,
++      0x19, 0x49, 0x89, 0x74, 0x35, 0x07, 0xbf, 0x3e,
++      0xe9, 0xcb, 0xde, 0x17, 0x86, 0xb6, 0x69, 0x5f,
++      0xe6, 0xc0, 0x25, 0xfd, 0x9b, 0xa4, 0xc1, 0x00
++};
++
++/* wycheproof - misc */
++static const u8 enc_input019[] __initconst = {
++      0xdd, 0xf2
++};
++static const u8 enc_output019[] __initconst = {
++      0xb6, 0x0d, 0xea, 0xd0, 0xfd, 0x46, 0x97, 0xec,
++      0x2e, 0x55, 0x58, 0x23, 0x77, 0x19, 0xd0, 0x24,
++      0x37, 0xa2
++};
++static const u8 enc_assoc019[] __initconst = {
++      0x88, 0x36, 0x4f, 0xc8, 0x06, 0x05, 0x18, 0xbf
++};
++static const u8 enc_nonce019[] __initconst = {
++      0x61, 0x54, 0x6b, 0xa5, 0xf1, 0x72, 0x05, 0x90,
++      0xb6, 0x04, 0x0a, 0xc6
++};
++static const u8 enc_key019[] __initconst = {
++      0xc8, 0x83, 0x3d, 0xce, 0x5e, 0xa9, 0xf2, 0x48,
++      0xaa, 0x20, 0x30, 0xea, 0xcf, 0xe7, 0x2b, 0xff,
++      0xe6, 0x9a, 0x62, 0x0c, 0xaf, 0x79, 0x33, 0x44,
++      0xe5, 0x71, 0x8f, 0xe0, 0xd7, 0xab, 0x1a, 0x58
++};
++
++/* wycheproof - misc */
++static const u8 enc_input020[] __initconst = {
++      0xab, 0x85, 0xe9, 0xc1, 0x57, 0x17, 0x31
++};
++static const u8 enc_output020[] __initconst = {
++      0x5d, 0xfe, 0x34, 0x40, 0xdb, 0xb3, 0xc3, 0xed,
++      0x7a, 0x43, 0x4e, 0x26, 0x02, 0xd3, 0x94, 0x28,
++      0x1e, 0x0a, 0xfa, 0x9f, 0xb7, 0xaa, 0x42
++};
++static const u8 enc_assoc020[] __initconst = { };
++static const u8 enc_nonce020[] __initconst = {
++      0x3c, 0x4e, 0x65, 0x4d, 0x66, 0x3f, 0xa4, 0x59,
++      0x6d, 0xc5, 0x5b, 0xb7
++};
++static const u8 enc_key020[] __initconst = {
++      0x55, 0x56, 0x81, 0x58, 0xd3, 0xa6, 0x48, 0x3f,
++      0x1f, 0x70, 0x21, 0xea, 0xb6, 0x9b, 0x70, 0x3f,
++      0x61, 0x42, 0x51, 0xca, 0xdc, 0x1a, 0xf5, 0xd3,
++      0x4a, 0x37, 0x4f, 0xdb, 0xfc, 0x5a, 0xda, 0xc7
++};
++
++/* wycheproof - misc */
++static const u8 enc_input021[] __initconst = {
++      0x4e, 0xe5, 0xcd, 0xa2, 0x0d, 0x42, 0x90
++};
++static const u8 enc_output021[] __initconst = {
++      0x4b, 0xd4, 0x72, 0x12, 0x94, 0x1c, 0xe3, 0x18,
++      0x5f, 0x14, 0x08, 0xee, 0x7f, 0xbf, 0x18, 0xf5,
++      0xab, 0xad, 0x6e, 0x22, 0x53, 0xa1, 0xba
++};
++static const u8 enc_assoc021[] __initconst = {
++      0x84, 0xe4, 0x6b, 0xe8, 0xc0, 0x91, 0x90, 0x53
++};
++static const u8 enc_nonce021[] __initconst = {
++      0x58, 0x38, 0x93, 0x75, 0xc6, 0x9e, 0xe3, 0x98,
++      0xde, 0x94, 0x83, 0x96
++};
++static const u8 enc_key021[] __initconst = {
++      0xe3, 0xc0, 0x9e, 0x7f, 0xab, 0x1a, 0xef, 0xb5,
++      0x16, 0xda, 0x6a, 0x33, 0x02, 0x2a, 0x1d, 0xd4,
++      0xeb, 0x27, 0x2c, 0x80, 0xd5, 0x40, 0xc5, 0xda,
++      0x52, 0xa7, 0x30, 0xf3, 0x4d, 0x84, 0x0d, 0x7f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input022[] __initconst = {
++      0xbe, 0x33, 0x08, 0xf7, 0x2a, 0x2c, 0x6a, 0xed
++};
++static const u8 enc_output022[] __initconst = {
++      0x8e, 0x94, 0x39, 0xa5, 0x6e, 0xee, 0xc8, 0x17,
++      0xfb, 0xe8, 0xa6, 0xed, 0x8f, 0xab, 0xb1, 0x93,
++      0x75, 0x39, 0xdd, 0x6c, 0x00, 0xe9, 0x00, 0x21
++};
++static const u8 enc_assoc022[] __initconst = { };
++static const u8 enc_nonce022[] __initconst = {
++      0x4f, 0x07, 0xaf, 0xed, 0xfd, 0xc3, 0xb6, 0xc2,
++      0x36, 0x18, 0x23, 0xd3
++};
++static const u8 enc_key022[] __initconst = {
++      0x51, 0xe4, 0xbf, 0x2b, 0xad, 0x92, 0xb7, 0xaf,
++      0xf1, 0xa4, 0xbc, 0x05, 0x55, 0x0b, 0xa8, 0x1d,
++      0xf4, 0xb9, 0x6f, 0xab, 0xf4, 0x1c, 0x12, 0xc7,
++      0xb0, 0x0e, 0x60, 0xe4, 0x8d, 0xb7, 0xe1, 0x52
++};
++
++/* wycheproof - misc */
++static const u8 enc_input023[] __initconst = {
++      0xa4, 0xc9, 0xc2, 0x80, 0x1b, 0x71, 0xf7, 0xdf
++};
++static const u8 enc_output023[] __initconst = {
++      0xb9, 0xb9, 0x10, 0x43, 0x3a, 0xf0, 0x52, 0xb0,
++      0x45, 0x30, 0xf5, 0x1a, 0xee, 0xe0, 0x24, 0xe0,
++      0xa4, 0x45, 0xa6, 0x32, 0x8f, 0xa6, 0x7a, 0x18
++};
++static const u8 enc_assoc023[] __initconst = {
++      0x66, 0xc0, 0xae, 0x70, 0x07, 0x6c, 0xb1, 0x4d
++};
++static const u8 enc_nonce023[] __initconst = {
++      0xb4, 0xea, 0x66, 0x6e, 0xe1, 0x19, 0x56, 0x33,
++      0x66, 0x48, 0x4a, 0x78
++};
++static const u8 enc_key023[] __initconst = {
++      0x11, 0x31, 0xc1, 0x41, 0x85, 0x77, 0xa0, 0x54,
++      0xde, 0x7a, 0x4a, 0xc5, 0x51, 0x95, 0x0f, 0x1a,
++      0x05, 0x3f, 0x9a, 0xe4, 0x6e, 0x5b, 0x75, 0xfe,
++      0x4a, 0xbd, 0x56, 0x08, 0xd7, 0xcd, 0xda, 0xdd
++};
++
++/* wycheproof - misc */
++static const u8 enc_input024[] __initconst = {
++      0x42, 0xba, 0xae, 0x59, 0x78, 0xfe, 0xaf, 0x5c,
++      0x36, 0x8d, 0x14, 0xe0
++};
++static const u8 enc_output024[] __initconst = {
++      0xff, 0x7d, 0xc2, 0x03, 0xb2, 0x6c, 0x46, 0x7a,
++      0x6b, 0x50, 0xdb, 0x33, 0x57, 0x8c, 0x0f, 0x27,
++      0x58, 0xc2, 0xe1, 0x4e, 0x36, 0xd4, 0xfc, 0x10,
++      0x6d, 0xcb, 0x29, 0xb4
++};
++static const u8 enc_assoc024[] __initconst = { };
++static const u8 enc_nonce024[] __initconst = {
++      0x9a, 0x59, 0xfc, 0xe2, 0x6d, 0xf0, 0x00, 0x5e,
++      0x07, 0x53, 0x86, 0x56
++};
++static const u8 enc_key024[] __initconst = {
++      0x99, 0xb6, 0x2b, 0xd5, 0xaf, 0xbe, 0x3f, 0xb0,
++      0x15, 0xbd, 0xe9, 0x3f, 0x0a, 0xbf, 0x48, 0x39,
++      0x57, 0xa1, 0xc3, 0xeb, 0x3c, 0xa5, 0x9c, 0xb5,
++      0x0b, 0x39, 0xf7, 0xf8, 0xa9, 0xcc, 0x51, 0xbe
++};
++
++/* wycheproof - misc */
++static const u8 enc_input025[] __initconst = {
++      0xfd, 0xc8, 0x5b, 0x94, 0xa4, 0xb2, 0xa6, 0xb7,
++      0x59, 0xb1, 0xa0, 0xda
++};
++static const u8 enc_output025[] __initconst = {
++      0x9f, 0x88, 0x16, 0xde, 0x09, 0x94, 0xe9, 0x38,
++      0xd9, 0xe5, 0x3f, 0x95, 0xd0, 0x86, 0xfc, 0x6c,
++      0x9d, 0x8f, 0xa9, 0x15, 0xfd, 0x84, 0x23, 0xa7,
++      0xcf, 0x05, 0x07, 0x2f
++};
++static const u8 enc_assoc025[] __initconst = {
++      0xa5, 0x06, 0xe1, 0xa5, 0xc6, 0x90, 0x93, 0xf9
++};
++static const u8 enc_nonce025[] __initconst = {
++      0x58, 0xdb, 0xd4, 0xad, 0x2c, 0x4a, 0xd3, 0x5d,
++      0xd9, 0x06, 0xe9, 0xce
++};
++static const u8 enc_key025[] __initconst = {
++      0x85, 0xf3, 0x5b, 0x62, 0x82, 0xcf, 0xf4, 0x40,
++      0xbc, 0x10, 0x20, 0xc8, 0x13, 0x6f, 0xf2, 0x70,
++      0x31, 0x11, 0x0f, 0xa6, 0x3e, 0xc1, 0x6f, 0x1e,
++      0x82, 0x51, 0x18, 0xb0, 0x06, 0xb9, 0x12, 0x57
++};
++
++/* wycheproof - misc */
++static const u8 enc_input026[] __initconst = {
++      0x51, 0xf8, 0xc1, 0xf7, 0x31, 0xea, 0x14, 0xac,
++      0xdb, 0x21, 0x0a, 0x6d, 0x97, 0x3e, 0x07
++};
++static const u8 enc_output026[] __initconst = {
++      0x0b, 0x29, 0x63, 0x8e, 0x1f, 0xbd, 0xd6, 0xdf,
++      0x53, 0x97, 0x0b, 0xe2, 0x21, 0x00, 0x42, 0x2a,
++      0x91, 0x34, 0x08, 0x7d, 0x67, 0xa4, 0x6e, 0x79,
++      0x17, 0x8d, 0x0a, 0x93, 0xf5, 0xe1, 0xd2
++};
++static const u8 enc_assoc026[] __initconst = { };
++static const u8 enc_nonce026[] __initconst = {
++      0x68, 0xab, 0x7f, 0xdb, 0xf6, 0x19, 0x01, 0xda,
++      0xd4, 0x61, 0xd2, 0x3c
++};
++static const u8 enc_key026[] __initconst = {
++      0x67, 0x11, 0x96, 0x27, 0xbd, 0x98, 0x8e, 0xda,
++      0x90, 0x62, 0x19, 0xe0, 0x8c, 0x0d, 0x0d, 0x77,
++      0x9a, 0x07, 0xd2, 0x08, 0xce, 0x8a, 0x4f, 0xe0,
++      0x70, 0x9a, 0xf7, 0x55, 0xee, 0xec, 0x6d, 0xcb
++};
++
++/* wycheproof - misc */
++static const u8 enc_input027[] __initconst = {
++      0x97, 0x46, 0x9d, 0xa6, 0x67, 0xd6, 0x11, 0x0f,
++      0x9c, 0xbd, 0xa1, 0xd1, 0xa2, 0x06, 0x73
++};
++static const u8 enc_output027[] __initconst = {
++      0x32, 0xdb, 0x66, 0xc4, 0xa3, 0x81, 0x9d, 0x81,
++      0x55, 0x74, 0x55, 0xe5, 0x98, 0x0f, 0xed, 0xfe,
++      0xae, 0x30, 0xde, 0xc9, 0x4e, 0x6a, 0xd3, 0xa9,
++      0xee, 0xa0, 0x6a, 0x0d, 0x70, 0x39, 0x17
++};
++static const u8 enc_assoc027[] __initconst = {
++      0x64, 0x53, 0xa5, 0x33, 0x84, 0x63, 0x22, 0x12
++};
++static const u8 enc_nonce027[] __initconst = {
++      0xd9, 0x5b, 0x32, 0x43, 0xaf, 0xae, 0xf7, 0x14,
++      0xc5, 0x03, 0x5b, 0x6a
++};
++static const u8 enc_key027[] __initconst = {
++      0xe6, 0xf1, 0x11, 0x8d, 0x41, 0xe4, 0xb4, 0x3f,
++      0xb5, 0x82, 0x21, 0xb7, 0xed, 0x79, 0x67, 0x38,
++      0x34, 0xe0, 0xd8, 0xac, 0x5c, 0x4f, 0xa6, 0x0b,
++      0xbc, 0x8b, 0xc4, 0x89, 0x3a, 0x58, 0x89, 0x4d
++};
++
++/* wycheproof - misc */
++static const u8 enc_input028[] __initconst = {
++      0x54, 0x9b, 0x36, 0x5a, 0xf9, 0x13, 0xf3, 0xb0,
++      0x81, 0x13, 0x1c, 0xcb, 0x6b, 0x82, 0x55, 0x88
++};
++static const u8 enc_output028[] __initconst = {
++      0xe9, 0x11, 0x0e, 0x9f, 0x56, 0xab, 0x3c, 0xa4,
++      0x83, 0x50, 0x0c, 0xea, 0xba, 0xb6, 0x7a, 0x13,
++      0x83, 0x6c, 0xca, 0xbf, 0x15, 0xa6, 0xa2, 0x2a,
++      0x51, 0xc1, 0x07, 0x1c, 0xfa, 0x68, 0xfa, 0x0c
++};
++static const u8 enc_assoc028[] __initconst = { };
++static const u8 enc_nonce028[] __initconst = {
++      0x2f, 0xcb, 0x1b, 0x38, 0xa9, 0x9e, 0x71, 0xb8,
++      0x47, 0x40, 0xad, 0x9b
++};
++static const u8 enc_key028[] __initconst = {
++      0x59, 0xd4, 0xea, 0xfb, 0x4d, 0xe0, 0xcf, 0xc7,
++      0xd3, 0xdb, 0x99, 0xa8, 0xf5, 0x4b, 0x15, 0xd7,
++      0xb3, 0x9f, 0x0a, 0xcc, 0x8d, 0xa6, 0x97, 0x63,
++      0xb0, 0x19, 0xc1, 0x69, 0x9f, 0x87, 0x67, 0x4a
++};
++
++/* wycheproof - misc */
++static const u8 enc_input029[] __initconst = {
++      0x55, 0xa4, 0x65, 0x64, 0x4f, 0x5b, 0x65, 0x09,
++      0x28, 0xcb, 0xee, 0x7c, 0x06, 0x32, 0x14, 0xd6
++};
++static const u8 enc_output029[] __initconst = {
++      0xe4, 0xb1, 0x13, 0xcb, 0x77, 0x59, 0x45, 0xf3,
++      0xd3, 0xa8, 0xae, 0x9e, 0xc1, 0x41, 0xc0, 0x0c,
++      0x7c, 0x43, 0xf1, 0x6c, 0xe0, 0x96, 0xd0, 0xdc,
++      0x27, 0xc9, 0x58, 0x49, 0xdc, 0x38, 0x3b, 0x7d
++};
++static const u8 enc_assoc029[] __initconst = {
++      0x03, 0x45, 0x85, 0x62, 0x1a, 0xf8, 0xd7, 0xff
++};
++static const u8 enc_nonce029[] __initconst = {
++      0x11, 0x8a, 0x69, 0x64, 0xc2, 0xd3, 0xe3, 0x80,
++      0x07, 0x1f, 0x52, 0x66
++};
++static const u8 enc_key029[] __initconst = {
++      0xb9, 0x07, 0xa4, 0x50, 0x75, 0x51, 0x3f, 0xe8,
++      0xa8, 0x01, 0x9e, 0xde, 0xe3, 0xf2, 0x59, 0x14,
++      0x87, 0xb2, 0xa0, 0x30, 0xb0, 0x3c, 0x6e, 0x1d,
++      0x77, 0x1c, 0x86, 0x25, 0x71, 0xd2, 0xea, 0x1e
++};
++
++/* wycheproof - misc */
++static const u8 enc_input030[] __initconst = {
++      0x3f, 0xf1, 0x51, 0x4b, 0x1c, 0x50, 0x39, 0x15,
++      0x91, 0x8f, 0x0c, 0x0c, 0x31, 0x09, 0x4a, 0x6e,
++      0x1f
++};
++static const u8 enc_output030[] __initconst = {
++      0x02, 0xcc, 0x3a, 0xcb, 0x5e, 0xe1, 0xfc, 0xdd,
++      0x12, 0xa0, 0x3b, 0xb8, 0x57, 0x97, 0x64, 0x74,
++      0xd3, 0xd8, 0x3b, 0x74, 0x63, 0xa2, 0xc3, 0x80,
++      0x0f, 0xe9, 0x58, 0xc2, 0x8e, 0xaa, 0x29, 0x08,
++      0x13
++};
++static const u8 enc_assoc030[] __initconst = { };
++static const u8 enc_nonce030[] __initconst = {
++      0x45, 0xaa, 0xa3, 0xe5, 0xd1, 0x6d, 0x2d, 0x42,
++      0xdc, 0x03, 0x44, 0x5d
++};
++static const u8 enc_key030[] __initconst = {
++      0x3b, 0x24, 0x58, 0xd8, 0x17, 0x6e, 0x16, 0x21,
++      0xc0, 0xcc, 0x24, 0xc0, 0xc0, 0xe2, 0x4c, 0x1e,
++      0x80, 0xd7, 0x2f, 0x7e, 0xe9, 0x14, 0x9a, 0x4b,
++      0x16, 0x61, 0x76, 0x62, 0x96, 0x16, 0xd0, 0x11
++};
++
++/* wycheproof - misc */
++static const u8 enc_input031[] __initconst = {
++      0x63, 0x85, 0x8c, 0xa3, 0xe2, 0xce, 0x69, 0x88,
++      0x7b, 0x57, 0x8a, 0x3c, 0x16, 0x7b, 0x42, 0x1c,
++      0x9c
++};
++static const u8 enc_output031[] __initconst = {
++      0x35, 0x76, 0x64, 0x88, 0xd2, 0xbc, 0x7c, 0x2b,
++      0x8d, 0x17, 0xcb, 0xbb, 0x9a, 0xbf, 0xad, 0x9e,
++      0x6d, 0x1f, 0x39, 0x1e, 0x65, 0x7b, 0x27, 0x38,
++      0xdd, 0xa0, 0x84, 0x48, 0xcb, 0xa2, 0x81, 0x1c,
++      0xeb
++};
++static const u8 enc_assoc031[] __initconst = {
++      0x9a, 0xaf, 0x29, 0x9e, 0xee, 0xa7, 0x8f, 0x79
++};
++static const u8 enc_nonce031[] __initconst = {
++      0xf0, 0x38, 0x4f, 0xb8, 0x76, 0x12, 0x14, 0x10,
++      0x63, 0x3d, 0x99, 0x3d
++};
++static const u8 enc_key031[] __initconst = {
++      0xf6, 0x0c, 0x6a, 0x1b, 0x62, 0x57, 0x25, 0xf7,
++      0x6c, 0x70, 0x37, 0xb4, 0x8f, 0xe3, 0x57, 0x7f,
++      0xa7, 0xf7, 0xb8, 0x7b, 0x1b, 0xd5, 0xa9, 0x82,
++      0x17, 0x6d, 0x18, 0x23, 0x06, 0xff, 0xb8, 0x70
++};
++
++/* wycheproof - misc */
++static const u8 enc_input032[] __initconst = {
++      0x10, 0xf1, 0xec, 0xf9, 0xc6, 0x05, 0x84, 0x66,
++      0x5d, 0x9a, 0xe5, 0xef, 0xe2, 0x79, 0xe7, 0xf7,
++      0x37, 0x7e, 0xea, 0x69, 0x16, 0xd2, 0xb1, 0x11
++};
++static const u8 enc_output032[] __initconst = {
++      0x42, 0xf2, 0x6c, 0x56, 0xcb, 0x4b, 0xe2, 0x1d,
++      0x9d, 0x8d, 0x0c, 0x80, 0xfc, 0x99, 0xdd, 0xe0,
++      0x0d, 0x75, 0xf3, 0x80, 0x74, 0xbf, 0xe7, 0x64,
++      0x54, 0xaa, 0x7e, 0x13, 0xd4, 0x8f, 0xff, 0x7d,
++      0x75, 0x57, 0x03, 0x94, 0x57, 0x04, 0x0a, 0x3a
++};
++static const u8 enc_assoc032[] __initconst = { };
++static const u8 enc_nonce032[] __initconst = {
++      0xe6, 0xb1, 0xad, 0xf2, 0xfd, 0x58, 0xa8, 0x76,
++      0x2c, 0x65, 0xf3, 0x1b
++};
++static const u8 enc_key032[] __initconst = {
++      0x02, 0x12, 0xa8, 0xde, 0x50, 0x07, 0xed, 0x87,
++      0xb3, 0x3f, 0x1a, 0x70, 0x90, 0xb6, 0x11, 0x4f,
++      0x9e, 0x08, 0xce, 0xfd, 0x96, 0x07, 0xf2, 0xc2,
++      0x76, 0xbd, 0xcf, 0xdb, 0xc5, 0xce, 0x9c, 0xd7
++};
++
++/* wycheproof - misc */
++static const u8 enc_input033[] __initconst = {
++      0x92, 0x22, 0xf9, 0x01, 0x8e, 0x54, 0xfd, 0x6d,
++      0xe1, 0x20, 0x08, 0x06, 0xa9, 0xee, 0x8e, 0x4c,
++      0xc9, 0x04, 0xd2, 0x9f, 0x25, 0xcb, 0xa1, 0x93
++};
++static const u8 enc_output033[] __initconst = {
++      0x12, 0x30, 0x32, 0x43, 0x7b, 0x4b, 0xfd, 0x69,
++      0x20, 0xe8, 0xf7, 0xe7, 0xe0, 0x08, 0x7a, 0xe4,
++      0x88, 0x9e, 0xbe, 0x7a, 0x0a, 0xd0, 0xe9, 0x00,
++      0x3c, 0xf6, 0x8f, 0x17, 0x95, 0x50, 0xda, 0x63,
++      0xd3, 0xb9, 0x6c, 0x2d, 0x55, 0x41, 0x18, 0x65
++};
++static const u8 enc_assoc033[] __initconst = {
++      0x3e, 0x8b, 0xc5, 0xad, 0xe1, 0x82, 0xff, 0x08
++};
++static const u8 enc_nonce033[] __initconst = {
++      0x6b, 0x28, 0x2e, 0xbe, 0xcc, 0x54, 0x1b, 0xcd,
++      0x78, 0x34, 0xed, 0x55
++};
++static const u8 enc_key033[] __initconst = {
++      0xc5, 0xbc, 0x09, 0x56, 0x56, 0x46, 0xe7, 0xed,
++      0xda, 0x95, 0x4f, 0x1f, 0x73, 0x92, 0x23, 0xda,
++      0xda, 0x20, 0xb9, 0x5c, 0x44, 0xab, 0x03, 0x3d,
++      0x0f, 0xae, 0x4b, 0x02, 0x83, 0xd1, 0x8b, 0xe3
++};
++
++/* wycheproof - misc */
++static const u8 enc_input034[] __initconst = {
++      0xb0, 0x53, 0x99, 0x92, 0x86, 0xa2, 0x82, 0x4f,
++      0x42, 0xcc, 0x8c, 0x20, 0x3a, 0xb2, 0x4e, 0x2c,
++      0x97, 0xa6, 0x85, 0xad, 0xcc, 0x2a, 0xd3, 0x26,
++      0x62, 0x55, 0x8e, 0x55, 0xa5, 0xc7, 0x29
++};
++static const u8 enc_output034[] __initconst = {
++      0x45, 0xc7, 0xd6, 0xb5, 0x3a, 0xca, 0xd4, 0xab,
++      0xb6, 0x88, 0x76, 0xa6, 0xe9, 0x6a, 0x48, 0xfb,
++      0x59, 0x52, 0x4d, 0x2c, 0x92, 0xc9, 0xd8, 0xa1,
++      0x89, 0xc9, 0xfd, 0x2d, 0xb9, 0x17, 0x46, 0x56,
++      0x6d, 0x3c, 0xa1, 0x0e, 0x31, 0x1b, 0x69, 0x5f,
++      0x3e, 0xae, 0x15, 0x51, 0x65, 0x24, 0x93
++};
++static const u8 enc_assoc034[] __initconst = { };
++static const u8 enc_nonce034[] __initconst = {
++      0x04, 0xa9, 0xbe, 0x03, 0x50, 0x8a, 0x5f, 0x31,
++      0x37, 0x1a, 0x6f, 0xd2
++};
++static const u8 enc_key034[] __initconst = {
++      0x2e, 0xb5, 0x1c, 0x46, 0x9a, 0xa8, 0xeb, 0x9e,
++      0x6c, 0x54, 0xa8, 0x34, 0x9b, 0xae, 0x50, 0xa2,
++      0x0f, 0x0e, 0x38, 0x27, 0x11, 0xbb, 0xa1, 0x15,
++      0x2c, 0x42, 0x4f, 0x03, 0xb6, 0x67, 0x1d, 0x71
++};
++
++/* wycheproof - misc */
++static const u8 enc_input035[] __initconst = {
++      0xf4, 0x52, 0x06, 0xab, 0xc2, 0x55, 0x52, 0xb2,
++      0xab, 0xc9, 0xab, 0x7f, 0xa2, 0x43, 0x03, 0x5f,
++      0xed, 0xaa, 0xdd, 0xc3, 0xb2, 0x29, 0x39, 0x56,
++      0xf1, 0xea, 0x6e, 0x71, 0x56, 0xe7, 0xeb
++};
++static const u8 enc_output035[] __initconst = {
++      0x46, 0xa8, 0x0c, 0x41, 0x87, 0x02, 0x47, 0x20,
++      0x08, 0x46, 0x27, 0x58, 0x00, 0x80, 0xdd, 0xe5,
++      0xa3, 0xf4, 0xa1, 0x10, 0x93, 0xa7, 0x07, 0x6e,
++      0xd6, 0xf3, 0xd3, 0x26, 0xbc, 0x7b, 0x70, 0x53,
++      0x4d, 0x4a, 0xa2, 0x83, 0x5a, 0x52, 0xe7, 0x2d,
++      0x14, 0xdf, 0x0e, 0x4f, 0x47, 0xf2, 0x5f
++};
++static const u8 enc_assoc035[] __initconst = {
++      0x37, 0x46, 0x18, 0xa0, 0x6e, 0xa9, 0x8a, 0x48
++};
++static const u8 enc_nonce035[] __initconst = {
++      0x47, 0x0a, 0x33, 0x9e, 0xcb, 0x32, 0x19, 0xb8,
++      0xb8, 0x1a, 0x1f, 0x8b
++};
++static const u8 enc_key035[] __initconst = {
++      0x7f, 0x5b, 0x74, 0xc0, 0x7e, 0xd1, 0xb4, 0x0f,
++      0xd1, 0x43, 0x58, 0xfe, 0x2f, 0xf2, 0xa7, 0x40,
++      0xc1, 0x16, 0xc7, 0x70, 0x65, 0x10, 0xe6, 0xa4,
++      0x37, 0xf1, 0x9e, 0xa4, 0x99, 0x11, 0xce, 0xc4
++};
++
++/* wycheproof - misc */
++static const u8 enc_input036[] __initconst = {
++      0xb9, 0xc5, 0x54, 0xcb, 0xc3, 0x6a, 0xc1, 0x8a,
++      0xe8, 0x97, 0xdf, 0x7b, 0xee, 0xca, 0xc1, 0xdb,
++      0xeb, 0x4e, 0xaf, 0xa1, 0x56, 0xbb, 0x60, 0xce,
++      0x2e, 0x5d, 0x48, 0xf0, 0x57, 0x15, 0xe6, 0x78
++};
++static const u8 enc_output036[] __initconst = {
++      0xea, 0x29, 0xaf, 0xa4, 0x9d, 0x36, 0xe8, 0x76,
++      0x0f, 0x5f, 0xe1, 0x97, 0x23, 0xb9, 0x81, 0x1e,
++      0xd5, 0xd5, 0x19, 0x93, 0x4a, 0x44, 0x0f, 0x50,
++      0x81, 0xac, 0x43, 0x0b, 0x95, 0x3b, 0x0e, 0x21,
++      0x22, 0x25, 0x41, 0xaf, 0x46, 0xb8, 0x65, 0x33,
++      0xc6, 0xb6, 0x8d, 0x2f, 0xf1, 0x08, 0xa7, 0xea
++};
++static const u8 enc_assoc036[] __initconst = { };
++static const u8 enc_nonce036[] __initconst = {
++      0x72, 0xcf, 0xd9, 0x0e, 0xf3, 0x02, 0x6c, 0xa2,
++      0x2b, 0x7e, 0x6e, 0x6a
++};
++static const u8 enc_key036[] __initconst = {
++      0xe1, 0x73, 0x1d, 0x58, 0x54, 0xe1, 0xb7, 0x0c,
++      0xb3, 0xff, 0xe8, 0xb7, 0x86, 0xa2, 0xb3, 0xeb,
++      0xf0, 0x99, 0x43, 0x70, 0x95, 0x47, 0x57, 0xb9,
++      0xdc, 0x8c, 0x7b, 0xc5, 0x35, 0x46, 0x34, 0xa3
++};
++
++/* wycheproof - misc */
++static const u8 enc_input037[] __initconst = {
++      0x6b, 0x26, 0x04, 0x99, 0x6c, 0xd3, 0x0c, 0x14,
++      0xa1, 0x3a, 0x52, 0x57, 0xed, 0x6c, 0xff, 0xd3,
++      0xbc, 0x5e, 0x29, 0xd6, 0xb9, 0x7e, 0xb1, 0x79,
++      0x9e, 0xb3, 0x35, 0xe2, 0x81, 0xea, 0x45, 0x1e
++};
++static const u8 enc_output037[] __initconst = {
++      0x6d, 0xad, 0x63, 0x78, 0x97, 0x54, 0x4d, 0x8b,
++      0xf6, 0xbe, 0x95, 0x07, 0xed, 0x4d, 0x1b, 0xb2,
++      0xe9, 0x54, 0xbc, 0x42, 0x7e, 0x5d, 0xe7, 0x29,
++      0xda, 0xf5, 0x07, 0x62, 0x84, 0x6f, 0xf2, 0xf4,
++      0x7b, 0x99, 0x7d, 0x93, 0xc9, 0x82, 0x18, 0x9d,
++      0x70, 0x95, 0xdc, 0x79, 0x4c, 0x74, 0x62, 0x32
++};
++static const u8 enc_assoc037[] __initconst = {
++      0x23, 0x33, 0xe5, 0xce, 0x0f, 0x93, 0xb0, 0x59
++};
++static const u8 enc_nonce037[] __initconst = {
++      0x26, 0x28, 0x80, 0xd4, 0x75, 0xf3, 0xda, 0xc5,
++      0x34, 0x0d, 0xd1, 0xb8
++};
++static const u8 enc_key037[] __initconst = {
++      0x27, 0xd8, 0x60, 0x63, 0x1b, 0x04, 0x85, 0xa4,
++      0x10, 0x70, 0x2f, 0xea, 0x61, 0xbc, 0x87, 0x3f,
++      0x34, 0x42, 0x26, 0x0c, 0xad, 0xed, 0x4a, 0xbd,
++      0xe2, 0x5b, 0x78, 0x6a, 0x2d, 0x97, 0xf1, 0x45
++};
++
++/* wycheproof - misc */
++static const u8 enc_input038[] __initconst = {
++      0x97, 0x3d, 0x0c, 0x75, 0x38, 0x26, 0xba, 0xe4,
++      0x66, 0xcf, 0x9a, 0xbb, 0x34, 0x93, 0x15, 0x2e,
++      0x9d, 0xe7, 0x81, 0x9e, 0x2b, 0xd0, 0xc7, 0x11,
++      0x71, 0x34, 0x6b, 0x4d, 0x2c, 0xeb, 0xf8, 0x04,
++      0x1a, 0xa3, 0xce, 0xdc, 0x0d, 0xfd, 0x7b, 0x46,
++      0x7e, 0x26, 0x22, 0x8b, 0xc8, 0x6c, 0x9a
++};
++static const u8 enc_output038[] __initconst = {
++      0xfb, 0xa7, 0x8a, 0xe4, 0xf9, 0xd8, 0x08, 0xa6,
++      0x2e, 0x3d, 0xa4, 0x0b, 0xe2, 0xcb, 0x77, 0x00,
++      0xc3, 0x61, 0x3d, 0x9e, 0xb2, 0xc5, 0x29, 0xc6,
++      0x52, 0xe7, 0x6a, 0x43, 0x2c, 0x65, 0x8d, 0x27,
++      0x09, 0x5f, 0x0e, 0xb8, 0xf9, 0x40, 0xc3, 0x24,
++      0x98, 0x1e, 0xa9, 0x35, 0xe5, 0x07, 0xf9, 0x8f,
++      0x04, 0x69, 0x56, 0xdb, 0x3a, 0x51, 0x29, 0x08,
++      0xbd, 0x7a, 0xfc, 0x8f, 0x2a, 0xb0, 0xa9
++};
++static const u8 enc_assoc038[] __initconst = { };
++static const u8 enc_nonce038[] __initconst = {
++      0xe7, 0x4a, 0x51, 0x5e, 0x7e, 0x21, 0x02, 0xb9,
++      0x0b, 0xef, 0x55, 0xd2
++};
++static const u8 enc_key038[] __initconst = {
++      0xcf, 0x0d, 0x40, 0xa4, 0x64, 0x4e, 0x5f, 0x51,
++      0x81, 0x51, 0x65, 0xd5, 0x30, 0x1b, 0x22, 0x63,
++      0x1f, 0x45, 0x44, 0xc4, 0x9a, 0x18, 0x78, 0xe3,
++      0xa0, 0xa5, 0xe8, 0xe1, 0xaa, 0xe0, 0xf2, 0x64
++};
++
++/* wycheproof - misc */
++static const u8 enc_input039[] __initconst = {
++      0xa9, 0x89, 0x95, 0x50, 0x4d, 0xf1, 0x6f, 0x74,
++      0x8b, 0xfb, 0x77, 0x85, 0xff, 0x91, 0xee, 0xb3,
++      0xb6, 0x60, 0xea, 0x9e, 0xd3, 0x45, 0x0c, 0x3d,
++      0x5e, 0x7b, 0x0e, 0x79, 0xef, 0x65, 0x36, 0x59,
++      0xa9, 0x97, 0x8d, 0x75, 0x54, 0x2e, 0xf9, 0x1c,
++      0x45, 0x67, 0x62, 0x21, 0x56, 0x40, 0xb9
++};
++static const u8 enc_output039[] __initconst = {
++      0xa1, 0xff, 0xed, 0x80, 0x76, 0x18, 0x29, 0xec,
++      0xce, 0x24, 0x2e, 0x0e, 0x88, 0xb1, 0x38, 0x04,
++      0x90, 0x16, 0xbc, 0xa0, 0x18, 0xda, 0x2b, 0x6e,
++      0x19, 0x98, 0x6b, 0x3e, 0x31, 0x8c, 0xae, 0x8d,
++      0x80, 0x61, 0x98, 0xfb, 0x4c, 0x52, 0x7c, 0xc3,
++      0x93, 0x50, 0xeb, 0xdd, 0xea, 0xc5, 0x73, 0xc4,
++      0xcb, 0xf0, 0xbe, 0xfd, 0xa0, 0xb7, 0x02, 0x42,
++      0xc6, 0x40, 0xd7, 0xcd, 0x02, 0xd7, 0xa3
++};
++static const u8 enc_assoc039[] __initconst = {
++      0xb3, 0xe4, 0x06, 0x46, 0x83, 0xb0, 0x2d, 0x84
++};
++static const u8 enc_nonce039[] __initconst = {
++      0xd4, 0xd8, 0x07, 0x34, 0x16, 0x83, 0x82, 0x5b,
++      0x31, 0xcd, 0x4d, 0x95
++};
++static const u8 enc_key039[] __initconst = {
++      0x6c, 0xbf, 0xd7, 0x1c, 0x64, 0x5d, 0x18, 0x4c,
++      0xf5, 0xd2, 0x3c, 0x40, 0x2b, 0xdb, 0x0d, 0x25,
++      0xec, 0x54, 0x89, 0x8c, 0x8a, 0x02, 0x73, 0xd4,
++      0x2e, 0xb5, 0xbe, 0x10, 0x9f, 0xdc, 0xb2, 0xac
++};
++
++/* wycheproof - misc */
++static const u8 enc_input040[] __initconst = {
++      0xd0, 0x96, 0x80, 0x31, 0x81, 0xbe, 0xef, 0x9e,
++      0x00, 0x8f, 0xf8, 0x5d, 0x5d, 0xdc, 0x38, 0xdd,
++      0xac, 0xf0, 0xf0, 0x9e, 0xe5, 0xf7, 0xe0, 0x7f,
++      0x1e, 0x40, 0x79, 0xcb, 0x64, 0xd0, 0xdc, 0x8f,
++      0x5e, 0x67, 0x11, 0xcd, 0x49, 0x21, 0xa7, 0x88,
++      0x7d, 0xe7, 0x6e, 0x26, 0x78, 0xfd, 0xc6, 0x76,
++      0x18, 0xf1, 0x18, 0x55, 0x86, 0xbf, 0xea, 0x9d,
++      0x4c, 0x68, 0x5d, 0x50, 0xe4, 0xbb, 0x9a, 0x82
++};
++static const u8 enc_output040[] __initconst = {
++      0x9a, 0x4e, 0xf2, 0x2b, 0x18, 0x16, 0x77, 0xb5,
++      0x75, 0x5c, 0x08, 0xf7, 0x47, 0xc0, 0xf8, 0xd8,
++      0xe8, 0xd4, 0xc1, 0x8a, 0x9c, 0xc2, 0x40, 0x5c,
++      0x12, 0xbb, 0x51, 0xbb, 0x18, 0x72, 0xc8, 0xe8,
++      0xb8, 0x77, 0x67, 0x8b, 0xec, 0x44, 0x2c, 0xfc,
++      0xbb, 0x0f, 0xf4, 0x64, 0xa6, 0x4b, 0x74, 0x33,
++      0x2c, 0xf0, 0x72, 0x89, 0x8c, 0x7e, 0x0e, 0xdd,
++      0xf6, 0x23, 0x2e, 0xa6, 0xe2, 0x7e, 0xfe, 0x50,
++      0x9f, 0xf3, 0x42, 0x7a, 0x0f, 0x32, 0xfa, 0x56,
++      0x6d, 0x9c, 0xa0, 0xa7, 0x8a, 0xef, 0xc0, 0x13
++};
++static const u8 enc_assoc040[] __initconst = { };
++static const u8 enc_nonce040[] __initconst = {
++      0xd6, 0x10, 0x40, 0xa3, 0x13, 0xed, 0x49, 0x28,
++      0x23, 0xcc, 0x06, 0x5b
++};
++static const u8 enc_key040[] __initconst = {
++      0x5b, 0x1d, 0x10, 0x35, 0xc0, 0xb1, 0x7e, 0xe0,
++      0xb0, 0x44, 0x47, 0x67, 0xf8, 0x0a, 0x25, 0xb8,
++      0xc1, 0xb7, 0x41, 0xf4, 0xb5, 0x0a, 0x4d, 0x30,
++      0x52, 0x22, 0x6b, 0xaa, 0x1c, 0x6f, 0xb7, 0x01
++};
++
++/* wycheproof - misc */
++static const u8 enc_input041[] __initconst = {
++      0x94, 0xee, 0x16, 0x6d, 0x6d, 0x6e, 0xcf, 0x88,
++      0x32, 0x43, 0x71, 0x36, 0xb4, 0xae, 0x80, 0x5d,
++      0x42, 0x88, 0x64, 0x35, 0x95, 0x86, 0xd9, 0x19,
++      0x3a, 0x25, 0x01, 0x62, 0x93, 0xed, 0xba, 0x44,
++      0x3c, 0x58, 0xe0, 0x7e, 0x7b, 0x71, 0x95, 0xec,
++      0x5b, 0xd8, 0x45, 0x82, 0xa9, 0xd5, 0x6c, 0x8d,
++      0x4a, 0x10, 0x8c, 0x7d, 0x7c, 0xe3, 0x4e, 0x6c,
++      0x6f, 0x8e, 0xa1, 0xbe, 0xc0, 0x56, 0x73, 0x17
++};
++static const u8 enc_output041[] __initconst = {
++      0x5f, 0xbb, 0xde, 0xcc, 0x34, 0xbe, 0x20, 0x16,
++      0x14, 0xf6, 0x36, 0x03, 0x1e, 0xeb, 0x42, 0xf1,
++      0xca, 0xce, 0x3c, 0x79, 0xa1, 0x2c, 0xff, 0xd8,
++      0x71, 0xee, 0x8e, 0x73, 0x82, 0x0c, 0x82, 0x97,
++      0x49, 0xf1, 0xab, 0xb4, 0x29, 0x43, 0x67, 0x84,
++      0x9f, 0xb6, 0xc2, 0xaa, 0x56, 0xbd, 0xa8, 0xa3,
++      0x07, 0x8f, 0x72, 0x3d, 0x7c, 0x1c, 0x85, 0x20,
++      0x24, 0xb0, 0x17, 0xb5, 0x89, 0x73, 0xfb, 0x1e,
++      0x09, 0x26, 0x3d, 0xa7, 0xb4, 0xcb, 0x92, 0x14,
++      0x52, 0xf9, 0x7d, 0xca, 0x40, 0xf5, 0x80, 0xec
++};
++static const u8 enc_assoc041[] __initconst = {
++      0x71, 0x93, 0xf6, 0x23, 0x66, 0x33, 0x21, 0xa2
++};
++static const u8 enc_nonce041[] __initconst = {
++      0xd3, 0x1c, 0x21, 0xab, 0xa1, 0x75, 0xb7, 0x0d,
++      0xe4, 0xeb, 0xb1, 0x9c
++};
++static const u8 enc_key041[] __initconst = {
++      0x97, 0xd6, 0x35, 0xc4, 0xf4, 0x75, 0x74, 0xd9,
++      0x99, 0x8a, 0x90, 0x87, 0x5d, 0xa1, 0xd3, 0xa2,
++      0x84, 0xb7, 0x55, 0xb2, 0xd3, 0x92, 0x97, 0xa5,
++      0x72, 0x52, 0x35, 0x19, 0x0e, 0x10, 0xa9, 0x7e
++};
++
++/* wycheproof - misc */
++static const u8 enc_input042[] __initconst = {
++      0xb4, 0x29, 0xeb, 0x80, 0xfb, 0x8f, 0xe8, 0xba,
++      0xed, 0xa0, 0xc8, 0x5b, 0x9c, 0x33, 0x34, 0x58,
++      0xe7, 0xc2, 0x99, 0x2e, 0x55, 0x84, 0x75, 0x06,
++      0x9d, 0x12, 0xd4, 0x5c, 0x22, 0x21, 0x75, 0x64,
++      0x12, 0x15, 0x88, 0x03, 0x22, 0x97, 0xef, 0xf5,
++      0x67, 0x83, 0x74, 0x2a, 0x5f, 0xc2, 0x2d, 0x74,
++      0x10, 0xff, 0xb2, 0x9d, 0x66, 0x09, 0x86, 0x61,
++      0xd7, 0x6f, 0x12, 0x6c, 0x3c, 0x27, 0x68, 0x9e,
++      0x43, 0xb3, 0x72, 0x67, 0xca, 0xc5, 0xa3, 0xa6,
++      0xd3, 0xab, 0x49, 0xe3, 0x91, 0xda, 0x29, 0xcd,
++      0x30, 0x54, 0xa5, 0x69, 0x2e, 0x28, 0x07, 0xe4,
++      0xc3, 0xea, 0x46, 0xc8, 0x76, 0x1d, 0x50, 0xf5,
++      0x92
++};
++static const u8 enc_output042[] __initconst = {
++      0xd0, 0x10, 0x2f, 0x6c, 0x25, 0x8b, 0xf4, 0x97,
++      0x42, 0xce, 0xc3, 0x4c, 0xf2, 0xd0, 0xfe, 0xdf,
++      0x23, 0xd1, 0x05, 0xfb, 0x4c, 0x84, 0xcf, 0x98,
++      0x51, 0x5e, 0x1b, 0xc9, 0xa6, 0x4f, 0x8a, 0xd5,
++      0xbe, 0x8f, 0x07, 0x21, 0xbd, 0xe5, 0x06, 0x45,
++      0xd0, 0x00, 0x83, 0xc3, 0xa2, 0x63, 0xa3, 0x10,
++      0x53, 0xb7, 0x60, 0x24, 0x5f, 0x52, 0xae, 0x28,
++      0x66, 0xa5, 0xec, 0x83, 0xb1, 0x9f, 0x61, 0xbe,
++      0x1d, 0x30, 0xd5, 0xc5, 0xd9, 0xfe, 0xcc, 0x4c,
++      0xbb, 0xe0, 0x8f, 0xd3, 0x85, 0x81, 0x3a, 0x2a,
++      0xa3, 0x9a, 0x00, 0xff, 0x9c, 0x10, 0xf7, 0xf2,
++      0x37, 0x02, 0xad, 0xd1, 0xe4, 0xb2, 0xff, 0xa3,
++      0x1c, 0x41, 0x86, 0x5f, 0xc7, 0x1d, 0xe1, 0x2b,
++      0x19, 0x61, 0x21, 0x27, 0xce, 0x49, 0x99, 0x3b,
++      0xb0
++};
++static const u8 enc_assoc042[] __initconst = { };
++static const u8 enc_nonce042[] __initconst = {
++      0x17, 0xc8, 0x6a, 0x8a, 0xbb, 0xb7, 0xe0, 0x03,
++      0xac, 0xde, 0x27, 0x99
++};
++static const u8 enc_key042[] __initconst = {
++      0xfe, 0x6e, 0x55, 0xbd, 0xae, 0xd1, 0xf7, 0x28,
++      0x4c, 0xa5, 0xfc, 0x0f, 0x8c, 0x5f, 0x2b, 0x8d,
++      0xf5, 0x6d, 0xc0, 0xf4, 0x9e, 0x8c, 0xa6, 0x6a,
++      0x41, 0x99, 0x5e, 0x78, 0x33, 0x51, 0xf9, 0x01
++};
++
++/* wycheproof - misc */
++static const u8 enc_input043[] __initconst = {
++      0xce, 0xb5, 0x34, 0xce, 0x50, 0xdc, 0x23, 0xff,
++      0x63, 0x8a, 0xce, 0x3e, 0xf6, 0x3a, 0xb2, 0xcc,
++      0x29, 0x73, 0xee, 0xad, 0xa8, 0x07, 0x85, 0xfc,
++      0x16, 0x5d, 0x06, 0xc2, 0xf5, 0x10, 0x0f, 0xf5,
++      0xe8, 0xab, 0x28, 0x82, 0xc4, 0x75, 0xaf, 0xcd,
++      0x05, 0xcc, 0xd4, 0x9f, 0x2e, 0x7d, 0x8f, 0x55,
++      0xef, 0x3a, 0x72, 0xe3, 0xdc, 0x51, 0xd6, 0x85,
++      0x2b, 0x8e, 0x6b, 0x9e, 0x7a, 0xec, 0xe5, 0x7b,
++      0xe6, 0x55, 0x6b, 0x0b, 0x6d, 0x94, 0x13, 0xe3,
++      0x3f, 0xc5, 0xfc, 0x24, 0xa9, 0xa2, 0x05, 0xad,
++      0x59, 0x57, 0x4b, 0xb3, 0x9d, 0x94, 0x4a, 0x92,
++      0xdc, 0x47, 0x97, 0x0d, 0x84, 0xa6, 0xad, 0x31,
++      0x76
++};
++static const u8 enc_output043[] __initconst = {
++      0x75, 0x45, 0x39, 0x1b, 0x51, 0xde, 0x01, 0xd5,
++      0xc5, 0x3d, 0xfa, 0xca, 0x77, 0x79, 0x09, 0x06,
++      0x3e, 0x58, 0xed, 0xee, 0x4b, 0xb1, 0x22, 0x7e,
++      0x71, 0x10, 0xac, 0x4d, 0x26, 0x20, 0xc2, 0xae,
++      0xc2, 0xf8, 0x48, 0xf5, 0x6d, 0xee, 0xb0, 0x37,
++      0xa8, 0xdc, 0xed, 0x75, 0xaf, 0xa8, 0xa6, 0xc8,
++      0x90, 0xe2, 0xde, 0xe4, 0x2f, 0x95, 0x0b, 0xb3,
++      0x3d, 0x9e, 0x24, 0x24, 0xd0, 0x8a, 0x50, 0x5d,
++      0x89, 0x95, 0x63, 0x97, 0x3e, 0xd3, 0x88, 0x70,
++      0xf3, 0xde, 0x6e, 0xe2, 0xad, 0xc7, 0xfe, 0x07,
++      0x2c, 0x36, 0x6c, 0x14, 0xe2, 0xcf, 0x7c, 0xa6,
++      0x2f, 0xb3, 0xd3, 0x6b, 0xee, 0x11, 0x68, 0x54,
++      0x61, 0xb7, 0x0d, 0x44, 0xef, 0x8c, 0x66, 0xc5,
++      0xc7, 0xbb, 0xf1, 0x0d, 0xca, 0xdd, 0x7f, 0xac,
++      0xf6
++};
++static const u8 enc_assoc043[] __initconst = {
++      0xa1, 0x1c, 0x40, 0xb6, 0x03, 0x76, 0x73, 0x30
++};
++static const u8 enc_nonce043[] __initconst = {
++      0x46, 0x36, 0x2f, 0x45, 0xd6, 0x37, 0x9e, 0x63,
++      0xe5, 0x22, 0x94, 0x60
++};
++static const u8 enc_key043[] __initconst = {
++      0xaa, 0xbc, 0x06, 0x34, 0x74, 0xe6, 0x5c, 0x4c,
++      0x3e, 0x9b, 0xdc, 0x48, 0x0d, 0xea, 0x97, 0xb4,
++      0x51, 0x10, 0xc8, 0x61, 0x88, 0x46, 0xff, 0x6b,
++      0x15, 0xbd, 0xd2, 0xa4, 0xa5, 0x68, 0x2c, 0x4e
++};
++
++/* wycheproof - misc */
++static const u8 enc_input044[] __initconst = {
++      0xe5, 0xcc, 0xaa, 0x44, 0x1b, 0xc8, 0x14, 0x68,
++      0x8f, 0x8f, 0x6e, 0x8f, 0x28, 0xb5, 0x00, 0xb2
++};
++static const u8 enc_output044[] __initconst = {
++      0x7e, 0x72, 0xf5, 0xa1, 0x85, 0xaf, 0x16, 0xa6,
++      0x11, 0x92, 0x1b, 0x43, 0x8f, 0x74, 0x9f, 0x0b,
++      0x12, 0x42, 0xc6, 0x70, 0x73, 0x23, 0x34, 0x02,
++      0x9a, 0xdf, 0xe1, 0xc5, 0x00, 0x16, 0x51, 0xe4
++};
++static const u8 enc_assoc044[] __initconst = {
++      0x02
++};
++static const u8 enc_nonce044[] __initconst = {
++      0x87, 0x34, 0x5f, 0x10, 0x55, 0xfd, 0x9e, 0x21,
++      0x02, 0xd5, 0x06, 0x56
++};
++static const u8 enc_key044[] __initconst = {
++      0x7d, 0x00, 0xb4, 0x80, 0x95, 0xad, 0xfa, 0x32,
++      0x72, 0x05, 0x06, 0x07, 0xb2, 0x64, 0x18, 0x50,
++      0x02, 0xba, 0x99, 0x95, 0x7c, 0x49, 0x8b, 0xe0,
++      0x22, 0x77, 0x0f, 0x2c, 0xe2, 0xf3, 0x14, 0x3c
++};
++
++/* wycheproof - misc */
++static const u8 enc_input045[] __initconst = {
++      0x02, 0xcd, 0xe1, 0x68, 0xfb, 0xa3, 0xf5, 0x44,
++      0xbb, 0xd0, 0x33, 0x2f, 0x7a, 0xde, 0xad, 0xa8
++};
++static const u8 enc_output045[] __initconst = {
++      0x85, 0xf2, 0x9a, 0x71, 0x95, 0x57, 0xcd, 0xd1,
++      0x4d, 0x1f, 0x8f, 0xff, 0xab, 0x6d, 0x9e, 0x60,
++      0x73, 0x2c, 0xa3, 0x2b, 0xec, 0xd5, 0x15, 0xa1,
++      0xed, 0x35, 0x3f, 0x54, 0x2e, 0x99, 0x98, 0x58
++};
++static const u8 enc_assoc045[] __initconst = {
++      0xb6, 0x48
++};
++static const u8 enc_nonce045[] __initconst = {
++      0x87, 0xa3, 0x16, 0x3e, 0xc0, 0x59, 0x8a, 0xd9,
++      0x5b, 0x3a, 0xa7, 0x13
++};
++static const u8 enc_key045[] __initconst = {
++      0x64, 0x32, 0x71, 0x7f, 0x1d, 0xb8, 0x5e, 0x41,
++      0xac, 0x78, 0x36, 0xbc, 0xe2, 0x51, 0x85, 0xa0,
++      0x80, 0xd5, 0x76, 0x2b, 0x9e, 0x2b, 0x18, 0x44,
++      0x4b, 0x6e, 0xc7, 0x2c, 0x3b, 0xd8, 0xe4, 0xdc
++};
++
++/* wycheproof - misc */
++static const u8 enc_input046[] __initconst = {
++      0x16, 0xdd, 0xd2, 0x3f, 0xf5, 0x3f, 0x3d, 0x23,
++      0xc0, 0x63, 0x34, 0x48, 0x70, 0x40, 0xeb, 0x47
++};
++static const u8 enc_output046[] __initconst = {
++      0xc1, 0xb2, 0x95, 0x93, 0x6d, 0x56, 0xfa, 0xda,
++      0xc0, 0x3e, 0x5f, 0x74, 0x2b, 0xff, 0x73, 0xa1,
++      0x39, 0xc4, 0x57, 0xdb, 0xab, 0x66, 0x38, 0x2b,
++      0xab, 0xb3, 0xb5, 0x58, 0x00, 0xcd, 0xa5, 0xb8
++};
++static const u8 enc_assoc046[] __initconst = {
++      0xbd, 0x4c, 0xd0, 0x2f, 0xc7, 0x50, 0x2b, 0xbd,
++      0xbd, 0xf6, 0xc9, 0xa3, 0xcb, 0xe8, 0xf0
++};
++static const u8 enc_nonce046[] __initconst = {
++      0x6f, 0x57, 0x3a, 0xa8, 0x6b, 0xaa, 0x49, 0x2b,
++      0xa4, 0x65, 0x96, 0xdf
++};
++static const u8 enc_key046[] __initconst = {
++      0x8e, 0x34, 0xcf, 0x73, 0xd2, 0x45, 0xa1, 0x08,
++      0x2a, 0x92, 0x0b, 0x86, 0x36, 0x4e, 0xb8, 0x96,
++      0xc4, 0x94, 0x64, 0x67, 0xbc, 0xb3, 0xd5, 0x89,
++      0x29, 0xfc, 0xb3, 0x66, 0x90, 0xe6, 0x39, 0x4f
++};
++
++/* wycheproof - misc */
++static const u8 enc_input047[] __initconst = {
++      0x62, 0x3b, 0x78, 0x50, 0xc3, 0x21, 0xe2, 0xcf,
++      0x0c, 0x6f, 0xbc, 0xc8, 0xdf, 0xd1, 0xaf, 0xf2
++};
++static const u8 enc_output047[] __initconst = {
++      0xc8, 0x4c, 0x9b, 0xb7, 0xc6, 0x1c, 0x1b, 0xcb,
++      0x17, 0x77, 0x2a, 0x1c, 0x50, 0x0c, 0x50, 0x95,
++      0xdb, 0xad, 0xf7, 0xa5, 0x13, 0x8c, 0xa0, 0x34,
++      0x59, 0xa2, 0xcd, 0x65, 0x83, 0x1e, 0x09, 0x2f
++};
++static const u8 enc_assoc047[] __initconst = {
++      0x89, 0xcc, 0xe9, 0xfb, 0x47, 0x44, 0x1d, 0x07,
++      0xe0, 0x24, 0x5a, 0x66, 0xfe, 0x8b, 0x77, 0x8b
++};
++static const u8 enc_nonce047[] __initconst = {
++      0x1a, 0x65, 0x18, 0xf0, 0x2e, 0xde, 0x1d, 0xa6,
++      0x80, 0x92, 0x66, 0xd9
++};
++static const u8 enc_key047[] __initconst = {
++      0xcb, 0x55, 0x75, 0xf5, 0xc7, 0xc4, 0x5c, 0x91,
++      0xcf, 0x32, 0x0b, 0x13, 0x9f, 0xb5, 0x94, 0x23,
++      0x75, 0x60, 0xd0, 0xa3, 0xe6, 0xf8, 0x65, 0xa6,
++      0x7d, 0x4f, 0x63, 0x3f, 0x2c, 0x08, 0xf0, 0x16
++};
++
++/* wycheproof - misc */
++static const u8 enc_input048[] __initconst = {
++      0x87, 0xb3, 0xa4, 0xd7, 0xb2, 0x6d, 0x8d, 0x32,
++      0x03, 0xa0, 0xde, 0x1d, 0x64, 0xef, 0x82, 0xe3
++};
++static const u8 enc_output048[] __initconst = {
++      0x94, 0xbc, 0x80, 0x62, 0x1e, 0xd1, 0xe7, 0x1b,
++      0x1f, 0xd2, 0xb5, 0xc3, 0xa1, 0x5e, 0x35, 0x68,
++      0x33, 0x35, 0x11, 0x86, 0x17, 0x96, 0x97, 0x84,
++      0x01, 0x59, 0x8b, 0x96, 0x37, 0x22, 0xf5, 0xb3
++};
++static const u8 enc_assoc048[] __initconst = {
++      0xd1, 0x9f, 0x2d, 0x98, 0x90, 0x95, 0xf7, 0xab,
++      0x03, 0xa5, 0xfd, 0xe8, 0x44, 0x16, 0xe0, 0x0c,
++      0x0e
++};
++static const u8 enc_nonce048[] __initconst = {
++      0x56, 0x4d, 0xee, 0x49, 0xab, 0x00, 0xd2, 0x40,
++      0xfc, 0x10, 0x68, 0xc3
++};
++static const u8 enc_key048[] __initconst = {
++      0xa5, 0x56, 0x9e, 0x72, 0x9a, 0x69, 0xb2, 0x4b,
++      0xa6, 0xe0, 0xff, 0x15, 0xc4, 0x62, 0x78, 0x97,
++      0x43, 0x68, 0x24, 0xc9, 0x41, 0xe9, 0xd0, 0x0b,
++      0x2e, 0x93, 0xfd, 0xdc, 0x4b, 0xa7, 0x76, 0x57
++};
++
++/* wycheproof - misc */
++static const u8 enc_input049[] __initconst = {
++      0xe6, 0x01, 0xb3, 0x85, 0x57, 0x79, 0x7d, 0xa2,
++      0xf8, 0xa4, 0x10, 0x6a, 0x08, 0x9d, 0x1d, 0xa6
++};
++static const u8 enc_output049[] __initconst = {
++      0x29, 0x9b, 0x5d, 0x3f, 0x3d, 0x03, 0xc0, 0x87,
++      0x20, 0x9a, 0x16, 0xe2, 0x85, 0x14, 0x31, 0x11,
++      0x4b, 0x45, 0x4e, 0xd1, 0x98, 0xde, 0x11, 0x7e,
++      0x83, 0xec, 0x49, 0xfa, 0x8d, 0x85, 0x08, 0xd6
++};
++static const u8 enc_assoc049[] __initconst = {
++      0x5e, 0x64, 0x70, 0xfa, 0xcd, 0x99, 0xc1, 0xd8,
++      0x1e, 0x37, 0xcd, 0x44, 0x01, 0x5f, 0xe1, 0x94,
++      0x80, 0xa2, 0xa4, 0xd3, 0x35, 0x2a, 0x4f, 0xf5,
++      0x60, 0xc0, 0x64, 0x0f, 0xdb, 0xda
++};
++static const u8 enc_nonce049[] __initconst = {
++      0xdf, 0x87, 0x13, 0xe8, 0x7e, 0xc3, 0xdb, 0xcf,
++      0xad, 0x14, 0xd5, 0x3e
++};
++static const u8 enc_key049[] __initconst = {
++      0x56, 0x20, 0x74, 0x65, 0xb4, 0xe4, 0x8e, 0x6d,
++      0x04, 0x63, 0x0f, 0x4a, 0x42, 0xf3, 0x5c, 0xfc,
++      0x16, 0x3a, 0xb2, 0x89, 0xc2, 0x2a, 0x2b, 0x47,
++      0x84, 0xf6, 0xf9, 0x29, 0x03, 0x30, 0xbe, 0xe0
++};
++
++/* wycheproof - misc */
++static const u8 enc_input050[] __initconst = {
++      0xdc, 0x9e, 0x9e, 0xaf, 0x11, 0xe3, 0x14, 0x18,
++      0x2d, 0xf6, 0xa4, 0xeb, 0xa1, 0x7a, 0xec, 0x9c
++};
++static const u8 enc_output050[] __initconst = {
++      0x60, 0x5b, 0xbf, 0x90, 0xae, 0xb9, 0x74, 0xf6,
++      0x60, 0x2b, 0xc7, 0x78, 0x05, 0x6f, 0x0d, 0xca,
++      0x38, 0xea, 0x23, 0xd9, 0x90, 0x54, 0xb4, 0x6b,
++      0x42, 0xff, 0xe0, 0x04, 0x12, 0x9d, 0x22, 0x04
++};
++static const u8 enc_assoc050[] __initconst = {
++      0xba, 0x44, 0x6f, 0x6f, 0x9a, 0x0c, 0xed, 0x22,
++      0x45, 0x0f, 0xeb, 0x10, 0x73, 0x7d, 0x90, 0x07,
++      0xfd, 0x69, 0xab, 0xc1, 0x9b, 0x1d, 0x4d, 0x90,
++      0x49, 0xa5, 0x55, 0x1e, 0x86, 0xec, 0x2b, 0x37
++};
++static const u8 enc_nonce050[] __initconst = {
++      0x8d, 0xf4, 0xb1, 0x5a, 0x88, 0x8c, 0x33, 0x28,
++      0x6a, 0x7b, 0x76, 0x51
++};
++static const u8 enc_key050[] __initconst = {
++      0x39, 0x37, 0x98, 0x6a, 0xf8, 0x6d, 0xaf, 0xc1,
++      0xba, 0x0c, 0x46, 0x72, 0xd8, 0xab, 0xc4, 0x6c,
++      0x20, 0x70, 0x62, 0x68, 0x2d, 0x9c, 0x26, 0x4a,
++      0xb0, 0x6d, 0x6c, 0x58, 0x07, 0x20, 0x51, 0x30
++};
++
++/* wycheproof - misc */
++static const u8 enc_input051[] __initconst = {
++      0x81, 0xce, 0x84, 0xed, 0xe9, 0xb3, 0x58, 0x59,
++      0xcc, 0x8c, 0x49, 0xa8, 0xf6, 0xbe, 0x7d, 0xc6
++};
++static const u8 enc_output051[] __initconst = {
++      0x7b, 0x7c, 0xe0, 0xd8, 0x24, 0x80, 0x9a, 0x70,
++      0xde, 0x32, 0x56, 0x2c, 0xcf, 0x2c, 0x2b, 0xbd,
++      0x15, 0xd4, 0x4a, 0x00, 0xce, 0x0d, 0x19, 0xb4,
++      0x23, 0x1f, 0x92, 0x1e, 0x22, 0xbc, 0x0a, 0x43
++};
++static const u8 enc_assoc051[] __initconst = {
++      0xd4, 0x1a, 0x82, 0x8d, 0x5e, 0x71, 0x82, 0x92,
++      0x47, 0x02, 0x19, 0x05, 0x40, 0x2e, 0xa2, 0x57,
++      0xdc, 0xcb, 0xc3, 0xb8, 0x0f, 0xcd, 0x56, 0x75,
++      0x05, 0x6b, 0x68, 0xbb, 0x59, 0xe6, 0x2e, 0x88,
++      0x73
++};
++static const u8 enc_nonce051[] __initconst = {
++      0xbe, 0x40, 0xe5, 0xf1, 0xa1, 0x18, 0x17, 0xa0,
++      0xa8, 0xfa, 0x89, 0x49
++};
++static const u8 enc_key051[] __initconst = {
++      0x36, 0x37, 0x2a, 0xbc, 0xdb, 0x78, 0xe0, 0x27,
++      0x96, 0x46, 0xac, 0x3d, 0x17, 0x6b, 0x96, 0x74,
++      0xe9, 0x15, 0x4e, 0xec, 0xf0, 0xd5, 0x46, 0x9c,
++      0x65, 0x1e, 0xc7, 0xe1, 0x6b, 0x4c, 0x11, 0x99
++};
++
++/* wycheproof - misc */
++static const u8 enc_input052[] __initconst = {
++      0xa6, 0x67, 0x47, 0xc8, 0x9e, 0x85, 0x7a, 0xf3,
++      0xa1, 0x8e, 0x2c, 0x79, 0x50, 0x00, 0x87, 0xed
++};
++static const u8 enc_output052[] __initconst = {
++      0xca, 0x82, 0xbf, 0xf3, 0xe2, 0xf3, 0x10, 0xcc,
++      0xc9, 0x76, 0x67, 0x2c, 0x44, 0x15, 0xe6, 0x9b,
++      0x57, 0x63, 0x8c, 0x62, 0xa5, 0xd8, 0x5d, 0xed,
++      0x77, 0x4f, 0x91, 0x3c, 0x81, 0x3e, 0xa0, 0x32
++};
++static const u8 enc_assoc052[] __initconst = {
++      0x3f, 0x2d, 0xd4, 0x9b, 0xbf, 0x09, 0xd6, 0x9a,
++      0x78, 0xa3, 0xd8, 0x0e, 0xa2, 0x56, 0x66, 0x14,
++      0xfc, 0x37, 0x94, 0x74, 0x19, 0x6c, 0x1a, 0xae,
++      0x84, 0x58, 0x3d, 0xa7, 0x3d, 0x7f, 0xf8, 0x5c,
++      0x6f, 0x42, 0xca, 0x42, 0x05, 0x6a, 0x97, 0x92,
++      0xcc, 0x1b, 0x9f, 0xb3, 0xc7, 0xd2, 0x61
++};
++static const u8 enc_nonce052[] __initconst = {
++      0x84, 0xc8, 0x7d, 0xae, 0x4e, 0xee, 0x27, 0x73,
++      0x0e, 0xc3, 0x5d, 0x12
++};
++static const u8 enc_key052[] __initconst = {
++      0x9f, 0x14, 0x79, 0xed, 0x09, 0x7d, 0x7f, 0xe5,
++      0x29, 0xc1, 0x1f, 0x2f, 0x5a, 0xdd, 0x9a, 0xaf,
++      0xf4, 0xa1, 0xca, 0x0b, 0x68, 0x99, 0x7a, 0x2c,
++      0xb7, 0xf7, 0x97, 0x49, 0xbd, 0x90, 0xaa, 0xf4
++};
++
+ /* wycheproof - misc */
+ static const u8 enc_input053[] __initconst = {
+       0x25, 0x6d, 0x40, 0x88, 0x80, 0x94, 0x17, 0x83,
+@@ -2760,6 +3859,126 @@ static const u8 enc_key073[] __initconst
+ };
+ 
+ /* wycheproof - checking for int overflows */
++static const u8 enc_input074[] __initconst = {
++      0xd4, 0x50, 0x0b, 0xf0, 0x09, 0x49, 0x35, 0x51,
++      0xc3, 0x80, 0xad, 0xf5, 0x2c, 0x57, 0x3a, 0x69,
++      0xdf, 0x7e, 0x8b, 0x76, 0x24, 0x63, 0x33, 0x0f,
++      0xac, 0xc1, 0x6a, 0x57, 0x26, 0xbe, 0x71, 0x90,
++      0xc6, 0x3c, 0x5a, 0x1c, 0x92, 0x65, 0x84, 0xa0,
++      0x96, 0x75, 0x68, 0x28, 0xdc, 0xdc, 0x64, 0xac,
++      0xdf, 0x96, 0x3d, 0x93, 0x1b, 0xf1, 0xda, 0xe2,
++      0x38, 0xf3, 0xf1, 0x57, 0x22, 0x4a, 0xc4, 0xb5,
++      0x42, 0xd7, 0x85, 0xb0, 0xdd, 0x84, 0xdb, 0x6b,
++      0xe3, 0xbc, 0x5a, 0x36, 0x63, 0xe8, 0x41, 0x49,
++      0xff, 0xbe, 0xd0, 0x9e, 0x54, 0xf7, 0x8f, 0x16,
++      0xa8, 0x22, 0x3b, 0x24, 0xcb, 0x01, 0x9f, 0x58,
++      0xb2, 0x1b, 0x0e, 0x55, 0x1e, 0x7a, 0xa0, 0x73,
++      0x27, 0x62, 0x95, 0x51, 0x37, 0x6c, 0xcb, 0xc3,
++      0x93, 0x76, 0x71, 0xa0, 0x62, 0x9b, 0xd9, 0x5c,
++      0x99, 0x15, 0xc7, 0x85, 0x55, 0x77, 0x1e, 0x7a
++};
++static const u8 enc_output074[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x0b, 0x30, 0x0d, 0x8d, 0xa5, 0x6c, 0x21, 0x85,
++      0x75, 0x52, 0x79, 0x55, 0x3c, 0x4c, 0x82, 0xca
++};
++static const u8 enc_assoc074[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce074[] __initconst = {
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x00, 0x02, 0x50, 0x6e
++};
++static const u8 enc_key074[] __initconst = {
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30
++};
++
++/* wycheproof - checking for int overflows */
++static const u8 enc_input075[] __initconst = {
++      0x7d, 0xe8, 0x7f, 0x67, 0x29, 0x94, 0x52, 0x75,
++      0xd0, 0x65, 0x5d, 0xa4, 0xc7, 0xfd, 0xe4, 0x56,
++      0x9e, 0x16, 0xf1, 0x11, 0xb5, 0xeb, 0x26, 0xc2,
++      0x2d, 0x85, 0x9e, 0x3f, 0xf8, 0x22, 0xec, 0xed,
++      0x3a, 0x6d, 0xd9, 0xa6, 0x0f, 0x22, 0x95, 0x7f,
++      0x7b, 0x7c, 0x85, 0x7e, 0x88, 0x22, 0xeb, 0x9f,
++      0xe0, 0xb8, 0xd7, 0x02, 0x21, 0x41, 0xf2, 0xd0,
++      0xb4, 0x8f, 0x4b, 0x56, 0x12, 0xd3, 0x22, 0xa8,
++      0x8d, 0xd0, 0xfe, 0x0b, 0x4d, 0x91, 0x79, 0x32,
++      0x4f, 0x7c, 0x6c, 0x9e, 0x99, 0x0e, 0xfb, 0xd8,
++      0x0e, 0x5e, 0xd6, 0x77, 0x58, 0x26, 0x49, 0x8b,
++      0x1e, 0xfe, 0x0f, 0x71, 0xa0, 0xf3, 0xec, 0x5b,
++      0x29, 0xcb, 0x28, 0xc2, 0x54, 0x0a, 0x7d, 0xcd,
++      0x51, 0xb7, 0xda, 0xae, 0xe0, 0xff, 0x4a, 0x7f,
++      0x3a, 0xc1, 0xee, 0x54, 0xc2, 0x9e, 0xe4, 0xc1,
++      0x70, 0xde, 0x40, 0x8f, 0x66, 0x69, 0x21, 0x94
++};
++static const u8 enc_output075[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xc5, 0x78, 0xe2, 0xaa, 0x44, 0xd3, 0x09, 0xb7,
++      0xb6, 0xa5, 0x19, 0x3b, 0xdc, 0x61, 0x18, 0xf5
++};
++static const u8 enc_assoc075[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_nonce075[] __initconst = {
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x00, 0x03, 0x18, 0xa5
++};
++static const u8 enc_key075[] __initconst = {
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
++      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30
++};
++
++/* wycheproof - checking for int overflows */
+ static const u8 enc_input076[] __initconst = {
+       0x1b, 0x99, 0x6f, 0x9a, 0x3c, 0xcc, 0x67, 0x85,
+       0xde, 0x22, 0xff, 0x5b, 0x8a, 0xdd, 0x95, 0x02,
+@@ -3349,6 +4568,286 @@ static const u8 enc_key085[] __initconst
+       0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
+ };
+ 
++/* wycheproof - special case tag */
++static const u8 enc_input086[] __initconst = {
++      0x9a, 0x49, 0xc4, 0x0f, 0x8b, 0x48, 0xd7, 0xc6,
++      0x6d, 0x1d, 0xb4, 0xe5, 0x3f, 0x20, 0xf2, 0xdd,
++      0x4a, 0xaa, 0x24, 0x1d, 0xda, 0xb2, 0x6b, 0x5b,
++      0xc0, 0xe2, 0x18, 0xb7, 0x2c, 0x33, 0x90, 0xf2,
++      0xdf, 0x3e, 0xbd, 0x01, 0x76, 0x70, 0x44, 0x19,
++      0x97, 0x2b, 0xcd, 0xbc, 0x6b, 0xbc, 0xb3, 0xe4,
++      0xe7, 0x4a, 0x71, 0x52, 0x8e, 0xf5, 0x12, 0x63,
++      0xce, 0x24, 0xe0, 0xd5, 0x75, 0xe0, 0xe4, 0x4d
++};
++static const u8 enc_output086[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
++      0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
++};
++static const u8 enc_assoc086[] __initconst = {
++      0x85, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xa6, 0x90, 0x2f, 0xcb, 0xc8, 0x83, 0xbb, 0xc1,
++      0x80, 0xb2, 0x56, 0xae, 0x34, 0xad, 0x7f, 0x00
++};
++static const u8 enc_nonce086[] __initconst = {
++      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
++      0x08, 0x09, 0x0a, 0x0b
++};
++static const u8 enc_key086[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - special case tag */
++static const u8 enc_input087[] __initconst = {
++      0x9a, 0x49, 0xc4, 0x0f, 0x8b, 0x48, 0xd7, 0xc6,
++      0x6d, 0x1d, 0xb4, 0xe5, 0x3f, 0x20, 0xf2, 0xdd,
++      0x4a, 0xaa, 0x24, 0x1d, 0xda, 0xb2, 0x6b, 0x5b,
++      0xc0, 0xe2, 0x18, 0xb7, 0x2c, 0x33, 0x90, 0xf2,
++      0xdf, 0x3e, 0xbd, 0x01, 0x76, 0x70, 0x44, 0x19,
++      0x97, 0x2b, 0xcd, 0xbc, 0x6b, 0xbc, 0xb3, 0xe4,
++      0xe7, 0x4a, 0x71, 0x52, 0x8e, 0xf5, 0x12, 0x63,
++      0xce, 0x24, 0xe0, 0xd5, 0x75, 0xe0, 0xe4, 0x4d
++};
++static const u8 enc_output087[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 enc_assoc087[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x24, 0x7e, 0x50, 0x64, 0x2a, 0x1c, 0x0a, 0x2f,
++      0x8f, 0x77, 0x21, 0x96, 0x09, 0xdb, 0xa9, 0x58
++};
++static const u8 enc_nonce087[] __initconst = {
++      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
++      0x08, 0x09, 0x0a, 0x0b
++};
++static const u8 enc_key087[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - special case tag */
++static const u8 enc_input088[] __initconst = {
++      0x9a, 0x49, 0xc4, 0x0f, 0x8b, 0x48, 0xd7, 0xc6,
++      0x6d, 0x1d, 0xb4, 0xe5, 0x3f, 0x20, 0xf2, 0xdd,
++      0x4a, 0xaa, 0x24, 0x1d, 0xda, 0xb2, 0x6b, 0x5b,
++      0xc0, 0xe2, 0x18, 0xb7, 0x2c, 0x33, 0x90, 0xf2,
++      0xdf, 0x3e, 0xbd, 0x01, 0x76, 0x70, 0x44, 0x19,
++      0x97, 0x2b, 0xcd, 0xbc, 0x6b, 0xbc, 0xb3, 0xe4,
++      0xe7, 0x4a, 0x71, 0x52, 0x8e, 0xf5, 0x12, 0x63,
++      0xce, 0x24, 0xe0, 0xd5, 0x75, 0xe0, 0xe4, 0x4d
++};
++static const u8 enc_output088[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++};
++static const u8 enc_assoc088[] __initconst = {
++      0x7c, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xd9, 0xe7, 0x2c, 0x06, 0x4a, 0xc8, 0x96, 0x1f,
++      0x3f, 0xa5, 0x85, 0xe0, 0xe2, 0xab, 0xd6, 0x00
++};
++static const u8 enc_nonce088[] __initconst = {
++      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
++      0x08, 0x09, 0x0a, 0x0b
++};
++static const u8 enc_key088[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - special case tag */
++static const u8 enc_input089[] __initconst = {
++      0x9a, 0x49, 0xc4, 0x0f, 0x8b, 0x48, 0xd7, 0xc6,
++      0x6d, 0x1d, 0xb4, 0xe5, 0x3f, 0x20, 0xf2, 0xdd,
++      0x4a, 0xaa, 0x24, 0x1d, 0xda, 0xb2, 0x6b, 0x5b,
++      0xc0, 0xe2, 0x18, 0xb7, 0x2c, 0x33, 0x90, 0xf2,
++      0xdf, 0x3e, 0xbd, 0x01, 0x76, 0x70, 0x44, 0x19,
++      0x97, 0x2b, 0xcd, 0xbc, 0x6b, 0xbc, 0xb3, 0xe4,
++      0xe7, 0x4a, 0x71, 0x52, 0x8e, 0xf5, 0x12, 0x63,
++      0xce, 0x24, 0xe0, 0xd5, 0x75, 0xe0, 0xe4, 0x4d
++};
++static const u8 enc_output089[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
++      0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80
++};
++static const u8 enc_assoc089[] __initconst = {
++      0x65, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x95, 0xaf, 0x0f, 0x4d, 0x0b, 0x68, 0x6e, 0xae,
++      0xcc, 0xca, 0x43, 0x07, 0xd5, 0x96, 0xf5, 0x02
++};
++static const u8 enc_nonce089[] __initconst = {
++      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
++      0x08, 0x09, 0x0a, 0x0b
++};
++static const u8 enc_key089[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - special case tag */
++static const u8 enc_input090[] __initconst = {
++      0x9a, 0x49, 0xc4, 0x0f, 0x8b, 0x48, 0xd7, 0xc6,
++      0x6d, 0x1d, 0xb4, 0xe5, 0x3f, 0x20, 0xf2, 0xdd,
++      0x4a, 0xaa, 0x24, 0x1d, 0xda, 0xb2, 0x6b, 0x5b,
++      0xc0, 0xe2, 0x18, 0xb7, 0x2c, 0x33, 0x90, 0xf2,
++      0xdf, 0x3e, 0xbd, 0x01, 0x76, 0x70, 0x44, 0x19,
++      0x97, 0x2b, 0xcd, 0xbc, 0x6b, 0xbc, 0xb3, 0xe4,
++      0xe7, 0x4a, 0x71, 0x52, 0x8e, 0xf5, 0x12, 0x63,
++      0xce, 0x24, 0xe0, 0xd5, 0x75, 0xe0, 0xe4, 0x4d
++};
++static const u8 enc_output090[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f,
++      0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f
++};
++static const u8 enc_assoc090[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x85, 0x40, 0xb4, 0x64, 0x35, 0x77, 0x07, 0xbe,
++      0x3a, 0x39, 0xd5, 0x5c, 0x34, 0xf8, 0xbc, 0xb3
++};
++static const u8 enc_nonce090[] __initconst = {
++      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
++      0x08, 0x09, 0x0a, 0x0b
++};
++static const u8 enc_key090[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - special case tag */
++static const u8 enc_input091[] __initconst = {
++      0x9a, 0x49, 0xc4, 0x0f, 0x8b, 0x48, 0xd7, 0xc6,
++      0x6d, 0x1d, 0xb4, 0xe5, 0x3f, 0x20, 0xf2, 0xdd,
++      0x4a, 0xaa, 0x24, 0x1d, 0xda, 0xb2, 0x6b, 0x5b,
++      0xc0, 0xe2, 0x18, 0xb7, 0x2c, 0x33, 0x90, 0xf2,
++      0xdf, 0x3e, 0xbd, 0x01, 0x76, 0x70, 0x44, 0x19,
++      0x97, 0x2b, 0xcd, 0xbc, 0x6b, 0xbc, 0xb3, 0xe4,
++      0xe7, 0x4a, 0x71, 0x52, 0x8e, 0xf5, 0x12, 0x63,
++      0xce, 0x24, 0xe0, 0xd5, 0x75, 0xe0, 0xe4, 0x4d
++};
++static const u8 enc_output091[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
++      0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00
++};
++static const u8 enc_assoc091[] __initconst = {
++      0x4f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x66, 0x23, 0xd9, 0x90, 0xb8, 0x98, 0xd8, 0x30,
++      0xd2, 0x12, 0xaf, 0x23, 0x83, 0x33, 0x07, 0x01
++};
++static const u8 enc_nonce091[] __initconst = {
++      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
++      0x08, 0x09, 0x0a, 0x0b
++};
++static const u8 enc_key091[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
++/* wycheproof - special case tag */
++static const u8 enc_input092[] __initconst = {
++      0x9a, 0x49, 0xc4, 0x0f, 0x8b, 0x48, 0xd7, 0xc6,
++      0x6d, 0x1d, 0xb4, 0xe5, 0x3f, 0x20, 0xf2, 0xdd,
++      0x4a, 0xaa, 0x24, 0x1d, 0xda, 0xb2, 0x6b, 0x5b,
++      0xc0, 0xe2, 0x18, 0xb7, 0x2c, 0x33, 0x90, 0xf2,
++      0xdf, 0x3e, 0xbd, 0x01, 0x76, 0x70, 0x44, 0x19,
++      0x97, 0x2b, 0xcd, 0xbc, 0x6b, 0xbc, 0xb3, 0xe4,
++      0xe7, 0x4a, 0x71, 0x52, 0x8e, 0xf5, 0x12, 0x63,
++      0xce, 0x24, 0xe0, 0xd5, 0x75, 0xe0, 0xe4, 0x4d
++};
++static const u8 enc_output092[] __initconst = {
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
++      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 enc_assoc092[] __initconst = {
++      0x83, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++      0x5f, 0x16, 0xd0, 0x9f, 0x17, 0x78, 0x72, 0x11,
++      0xb7, 0xd4, 0x84, 0xe0, 0x24, 0xf8, 0x97, 0x01
++};
++static const u8 enc_nonce092[] __initconst = {
++      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
++      0x08, 0x09, 0x0a, 0x0b
++};
++static const u8 enc_key092[] __initconst = {
++      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
++      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
++      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
++      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f
++};
++
+ /* wycheproof - edge case intermediate sums in poly1305 */
+ static const u8 enc_input093[] __initconst = {
+       0x00, 0x52, 0x35, 0xd2, 0xa9, 0x19, 0xf2, 0x8d,
+@@ -4455,6 +5954,86 @@ chacha20poly1305_enc_vectors[] __initcon
+         sizeof(enc_input011), sizeof(enc_assoc011), sizeof(enc_nonce011) },
+       { enc_input012, enc_output012, enc_assoc012, enc_nonce012, enc_key012,
+         sizeof(enc_input012), sizeof(enc_assoc012), sizeof(enc_nonce012) },
++      { enc_input013, enc_output013, enc_assoc013, enc_nonce013, enc_key013,
++        sizeof(enc_input013), sizeof(enc_assoc013), sizeof(enc_nonce013) },
++      { enc_input014, enc_output014, enc_assoc014, enc_nonce014, enc_key014,
++        sizeof(enc_input014), sizeof(enc_assoc014), sizeof(enc_nonce014) },
++      { enc_input015, enc_output015, enc_assoc015, enc_nonce015, enc_key015,
++        sizeof(enc_input015), sizeof(enc_assoc015), sizeof(enc_nonce015) },
++      { enc_input016, enc_output016, enc_assoc016, enc_nonce016, enc_key016,
++        sizeof(enc_input016), sizeof(enc_assoc016), sizeof(enc_nonce016) },
++      { enc_input017, enc_output017, enc_assoc017, enc_nonce017, enc_key017,
++        sizeof(enc_input017), sizeof(enc_assoc017), sizeof(enc_nonce017) },
++      { enc_input018, enc_output018, enc_assoc018, enc_nonce018, enc_key018,
++        sizeof(enc_input018), sizeof(enc_assoc018), sizeof(enc_nonce018) },
++      { enc_input019, enc_output019, enc_assoc019, enc_nonce019, enc_key019,
++        sizeof(enc_input019), sizeof(enc_assoc019), sizeof(enc_nonce019) },
++      { enc_input020, enc_output020, enc_assoc020, enc_nonce020, enc_key020,
++        sizeof(enc_input020), sizeof(enc_assoc020), sizeof(enc_nonce020) },
++      { enc_input021, enc_output021, enc_assoc021, enc_nonce021, enc_key021,
++        sizeof(enc_input021), sizeof(enc_assoc021), sizeof(enc_nonce021) },
++      { enc_input022, enc_output022, enc_assoc022, enc_nonce022, enc_key022,
++        sizeof(enc_input022), sizeof(enc_assoc022), sizeof(enc_nonce022) },
++      { enc_input023, enc_output023, enc_assoc023, enc_nonce023, enc_key023,
++        sizeof(enc_input023), sizeof(enc_assoc023), sizeof(enc_nonce023) },
++      { enc_input024, enc_output024, enc_assoc024, enc_nonce024, enc_key024,
++        sizeof(enc_input024), sizeof(enc_assoc024), sizeof(enc_nonce024) },
++      { enc_input025, enc_output025, enc_assoc025, enc_nonce025, enc_key025,
++        sizeof(enc_input025), sizeof(enc_assoc025), sizeof(enc_nonce025) },
++      { enc_input026, enc_output026, enc_assoc026, enc_nonce026, enc_key026,
++        sizeof(enc_input026), sizeof(enc_assoc026), sizeof(enc_nonce026) },
++      { enc_input027, enc_output027, enc_assoc027, enc_nonce027, enc_key027,
++        sizeof(enc_input027), sizeof(enc_assoc027), sizeof(enc_nonce027) },
++      { enc_input028, enc_output028, enc_assoc028, enc_nonce028, enc_key028,
++        sizeof(enc_input028), sizeof(enc_assoc028), sizeof(enc_nonce028) },
++      { enc_input029, enc_output029, enc_assoc029, enc_nonce029, enc_key029,
++        sizeof(enc_input029), sizeof(enc_assoc029), sizeof(enc_nonce029) },
++      { enc_input030, enc_output030, enc_assoc030, enc_nonce030, enc_key030,
++        sizeof(enc_input030), sizeof(enc_assoc030), sizeof(enc_nonce030) },
++      { enc_input031, enc_output031, enc_assoc031, enc_nonce031, enc_key031,
++        sizeof(enc_input031), sizeof(enc_assoc031), sizeof(enc_nonce031) },
++      { enc_input032, enc_output032, enc_assoc032, enc_nonce032, enc_key032,
++        sizeof(enc_input032), sizeof(enc_assoc032), sizeof(enc_nonce032) },
++      { enc_input033, enc_output033, enc_assoc033, enc_nonce033, enc_key033,
++        sizeof(enc_input033), sizeof(enc_assoc033), sizeof(enc_nonce033) },
++      { enc_input034, enc_output034, enc_assoc034, enc_nonce034, enc_key034,
++        sizeof(enc_input034), sizeof(enc_assoc034), sizeof(enc_nonce034) },
++      { enc_input035, enc_output035, enc_assoc035, enc_nonce035, enc_key035,
++        sizeof(enc_input035), sizeof(enc_assoc035), sizeof(enc_nonce035) },
++      { enc_input036, enc_output036, enc_assoc036, enc_nonce036, enc_key036,
++        sizeof(enc_input036), sizeof(enc_assoc036), sizeof(enc_nonce036) },
++      { enc_input037, enc_output037, enc_assoc037, enc_nonce037, enc_key037,
++        sizeof(enc_input037), sizeof(enc_assoc037), sizeof(enc_nonce037) },
++      { enc_input038, enc_output038, enc_assoc038, enc_nonce038, enc_key038,
++        sizeof(enc_input038), sizeof(enc_assoc038), sizeof(enc_nonce038) },
++      { enc_input039, enc_output039, enc_assoc039, enc_nonce039, enc_key039,
++        sizeof(enc_input039), sizeof(enc_assoc039), sizeof(enc_nonce039) },
++      { enc_input040, enc_output040, enc_assoc040, enc_nonce040, enc_key040,
++        sizeof(enc_input040), sizeof(enc_assoc040), sizeof(enc_nonce040) },
++      { enc_input041, enc_output041, enc_assoc041, enc_nonce041, enc_key041,
++        sizeof(enc_input041), sizeof(enc_assoc041), sizeof(enc_nonce041) },
++      { enc_input042, enc_output042, enc_assoc042, enc_nonce042, enc_key042,
++        sizeof(enc_input042), sizeof(enc_assoc042), sizeof(enc_nonce042) },
++      { enc_input043, enc_output043, enc_assoc043, enc_nonce043, enc_key043,
++        sizeof(enc_input043), sizeof(enc_assoc043), sizeof(enc_nonce043) },
++      { enc_input044, enc_output044, enc_assoc044, enc_nonce044, enc_key044,
++        sizeof(enc_input044), sizeof(enc_assoc044), sizeof(enc_nonce044) },
++      { enc_input045, enc_output045, enc_assoc045, enc_nonce045, enc_key045,
++        sizeof(enc_input045), sizeof(enc_assoc045), sizeof(enc_nonce045) },
++      { enc_input046, enc_output046, enc_assoc046, enc_nonce046, enc_key046,
++        sizeof(enc_input046), sizeof(enc_assoc046), sizeof(enc_nonce046) },
++      { enc_input047, enc_output047, enc_assoc047, enc_nonce047, enc_key047,
++        sizeof(enc_input047), sizeof(enc_assoc047), sizeof(enc_nonce047) },
++      { enc_input048, enc_output048, enc_assoc048, enc_nonce048, enc_key048,
++        sizeof(enc_input048), sizeof(enc_assoc048), sizeof(enc_nonce048) },
++      { enc_input049, enc_output049, enc_assoc049, enc_nonce049, enc_key049,
++        sizeof(enc_input049), sizeof(enc_assoc049), sizeof(enc_nonce049) },
++      { enc_input050, enc_output050, enc_assoc050, enc_nonce050, enc_key050,
++        sizeof(enc_input050), sizeof(enc_assoc050), sizeof(enc_nonce050) },
++      { enc_input051, enc_output051, enc_assoc051, enc_nonce051, enc_key051,
++        sizeof(enc_input051), sizeof(enc_assoc051), sizeof(enc_nonce051) },
++      { enc_input052, enc_output052, enc_assoc052, enc_nonce052, enc_key052,
++        sizeof(enc_input052), sizeof(enc_assoc052), sizeof(enc_nonce052) },
+       { enc_input053, enc_output053, enc_assoc053, enc_nonce053, enc_key053,
+         sizeof(enc_input053), sizeof(enc_assoc053), sizeof(enc_nonce053) },
+       { enc_input054, enc_output054, enc_assoc054, enc_nonce054, enc_key054,
+@@ -4497,6 +6076,10 @@ chacha20poly1305_enc_vectors[] __initcon
+         sizeof(enc_input072), sizeof(enc_assoc072), sizeof(enc_nonce072) },
+       { enc_input073, enc_output073, enc_assoc073, enc_nonce073, enc_key073,
+         sizeof(enc_input073), sizeof(enc_assoc073), sizeof(enc_nonce073) },
++      { enc_input074, enc_output074, enc_assoc074, enc_nonce074, enc_key074,
++        sizeof(enc_input074), sizeof(enc_assoc074), sizeof(enc_nonce074) },
++      { enc_input075, enc_output075, enc_assoc075, enc_nonce075, enc_key075,
++        sizeof(enc_input075), sizeof(enc_assoc075), sizeof(enc_nonce075) },
+       { enc_input076, enc_output076, enc_assoc076, enc_nonce076, enc_key076,
+         sizeof(enc_input076), sizeof(enc_assoc076), sizeof(enc_nonce076) },
+       { enc_input077, enc_output077, enc_assoc077, enc_nonce077, enc_key077,
+@@ -4517,6 +6100,20 @@ chacha20poly1305_enc_vectors[] __initcon
+         sizeof(enc_input084), sizeof(enc_assoc084), sizeof(enc_nonce084) },
+       { enc_input085, enc_output085, enc_assoc085, enc_nonce085, enc_key085,
+         sizeof(enc_input085), sizeof(enc_assoc085), sizeof(enc_nonce085) },
++      { enc_input086, enc_output086, enc_assoc086, enc_nonce086, enc_key086,
++        sizeof(enc_input086), sizeof(enc_assoc086), sizeof(enc_nonce086) },
++      { enc_input087, enc_output087, enc_assoc087, enc_nonce087, enc_key087,
++        sizeof(enc_input087), sizeof(enc_assoc087), sizeof(enc_nonce087) },
++      { enc_input088, enc_output088, enc_assoc088, enc_nonce088, enc_key088,
++        sizeof(enc_input088), sizeof(enc_assoc088), sizeof(enc_nonce088) },
++      { enc_input089, enc_output089, enc_assoc089, enc_nonce089, enc_key089,
++        sizeof(enc_input089), sizeof(enc_assoc089), sizeof(enc_nonce089) },
++      { enc_input090, enc_output090, enc_assoc090, enc_nonce090, enc_key090,
++        sizeof(enc_input090), sizeof(enc_assoc090), sizeof(enc_nonce090) },
++      { enc_input091, enc_output091, enc_assoc091, enc_nonce091, enc_key091,
++        sizeof(enc_input091), sizeof(enc_assoc091), sizeof(enc_nonce091) },
++      { enc_input092, enc_output092, enc_assoc092, enc_nonce092, enc_key092,
++        sizeof(enc_input092), sizeof(enc_assoc092), sizeof(enc_nonce092) },
+       { enc_input093, enc_output093, enc_assoc093, enc_nonce093, enc_key093,
+         sizeof(enc_input093), sizeof(enc_assoc093), sizeof(enc_nonce093) },
+       { enc_input094, enc_output094, enc_assoc094, enc_nonce094, enc_key094,
+@@ -7224,6 +8821,43 @@ xchacha20poly1305_dec_vectors[] __initco
+         sizeof(xdec_input001), sizeof(xdec_assoc001), sizeof(xdec_nonce001) }
+ };
+ 
++/* This is for the selftests-only, since it is only useful for the purpose of
++ * testing the underlying primitives and interactions.
++ */
++static void __init
++chacha20poly1305_encrypt_bignonce(u8 *dst, const u8 *src, const size_t src_len,
++                                const u8 *ad, const size_t ad_len,
++                                const u8 nonce[12],
++                                const u8 key[CHACHA20POLY1305_KEY_SIZE])
++{
++      const u8 *pad0 = page_address(ZERO_PAGE(0));
++      struct poly1305_desc_ctx poly1305_state;
++      u32 chacha20_state[CHACHA_STATE_WORDS];
++      union {
++              u8 block0[POLY1305_KEY_SIZE];
++              __le64 lens[2];
++      } b = {{ 0 }};
++      u8 bottom_row[16] = { 0 };
++      u32 le_key[8];
++      int i;
++
++      memcpy(&bottom_row[4], nonce, 12);
++      for (i = 0; i < 8; ++i)
++              le_key[i] = get_unaligned_le32(key + sizeof(le_key[i]) * i);
++      chacha_init(chacha20_state, le_key, bottom_row);
++      chacha20_crypt(chacha20_state, b.block0, b.block0, sizeof(b.block0));
++      poly1305_init(&poly1305_state, b.block0);
++      poly1305_update(&poly1305_state, ad, ad_len);
++      poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf);
++      chacha20_crypt(chacha20_state, dst, src, src_len);
++      poly1305_update(&poly1305_state, dst, src_len);
++      poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf);
++      b.lens[0] = cpu_to_le64(ad_len);
++      b.lens[1] = cpu_to_le64(src_len);
++      poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens));
++      poly1305_final(&poly1305_state, dst + src_len);
++}
++
+ static void __init
+ chacha20poly1305_selftest_encrypt(u8 *dst, const u8 *src, const size_t src_len,
+                                 const u8 *ad, const size_t ad_len,
+@@ -7233,6 +8867,9 @@ chacha20poly1305_selftest_encrypt(u8 *ds
+       if (nonce_len == 8)
+               chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len,
+                                        get_unaligned_le64(nonce), key);
++      else if (nonce_len == 12)
++              chacha20poly1305_encrypt_bignonce(dst, src, src_len, ad,
++                                                ad_len, nonce, key);
+       else
+               BUG();
+ }
+@@ -7248,14 +8885,14 @@ decryption_success(bool func_ret, bool e
+ bool __init chacha20poly1305_selftest(void)
+ {
+       enum { MAXIMUM_TEST_BUFFER_LEN = 1UL << 12 };
+-      size_t i;
+-      u8 *computed_output = NULL, *heap_src = NULL;
+-      struct scatterlist sg_src;
++      size_t i, j, k, total_len;
++      u8 *computed_output = NULL, *input = NULL;
+       bool success = true, ret;
++      struct scatterlist sg_src[3];
+ 
+-      heap_src = kmalloc(MAXIMUM_TEST_BUFFER_LEN, GFP_KERNEL);
+       computed_output = kmalloc(MAXIMUM_TEST_BUFFER_LEN, GFP_KERNEL);
+-      if (!heap_src || !computed_output) {
++      input = kmalloc(MAXIMUM_TEST_BUFFER_LEN, GFP_KERNEL);
++      if (!computed_output || !input) {
+               pr_err("chacha20poly1305 self-test malloc: FAIL\n");
+               success = false;
+               goto out;
+@@ -7284,17 +8921,17 @@ bool __init chacha20poly1305_selftest(vo
+       for (i = 0; i < ARRAY_SIZE(chacha20poly1305_enc_vectors); ++i) {
+               if (chacha20poly1305_enc_vectors[i].nlen != 8)
+                       continue;
+-              memcpy(heap_src, chacha20poly1305_enc_vectors[i].input,
++              memcpy(computed_output, chacha20poly1305_enc_vectors[i].input,
+                      chacha20poly1305_enc_vectors[i].ilen);
+-              sg_init_one(&sg_src, heap_src,
++              sg_init_one(sg_src, computed_output,
+                           chacha20poly1305_enc_vectors[i].ilen + POLY1305_DIGEST_SIZE);
+-              chacha20poly1305_encrypt_sg_inplace(&sg_src,
++              ret = chacha20poly1305_encrypt_sg_inplace(sg_src,
+                       chacha20poly1305_enc_vectors[i].ilen,
+                       chacha20poly1305_enc_vectors[i].assoc,
+                       chacha20poly1305_enc_vectors[i].alen,
+                       get_unaligned_le64(chacha20poly1305_enc_vectors[i].nonce),
+                       chacha20poly1305_enc_vectors[i].key);
+-              if (memcmp(heap_src,
++              if (!ret || memcmp(computed_output,
+                                  chacha20poly1305_enc_vectors[i].output,
+                                  chacha20poly1305_enc_vectors[i].ilen +
+                                                       POLY1305_DIGEST_SIZE)) {
+@@ -7326,11 +8963,11 @@ bool __init chacha20poly1305_selftest(vo
+       }
+ 
+       for (i = 0; i < ARRAY_SIZE(chacha20poly1305_dec_vectors); ++i) {
+-              memcpy(heap_src, chacha20poly1305_dec_vectors[i].input,
++              memcpy(computed_output, chacha20poly1305_dec_vectors[i].input,
+                      chacha20poly1305_dec_vectors[i].ilen);
+-              sg_init_one(&sg_src, heap_src,
++              sg_init_one(sg_src, computed_output,
+                           chacha20poly1305_dec_vectors[i].ilen);
+-              ret = chacha20poly1305_decrypt_sg_inplace(&sg_src,
++              ret = chacha20poly1305_decrypt_sg_inplace(sg_src,
+                       chacha20poly1305_dec_vectors[i].ilen,
+                       chacha20poly1305_dec_vectors[i].assoc,
+                       chacha20poly1305_dec_vectors[i].alen,
+@@ -7338,7 +8975,7 @@ bool __init chacha20poly1305_selftest(vo
+                       chacha20poly1305_dec_vectors[i].key);
+               if (!decryption_success(ret,
+                       chacha20poly1305_dec_vectors[i].failure,
+-                      memcmp(heap_src, chacha20poly1305_dec_vectors[i].output,
++                      memcmp(computed_output, chacha20poly1305_dec_vectors[i].output,
+                              chacha20poly1305_dec_vectors[i].ilen -
+                                                       POLY1305_DIGEST_SIZE))) {
+                       pr_err("chacha20poly1305 sg decryption self-test %zu: FAIL\n",
+@@ -7365,6 +9002,7 @@ bool __init chacha20poly1305_selftest(vo
+                       success = false;
+               }
+       }
++
+       for (i = 0; i < ARRAY_SIZE(xchacha20poly1305_dec_vectors); ++i) {
+               memset(computed_output, 0, MAXIMUM_TEST_BUFFER_LEN);
+               ret = xchacha20poly1305_decrypt(computed_output,
+@@ -7386,8 +9024,54 @@ bool __init chacha20poly1305_selftest(vo
+               }
+       }
+ 
++      for (total_len = POLY1305_DIGEST_SIZE; IS_ENABLED(DEBUG_CHACHA20POLY1305_SLOW_CHUNK_TEST)
++           && total_len <= 1 << 10; ++total_len) {
++              for (i = 0; i <= total_len; ++i) {
++                      for (j = i; j <= total_len; ++j) {
++                              sg_init_table(sg_src, 3);
++                              sg_set_buf(&sg_src[0], input, i);
++                              sg_set_buf(&sg_src[1], input + i, j - i);
++                              sg_set_buf(&sg_src[2], input + j, total_len - j);
++                              memset(computed_output, 0, total_len);
++                              memset(input, 0, total_len);
++
++                              if (!chacha20poly1305_encrypt_sg_inplace(sg_src,
++                                      total_len - POLY1305_DIGEST_SIZE, NULL, 0,
++                                      0, enc_key001))
++                                      goto chunkfail;
++                              chacha20poly1305_encrypt(computed_output,
++                                      computed_output,
++                                      total_len - POLY1305_DIGEST_SIZE, NULL, 0, 0,
++                                      enc_key001);
++                              if (memcmp(computed_output, input, total_len))
++                                      goto chunkfail;
++                              if (!chacha20poly1305_decrypt(computed_output,
++                                      input, total_len, NULL, 0, 0, enc_key001))
++                                      goto chunkfail;
++                              for (k = 0; k < total_len - POLY1305_DIGEST_SIZE; ++k) {
++                                      if (computed_output[k])
++                                              goto chunkfail;
++                              }
++                              if (!chacha20poly1305_decrypt_sg_inplace(sg_src,
++                                      total_len, NULL, 0, 0, enc_key001))
++                                      goto chunkfail;
++                              for (k = 0; k < total_len - POLY1305_DIGEST_SIZE; ++k) {
++                                      if (input[k])
++                                              goto chunkfail;
++                              }
++                              continue;
++
++                      chunkfail:
++                              pr_err("chacha20poly1305 chunked self-test %zu/%zu/%zu: FAIL\n",
++                                     total_len, i, j);
++                              success = false;
++                      }
++
++              }
++      }
++
+ out:
+-      kfree(heap_src);
+       kfree(computed_output);
++      kfree(input);
+       return success;
+ }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0048-crypto-x86-poly1305-emit-does-base-conversion-itself.patch b/target/linux/generic/backport-5.4/080-wireguard-0048-crypto-x86-poly1305-emit-does-base-conversion-itself.patch

new file mode 100644 (file)

index 0000000..68af53f
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0048-crypto-x86-poly1305-emit-does-base-conversion-itself.patch
@@ -0,0 +1,37 @@
+From 722ccb5da4bab4e142e4dc1eea10406a08547c7b Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 17 Jan 2020 11:42:22 +0100
+Subject: [PATCH 048/124] crypto: x86/poly1305 - emit does base conversion
+ itself
+
+commit f9e7fe32a792726186301423ff63a465d63386e1 upstream.
+
+The emit code does optional base conversion itself in assembly, so we
+don't need to do that here. Also, neither one of these functions uses
+simd instructions, so checking for that doesn't make sense either.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c | 8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -123,13 +123,9 @@ static void poly1305_simd_blocks(void *c
+ static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+                              const u32 nonce[4])
+ {
+-      struct poly1305_arch_internal *state = ctx;
+-
+-      if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
+-          !state->is_base2_26 || !crypto_simd_usable()) {
+-              convert_to_base2_64(ctx);
++      if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx))
+               poly1305_emit_x86_64(ctx, mac, nonce);
+-      } else
++      else
+               poly1305_emit_avx(ctx, mac, nonce);
+ }
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0049-crypto-arm-chacha-fix-build-failured-when-kernel-mod.patch b/target/linux/generic/backport-5.4/080-wireguard-0049-crypto-arm-chacha-fix-build-failured-when-kernel-mod.patch

new file mode 100644 (file)

index 0000000..392d52f
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0049-crypto-arm-chacha-fix-build-failured-when-kernel-mod.patch
@@ -0,0 +1,58 @@
+From 627e2c8313065e627fe5c8c9f82cebd765f5a65e Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 17 Jan 2020 17:43:18 +0100
+Subject: [PATCH 049/124] crypto: arm/chacha - fix build failured when kernel
+ mode NEON is disabled
+
+commit 0bc81767c5bd9d005fae1099fb39eb3688370cb1 upstream.
+
+When the ARM accelerated ChaCha driver is built as part of a configuration
+that has kernel mode NEON disabled, we expect the compiler to propagate
+the build time constant expression IS_ENABLED(CONFIG_KERNEL_MODE_NEON) in
+a way that eliminates all the cross-object references to the actual NEON
+routines, which allows the chacha-neon-core.o object to be omitted from
+the build entirely.
+
+Unfortunately, this fails to work as expected in some cases, and we may
+end up with a build error such as
+
+  chacha-glue.c:(.text+0xc0): undefined reference to `chacha_4block_xor_neon'
+
+caused by the fact that chacha_doneon() has not been eliminated from the
+object code, even though it will never be called in practice.
+
+Let's fix this by adding some IS_ENABLED(CONFIG_KERNEL_MODE_NEON) tests
+that are not strictly needed from a logical point of view, but should
+help the compiler infer that the NEON code paths are unreachable in
+those cases.
+
+Fixes: b36d8c09e710c71f ("crypto: arm/chacha - remove dependency on generic ...")
+Reported-by: Russell King <linux@armlinux.org.uk>
+Cc: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-glue.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/arm/crypto/chacha-glue.c
++++ b/arch/arm/crypto/chacha-glue.c
+@@ -115,7 +115,7 @@ static int chacha_stream_xor(struct skci
+               if (nbytes < walk.total)
+                       nbytes = round_down(nbytes, walk.stride);
+ 
+-              if (!neon) {
++              if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
+                       chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
+                                    nbytes, state, ctx->nrounds);
+                       state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
+@@ -159,7 +159,7 @@ static int do_xchacha(struct skcipher_re
+ 
+       chacha_init_generic(state, ctx->key, req->iv);
+ 
+-      if (!neon) {
++      if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
+               hchacha_block_arm(state, subctx.key, ctx->nrounds);
+       } else {
+               kernel_neon_begin();
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0050-crypto-Kconfig-allow-tests-to-be-disabled-when-manag.patch b/target/linux/generic/backport-5.4/080-wireguard-0050-crypto-Kconfig-allow-tests-to-be-disabled-when-manag.patch

new file mode 100644 (file)

index 0000000..88ce184
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0050-crypto-Kconfig-allow-tests-to-be-disabled-when-manag.patch
@@ -0,0 +1,40 @@
+From 5e8381a3dc454813605aef01de31985f0f6bf130 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 17 Jan 2020 12:01:36 +0100
+Subject: [PATCH 050/124] crypto: Kconfig - allow tests to be disabled when
+ manager is disabled
+
+commit 2343d1529aff8b552589f622c23932035ed7a05d upstream.
+
+The library code uses CRYPTO_MANAGER_DISABLE_TESTS to conditionalize its
+tests, but the library code can also exist without CRYPTO_MANAGER. That
+means on minimal configs, the test code winds up being built with no way
+to disable it.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/Kconfig | 4 ----
+ 1 file changed, 4 deletions(-)
+
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -136,8 +136,6 @@ config CRYPTO_USER
+         Userspace configuration for cryptographic instantiations such as
+         cbc(aes).
+ 
+-if CRYPTO_MANAGER2
+-
+ config CRYPTO_MANAGER_DISABLE_TESTS
+       bool "Disable run-time self tests"
+       default y
+@@ -155,8 +153,6 @@ config CRYPTO_MANAGER_EXTRA_TESTS
+         This is intended for developer use only, as these tests take much
+         longer to run than the normal self tests.
+ 
+-endif # if CRYPTO_MANAGER2
+-
+ config CRYPTO_GF128MUL
+       tristate
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0051-crypto-chacha20poly1305-prevent-integer-overflow-on-.patch b/target/linux/generic/backport-5.4/080-wireguard-0051-crypto-chacha20poly1305-prevent-integer-overflow-on-.patch

new file mode 100644 (file)

index 0000000..300420f
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0051-crypto-chacha20poly1305-prevent-integer-overflow-on-.patch
@@ -0,0 +1,40 @@
+From dceaaf068879fc228e85c482f65ebb707587f696 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 6 Feb 2020 12:42:01 +0100
+Subject: [PATCH 051/124] crypto: chacha20poly1305 - prevent integer overflow
+ on large input
+
+commit c9cc0517bba9f0213f1e55172feceb99e5512daf upstream.
+
+This code assigns src_len (size_t) to sl (int), which causes problems
+when src_len is very large. Probably nobody in the kernel should be
+passing this much data to chacha20poly1305 all in one go anyway, so I
+don't think we need to change the algorithm or introduce larger types
+or anything. But we should at least error out early in this case and
+print a warning so that we get reports if this does happen and can look
+into why anybody is possibly passing it that much data or if they're
+accidently passing -1 or similar.
+
+Fixes: d95312a3ccc0 ("crypto: lib/chacha20poly1305 - reimplement crypt_from_sg() routine")
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: stable@vger.kernel.org # 5.5+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/chacha20poly1305.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/lib/crypto/chacha20poly1305.c
++++ b/lib/crypto/chacha20poly1305.c
+@@ -235,6 +235,9 @@ bool chacha20poly1305_crypt_sg_inplace(s
+               __le64 lens[2];
+       } b __aligned(16);
+ 
++      if (WARN_ON(src_len > INT_MAX))
++              return false;
++
+       chacha_load_key(b.k, key);
+ 
+       b.iv[0] = 0;
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0052-crypto-x86-curve25519-support-assemblers-with-no-adx.patch b/target/linux/generic/backport-5.4/080-wireguard-0052-crypto-x86-curve25519-support-assemblers-with-no-adx.patch

new file mode 100644 (file)

index 0000000..9a380d3
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0052-crypto-x86-curve25519-support-assemblers-with-no-adx.patch
@@ -0,0 +1,84 @@
+From 50af997532492b0f55bd9928743ac1f99dc1cd41 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 1 Mar 2020 22:52:35 +0800
+Subject: [PATCH 052/124] crypto: x86/curve25519 - support assemblers with no
+ adx support
+
+commit 1579f1bc3b753d17a44de3457d5c6f4a5b14c752 upstream.
+
+Some older version of GAS do not support the ADX instructions, similarly
+to how they also don't support AVX and such. This commit adds the same
+build-time detection mechanisms we use for AVX and others for ADX, and
+then makes sure that the curve25519 library dispatcher calls the right
+functions.
+
+Reported-by: Willy Tarreau <w@1wt.eu>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/Makefile           | 5 +++--
+ arch/x86/crypto/Makefile    | 7 ++++++-
+ include/crypto/curve25519.h | 6 ++++--
+ 3 files changed, 13 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -197,9 +197,10 @@ avx2_instr :=$(call as-instr,vpbroadcast
+ avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
+ sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
+ sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
++adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1)
+ 
+-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
++KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
++KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
+ 
+ KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
+ 
+--- a/arch/x86/crypto/Makefile
++++ b/arch/x86/crypto/Makefile
+@@ -11,6 +11,7 @@ avx2_supported := $(call as-instr,vpgath
+ avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
+ sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
+ sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
++adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
+ 
+ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
+ 
+@@ -39,7 +40,11 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2)
+ 
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+-obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
++
++# These modules require the assembler to support ADX.
++ifeq ($(adx_supported),yes)
++      obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
++endif
+ 
+ # These modules require assembler to support AVX.
+ ifeq ($(avx_supported),yes)
+--- a/include/crypto/curve25519.h
++++ b/include/crypto/curve25519.h
+@@ -33,7 +33,8 @@ bool __must_check curve25519(u8 mypublic
+                            const u8 secret[CURVE25519_KEY_SIZE],
+                            const u8 basepoint[CURVE25519_KEY_SIZE])
+ {
+-      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519) &&
++          (!IS_ENABLED(CONFIG_CRYPTO_CURVE25519_X86) || IS_ENABLED(CONFIG_AS_ADX)))
+               curve25519_arch(mypublic, secret, basepoint);
+       else
+               curve25519_generic(mypublic, secret, basepoint);
+@@ -49,7 +50,8 @@ __must_check curve25519_generate_public(
+                                   CURVE25519_KEY_SIZE)))
+               return false;
+ 
+-      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
++      if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519) &&
++          (!IS_ENABLED(CONFIG_CRYPTO_CURVE25519_X86) || IS_ENABLED(CONFIG_AS_ADX)))
+               curve25519_base_arch(pub, secret);
+       else
+               curve25519_generic(pub, secret, curve25519_base_point);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0053-crypto-arm64-chacha-correctly-walk-through-blocks.patch b/target/linux/generic/backport-5.4/080-wireguard-0053-crypto-arm64-chacha-correctly-walk-through-blocks.patch

new file mode 100644 (file)

index 0000000..1c8d2df
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0053-crypto-arm64-chacha-correctly-walk-through-blocks.patch
@@ -0,0 +1,68 @@
+From ed61666f3b3fae43e872dc36a2c01794d7119165 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 18 Mar 2020 20:27:32 -0600
+Subject: [PATCH 053/124] crypto: arm64/chacha - correctly walk through blocks
+
+commit c8cfcb78c65877313cda7bcbace624d3dbd1f3b3 upstream.
+
+Prior, passing in chunks of 2, 3, or 4, followed by any additional
+chunks would result in the chacha state counter getting out of sync,
+resulting in incorrect encryption/decryption, which is a pretty nasty
+crypto vuln: "why do images look weird on webpages?" WireGuard users
+never experienced this prior, because we have always, out of tree, used
+a different crypto library, until the recent Frankenzinc addition. This
+commit fixes the issue by advancing the pointers and state counter by
+the actual size processed. It also fixes up a bug in the (optional,
+costly) stride test that prevented it from running on arm64.
+
+Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
+Reported-and-tested-by: Emil Renner Berthing <kernel@esmil.dk>
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: stable@vger.kernel.org # v5.5+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Reviewed-by: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm64/crypto/chacha-neon-glue.c   |  8 ++++----
+ lib/crypto/chacha20poly1305-selftest.c | 11 ++++++++---
+ 2 files changed, 12 insertions(+), 7 deletions(-)
+
+--- a/arch/arm64/crypto/chacha-neon-glue.c
++++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -55,10 +55,10 @@ static void chacha_doneon(u32 *state, u8
+                       break;
+               }
+               chacha_4block_xor_neon(state, dst, src, nrounds, l);
+-              bytes -= CHACHA_BLOCK_SIZE * 5;
+-              src += CHACHA_BLOCK_SIZE * 5;
+-              dst += CHACHA_BLOCK_SIZE * 5;
+-              state[12] += 5;
++              bytes -= l;
++              src += l;
++              dst += l;
++              state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+       }
+ }
+ 
+--- a/lib/crypto/chacha20poly1305-selftest.c
++++ b/lib/crypto/chacha20poly1305-selftest.c
+@@ -9028,10 +9028,15 @@ bool __init chacha20poly1305_selftest(vo
+            && total_len <= 1 << 10; ++total_len) {
+               for (i = 0; i <= total_len; ++i) {
+                       for (j = i; j <= total_len; ++j) {
++                              k = 0;
+                               sg_init_table(sg_src, 3);
+-                              sg_set_buf(&sg_src[0], input, i);
+-                              sg_set_buf(&sg_src[1], input + i, j - i);
+-                              sg_set_buf(&sg_src[2], input + j, total_len - j);
++                              if (i)
++                                      sg_set_buf(&sg_src[k++], input, i);
++                              if (j - i)
++                                      sg_set_buf(&sg_src[k++], input + i, j - i);
++                              if (total_len - j)
++                                      sg_set_buf(&sg_src[k++], input + j, total_len - j);
++                              sg_init_marker(sg_src, k);
+                               memset(computed_output, 0, total_len);
+                               memset(input, 0, total_len);
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0054-crypto-x86-curve25519-replace-with-formally-verified.patch b/target/linux/generic/backport-5.4/080-wireguard-0054-crypto-x86-curve25519-replace-with-formally-verified.patch

new file mode 100644 (file)

index 0000000..46ee257
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0054-crypto-x86-curve25519-replace-with-formally-verified.patch
@@ -0,0 +1,3765 @@
+From a35b4c8928691ab2aa671aa2ca38a02d4e3cc58d Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 20 Jan 2020 18:18:15 +0100
+Subject: [PATCH 054/124] crypto: x86/curve25519 - replace with formally
+ verified implementation
+
+commit 07b586fe06625b0b610dc3d3a969c51913d143d4 upstream.
+
+This comes from INRIA's HACL*/Vale. It implements the same algorithm and
+implementation strategy as the code it replaces, only this code has been
+formally verified, sans the base point multiplication, which uses code
+similar to prior, only it uses the formally verified field arithmetic
+alongside reproducable ladder generation steps. This doesn't have a
+pure-bmi2 version, which means haswell no longer benefits, but the
+increased (doubled) code complexity is not worth it for a single
+generation of chips that's already old.
+
+Performance-wise, this is around 1% slower on older microarchitectures,
+and slightly faster on newer microarchitectures, mainly 10nm ones or
+backports of 10nm to 14nm. This implementation is "everest" below:
+
+Xeon E5-2680 v4 (Broadwell)
+
+     armfazh: 133340 cycles per call
+     everest: 133436 cycles per call
+
+Xeon Gold 5120 (Sky Lake Server)
+
+     armfazh: 112636 cycles per call
+     everest: 113906 cycles per call
+
+Core i5-6300U (Sky Lake Client)
+
+     armfazh: 116810 cycles per call
+     everest: 117916 cycles per call
+
+Core i7-7600U (Kaby Lake)
+
+     armfazh: 119523 cycles per call
+     everest: 119040 cycles per call
+
+Core i7-8750H (Coffee Lake)
+
+     armfazh: 113914 cycles per call
+     everest: 113650 cycles per call
+
+Core i9-9880H (Coffee Lake Refresh)
+
+     armfazh: 112616 cycles per call
+     everest: 114082 cycles per call
+
+Core i3-8121U (Cannon Lake)
+
+     armfazh: 113202 cycles per call
+     everest: 111382 cycles per call
+
+Core i7-8265U (Whiskey Lake)
+
+     armfazh: 127307 cycles per call
+     everest: 127697 cycles per call
+
+Core i7-8550U (Kaby Lake Refresh)
+
+     armfazh: 127522 cycles per call
+     everest: 127083 cycles per call
+
+Xeon Platinum 8275CL (Cascade Lake)
+
+     armfazh: 114380 cycles per call
+     everest: 114656 cycles per call
+
+Achieving these kind of results with formally verified code is quite
+remarkable, especialy considering that performance is favorable for
+newer chips.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/curve25519-x86_64.c | 3546 ++++++++++-----------------
+ 1 file changed, 1292 insertions(+), 2254 deletions(-)
+
+--- a/arch/x86/crypto/curve25519-x86_64.c
++++ b/arch/x86/crypto/curve25519-x86_64.c
+@@ -1,8 +1,7 @@
+-// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
++// SPDX-License-Identifier: GPL-2.0 OR MIT
+ /*
+- * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
+- * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+- * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
++ * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
+  */
+ 
+ #include <crypto/curve25519.h>
+@@ -16,2337 +15,1378 @@
+ #include <asm/cpufeature.h>
+ #include <asm/processor.h>
+ 
+-static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2);
+-static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx);
+-
+-enum { NUM_WORDS_ELTFP25519 = 4 };
+-typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
+-typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
+-
+-#define mul_eltfp25519_1w_adx(c, a, b) do { \
+-      mul_256x256_integer_adx(m.buffer, a, b); \
+-      red_eltfp25519_1w_adx(c, m.buffer); \
+-} while (0)
+-
+-#define mul_eltfp25519_1w_bmi2(c, a, b) do { \
+-      mul_256x256_integer_bmi2(m.buffer, a, b); \
+-      red_eltfp25519_1w_bmi2(c, m.buffer); \
+-} while (0)
+-
+-#define sqr_eltfp25519_1w_adx(a) do { \
+-      sqr_256x256_integer_adx(m.buffer, a); \
+-      red_eltfp25519_1w_adx(a, m.buffer); \
+-} while (0)
+-
+-#define sqr_eltfp25519_1w_bmi2(a) do { \
+-      sqr_256x256_integer_bmi2(m.buffer, a); \
+-      red_eltfp25519_1w_bmi2(a, m.buffer); \
+-} while (0)
+-
+-#define mul_eltfp25519_2w_adx(c, a, b) do { \
+-      mul2_256x256_integer_adx(m.buffer, a, b); \
+-      red_eltfp25519_2w_adx(c, m.buffer); \
+-} while (0)
+-
+-#define mul_eltfp25519_2w_bmi2(c, a, b) do { \
+-      mul2_256x256_integer_bmi2(m.buffer, a, b); \
+-      red_eltfp25519_2w_bmi2(c, m.buffer); \
+-} while (0)
+-
+-#define sqr_eltfp25519_2w_adx(a) do { \
+-      sqr2_256x256_integer_adx(m.buffer, a); \
+-      red_eltfp25519_2w_adx(a, m.buffer); \
+-} while (0)
+-
+-#define sqr_eltfp25519_2w_bmi2(a) do { \
+-      sqr2_256x256_integer_bmi2(m.buffer, a); \
+-      red_eltfp25519_2w_bmi2(a, m.buffer); \
+-} while (0)
+-
+-#define sqrn_eltfp25519_1w_adx(a, times) do { \
+-      int ____counter = (times); \
+-      while (____counter-- > 0) \
+-              sqr_eltfp25519_1w_adx(a); \
+-} while (0)
+-
+-#define sqrn_eltfp25519_1w_bmi2(a, times) do { \
+-      int ____counter = (times); \
+-      while (____counter-- > 0) \
+-              sqr_eltfp25519_1w_bmi2(a); \
+-} while (0)
+-
+-#define copy_eltfp25519_1w(C, A) do { \
+-      (C)[0] = (A)[0]; \
+-      (C)[1] = (A)[1]; \
+-      (C)[2] = (A)[2]; \
+-      (C)[3] = (A)[3]; \
+-} while (0)
+-
+-#define setzero_eltfp25519_1w(C) do { \
+-      (C)[0] = 0; \
+-      (C)[1] = 0; \
+-      (C)[2] = 0; \
+-      (C)[3] = 0; \
+-} while (0)
+-
+-__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
+-      /*   1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL,
+-                0xffffffffffffffffUL, 0x5fffffffffffffffUL,
+-      /*   2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL,
+-                0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL,
+-      /*   3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL,
+-                0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL,
+-      /*   4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL,
+-                0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL,
+-      /*   5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL,
+-                0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL,
+-      /*   6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL,
+-                0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL,
+-      /*   7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL,
+-                0xc1c20d06231f7614UL, 0x2938218da274f972UL,
+-      /*   8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL,
+-                0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL,
+-      /*   9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL,
+-                0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL,
+-      /*  10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL,
+-                0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL,
+-      /*  11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL,
+-                0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL,
+-      /*  12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL,
+-                0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL,
+-      /*  13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL,
+-                0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL,
+-      /*  14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL,
+-                0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL,
+-      /*  15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL,
+-                0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL,
+-      /*  16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL,
+-                0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL,
+-      /*  17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL,
+-                0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL,
+-      /*  18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL,
+-                0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL,
+-      /*  19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL,
+-                0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL,
+-      /*  20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL,
+-                0x9d4935467caaf22eUL, 0x5166408eee85ff49UL,
+-      /*  21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL,
+-                0x5259729241159b1cUL, 0x6a621892d5b0ab33UL,
+-      /*  22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL,
+-                0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL,
+-      /*  23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL,
+-                0x23758739f630a257UL, 0x295a407a01a78580UL,
+-      /*  24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL,
+-                0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL,
+-      /*  25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL,
+-                0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL,
+-      /*  26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL,
+-                0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL,
+-      /*  27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL,
+-                0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL,
+-      /*  28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL,
+-                0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL,
+-      /*  29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL,
+-                0x74b4c4ceab102f64UL, 0x183abadd10139845UL,
+-      /*  30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL,
+-                0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL,
+-      /*  31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL,
+-                0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL,
+-      /*  32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL,
+-                0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL,
+-      /*  33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL,
+-                0xd88768e4904032d8UL, 0x131384427b3aaeecUL,
+-      /*  34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL,
+-                0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL,
+-      /*  35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL,
+-                0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL,
+-      /*  36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL,
+-                0xa401760b882c797aUL, 0x1fc223e28dc88730UL,
+-      /*  37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL,
+-                0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL,
+-      /*  38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL,
+-                0xfdbf177988bbc586UL, 0x2959894fcad81df5UL,
+-      /*  39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL,
+-                0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL,
+-      /*  40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL,
+-                0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL,
+-      /*  41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL,
+-                0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL,
+-      /*  42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL,
+-                0x5c217736fa279374UL, 0x7dde05734afeb1faUL,
+-      /*  43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL,
+-                0xe6053bf89595bf7aUL, 0x394faf38da245530UL,
+-      /*  44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL,
+-                0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL,
+-      /*  45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL,
+-                0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL,
+-      /*  46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL,
+-                0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL,
+-      /*  47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL,
+-                0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL,
+-      /*  48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL,
+-                0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL,
+-      /*  49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL,
+-                0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL,
+-      /*  50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL,
+-                0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL,
+-      /*  51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL,
+-                0xc189218075e91436UL, 0x6d9284169b3b8484UL,
+-      /*  52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL,
+-                0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL,
+-      /*  53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL,
+-                0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL,
+-      /*  54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL,
+-                0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL,
+-      /*  55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL,
+-                0x19346a65d3224a08UL, 0x0f5034e49b9af466UL,
+-      /*  56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL,
+-                0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL,
+-      /*  57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL,
+-                0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL,
+-      /*  58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL,
+-                0xf826842130f5ad28UL, 0x3ea988f75301a441UL,
+-      /*  59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL,
+-                0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL,
+-      /*  60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL,
+-                0xd01469df811d644bUL, 0x77fea47d81a5d71fUL,
+-      /*  61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL,
+-                0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL,
+-      /*  62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL,
+-                0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL,
+-      /*  63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL,
+-                0xbea450e1dbd885d5UL, 0x61b68649320f712cUL,
+-      /*  64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL,
+-                0x25232973322dbef4UL, 0x445dc4758c17f770UL,
+-      /*  65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL,
+-                0x1efebefdc053db34UL, 0x4adbe867c65daf99UL,
+-      /*  66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL,
+-                0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL,
+-      /*  67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL,
+-                0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL,
+-      /*  68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL,
+-                0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL,
+-      /*  69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL,
+-                0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL,
+-      /*  70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL,
+-                0xbdaacb805831ca6fUL, 0x445b652dc916694fUL,
+-      /*  71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL,
+-                0xa1823aafe04c314aUL, 0x790a2d94437cf586UL,
+-      /*  72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL,
+-                0xbf70903b204f5169UL, 0x2f7a89891ba319feUL,
+-      /*  73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL,
+-                0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL,
+-      /*  74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL,
+-                0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL,
+-      /*  75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL,
+-                0x674f1288f8e11217UL, 0x5682250f329f93d0UL,
+-      /*  76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL,
+-                0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL,
+-      /*  77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL,
+-                0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL,
+-      /*  78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL,
+-                0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL,
+-      /*  79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL,
+-                0x5deadacec9f04973UL, 0x29275b5d41d29b27UL,
+-      /*  80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL,
+-                0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL,
+-      /*  81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL,
+-                0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL,
+-      /*  82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL,
+-                0x894d1d855ae52359UL, 0x68e122157b743d69UL,
+-      /*  83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL,
+-                0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL,
+-      /*  84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL,
+-                0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL,
+-      /*  85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL,
+-                0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL,
+-      /*  86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL,
+-                0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL,
+-      /*  87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL,
+-                0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL,
+-      /*  88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL,
+-                0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL,
+-      /*  89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL,
+-                0x45adb16e76cefcf2UL, 0x01f768aead232999UL,
+-      /*  90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL,
+-                0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL,
+-      /*  91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL,
+-                0x5eefa966de2a701dUL, 0x23b20565de55e3efUL,
+-      /*  92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL,
+-                0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL,
+-      /*  93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL,
+-                0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL,
+-      /*  94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL,
+-                0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL,
+-      /*  95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL,
+-                0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL,
+-      /*  96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL,
+-                0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL,
+-      /*  97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL,
+-                0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL,
+-      /*  98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL,
+-                0xb9886314844006b1UL, 0x40d2a72ab454cc60UL,
+-      /*  99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL,
+-                0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL,
+-      /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL,
+-                0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL,
+-      /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL,
+-                0x40727064c416d74fUL, 0x6e15c6114b502ef0UL,
+-      /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL,
+-                0x4a497962066e6043UL, 0x705b3aab41355b44UL,
+-      /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL,
+-                0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL,
+-      /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL,
+-                0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL,
+-      /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL,
+-                0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL,
+-      /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL,
+-                0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL,
+-      /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL,
+-                0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL,
+-      /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL,
+-                0x2088ce1570033c68UL, 0x7fba1f495c837987UL,
+-      /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL,
+-                0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL,
+-      /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL,
+-                0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL,
+-      /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL,
+-                0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL,
+-      /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL,
+-                0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL,
+-      /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL,
+-                0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL,
+-      /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL,
+-                0x00f52e3f67280294UL, 0x566d4fc14730c509UL,
+-      /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL,
+-                0x216730fba68d6095UL, 0x22e8c3843f69cea7UL,
+-      /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL,
+-                0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL,
+-      /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL,
+-                0x508e862f121692fcUL, 0x3a81907fa093c291UL,
+-      /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL,
+-                0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL,
+-      /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL,
+-                0xbee595ce8a9df2e5UL, 0x25e496c722422236UL,
+-      /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL,
+-                0xe488de11d761e352UL, 0x0e878a01a085545cUL,
+-      /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL,
+-                0x9ea37a487ae80d67UL, 0x67a9958011e41794UL,
+-      /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL,
+-                0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL,
+-      /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL,
+-                0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL,
+-      /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL,
+-                0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL,
+-      /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL,
+-                0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL,
+-      /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL,
+-                0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL,
+-      /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL,
+-                0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL,
+-      /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL,
+-                0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL,
+-      /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL,
+-                0x97134556a9832d06UL, 0x269bb0360a84f8a0UL,
+-      /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL,
+-                0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL,
+-      /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL,
+-                0x904659bb686e3772UL, 0x7215c371746ba8c8UL,
+-      /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL,
+-                0x266fd5809208f294UL, 0x5c847085619a26b9UL,
+-      /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL,
+-                0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL,
+-      /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL,
+-                0x2b1641917b307614UL, 0x117c554fc4f45b7cUL,
+-      /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL,
+-                0xd7e803f4171b2827UL, 0x1015e87487d225eaUL,
+-      /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL,
+-                0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL,
+-      /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL,
+-                0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL,
+-      /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL,
+-                0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL,
+-      /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL,
+-                0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL,
+-      /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL,
+-                0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL,
+-      /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL,
+-                0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL,
+-      /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL,
+-                0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL,
+-      /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL,
+-                0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL,
+-      /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL,
+-                0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL,
+-      /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL,
+-                0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL,
+-      /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL,
+-                0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL,
+-      /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL,
+-                0xec219c48fbd21604UL, 0x1aaf1af517c36731UL,
+-      /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL,
+-                0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL,
+-      /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL,
+-                0x4dbbc207f531561aUL, 0x0253b7f082128a27UL,
+-      /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL,
+-                0x52d17436309d4253UL, 0x356f97e13efae576UL,
+-      /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL,
+-                0x0c776128bed92c98UL, 0x1d34ae93032885b8UL,
+-      /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL,
+-                0x66124c6f97bda770UL, 0x0f81a0290654124aUL,
+-      /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL,
+-                0xff08d03f93d8c20aUL, 0x52a148199faef26bUL,
+-      /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL,
+-                0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL,
+-      /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL,
+-                0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL,
+-      /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL,
+-                0x5da643cb4bf30035UL, 0x77db28d63940f721UL,
+-      /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL,
+-                0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL,
+-      /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL,
+-                0x140a69245ca575edUL, 0x0cf1c37134273a4cUL,
+-      /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL,
+-                0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL,
+-      /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL,
+-                0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL,
+-      /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL,
+-                0x497d723f802e88e1UL, 0x30684dea602f408dUL,
+-      /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL,
+-                0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL,
+-      /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL,
+-                0x7468c7423a543258UL, 0x4a7f11464eb5642fUL,
+-      /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL,
+-                0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL,
+-      /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL,
+-                0x026df551dbb85c20UL, 0x74fcd91047e21901UL,
+-      /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL,
+-                0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL,
+-      /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL,
+-                0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL,
+-      /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL,
+-                0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL,
+-      /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL,
+-                0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL,
+-      /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL,
+-                0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL,
+-      /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL,
+-                0x13033ac001f66697UL, 0x273b24fe3b367d75UL,
+-      /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL,
+-                0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL,
+-      /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL,
+-                0xacc63ca34b8ec145UL, 0x74621888fee66574UL,
+-      /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL,
+-                0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL,
+-      /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL,
+-                0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL,
+-      /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL,
+-                0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL,
+-      /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL,
+-                0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL,
+-      /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL,
+-                0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL,
+-      /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL,
+-                0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL,
+-      /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL,
+-                0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL,
+-      /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL,
+-                0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL,
+-      /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL,
+-                0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL,
+-      /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL,
+-                0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL,
+-      /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL,
+-                0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL,
+-      /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL,
+-                0x81004b71e33cc191UL, 0x44e6be345122803cUL,
+-      /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL,
+-                0x49c8c4281af60c29UL, 0x21edb518de701aeeUL,
+-      /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL,
+-                0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL,
+-      /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL,
+-                0x12bc8d6915783712UL, 0x498194c0fc620abbUL,
+-      /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL,
+-                0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL,
+-      /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL,
+-                0x1e60c24598c71fffUL, 0x59f2f014979983efUL,
+-      /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL,
+-                0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL,
+-      /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL,
+-                0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL,
+-      /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL,
+-                0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL,
+-      /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL,
+-                0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL,
+-      /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL,
+-                0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL,
+-      /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL,
+-                0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL,
+-      /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL,
+-                0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL,
+-      /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL,
+-                0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL,
+-      /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL,
+-                0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL,
+-      /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL,
+-                0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL,
+-      /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL,
+-                0x883ada83a6a1652cUL, 0x585f1974034d6c17UL,
+-      /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL,
+-                0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL,
+-      /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL,
+-                0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL,
+-      /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL,
+-                0x33979624f0e917beUL, 0x2c018dc527356b30UL,
+-      /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL,
+-                0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL,
+-      /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL,
+-                0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL,
+-      /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL,
+-                0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL,
+-      /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL,
+-                0x345ead5e972d091eUL, 0x18c8df11a83103baUL,
+-      /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL,
+-                0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL,
+-      /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL,
+-                0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL,
+-      /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL,
+-                0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL,
+-      /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL,
+-                0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL,
+-      /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL,
+-                0x79952a008221e738UL, 0x4322e1a7535cd2bbUL,
+-      /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL,
+-                0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL,
+-      /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL,
+-                0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL,
+-      /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL,
+-                0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL,
+-      /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL,
+-                0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL,
+-      /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL,
+-                0x1df4c0af01314a60UL, 0x09a62dab89289527UL,
+-      /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL,
+-                0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL,
+-      /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL,
+-                0x49b96853d7a7084aUL, 0x4980a319601420a8UL,
+-      /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL,
+-                0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL,
+-      /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL,
+-                0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL,
+-      /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL,
+-                0xddeb34a061615d99UL, 0x5129cecceb64b773UL,
+-      /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL,
+-                0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL,
+-      /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL,
+-                0x680bd77c73edad2eUL, 0x487c02354edd9041UL,
+-      /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL,
+-                0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL,
+-      /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL,
+-                0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL,
+-      /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL,
+-                0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL,
+-      /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL,
+-                0xe9834262d13921edUL, 0x27fedafaa54bb592UL,
+-      /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL,
+-                0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL,
+-      /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL,
+-                0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL,
+-      /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL,
+-                0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL,
+-      /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL,
+-                0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL,
+-      /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL,
+-                0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL,
+-      /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL,
+-                0x645b426f3d1d58acUL, 0x4804a82227a557bcUL,
+-      /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL,
+-                0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL,
+-      /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL,
+-                0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL,
+-      /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL,
+-                0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL,
+-      /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL,
+-                0xc26ccff352b37ec7UL, 0x056f68341d797b21UL,
+-      /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL,
+-                0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL,
+-      /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL,
+-                0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL,
+-      /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL,
+-                0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL,
+-      /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL,
+-                0x2c43ecea0107c1ddUL, 0x526028809372de35UL,
+-      /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL,
+-                0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL,
+-      /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL,
+-                0x899fc38fc4b5c515UL, 0x250386b124ffc207UL,
+-      /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL,
+-                0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL,
+-      /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL,
+-                0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL,
+-      /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL,
+-                0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL,
+-      /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL,
+-                0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL,
+-      /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL,
+-                0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL,
+-      /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL,
+-                0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL,
+-      /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL,
+-                0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL
+-};
+-
+-/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
+- * a is two 256-bit integers: a0[0:3] and a1[4:7]
+- * b is two 256-bit integers: b0[0:3] and b1[4:7]
+- */
+-static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a,
+-                                   const u64 *const b)
+-{
+-      asm volatile(
+-              "xorl %%r14d, %%r14d ;"
+-              "movq   (%1), %%rdx; "  /* A[0] */
+-              "mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
+-              "xorl %%r10d, %%r10d ;"
+-              "movq %%r8, (%0) ;"
+-              "mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+-              "adox %%r10, %%r15 ;"
+-              "mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
+-              "adox  %%r8, %%rax ;"
+-              "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+-              "adox %%r10, %%rbx ;"
+-              /******************************************/
+-              "adox %%r14, %%rcx ;"
+-
+-              "movq  8(%1), %%rdx; "  /* A[1] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
+-              "adox %%r15,  %%r8 ;"
+-              "movq  %%r8, 8(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+-              "adox %%r10,  %%r9 ;"
+-              "adcx  %%r9, %%rax ;"
+-              "mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
+-              "adox  %%r8, %%r11 ;"
+-              "adcx %%r11, %%rbx ;"
+-              "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
+-              "adox %%r10, %%r13 ;"
+-              "adcx %%r13, %%rcx ;"
+-              /******************************************/
+-              "adox %%r14, %%r15 ;"
+-              "adcx %%r14, %%r15 ;"
+-
+-              "movq 16(%1), %%rdx; " /* A[2] */
+-              "xorl %%r10d, %%r10d ;"
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
+-              "adox %%rax,  %%r8 ;"
+-              "movq %%r8, 16(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+-              "adox %%r10,  %%r9 ;"
+-              "adcx  %%r9, %%rbx ;"
+-              "mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
+-              "adox  %%r8, %%r11 ;"
+-              "adcx %%r11, %%rcx ;"
+-              "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+-              "adox %%r10, %%r13 ;"
+-              "adcx %%r13, %%r15 ;"
+-              /******************************************/
+-              "adox %%r14, %%rax ;"
+-              "adcx %%r14, %%rax ;"
+-
+-              "movq 24(%1), %%rdx; " /* A[3] */
+-              "xorl %%r10d, %%r10d ;"
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
+-              "adox %%rbx,  %%r8 ;"
+-              "movq %%r8, 24(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+-              "adox %%r10,  %%r9 ;"
+-              "adcx  %%r9, %%rcx ;"
+-              "movq %%rcx, 32(%0) ;"
+-              "mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
+-              "adox  %%r8, %%r11 ;"
+-              "adcx %%r11, %%r15 ;"
+-              "movq %%r15, 40(%0) ;"
+-              "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+-              "adox %%r10, %%r13 ;"
+-              "adcx %%r13, %%rax ;"
+-              "movq %%rax, 48(%0) ;"
+-              /******************************************/
+-              "adox %%r14, %%rbx ;"
+-              "adcx %%r14, %%rbx ;"
+-              "movq %%rbx, 56(%0) ;"
+-
+-              "movq 32(%1), %%rdx; "  /* C[0] */
+-              "mulx 32(%2),  %%r8, %%r15; " /* C[0]*D[0] */
+-              "xorl %%r10d, %%r10d ;"
+-              "movq %%r8, 64(%0);"
+-              "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
+-              "adox %%r10, %%r15 ;"
+-              "mulx 48(%2),  %%r8, %%rbx; " /* C[0]*D[2] */
+-              "adox  %%r8, %%rax ;"
+-              "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
+-              "adox %%r10, %%rbx ;"
+-              /******************************************/
+-              "adox %%r14, %%rcx ;"
+-
+-              "movq 40(%1), %%rdx; " /* C[1] */
+-              "xorl %%r10d, %%r10d ;"
+-              "mulx 32(%2),  %%r8,  %%r9; " /* C[1]*D[0] */
+-              "adox %%r15,  %%r8 ;"
+-              "movq  %%r8, 72(%0);"
+-              "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
+-              "adox %%r10,  %%r9 ;"
+-              "adcx  %%r9, %%rax ;"
+-              "mulx 48(%2),  %%r8, %%r13; " /* C[1]*D[2] */
+-              "adox  %%r8, %%r11 ;"
+-              "adcx %%r11, %%rbx ;"
+-              "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
+-              "adox %%r10, %%r13 ;"
+-              "adcx %%r13, %%rcx ;"
+-              /******************************************/
+-              "adox %%r14, %%r15 ;"
+-              "adcx %%r14, %%r15 ;"
+-
+-              "movq 48(%1), %%rdx; " /* C[2] */
+-              "xorl %%r10d, %%r10d ;"
+-              "mulx 32(%2),  %%r8,  %%r9; " /* C[2]*D[0] */
+-              "adox %%rax,  %%r8 ;"
+-              "movq  %%r8, 80(%0);"
+-              "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
+-              "adox %%r10,  %%r9 ;"
+-              "adcx  %%r9, %%rbx ;"
+-              "mulx 48(%2),  %%r8, %%r13; " /* C[2]*D[2] */
+-              "adox  %%r8, %%r11 ;"
+-              "adcx %%r11, %%rcx ;"
+-              "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
+-              "adox %%r10, %%r13 ;"
+-              "adcx %%r13, %%r15 ;"
+-              /******************************************/
+-              "adox %%r14, %%rax ;"
+-              "adcx %%r14, %%rax ;"
+-
+-              "movq 56(%1), %%rdx; " /* C[3] */
+-              "xorl %%r10d, %%r10d ;"
+-              "mulx 32(%2),  %%r8,  %%r9; " /* C[3]*D[0] */
+-              "adox %%rbx,  %%r8 ;"
+-              "movq  %%r8, 88(%0);"
+-              "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
+-              "adox %%r10,  %%r9 ;"
+-              "adcx  %%r9, %%rcx ;"
+-              "movq %%rcx,  96(%0) ;"
+-              "mulx 48(%2),  %%r8, %%r13; " /* C[3]*D[2] */
+-              "adox  %%r8, %%r11 ;"
+-              "adcx %%r11, %%r15 ;"
+-              "movq %%r15, 104(%0) ;"
+-              "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
+-              "adox %%r10, %%r13 ;"
+-              "adcx %%r13, %%rax ;"
+-              "movq %%rax, 112(%0) ;"
+-              /******************************************/
+-              "adox %%r14, %%rbx ;"
+-              "adcx %%r14, %%rbx ;"
+-              "movq %%rbx, 120(%0) ;"
+-              :
+-              : "r"(c), "r"(a), "r"(b)
+-              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+-                "%r10", "%r11", "%r13", "%r14", "%r15");
+-}
+-
+-static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a,
+-                                    const u64 *const b)
++static __always_inline u64 eq_mask(u64 a, u64 b)
+ {
+-      asm volatile(
+-              "movq   (%1), %%rdx; "  /* A[0] */
+-              "mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
+-              "movq %%r8,  (%0) ;"
+-              "mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+-              "addq %%r10, %%r15 ;"
+-              "mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
+-              "adcq  %%r8, %%rax ;"
+-              "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+-              "adcq %%r10, %%rbx ;"
+-              /******************************************/
+-              "adcq    $0, %%rcx ;"
+-
+-              "movq  8(%1), %%rdx; "  /* A[1] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
+-              "addq %%r15,  %%r8 ;"
+-              "movq %%r8, 8(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+-              "adcq %%r10,  %%r9 ;"
+-              "mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
+-              "adcq  %%r8, %%r11 ;"
+-              "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
+-              "adcq %%r10, %%r13 ;"
+-              /******************************************/
+-              "adcq    $0, %%r15 ;"
+-
+-              "addq  %%r9, %%rax ;"
+-              "adcq %%r11, %%rbx ;"
+-              "adcq %%r13, %%rcx ;"
+-              "adcq    $0, %%r15 ;"
+-
+-              "movq 16(%1), %%rdx; "  /* A[2] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
+-              "addq %%rax,  %%r8 ;"
+-              "movq %%r8, 16(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+-              "adcq %%r10,  %%r9 ;"
+-              "mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
+-              "adcq  %%r8, %%r11 ;"
+-              "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+-              "adcq %%r10, %%r13 ;"
+-              /******************************************/
+-              "adcq    $0, %%rax ;"
+-
+-              "addq  %%r9, %%rbx ;"
+-              "adcq %%r11, %%rcx ;"
+-              "adcq %%r13, %%r15 ;"
+-              "adcq    $0, %%rax ;"
+-
+-              "movq 24(%1), %%rdx; "  /* A[3] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
+-              "addq %%rbx,  %%r8 ;"
+-              "movq %%r8, 24(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+-              "adcq %%r10,  %%r9 ;"
+-              "mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
+-              "adcq  %%r8, %%r11 ;"
+-              "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+-              "adcq %%r10, %%r13 ;"
+-              /******************************************/
+-              "adcq    $0, %%rbx ;"
+-
+-              "addq  %%r9, %%rcx ;"
+-              "movq %%rcx, 32(%0) ;"
+-              "adcq %%r11, %%r15 ;"
+-              "movq %%r15, 40(%0) ;"
+-              "adcq %%r13, %%rax ;"
+-              "movq %%rax, 48(%0) ;"
+-              "adcq    $0, %%rbx ;"
+-              "movq %%rbx, 56(%0) ;"
+-
+-              "movq 32(%1), %%rdx; "  /* C[0] */
+-              "mulx 32(%2),  %%r8, %%r15; " /* C[0]*D[0] */
+-              "movq %%r8, 64(%0) ;"
+-              "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
+-              "addq %%r10, %%r15 ;"
+-              "mulx 48(%2),  %%r8, %%rbx; " /* C[0]*D[2] */
+-              "adcq  %%r8, %%rax ;"
+-              "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
+-              "adcq %%r10, %%rbx ;"
+-              /******************************************/
+-              "adcq    $0, %%rcx ;"
+-
+-              "movq 40(%1), %%rdx; "  /* C[1] */
+-              "mulx 32(%2),  %%r8,  %%r9; " /* C[1]*D[0] */
+-              "addq %%r15,  %%r8 ;"
+-              "movq %%r8, 72(%0) ;"
+-              "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
+-              "adcq %%r10,  %%r9 ;"
+-              "mulx 48(%2),  %%r8, %%r13; " /* C[1]*D[2] */
+-              "adcq  %%r8, %%r11 ;"
+-              "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
+-              "adcq %%r10, %%r13 ;"
+-              /******************************************/
+-              "adcq    $0, %%r15 ;"
+-
+-              "addq  %%r9, %%rax ;"
+-              "adcq %%r11, %%rbx ;"
+-              "adcq %%r13, %%rcx ;"
+-              "adcq    $0, %%r15 ;"
+-
+-              "movq 48(%1), %%rdx; "  /* C[2] */
+-              "mulx 32(%2),  %%r8,  %%r9; " /* C[2]*D[0] */
+-              "addq %%rax,  %%r8 ;"
+-              "movq %%r8, 80(%0) ;"
+-              "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
+-              "adcq %%r10,  %%r9 ;"
+-              "mulx 48(%2),  %%r8, %%r13; " /* C[2]*D[2] */
+-              "adcq  %%r8, %%r11 ;"
+-              "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
+-              "adcq %%r10, %%r13 ;"
+-              /******************************************/
+-              "adcq    $0, %%rax ;"
+-
+-              "addq  %%r9, %%rbx ;"
+-              "adcq %%r11, %%rcx ;"
+-              "adcq %%r13, %%r15 ;"
+-              "adcq    $0, %%rax ;"
+-
+-              "movq 56(%1), %%rdx; "  /* C[3] */
+-              "mulx 32(%2),  %%r8,  %%r9; " /* C[3]*D[0] */
+-              "addq %%rbx,  %%r8 ;"
+-              "movq %%r8, 88(%0) ;"
+-              "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
+-              "adcq %%r10,  %%r9 ;"
+-              "mulx 48(%2),  %%r8, %%r13; " /* C[3]*D[2] */
+-              "adcq  %%r8, %%r11 ;"
+-              "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
+-              "adcq %%r10, %%r13 ;"
+-              /******************************************/
+-              "adcq    $0, %%rbx ;"
+-
+-              "addq  %%r9, %%rcx ;"
+-              "movq %%rcx,  96(%0) ;"
+-              "adcq %%r11, %%r15 ;"
+-              "movq %%r15, 104(%0) ;"
+-              "adcq %%r13, %%rax ;"
+-              "movq %%rax, 112(%0) ;"
+-              "adcq    $0, %%rbx ;"
+-              "movq %%rbx, 120(%0) ;"
+-              :
+-              : "r"(c), "r"(a), "r"(b)
+-              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+-                "%r10", "%r11", "%r13", "%r15");
++      u64 x = a ^ b;
++      u64 minus_x = ~x + (u64)1U;
++      u64 x_or_minus_x = x | minus_x;
++      u64 xnx = x_or_minus_x >> (u32)63U;
++      return xnx - (u64)1U;
+ }
+ 
+-static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a)
++static __always_inline u64 gte_mask(u64 a, u64 b)
+ {
+-      asm volatile(
+-              "movq   (%1), %%rdx        ;" /* A[0]      */
+-              "mulx  8(%1),  %%r8, %%r14 ;" /* A[1]*A[0] */
+-              "xorl %%r15d, %%r15d;"
+-              "mulx 16(%1),  %%r9, %%r10 ;" /* A[2]*A[0] */
+-              "adcx %%r14,  %%r9 ;"
+-              "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
+-              "adcx %%rax, %%r10 ;"
+-              "movq 24(%1), %%rdx        ;" /* A[3]      */
+-              "mulx  8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
+-              "adcx %%rcx, %%r11 ;"
+-              "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
+-              "adcx %%rax, %%rbx ;"
+-              "movq  8(%1), %%rdx        ;" /* A[1]      */
+-              "adcx %%r15, %%r13 ;"
+-              "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
+-              "movq    $0, %%r14 ;"
+-              /******************************************/
+-              "adcx %%r15, %%r14 ;"
+-
+-              "xorl %%r15d, %%r15d;"
+-              "adox %%rax, %%r10 ;"
+-              "adcx  %%r8,  %%r8 ;"
+-              "adox %%rcx, %%r11 ;"
+-              "adcx  %%r9,  %%r9 ;"
+-              "adox %%r15, %%rbx ;"
+-              "adcx %%r10, %%r10 ;"
+-              "adox %%r15, %%r13 ;"
+-              "adcx %%r11, %%r11 ;"
+-              "adox %%r15, %%r14 ;"
+-              "adcx %%rbx, %%rbx ;"
+-              "adcx %%r13, %%r13 ;"
+-              "adcx %%r14, %%r14 ;"
+-
+-              "movq   (%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+-              /*******************/
+-              "movq %%rax,  0(%0) ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,  8(%0) ;"
+-              "movq  8(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+-              "adcq %%rax,  %%r9 ;"
+-              "movq  %%r9, 16(%0) ;"
+-              "adcq %%rcx, %%r10 ;"
+-              "movq %%r10, 24(%0) ;"
+-              "movq 16(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+-              "adcq %%rax, %%r11 ;"
+-              "movq %%r11, 32(%0) ;"
+-              "adcq %%rcx, %%rbx ;"
+-              "movq %%rbx, 40(%0) ;"
+-              "movq 24(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+-              "adcq %%rax, %%r13 ;"
+-              "movq %%r13, 48(%0) ;"
+-              "adcq %%rcx, %%r14 ;"
+-              "movq %%r14, 56(%0) ;"
+-
+-
+-              "movq 32(%1), %%rdx        ;" /* B[0]      */
+-              "mulx 40(%1),  %%r8, %%r14 ;" /* B[1]*B[0] */
+-              "xorl %%r15d, %%r15d;"
+-              "mulx 48(%1),  %%r9, %%r10 ;" /* B[2]*B[0] */
+-              "adcx %%r14,  %%r9 ;"
+-              "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
+-              "adcx %%rax, %%r10 ;"
+-              "movq 56(%1), %%rdx        ;" /* B[3]      */
+-              "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */
+-              "adcx %%rcx, %%r11 ;"
+-              "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
+-              "adcx %%rax, %%rbx ;"
+-              "movq 40(%1), %%rdx        ;" /* B[1]      */
+-              "adcx %%r15, %%r13 ;"
+-              "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
+-              "movq    $0, %%r14 ;"
+-              /******************************************/
+-              "adcx %%r15, %%r14 ;"
+-
+-              "xorl %%r15d, %%r15d;"
+-              "adox %%rax, %%r10 ;"
+-              "adcx  %%r8,  %%r8 ;"
+-              "adox %%rcx, %%r11 ;"
+-              "adcx  %%r9,  %%r9 ;"
+-              "adox %%r15, %%rbx ;"
+-              "adcx %%r10, %%r10 ;"
+-              "adox %%r15, %%r13 ;"
+-              "adcx %%r11, %%r11 ;"
+-              "adox %%r15, %%r14 ;"
+-              "adcx %%rbx, %%rbx ;"
+-              "adcx %%r13, %%r13 ;"
+-              "adcx %%r14, %%r14 ;"
+-
+-              "movq 32(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
+-              /*******************/
+-              "movq %%rax,  64(%0) ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,  72(%0) ;"
+-              "movq 40(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
+-              "adcq %%rax,  %%r9 ;"
+-              "movq  %%r9,  80(%0) ;"
+-              "adcq %%rcx, %%r10 ;"
+-              "movq %%r10,  88(%0) ;"
+-              "movq 48(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
+-              "adcq %%rax, %%r11 ;"
+-              "movq %%r11,  96(%0) ;"
+-              "adcq %%rcx, %%rbx ;"
+-              "movq %%rbx, 104(%0) ;"
+-              "movq 56(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
+-              "adcq %%rax, %%r13 ;"
+-              "movq %%r13, 112(%0) ;"
+-              "adcq %%rcx, %%r14 ;"
+-              "movq %%r14, 120(%0) ;"
+-              :
+-              : "r"(c), "r"(a)
+-              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+-                "%r10", "%r11", "%r13", "%r14", "%r15");
++      u64 x = a;
++      u64 y = b;
++      u64 x_xor_y = x ^ y;
++      u64 x_sub_y = x - y;
++      u64 x_sub_y_xor_y = x_sub_y ^ y;
++      u64 q = x_xor_y | x_sub_y_xor_y;
++      u64 x_xor_q = x ^ q;
++      u64 x_xor_q_ = x_xor_q >> (u32)63U;
++      return x_xor_q_ - (u64)1U;
+ }
+ 
+-static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a)
++/* Computes the addition of four-element f1 with value in f2
++ * and returns the carry (if any) */
++static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
+ {
+-      asm volatile(
+-              "movq  8(%1), %%rdx        ;" /* A[1]      */
+-              "mulx   (%1),  %%r8,  %%r9 ;" /* A[0]*A[1] */
+-              "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
+-              "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
+-
+-              "movq 16(%1), %%rdx        ;" /* A[2]      */
+-              "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
+-              "mulx   (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
+-
+-              "addq %%rax,  %%r9 ;"
+-              "adcq %%rdx, %%r10 ;"
+-              "adcq %%rcx, %%r11 ;"
+-              "adcq %%r14, %%r15 ;"
+-              "adcq    $0, %%r13 ;"
+-              "movq    $0, %%r14 ;"
+-              "adcq    $0, %%r14 ;"
+-
+-              "movq   (%1), %%rdx        ;" /* A[0]      */
+-              "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
+-
+-              "addq %%rax, %%r10 ;"
+-              "adcq %%rcx, %%r11 ;"
+-              "adcq    $0, %%r15 ;"
+-              "adcq    $0, %%r13 ;"
+-              "adcq    $0, %%r14 ;"
+-
+-              "shldq $1, %%r13, %%r14 ;"
+-              "shldq $1, %%r15, %%r13 ;"
+-              "shldq $1, %%r11, %%r15 ;"
+-              "shldq $1, %%r10, %%r11 ;"
+-              "shldq $1,  %%r9, %%r10 ;"
+-              "shldq $1,  %%r8,  %%r9 ;"
+-              "shlq  $1,  %%r8        ;"
+-
+-              /*******************/
+-              "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
+-              /*******************/
+-              "movq %%rax,  0(%0) ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,  8(%0) ;"
+-              "movq  8(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
+-              "adcq %%rax,  %%r9 ;"
+-              "movq  %%r9, 16(%0) ;"
+-              "adcq %%rcx, %%r10 ;"
+-              "movq %%r10, 24(%0) ;"
+-              "movq 16(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
+-              "adcq %%rax, %%r11 ;"
+-              "movq %%r11, 32(%0) ;"
+-              "adcq %%rcx, %%r15 ;"
+-              "movq %%r15, 40(%0) ;"
+-              "movq 24(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
+-              "adcq %%rax, %%r13 ;"
+-              "movq %%r13, 48(%0) ;"
+-              "adcq %%rcx, %%r14 ;"
+-              "movq %%r14, 56(%0) ;"
+-
+-              "movq 40(%1), %%rdx        ;" /* B[1]      */
+-              "mulx 32(%1),  %%r8,  %%r9 ;" /* B[0]*B[1] */
+-              "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
+-              "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
+-
+-              "movq 48(%1), %%rdx        ;" /* B[2]      */
+-              "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */
+-              "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
+-
+-              "addq %%rax,  %%r9 ;"
+-              "adcq %%rdx, %%r10 ;"
+-              "adcq %%rcx, %%r11 ;"
+-              "adcq %%r14, %%r15 ;"
+-              "adcq    $0, %%r13 ;"
+-              "movq    $0, %%r14 ;"
+-              "adcq    $0, %%r14 ;"
+-
+-              "movq 32(%1), %%rdx        ;" /* B[0]      */
+-              "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
+-
+-              "addq %%rax, %%r10 ;"
+-              "adcq %%rcx, %%r11 ;"
+-              "adcq    $0, %%r15 ;"
+-              "adcq    $0, %%r13 ;"
+-              "adcq    $0, %%r14 ;"
+-
+-              "shldq $1, %%r13, %%r14 ;"
+-              "shldq $1, %%r15, %%r13 ;"
+-              "shldq $1, %%r11, %%r15 ;"
+-              "shldq $1, %%r10, %%r11 ;"
+-              "shldq $1,  %%r9, %%r10 ;"
+-              "shldq $1,  %%r8,  %%r9 ;"
+-              "shlq  $1,  %%r8        ;"
+-
+-              /*******************/
+-              "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
+-              /*******************/
+-              "movq %%rax,  64(%0) ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,  72(%0) ;"
+-              "movq 40(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
+-              "adcq %%rax,  %%r9 ;"
+-              "movq  %%r9,  80(%0) ;"
+-              "adcq %%rcx, %%r10 ;"
+-              "movq %%r10,  88(%0) ;"
+-              "movq 48(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
+-              "adcq %%rax, %%r11 ;"
+-              "movq %%r11,  96(%0) ;"
+-              "adcq %%rcx, %%r15 ;"
+-              "movq %%r15, 104(%0) ;"
+-              "movq 56(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
+-              "adcq %%rax, %%r13 ;"
+-              "movq %%r13, 112(%0) ;"
+-              "adcq %%rcx, %%r14 ;"
+-              "movq %%r14, 120(%0) ;"
+-              :
+-              : "r"(c), "r"(a)
+-              : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
+-                "%r11", "%r13", "%r14", "%r15");
+-}
++      u64 carry_r;
+ 
+-static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a)
+-{
+       asm volatile(
+-              "movl    $38, %%edx; "  /* 2*c = 38 = 2^256 */
+-              "mulx 32(%1),  %%r8, %%r10; " /* c*C[4] */
+-              "xorl %%ebx, %%ebx ;"
+-              "adox   (%1),  %%r8 ;"
+-              "mulx 40(%1),  %%r9, %%r11; " /* c*C[5] */
+-              "adcx %%r10,  %%r9 ;"
+-              "adox  8(%1),  %%r9 ;"
+-              "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
+-              "adcx %%r11, %%r10 ;"
+-              "adox 16(%1), %%r10 ;"
+-              "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
+-              "adcx %%rax, %%r11 ;"
+-              "adox 24(%1), %%r11 ;"
+-              /***************************************/
+-              "adcx %%rbx, %%rcx ;"
+-              "adox  %%rbx, %%rcx ;"
+-              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
+-              "adcx %%rcx,  %%r8 ;"
+-              "adcx %%rbx,  %%r9 ;"
+-              "movq  %%r9,  8(%0) ;"
+-              "adcx %%rbx, %%r10 ;"
+-              "movq %%r10, 16(%0) ;"
+-              "adcx %%rbx, %%r11 ;"
+-              "movq %%r11, 24(%0) ;"
+-              "mov     $0, %%ecx ;"
+-              "cmovc %%edx, %%ecx ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,   (%0) ;"
+-
+-              "mulx  96(%1),  %%r8, %%r10; " /* c*C[4] */
+-              "xorl %%ebx, %%ebx ;"
+-              "adox 64(%1),  %%r8 ;"
+-              "mulx 104(%1),  %%r9, %%r11; " /* c*C[5] */
+-              "adcx %%r10,  %%r9 ;"
+-              "adox 72(%1),  %%r9 ;"
+-              "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
+-              "adcx %%r11, %%r10 ;"
+-              "adox 80(%1), %%r10 ;"
+-              "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
+-              "adcx %%rax, %%r11 ;"
+-              "adox 88(%1), %%r11 ;"
+-              /****************************************/
+-              "adcx %%rbx, %%rcx ;"
+-              "adox  %%rbx, %%rcx ;"
+-              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
+-              "adcx %%rcx,  %%r8 ;"
+-              "adcx %%rbx,  %%r9 ;"
+-              "movq  %%r9, 40(%0) ;"
+-              "adcx %%rbx, %%r10 ;"
+-              "movq %%r10, 48(%0) ;"
+-              "adcx %%rbx, %%r11 ;"
+-              "movq %%r11, 56(%0) ;"
+-              "mov     $0, %%ecx ;"
+-              "cmovc %%edx, %%ecx ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8, 32(%0) ;"
+-              :
+-              : "r"(c), "r"(a)
+-              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+-                "%r10", "%r11");
+-}
++              /* Clear registers to propagate the carry bit */
++              "  xor %%r8, %%r8;"
++              "  xor %%r9, %%r9;"
++              "  xor %%r10, %%r10;"
++              "  xor %%r11, %%r11;"
++              "  xor %1, %1;"
++
++              /* Begin addition chain */
++              "  addq 0(%3), %0;"
++              "  movq %0, 0(%2);"
++              "  adcxq 8(%3), %%r8;"
++              "  movq %%r8, 8(%2);"
++              "  adcxq 16(%3), %%r9;"
++              "  movq %%r9, 16(%2);"
++              "  adcxq 24(%3), %%r10;"
++              "  movq %%r10, 24(%2);"
++
++              /* Return the carry bit in a register */
++              "  adcx %%r11, %1;"
++      : "+&r" (f2), "=&r" (carry_r)
++      : "r" (out), "r" (f1)
++      : "%r8", "%r9", "%r10", "%r11", "memory", "cc"
++      );
+ 
+-static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a)
+-{
+-      asm volatile(
+-              "movl    $38, %%edx ; "       /* 2*c = 38 = 2^256 */
+-              "mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
+-              "mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
+-              "addq %%r10,  %%r9 ;"
+-              "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+-              "adcq %%r11, %%r10 ;"
+-              "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+-              "adcq %%rax, %%r11 ;"
+-              /***************************************/
+-              "adcq    $0, %%rcx ;"
+-              "addq   (%1),  %%r8 ;"
+-              "adcq  8(%1),  %%r9 ;"
+-              "adcq 16(%1), %%r10 ;"
+-              "adcq 24(%1), %%r11 ;"
+-              "adcq     $0, %%rcx ;"
+-              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
+-              "addq %%rcx,  %%r8 ;"
+-              "adcq    $0,  %%r9 ;"
+-              "movq  %%r9,  8(%0) ;"
+-              "adcq    $0, %%r10 ;"
+-              "movq %%r10, 16(%0) ;"
+-              "adcq    $0, %%r11 ;"
+-              "movq %%r11, 24(%0) ;"
+-              "mov     $0, %%ecx ;"
+-              "cmovc %%edx, %%ecx ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,   (%0) ;"
+-
+-              "mulx  96(%1),  %%r8, %%r10 ;" /* c*C[4] */
+-              "mulx 104(%1),  %%r9, %%r11 ;" /* c*C[5] */
+-              "addq %%r10,  %%r9 ;"
+-              "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
+-              "adcq %%r11, %%r10 ;"
+-              "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
+-              "adcq %%rax, %%r11 ;"
+-              /****************************************/
+-              "adcq    $0, %%rcx ;"
+-              "addq 64(%1),  %%r8 ;"
+-              "adcq 72(%1),  %%r9 ;"
+-              "adcq 80(%1), %%r10 ;"
+-              "adcq 88(%1), %%r11 ;"
+-              "adcq     $0, %%rcx ;"
+-              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
+-              "addq %%rcx,  %%r8 ;"
+-              "adcq    $0,  %%r9 ;"
+-              "movq  %%r9, 40(%0) ;"
+-              "adcq    $0, %%r10 ;"
+-              "movq %%r10, 48(%0) ;"
+-              "adcq    $0, %%r11 ;"
+-              "movq %%r11, 56(%0) ;"
+-              "mov     $0, %%ecx ;"
+-              "cmovc %%edx, %%ecx ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8, 32(%0) ;"
+-              :
+-              : "r"(c), "r"(a)
+-              : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
+-                "%r11");
++      return carry_r;
+ }
+ 
+-static void mul_256x256_integer_adx(u64 *const c, const u64 *const a,
+-                                  const u64 *const b)
++/* Computes the field addition of two field elements */
++static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
+ {
+       asm volatile(
+-              "movq   (%1), %%rdx; "  /* A[0] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[0]*B[0] */
+-              "xorl %%r10d, %%r10d ;"
+-              "movq  %%r8,  (%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[0]*B[1] */
+-              "adox  %%r9, %%r10 ;"
+-              "movq %%r10, 8(%0) ;"
+-              "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */
+-              "adox %%r11, %%r15 ;"
+-              "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
+-              "adox %%r13, %%r14 ;"
+-              "movq $0, %%rax ;"
+-              /******************************************/
+-              "adox %%rdx, %%rax ;"
+-
+-              "movq  8(%1), %%rdx; "  /* A[1] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
+-              "xorl %%r10d, %%r10d ;"
+-              "adcx 8(%0),  %%r8 ;"
+-              "movq  %%r8,  8(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+-              "adox  %%r9, %%r10 ;"
+-              "adcx %%r15, %%r10 ;"
+-              "movq %%r10, 16(%0) ;"
+-              "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */
+-              "adox %%r11, %%r15 ;"
+-              "adcx %%r14, %%r15 ;"
+-              "movq $0, %%r8  ;"
+-              "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
+-              "adox %%r13, %%r14 ;"
+-              "adcx %%rax, %%r14 ;"
+-              "movq $0, %%rax ;"
+-              /******************************************/
+-              "adox %%rdx, %%rax ;"
+-              "adcx  %%r8, %%rax ;"
+-
+-              "movq 16(%1), %%rdx; "  /* A[2] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
+-              "xorl %%r10d, %%r10d ;"
+-              "adcx 16(%0), %%r8 ;"
+-              "movq  %%r8, 16(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+-              "adox  %%r9, %%r10 ;"
+-              "adcx %%r15, %%r10 ;"
+-              "movq %%r10, 24(%0) ;"
+-              "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */
+-              "adox %%r11, %%r15 ;"
+-              "adcx %%r14, %%r15 ;"
+-              "movq $0, %%r8  ;"
+-              "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
+-              "adox %%r13, %%r14 ;"
+-              "adcx %%rax, %%r14 ;"
+-              "movq $0, %%rax ;"
+-              /******************************************/
+-              "adox %%rdx, %%rax ;"
+-              "adcx  %%r8, %%rax ;"
+-
+-              "movq 24(%1), %%rdx; "  /* A[3] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
+-              "xorl %%r10d, %%r10d ;"
+-              "adcx 24(%0), %%r8 ;"
+-              "movq  %%r8, 24(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+-              "adox  %%r9, %%r10 ;"
+-              "adcx %%r15, %%r10 ;"
+-              "movq %%r10, 32(%0) ;"
+-              "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */
+-              "adox %%r11, %%r15 ;"
+-              "adcx %%r14, %%r15 ;"
+-              "movq %%r15, 40(%0) ;"
+-              "movq $0, %%r8  ;"
+-              "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
+-              "adox %%r13, %%r14 ;"
+-              "adcx %%rax, %%r14 ;"
+-              "movq %%r14, 48(%0) ;"
+-              "movq $0, %%rax ;"
+-              /******************************************/
+-              "adox %%rdx, %%rax ;"
+-              "adcx  %%r8, %%rax ;"
+-              "movq %%rax, 56(%0) ;"
+-              :
+-              : "r"(c), "r"(a), "r"(b)
+-              : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11",
+-                "%r13", "%r14", "%r15");
++              /* Compute the raw addition of f1 + f2 */
++              "  movq 0(%0), %%r8;"
++              "  addq 0(%2), %%r8;"
++              "  movq 8(%0), %%r9;"
++              "  adcxq 8(%2), %%r9;"
++              "  movq 16(%0), %%r10;"
++              "  adcxq 16(%2), %%r10;"
++              "  movq 24(%0), %%r11;"
++              "  adcxq 24(%2), %%r11;"
++
++              /* Wrap the result back into the field */
++
++              /* Step 1: Compute carry*38 */
++              "  mov $0, %%rax;"
++              "  mov $38, %0;"
++              "  cmovc %0, %%rax;"
++
++              /* Step 2: Add carry*38 to the original sum */
++              "  xor %%rcx, %%rcx;"
++              "  add %%rax, %%r8;"
++              "  adcx %%rcx, %%r9;"
++              "  movq %%r9, 8(%1);"
++              "  adcx %%rcx, %%r10;"
++              "  movq %%r10, 16(%1);"
++              "  adcx %%rcx, %%r11;"
++              "  movq %%r11, 24(%1);"
++
++              /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
++              "  mov $0, %%rax;"
++              "  cmovc %0, %%rax;"
++              "  add %%rax, %%r8;"
++              "  movq %%r8, 0(%1);"
++      : "+&r" (f2)
++      : "r" (out), "r" (f1)
++      : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
++      );
+ }
+ 
+-static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a,
+-                                   const u64 *const b)
++/* Computes the field substraction of two field elements */
++static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
+ {
+       asm volatile(
+-              "movq   (%1), %%rdx; "  /* A[0] */
+-              "mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
+-              "movq %%r8,  (%0) ;"
+-              "mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+-              "addq %%r10, %%r15 ;"
+-              "mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
+-              "adcq  %%r8, %%rax ;"
+-              "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+-              "adcq %%r10, %%rbx ;"
+-              /******************************************/
+-              "adcq    $0, %%rcx ;"
+-
+-              "movq  8(%1), %%rdx; "  /* A[1] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
+-              "addq %%r15,  %%r8 ;"
+-              "movq %%r8, 8(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+-              "adcq %%r10,  %%r9 ;"
+-              "mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
+-              "adcq  %%r8, %%r11 ;"
+-              "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
+-              "adcq %%r10, %%r13 ;"
+-              /******************************************/
+-              "adcq    $0, %%r15 ;"
+-
+-              "addq  %%r9, %%rax ;"
+-              "adcq %%r11, %%rbx ;"
+-              "adcq %%r13, %%rcx ;"
+-              "adcq    $0, %%r15 ;"
+-
+-              "movq 16(%1), %%rdx; "  /* A[2] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
+-              "addq %%rax,  %%r8 ;"
+-              "movq %%r8, 16(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+-              "adcq %%r10,  %%r9 ;"
+-              "mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
+-              "adcq  %%r8, %%r11 ;"
+-              "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+-              "adcq %%r10, %%r13 ;"
+-              /******************************************/
+-              "adcq    $0, %%rax ;"
+-
+-              "addq  %%r9, %%rbx ;"
+-              "adcq %%r11, %%rcx ;"
+-              "adcq %%r13, %%r15 ;"
+-              "adcq    $0, %%rax ;"
+-
+-              "movq 24(%1), %%rdx; "  /* A[3] */
+-              "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
+-              "addq %%rbx,  %%r8 ;"
+-              "movq %%r8, 24(%0) ;"
+-              "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+-              "adcq %%r10,  %%r9 ;"
+-              "mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
+-              "adcq  %%r8, %%r11 ;"
+-              "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+-              "adcq %%r10, %%r13 ;"
+-              /******************************************/
+-              "adcq    $0, %%rbx ;"
+-
+-              "addq  %%r9, %%rcx ;"
+-              "movq %%rcx, 32(%0) ;"
+-              "adcq %%r11, %%r15 ;"
+-              "movq %%r15, 40(%0) ;"
+-              "adcq %%r13, %%rax ;"
+-              "movq %%rax, 48(%0) ;"
+-              "adcq    $0, %%rbx ;"
+-              "movq %%rbx, 56(%0) ;"
+-              :
+-              : "r"(c), "r"(a), "r"(b)
+-              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+-                "%r10", "%r11", "%r13", "%r15");
++              /* Compute the raw substraction of f1-f2 */
++              "  movq 0(%1), %%r8;"
++              "  subq 0(%2), %%r8;"
++              "  movq 8(%1), %%r9;"
++              "  sbbq 8(%2), %%r9;"
++              "  movq 16(%1), %%r10;"
++              "  sbbq 16(%2), %%r10;"
++              "  movq 24(%1), %%r11;"
++              "  sbbq 24(%2), %%r11;"
++
++              /* Wrap the result back into the field */
++
++              /* Step 1: Compute carry*38 */
++              "  mov $0, %%rax;"
++              "  mov $38, %%rcx;"
++              "  cmovc %%rcx, %%rax;"
++
++              /* Step 2: Substract carry*38 from the original difference */
++              "  sub %%rax, %%r8;"
++              "  sbb $0, %%r9;"
++              "  sbb $0, %%r10;"
++              "  sbb $0, %%r11;"
++
++              /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
++              "  mov $0, %%rax;"
++              "  cmovc %%rcx, %%rax;"
++              "  sub %%rax, %%r8;"
++
++              /* Store the result */
++              "  movq %%r8, 0(%0);"
++              "  movq %%r9, 8(%0);"
++              "  movq %%r10, 16(%0);"
++              "  movq %%r11, 24(%0);"
++      :
++      : "r" (out), "r" (f1), "r" (f2)
++      : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
++      );
+ }
+ 
+-static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a)
++/* Computes a field multiplication: out <- f1 * f2
++ * Uses the 8-element buffer tmp for intermediate results */
++static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
+ {
+       asm volatile(
+-              "movq   (%1), %%rdx        ;" /* A[0]      */
+-              "mulx  8(%1),  %%r8, %%r14 ;" /* A[1]*A[0] */
+-              "xorl %%r15d, %%r15d;"
+-              "mulx 16(%1),  %%r9, %%r10 ;" /* A[2]*A[0] */
+-              "adcx %%r14,  %%r9 ;"
+-              "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
+-              "adcx %%rax, %%r10 ;"
+-              "movq 24(%1), %%rdx        ;" /* A[3]      */
+-              "mulx  8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
+-              "adcx %%rcx, %%r11 ;"
+-              "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
+-              "adcx %%rax, %%rbx ;"
+-              "movq  8(%1), %%rdx        ;" /* A[1]      */
+-              "adcx %%r15, %%r13 ;"
+-              "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
+-              "movq    $0, %%r14 ;"
+-              /******************************************/
+-              "adcx %%r15, %%r14 ;"
+-
+-              "xorl %%r15d, %%r15d;"
+-              "adox %%rax, %%r10 ;"
+-              "adcx  %%r8,  %%r8 ;"
+-              "adox %%rcx, %%r11 ;"
+-              "adcx  %%r9,  %%r9 ;"
+-              "adox %%r15, %%rbx ;"
+-              "adcx %%r10, %%r10 ;"
+-              "adox %%r15, %%r13 ;"
+-              "adcx %%r11, %%r11 ;"
+-              "adox %%r15, %%r14 ;"
+-              "adcx %%rbx, %%rbx ;"
+-              "adcx %%r13, %%r13 ;"
+-              "adcx %%r14, %%r14 ;"
+-
+-              "movq   (%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+-              /*******************/
+-              "movq %%rax,  0(%0) ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,  8(%0) ;"
+-              "movq  8(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+-              "adcq %%rax,  %%r9 ;"
+-              "movq  %%r9, 16(%0) ;"
+-              "adcq %%rcx, %%r10 ;"
+-              "movq %%r10, 24(%0) ;"
+-              "movq 16(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+-              "adcq %%rax, %%r11 ;"
+-              "movq %%r11, 32(%0) ;"
+-              "adcq %%rcx, %%rbx ;"
+-              "movq %%rbx, 40(%0) ;"
+-              "movq 24(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+-              "adcq %%rax, %%r13 ;"
+-              "movq %%r13, 48(%0) ;"
+-              "adcq %%rcx, %%r14 ;"
+-              "movq %%r14, 56(%0) ;"
+-              :
+-              : "r"(c), "r"(a)
+-              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+-                "%r10", "%r11", "%r13", "%r14", "%r15");
+-}
++              /* Compute the raw multiplication: tmp <- src1 * src2 */
+ 
+-static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a)
+-{
+-      asm volatile(
+-              "movq  8(%1), %%rdx        ;" /* A[1]      */
+-              "mulx   (%1),  %%r8,  %%r9 ;" /* A[0]*A[1] */
+-              "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
+-              "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
+-
+-              "movq 16(%1), %%rdx        ;" /* A[2]      */
+-              "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
+-              "mulx   (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
+-
+-              "addq %%rax,  %%r9 ;"
+-              "adcq %%rdx, %%r10 ;"
+-              "adcq %%rcx, %%r11 ;"
+-              "adcq %%r14, %%r15 ;"
+-              "adcq    $0, %%r13 ;"
+-              "movq    $0, %%r14 ;"
+-              "adcq    $0, %%r14 ;"
+-
+-              "movq   (%1), %%rdx        ;" /* A[0]      */
+-              "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
+-
+-              "addq %%rax, %%r10 ;"
+-              "adcq %%rcx, %%r11 ;"
+-              "adcq    $0, %%r15 ;"
+-              "adcq    $0, %%r13 ;"
+-              "adcq    $0, %%r14 ;"
+-
+-              "shldq $1, %%r13, %%r14 ;"
+-              "shldq $1, %%r15, %%r13 ;"
+-              "shldq $1, %%r11, %%r15 ;"
+-              "shldq $1, %%r10, %%r11 ;"
+-              "shldq $1,  %%r9, %%r10 ;"
+-              "shldq $1,  %%r8,  %%r9 ;"
+-              "shlq  $1,  %%r8        ;"
+-
+-              /*******************/
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+-              /*******************/
+-              "movq %%rax,  0(%0) ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,  8(%0) ;"
+-              "movq  8(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+-              "adcq %%rax,  %%r9 ;"
+-              "movq  %%r9, 16(%0) ;"
+-              "adcq %%rcx, %%r10 ;"
+-              "movq %%r10, 24(%0) ;"
+-              "movq 16(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+-              "adcq %%rax, %%r11 ;"
+-              "movq %%r11, 32(%0) ;"
+-              "adcq %%rcx, %%r15 ;"
+-              "movq %%r15, 40(%0) ;"
+-              "movq 24(%1), %%rdx ;"
+-              "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+-              "adcq %%rax, %%r13 ;"
+-              "movq %%r13, 48(%0) ;"
+-              "adcq %%rcx, %%r14 ;"
+-              "movq %%r14, 56(%0) ;"
+-              :
+-              : "r"(c), "r"(a)
+-              : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
+-                "%r11", "%r13", "%r14", "%r15");
++              /* Compute src1[0] * src2 */
++              "  movq 0(%1), %%rdx;"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
++              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
++              "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"
++              /* Compute src1[1] * src2 */
++              "  movq 8(%1), %%rdx;"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 16(%0);"
++              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
++              /* Compute src1[2] * src2 */
++              "  movq 16(%1), %%rdx;"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 24(%0);"
++              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
++              /* Compute src1[3] * src2 */
++              "  movq 24(%1), %%rdx;"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 32(%0);"
++              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 40(%0);"    "  mov $0, %%r8;"
++              "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
++              /* Line up pointers */
++              "  mov %0, %1;"
++              "  mov %2, %0;"
++
++              /* Wrap the result back into the field */
++
++              /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
++              "  mov $38, %%rdx;"
++              "  mulxq 32(%1), %%r8, %%r13;"
++              "  xor %3, %3;"
++              "  adoxq 0(%1), %%r8;"
++              "  mulxq 40(%1), %%r9, %%r12;"
++              "  adcx %%r13, %%r9;"
++              "  adoxq 8(%1), %%r9;"
++              "  mulxq 48(%1), %%r10, %%r13;"
++              "  adcx %%r12, %%r10;"
++              "  adoxq 16(%1), %%r10;"
++              "  mulxq 56(%1), %%r11, %%rax;"
++              "  adcx %%r13, %%r11;"
++              "  adoxq 24(%1), %%r11;"
++              "  adcx %3, %%rax;"
++              "  adox %3, %%rax;"
++              "  imul %%rdx, %%rax;"
++
++              /* Step 2: Fold the carry back into dst */
++              "  add %%rax, %%r8;"
++              "  adcx %3, %%r9;"
++              "  movq %%r9, 8(%0);"
++              "  adcx %3, %%r10;"
++              "  movq %%r10, 16(%0);"
++              "  adcx %3, %%r11;"
++              "  movq %%r11, 24(%0);"
++
++              /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
++              "  mov $0, %%rax;"
++              "  cmovc %%rdx, %%rax;"
++              "  add %%rax, %%r8;"
++              "  movq %%r8, 0(%0);"
++      : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
++      :
++      : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
++      );
+ }
+ 
+-static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
++/* Computes two field multiplications:
++ * out[0] <- f1[0] * f2[0]
++ * out[1] <- f1[1] * f2[1]
++ * Uses the 16-element buffer tmp for intermediate results. */
++static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
+ {
+       asm volatile(
+-              "movl    $38, %%edx ;"  /* 2*c = 38 = 2^256 */
+-              "mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
+-              "xorl %%ebx, %%ebx ;"
+-              "adox   (%1),  %%r8 ;"
+-              "mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
+-              "adcx %%r10,  %%r9 ;"
+-              "adox  8(%1),  %%r9 ;"
+-              "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+-              "adcx %%r11, %%r10 ;"
+-              "adox 16(%1), %%r10 ;"
+-              "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+-              "adcx %%rax, %%r11 ;"
+-              "adox 24(%1), %%r11 ;"
+-              /***************************************/
+-              "adcx %%rbx, %%rcx ;"
+-              "adox  %%rbx, %%rcx ;"
+-              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
+-              "adcx %%rcx,  %%r8 ;"
+-              "adcx %%rbx,  %%r9 ;"
+-              "movq  %%r9,  8(%0) ;"
+-              "adcx %%rbx, %%r10 ;"
+-              "movq %%r10, 16(%0) ;"
+-              "adcx %%rbx, %%r11 ;"
+-              "movq %%r11, 24(%0) ;"
+-              "mov     $0, %%ecx ;"
+-              "cmovc %%edx, %%ecx ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,   (%0) ;"
+-              :
+-              : "r"(c), "r"(a)
+-              : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+-                "%r10", "%r11");
+-}
++              /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
+ 
+-static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
+-{
+-      asm volatile(
+-              "movl    $38, %%edx ;"  /* 2*c = 38 = 2^256 */
+-              "mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
+-              "mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
+-              "addq %%r10,  %%r9 ;"
+-              "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+-              "adcq %%r11, %%r10 ;"
+-              "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+-              "adcq %%rax, %%r11 ;"
+-              /***************************************/
+-              "adcq    $0, %%rcx ;"
+-              "addq   (%1),  %%r8 ;"
+-              "adcq  8(%1),  %%r9 ;"
+-              "adcq 16(%1), %%r10 ;"
+-              "adcq 24(%1), %%r11 ;"
+-              "adcq     $0, %%rcx ;"
+-              "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
+-              "addq %%rcx,  %%r8 ;"
+-              "adcq    $0,  %%r9 ;"
+-              "movq  %%r9,  8(%0) ;"
+-              "adcq    $0, %%r10 ;"
+-              "movq %%r10, 16(%0) ;"
+-              "adcq    $0, %%r11 ;"
+-              "movq %%r11, 24(%0) ;"
+-              "mov     $0, %%ecx ;"
+-              "cmovc %%edx, %%ecx ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,   (%0) ;"
+-              :
+-              : "r"(c), "r"(a)
+-              : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
+-                "%r11");
++              /* Compute src1[0] * src2 */
++              "  movq 0(%1), %%rdx;"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
++              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
++              "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"
++              /* Compute src1[1] * src2 */
++              "  movq 8(%1), %%rdx;"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 16(%0);"
++              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
++              /* Compute src1[2] * src2 */
++              "  movq 16(%1), %%rdx;"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 24(%0);"
++              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
++              /* Compute src1[3] * src2 */
++              "  movq 24(%1), %%rdx;"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 32(%0);"
++              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 40(%0);"    "  mov $0, %%r8;"
++              "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
++
++              /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
++
++              /* Compute src1[0] * src2 */
++              "  movq 32(%1), %%rdx;"
++              "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
++              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
++              "  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
++              "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"
++              /* Compute src1[1] * src2 */
++              "  movq 40(%1), %%rdx;"
++              "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
++              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 80(%0);"
++              "  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
++              /* Compute src1[2] * src2 */
++              "  movq 48(%1), %%rdx;"
++              "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
++              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 88(%0);"
++              "  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
++              /* Compute src1[3] * src2 */
++              "  movq 56(%1), %%rdx;"
++              "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
++              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 96(%0);"
++              "  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 104(%0);"    "  mov $0, %%r8;"
++              "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
++                                                 "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
++              /* Line up pointers */
++              "  mov %0, %1;"
++              "  mov %2, %0;"
++
++              /* Wrap the results back into the field */
++
++              /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
++              "  mov $38, %%rdx;"
++              "  mulxq 32(%1), %%r8, %%r13;"
++              "  xor %3, %3;"
++              "  adoxq 0(%1), %%r8;"
++              "  mulxq 40(%1), %%r9, %%r12;"
++              "  adcx %%r13, %%r9;"
++              "  adoxq 8(%1), %%r9;"
++              "  mulxq 48(%1), %%r10, %%r13;"
++              "  adcx %%r12, %%r10;"
++              "  adoxq 16(%1), %%r10;"
++              "  mulxq 56(%1), %%r11, %%rax;"
++              "  adcx %%r13, %%r11;"
++              "  adoxq 24(%1), %%r11;"
++              "  adcx %3, %%rax;"
++              "  adox %3, %%rax;"
++              "  imul %%rdx, %%rax;"
++
++              /* Step 2: Fold the carry back into dst */
++              "  add %%rax, %%r8;"
++              "  adcx %3, %%r9;"
++              "  movq %%r9, 8(%0);"
++              "  adcx %3, %%r10;"
++              "  movq %%r10, 16(%0);"
++              "  adcx %3, %%r11;"
++              "  movq %%r11, 24(%0);"
++
++              /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
++              "  mov $0, %%rax;"
++              "  cmovc %%rdx, %%rax;"
++              "  add %%rax, %%r8;"
++              "  movq %%r8, 0(%0);"
++
++              /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
++              "  mov $38, %%rdx;"
++              "  mulxq 96(%1), %%r8, %%r13;"
++              "  xor %3, %3;"
++              "  adoxq 64(%1), %%r8;"
++              "  mulxq 104(%1), %%r9, %%r12;"
++              "  adcx %%r13, %%r9;"
++              "  adoxq 72(%1), %%r9;"
++              "  mulxq 112(%1), %%r10, %%r13;"
++              "  adcx %%r12, %%r10;"
++              "  adoxq 80(%1), %%r10;"
++              "  mulxq 120(%1), %%r11, %%rax;"
++              "  adcx %%r13, %%r11;"
++              "  adoxq 88(%1), %%r11;"
++              "  adcx %3, %%rax;"
++              "  adox %3, %%rax;"
++              "  imul %%rdx, %%rax;"
++
++              /* Step 2: Fold the carry back into dst */
++              "  add %%rax, %%r8;"
++              "  adcx %3, %%r9;"
++              "  movq %%r9, 40(%0);"
++              "  adcx %3, %%r10;"
++              "  movq %%r10, 48(%0);"
++              "  adcx %3, %%r11;"
++              "  movq %%r11, 56(%0);"
++
++              /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
++              "  mov $0, %%rax;"
++              "  cmovc %%rdx, %%rax;"
++              "  add %%rax, %%r8;"
++              "  movq %%r8, 32(%0);"
++      : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
++      :
++      : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
++      );
+ }
+ 
+-static __always_inline void
+-add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b)
++/* Computes the field multiplication of four-element f1 with value in f2 */
++static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
+ {
+-      asm volatile(
+-              "mov     $38, %%eax ;"
+-              "xorl  %%ecx, %%ecx ;"
+-              "movq   (%2),  %%r8 ;"
+-              "adcx   (%1),  %%r8 ;"
+-              "movq  8(%2),  %%r9 ;"
+-              "adcx  8(%1),  %%r9 ;"
+-              "movq 16(%2), %%r10 ;"
+-              "adcx 16(%1), %%r10 ;"
+-              "movq 24(%2), %%r11 ;"
+-              "adcx 24(%1), %%r11 ;"
+-              "cmovc %%eax, %%ecx ;"
+-              "xorl %%eax, %%eax  ;"
+-              "adcx %%rcx,  %%r8  ;"
+-              "adcx %%rax,  %%r9  ;"
+-              "movq  %%r9,  8(%0) ;"
+-              "adcx %%rax, %%r10  ;"
+-              "movq %%r10, 16(%0) ;"
+-              "adcx %%rax, %%r11  ;"
+-              "movq %%r11, 24(%0) ;"
+-              "mov     $38, %%ecx ;"
+-              "cmovc %%ecx, %%eax ;"
+-              "addq %%rax,  %%r8  ;"
+-              "movq  %%r8,   (%0) ;"
+-              :
+-              : "r"(c), "r"(a), "r"(b)
+-              : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+-}
++      register u64 f2_r asm("rdx") = f2;
+ 
+-static __always_inline void
+-add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b)
+-{
+       asm volatile(
+-              "mov     $38, %%eax ;"
+-              "movq   (%2),  %%r8 ;"
+-              "addq   (%1),  %%r8 ;"
+-              "movq  8(%2),  %%r9 ;"
+-              "adcq  8(%1),  %%r9 ;"
+-              "movq 16(%2), %%r10 ;"
+-              "adcq 16(%1), %%r10 ;"
+-              "movq 24(%2), %%r11 ;"
+-              "adcq 24(%1), %%r11 ;"
+-              "mov      $0, %%ecx ;"
+-              "cmovc %%eax, %%ecx ;"
+-              "addq %%rcx,  %%r8  ;"
+-              "adcq    $0,  %%r9  ;"
+-              "movq  %%r9,  8(%0) ;"
+-              "adcq    $0, %%r10  ;"
+-              "movq %%r10, 16(%0) ;"
+-              "adcq    $0, %%r11  ;"
+-              "movq %%r11, 24(%0) ;"
+-              "mov     $0, %%ecx  ;"
+-              "cmovc %%eax, %%ecx ;"
+-              "addq %%rcx,  %%r8  ;"
+-              "movq  %%r8,   (%0) ;"
+-              :
+-              : "r"(c), "r"(a), "r"(b)
+-              : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
++              /* Compute the raw multiplication of f1*f2 */
++              "  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
++              "  mulxq 8(%2), %%r9, %%r12;"      /* f1[1]*f2 */
++              "  add %%rcx, %%r9;"
++              "  mov $0, %%rcx;"
++              "  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
++              "  adcx %%r12, %%r10;"
++              "  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
++              "  adcx %%r13, %%r11;"
++              "  adcx %%rcx, %%rax;"
++
++              /* Wrap the result back into the field */
++
++              /* Step 1: Compute carry*38 */
++              "  mov $38, %%rdx;"
++              "  imul %%rdx, %%rax;"
++
++              /* Step 2: Fold the carry back into dst */
++              "  add %%rax, %%r8;"
++              "  adcx %%rcx, %%r9;"
++              "  movq %%r9, 8(%1);"
++              "  adcx %%rcx, %%r10;"
++              "  movq %%r10, 16(%1);"
++              "  adcx %%rcx, %%r11;"
++              "  movq %%r11, 24(%1);"
++
++              /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
++              "  mov $0, %%rax;"
++              "  cmovc %%rdx, %%rax;"
++              "  add %%rax, %%r8;"
++              "  movq %%r8, 0(%1);"
++      : "+&r" (f2_r)
++      : "r" (out), "r" (f1)
++      : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc"
++      );
+ }
+ 
+-static __always_inline void
+-sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b)
+-{
+-      asm volatile(
+-              "mov     $38, %%eax ;"
+-              "movq   (%1),  %%r8 ;"
+-              "subq   (%2),  %%r8 ;"
+-              "movq  8(%1),  %%r9 ;"
+-              "sbbq  8(%2),  %%r9 ;"
+-              "movq 16(%1), %%r10 ;"
+-              "sbbq 16(%2), %%r10 ;"
+-              "movq 24(%1), %%r11 ;"
+-              "sbbq 24(%2), %%r11 ;"
+-              "mov      $0, %%ecx ;"
+-              "cmovc %%eax, %%ecx ;"
+-              "subq %%rcx,  %%r8  ;"
+-              "sbbq    $0,  %%r9  ;"
+-              "movq  %%r9,  8(%0) ;"
+-              "sbbq    $0, %%r10  ;"
+-              "movq %%r10, 16(%0) ;"
+-              "sbbq    $0, %%r11  ;"
+-              "movq %%r11, 24(%0) ;"
+-              "mov     $0, %%ecx  ;"
+-              "cmovc %%eax, %%ecx ;"
+-              "subq %%rcx,  %%r8  ;"
+-              "movq  %%r8,   (%0) ;"
+-              :
+-              : "r"(c), "r"(a), "r"(b)
+-              : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+-}
+-
+-/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
+-static __always_inline void
+-mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a)
++/* Computes p1 <- bit ? p2 : p1 in constant time */
++static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
+ {
+-      const u64 a24 = 121666;
+       asm volatile(
+-              "movq     %2, %%rdx ;"
+-              "mulx   (%1),  %%r8, %%r10 ;"
+-              "mulx  8(%1),  %%r9, %%r11 ;"
+-              "addq %%r10,  %%r9 ;"
+-              "mulx 16(%1), %%r10, %%rax ;"
+-              "adcq %%r11, %%r10 ;"
+-              "mulx 24(%1), %%r11, %%rcx ;"
+-              "adcq %%rax, %%r11 ;"
+-              /**************************/
+-              "adcq    $0, %%rcx ;"
+-              "movl   $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
+-              "imul %%rdx, %%rcx ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "adcq    $0,  %%r9 ;"
+-              "movq  %%r9,  8(%0) ;"
+-              "adcq    $0, %%r10 ;"
+-              "movq %%r10, 16(%0) ;"
+-              "adcq    $0, %%r11 ;"
+-              "movq %%r11, 24(%0) ;"
+-              "mov     $0, %%ecx ;"
+-              "cmovc %%edx, %%ecx ;"
+-              "addq %%rcx,  %%r8 ;"
+-              "movq  %%r8,   (%0) ;"
+-              :
+-              : "r"(c), "r"(a), "r"(a24)
+-              : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
+-                "%r11");
+-}
+-
+-static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
+-{
+-      struct {
+-              eltfp25519_1w_buffer buffer;
+-              eltfp25519_1w x0, x1, x2;
+-      } __aligned(32) m;
+-      u64 *T[4];
+-
+-      T[0] = m.x0;
+-      T[1] = c; /* x^(-1) */
+-      T[2] = m.x1;
+-      T[3] = m.x2;
+-
+-      copy_eltfp25519_1w(T[1], a);
+-      sqrn_eltfp25519_1w_adx(T[1], 1);
+-      copy_eltfp25519_1w(T[2], T[1]);
+-      sqrn_eltfp25519_1w_adx(T[2], 2);
+-      mul_eltfp25519_1w_adx(T[0], a, T[2]);
+-      mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
+-      copy_eltfp25519_1w(T[2], T[1]);
+-      sqrn_eltfp25519_1w_adx(T[2], 1);
+-      mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
+-      copy_eltfp25519_1w(T[2], T[0]);
+-      sqrn_eltfp25519_1w_adx(T[2], 5);
+-      mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
+-      copy_eltfp25519_1w(T[2], T[0]);
+-      sqrn_eltfp25519_1w_adx(T[2], 10);
+-      mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
+-      copy_eltfp25519_1w(T[3], T[2]);
+-      sqrn_eltfp25519_1w_adx(T[3], 20);
+-      mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
+-      sqrn_eltfp25519_1w_adx(T[3], 10);
+-      mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
+-      copy_eltfp25519_1w(T[0], T[3]);
+-      sqrn_eltfp25519_1w_adx(T[0], 50);
+-      mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
+-      copy_eltfp25519_1w(T[2], T[0]);
+-      sqrn_eltfp25519_1w_adx(T[2], 100);
+-      mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
+-      sqrn_eltfp25519_1w_adx(T[2], 50);
+-      mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
+-      sqrn_eltfp25519_1w_adx(T[2], 5);
+-      mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
+-
+-      memzero_explicit(&m, sizeof(m));
+-}
+-
+-static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
+-{
+-      struct {
+-              eltfp25519_1w_buffer buffer;
+-              eltfp25519_1w x0, x1, x2;
+-      } __aligned(32) m;
+-      u64 *T[5];
+-
+-      T[0] = m.x0;
+-      T[1] = c; /* x^(-1) */
+-      T[2] = m.x1;
+-      T[3] = m.x2;
+-
+-      copy_eltfp25519_1w(T[1], a);
+-      sqrn_eltfp25519_1w_bmi2(T[1], 1);
+-      copy_eltfp25519_1w(T[2], T[1]);
+-      sqrn_eltfp25519_1w_bmi2(T[2], 2);
+-      mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
+-      mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
+-      copy_eltfp25519_1w(T[2], T[1]);
+-      sqrn_eltfp25519_1w_bmi2(T[2], 1);
+-      mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
+-      copy_eltfp25519_1w(T[2], T[0]);
+-      sqrn_eltfp25519_1w_bmi2(T[2], 5);
+-      mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
+-      copy_eltfp25519_1w(T[2], T[0]);
+-      sqrn_eltfp25519_1w_bmi2(T[2], 10);
+-      mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
+-      copy_eltfp25519_1w(T[3], T[2]);
+-      sqrn_eltfp25519_1w_bmi2(T[3], 20);
+-      mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
+-      sqrn_eltfp25519_1w_bmi2(T[3], 10);
+-      mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
+-      copy_eltfp25519_1w(T[0], T[3]);
+-      sqrn_eltfp25519_1w_bmi2(T[0], 50);
+-      mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
+-      copy_eltfp25519_1w(T[2], T[0]);
+-      sqrn_eltfp25519_1w_bmi2(T[2], 100);
+-      mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
+-      sqrn_eltfp25519_1w_bmi2(T[2], 50);
+-      mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
+-      sqrn_eltfp25519_1w_bmi2(T[2], 5);
+-      mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
++              /* Invert the polarity of bit to match cmov expectations */
++              "  add $18446744073709551615, %0;"
+ 
+-      memzero_explicit(&m, sizeof(m));
++              /* cswap p1[0], p2[0] */
++              "  movq 0(%1), %%r8;"
++              "  movq 0(%2), %%r9;"
++              "  mov %%r8, %%r10;"
++              "  cmovc %%r9, %%r8;"
++              "  cmovc %%r10, %%r9;"
++              "  movq %%r8, 0(%1);"
++              "  movq %%r9, 0(%2);"
++
++              /* cswap p1[1], p2[1] */
++              "  movq 8(%1), %%r8;"
++              "  movq 8(%2), %%r9;"
++              "  mov %%r8, %%r10;"
++              "  cmovc %%r9, %%r8;"
++              "  cmovc %%r10, %%r9;"
++              "  movq %%r8, 8(%1);"
++              "  movq %%r9, 8(%2);"
++
++              /* cswap p1[2], p2[2] */
++              "  movq 16(%1), %%r8;"
++              "  movq 16(%2), %%r9;"
++              "  mov %%r8, %%r10;"
++              "  cmovc %%r9, %%r8;"
++              "  cmovc %%r10, %%r9;"
++              "  movq %%r8, 16(%1);"
++              "  movq %%r9, 16(%2);"
++
++              /* cswap p1[3], p2[3] */
++              "  movq 24(%1), %%r8;"
++              "  movq 24(%2), %%r9;"
++              "  mov %%r8, %%r10;"
++              "  cmovc %%r9, %%r8;"
++              "  cmovc %%r10, %%r9;"
++              "  movq %%r8, 24(%1);"
++              "  movq %%r9, 24(%2);"
++
++              /* cswap p1[4], p2[4] */
++              "  movq 32(%1), %%r8;"
++              "  movq 32(%2), %%r9;"
++              "  mov %%r8, %%r10;"
++              "  cmovc %%r9, %%r8;"
++              "  cmovc %%r10, %%r9;"
++              "  movq %%r8, 32(%1);"
++              "  movq %%r9, 32(%2);"
++
++              /* cswap p1[5], p2[5] */
++              "  movq 40(%1), %%r8;"
++              "  movq 40(%2), %%r9;"
++              "  mov %%r8, %%r10;"
++              "  cmovc %%r9, %%r8;"
++              "  cmovc %%r10, %%r9;"
++              "  movq %%r8, 40(%1);"
++              "  movq %%r9, 40(%2);"
++
++              /* cswap p1[6], p2[6] */
++              "  movq 48(%1), %%r8;"
++              "  movq 48(%2), %%r9;"
++              "  mov %%r8, %%r10;"
++              "  cmovc %%r9, %%r8;"
++              "  cmovc %%r10, %%r9;"
++              "  movq %%r8, 48(%1);"
++              "  movq %%r9, 48(%2);"
++
++              /* cswap p1[7], p2[7] */
++              "  movq 56(%1), %%r8;"
++              "  movq 56(%2), %%r9;"
++              "  mov %%r8, %%r10;"
++              "  cmovc %%r9, %%r8;"
++              "  cmovc %%r10, %%r9;"
++              "  movq %%r8, 56(%1);"
++              "  movq %%r9, 56(%2);"
++      : "+&r" (bit)
++      : "r" (p1), "r" (p2)
++      : "%r8", "%r9", "%r10", "memory", "cc"
++      );
+ }
+ 
+-/* Given c, a 256-bit number, fred_eltfp25519_1w updates c
+- * with a number such that 0 <= C < 2**255-19.
+- */
+-static __always_inline void fred_eltfp25519_1w(u64 *const c)
++/* Computes the square of a field element: out <- f * f
++ * Uses the 8-element buffer tmp for intermediate results */
++static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
+ {
+-      u64 tmp0 = 38, tmp1 = 19;
+       asm volatile(
+-              "btrq   $63,    %3 ;" /* Put bit 255 in carry flag and clear */
+-              "cmovncl %k5,   %k4 ;" /* c[255] ? 38 : 19 */
+-
+-              /* Add either 19 or 38 to c */
+-              "addq    %4,   %0 ;"
+-              "adcq    $0,   %1 ;"
+-              "adcq    $0,   %2 ;"
+-              "adcq    $0,   %3 ;"
+-
+-              /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
+-              "movl    $0,  %k4 ;"
+-              "cmovnsl %k5,  %k4 ;" /* c[255] ? 0 : 19 */
+-              "btrq   $63,   %3 ;" /* Clear bit 255 */
+-
+-              /* Subtract 19 if necessary */
+-              "subq    %4,   %0 ;"
+-              "sbbq    $0,   %1 ;"
+-              "sbbq    $0,   %2 ;"
+-              "sbbq    $0,   %3 ;"
+-
+-              : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0),
+-                "+r"(tmp1)
+-              :
+-              : "memory", "cc");
+-}
++              /* Compute the raw multiplication: tmp <- f * f */
+ 
+-static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
+-{
+-      u64 temp;
+-      asm volatile(
+-              "test %9, %9 ;"
+-              "movq %0, %8 ;"
+-              "cmovnzq %4, %0 ;"
+-              "cmovnzq %8, %4 ;"
+-              "movq %1, %8 ;"
+-              "cmovnzq %5, %1 ;"
+-              "cmovnzq %8, %5 ;"
+-              "movq %2, %8 ;"
+-              "cmovnzq %6, %2 ;"
+-              "cmovnzq %8, %6 ;"
+-              "movq %3, %8 ;"
+-              "cmovnzq %7, %3 ;"
+-              "cmovnzq %8, %7 ;"
+-              : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]),
+-                "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]),
+-                "=r"(temp)
+-              : "r"(bit)
+-              : "cc"
++              /* Step 1: Compute all partial products */
++              "  movq 0(%1), %%rdx;"                                       /* f[0] */
++              "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
++              "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
++              "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
++              "  movq 24(%1), %%rdx;"                                      /* f[3] */
++              "  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
++              "  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
++              "  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
++              "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
++
++              /* Step 2: Compute two parallel carry chains */
++              "  xor %%r15, %%r15;"
++              "  adox %%rax, %%r10;"
++              "  adcx %%r8, %%r8;"
++              "  adox %%rcx, %%r11;"
++              "  adcx %%r9, %%r9;"
++              "  adox %%r15, %%r12;"
++              "  adcx %%r10, %%r10;"
++              "  adox %%r15, %%r13;"
++              "  adcx %%r11, %%r11;"
++              "  adox %%r15, %%r14;"
++              "  adcx %%r12, %%r12;"
++              "  adcx %%r13, %%r13;"
++              "  adcx %%r14, %%r14;"
++
++              /* Step 3: Compute intermediate squares */
++              "  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
++                                         "  movq %%rax, 0(%0);"
++              "  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
++              "  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
++              "  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
++              "  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
++              "  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
++              "  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
++              "  adcx %%rcx, %%r12;"     "  movq %%r12, 40(%0);"
++              "  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
++              "  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
++              "  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
++
++              /* Line up pointers */
++              "  mov %0, %1;"
++              "  mov %2, %0;"
++
++              /* Wrap the result back into the field */
++
++              /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
++              "  mov $38, %%rdx;"
++              "  mulxq 32(%1), %%r8, %%r13;"
++              "  xor %%rcx, %%rcx;"
++              "  adoxq 0(%1), %%r8;"
++              "  mulxq 40(%1), %%r9, %%r12;"
++              "  adcx %%r13, %%r9;"
++              "  adoxq 8(%1), %%r9;"
++              "  mulxq 48(%1), %%r10, %%r13;"
++              "  adcx %%r12, %%r10;"
++              "  adoxq 16(%1), %%r10;"
++              "  mulxq 56(%1), %%r11, %%rax;"
++              "  adcx %%r13, %%r11;"
++              "  adoxq 24(%1), %%r11;"
++              "  adcx %%rcx, %%rax;"
++              "  adox %%rcx, %%rax;"
++              "  imul %%rdx, %%rax;"
++
++              /* Step 2: Fold the carry back into dst */
++              "  add %%rax, %%r8;"
++              "  adcx %%rcx, %%r9;"
++              "  movq %%r9, 8(%0);"
++              "  adcx %%rcx, %%r10;"
++              "  movq %%r10, 16(%0);"
++              "  adcx %%rcx, %%r11;"
++              "  movq %%r11, 24(%0);"
++
++              /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
++              "  mov $0, %%rax;"
++              "  cmovc %%rdx, %%rax;"
++              "  add %%rax, %%r8;"
++              "  movq %%r8, 0(%0);"
++      : "+&r" (tmp), "+&r" (f), "+&r" (out)
++      :
++      : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+       );
+ }
+ 
+-static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py)
++/* Computes two field squarings:
++ * out[0] <- f[0] * f[0]
++ * out[1] <- f[1] * f[1]
++ * Uses the 16-element buffer tmp for intermediate results */
++static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
+ {
+       asm volatile(
+-              "test %4, %4 ;"
+-              "cmovnzq %5, %0 ;"
+-              "cmovnzq %6, %1 ;"
+-              "cmovnzq %7, %2 ;"
+-              "cmovnzq %8, %3 ;"
+-              : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3])
+-              : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3])
+-              : "cc"
++              /* Step 1: Compute all partial products */
++              "  movq 0(%1), %%rdx;"                                       /* f[0] */
++              "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
++              "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
++              "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
++              "  movq 24(%1), %%rdx;"                                      /* f[3] */
++              "  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
++              "  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
++              "  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
++              "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
++
++              /* Step 2: Compute two parallel carry chains */
++              "  xor %%r15, %%r15;"
++              "  adox %%rax, %%r10;"
++              "  adcx %%r8, %%r8;"
++              "  adox %%rcx, %%r11;"
++              "  adcx %%r9, %%r9;"
++              "  adox %%r15, %%r12;"
++              "  adcx %%r10, %%r10;"
++              "  adox %%r15, %%r13;"
++              "  adcx %%r11, %%r11;"
++              "  adox %%r15, %%r14;"
++              "  adcx %%r12, %%r12;"
++              "  adcx %%r13, %%r13;"
++              "  adcx %%r14, %%r14;"
++
++              /* Step 3: Compute intermediate squares */
++              "  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
++                                         "  movq %%rax, 0(%0);"
++              "  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
++              "  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
++              "  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
++              "  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
++              "  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
++              "  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
++              "  adcx %%rcx, %%r12;"     "  movq %%r12, 40(%0);"
++              "  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
++              "  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
++              "  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
++
++              /* Step 1: Compute all partial products */
++              "  movq 32(%1), %%rdx;"                                       /* f[0] */
++              "  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
++              "  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
++              "  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
++              "  movq 56(%1), %%rdx;"                                      /* f[3] */
++              "  mulxq 40(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
++              "  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
++              "  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
++              "  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
++
++              /* Step 2: Compute two parallel carry chains */
++              "  xor %%r15, %%r15;"
++              "  adox %%rax, %%r10;"
++              "  adcx %%r8, %%r8;"
++              "  adox %%rcx, %%r11;"
++              "  adcx %%r9, %%r9;"
++              "  adox %%r15, %%r12;"
++              "  adcx %%r10, %%r10;"
++              "  adox %%r15, %%r13;"
++              "  adcx %%r11, %%r11;"
++              "  adox %%r15, %%r14;"
++              "  adcx %%r12, %%r12;"
++              "  adcx %%r13, %%r13;"
++              "  adcx %%r14, %%r14;"
++
++              /* Step 3: Compute intermediate squares */
++              "  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
++                                         "  movq %%rax, 64(%0);"
++              "  add %%rcx, %%r8;"       "  movq %%r8, 72(%0);"
++              "  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
++              "  adcx %%rax, %%r9;"      "  movq %%r9, 80(%0);"
++              "  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
++              "  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
++              "  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
++              "  adcx %%rcx, %%r12;"     "  movq %%r12, 104(%0);"
++              "  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
++              "  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
++              "  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
++
++              /* Line up pointers */
++              "  mov %0, %1;"
++              "  mov %2, %0;"
++
++              /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
++              "  mov $38, %%rdx;"
++              "  mulxq 32(%1), %%r8, %%r13;"
++              "  xor %%rcx, %%rcx;"
++              "  adoxq 0(%1), %%r8;"
++              "  mulxq 40(%1), %%r9, %%r12;"
++              "  adcx %%r13, %%r9;"
++              "  adoxq 8(%1), %%r9;"
++              "  mulxq 48(%1), %%r10, %%r13;"
++              "  adcx %%r12, %%r10;"
++              "  adoxq 16(%1), %%r10;"
++              "  mulxq 56(%1), %%r11, %%rax;"
++              "  adcx %%r13, %%r11;"
++              "  adoxq 24(%1), %%r11;"
++              "  adcx %%rcx, %%rax;"
++              "  adox %%rcx, %%rax;"
++              "  imul %%rdx, %%rax;"
++
++              /* Step 2: Fold the carry back into dst */
++              "  add %%rax, %%r8;"
++              "  adcx %%rcx, %%r9;"
++              "  movq %%r9, 8(%0);"
++              "  adcx %%rcx, %%r10;"
++              "  movq %%r10, 16(%0);"
++              "  adcx %%rcx, %%r11;"
++              "  movq %%r11, 24(%0);"
++
++              /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
++              "  mov $0, %%rax;"
++              "  cmovc %%rdx, %%rax;"
++              "  add %%rax, %%r8;"
++              "  movq %%r8, 0(%0);"
++
++              /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
++              "  mov $38, %%rdx;"
++              "  mulxq 96(%1), %%r8, %%r13;"
++              "  xor %%rcx, %%rcx;"
++              "  adoxq 64(%1), %%r8;"
++              "  mulxq 104(%1), %%r9, %%r12;"
++              "  adcx %%r13, %%r9;"
++              "  adoxq 72(%1), %%r9;"
++              "  mulxq 112(%1), %%r10, %%r13;"
++              "  adcx %%r12, %%r10;"
++              "  adoxq 80(%1), %%r10;"
++              "  mulxq 120(%1), %%r11, %%rax;"
++              "  adcx %%r13, %%r11;"
++              "  adoxq 88(%1), %%r11;"
++              "  adcx %%rcx, %%rax;"
++              "  adox %%rcx, %%rax;"
++              "  imul %%rdx, %%rax;"
++
++              /* Step 2: Fold the carry back into dst */
++              "  add %%rax, %%r8;"
++              "  adcx %%rcx, %%r9;"
++              "  movq %%r9, 40(%0);"
++              "  adcx %%rcx, %%r10;"
++              "  movq %%r10, 48(%0);"
++              "  adcx %%rcx, %%r11;"
++              "  movq %%r11, 56(%0);"
++
++              /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
++              "  mov $0, %%rax;"
++              "  cmovc %%rdx, %%rax;"
++              "  add %%rax, %%r8;"
++              "  movq %%r8, 32(%0);"
++      : "+&r" (tmp), "+&r" (f), "+&r" (out)
++      :
++      : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+       );
+ }
+ 
+-static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE],
+-                         const u8 private_key[CURVE25519_KEY_SIZE],
+-                         const u8 session_key[CURVE25519_KEY_SIZE])
+-{
+-      struct {
+-              u64 buffer[4 * NUM_WORDS_ELTFP25519];
+-              u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+-              u64 workspace[6 * NUM_WORDS_ELTFP25519];
+-              u8 session[CURVE25519_KEY_SIZE];
+-              u8 private[CURVE25519_KEY_SIZE];
+-      } __aligned(32) m;
+-
+-      int i = 0, j = 0;
+-      u64 prev = 0;
+-      u64 *const X1 = (u64 *)m.session;
+-      u64 *const key = (u64 *)m.private;
+-      u64 *const Px = m.coordinates + 0;
+-      u64 *const Pz = m.coordinates + 4;
+-      u64 *const Qx = m.coordinates + 8;
+-      u64 *const Qz = m.coordinates + 12;
+-      u64 *const X2 = Qx;
+-      u64 *const Z2 = Qz;
+-      u64 *const X3 = Px;
+-      u64 *const Z3 = Pz;
+-      u64 *const X2Z2 = Qx;
+-      u64 *const X3Z3 = Px;
+-
+-      u64 *const A = m.workspace + 0;
+-      u64 *const B = m.workspace + 4;
+-      u64 *const D = m.workspace + 8;
+-      u64 *const C = m.workspace + 12;
+-      u64 *const DA = m.workspace + 16;
+-      u64 *const CB = m.workspace + 20;
+-      u64 *const AB = A;
+-      u64 *const DC = D;
+-      u64 *const DACB = DA;
+-
+-      memcpy(m.private, private_key, sizeof(m.private));
+-      memcpy(m.session, session_key, sizeof(m.session));
+-
+-      curve25519_clamp_secret(m.private);
+-
+-      /* As in the draft:
+-       * When receiving such an array, implementations of curve25519
+-       * MUST mask the most-significant bit in the final byte. This
+-       * is done to preserve compatibility with point formats which
+-       * reserve the sign bit for use in other protocols and to
+-       * increase resistance to implementation fingerprinting
+-       */
+-      m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
+-
+-      copy_eltfp25519_1w(Px, X1);
+-      setzero_eltfp25519_1w(Pz);
+-      setzero_eltfp25519_1w(Qx);
+-      setzero_eltfp25519_1w(Qz);
+-
+-      Pz[0] = 1;
+-      Qx[0] = 1;
+-
+-      /* main-loop */
+-      prev = 0;
+-      j = 62;
+-      for (i = 3; i >= 0; --i) {
+-              while (j >= 0) {
+-                      u64 bit = (key[i] >> j) & 0x1;
+-                      u64 swap = bit ^ prev;
+-                      prev = bit;
+-
+-                      add_eltfp25519_1w_adx(A, X2, Z2);       /* A = (X2+Z2) */
+-                      sub_eltfp25519_1w(B, X2, Z2);           /* B = (X2-Z2) */
+-                      add_eltfp25519_1w_adx(C, X3, Z3);       /* C = (X3+Z3) */
+-                      sub_eltfp25519_1w(D, X3, Z3);           /* D = (X3-Z3) */
+-                      mul_eltfp25519_2w_adx(DACB, AB, DC);    /* [DA|CB] = [A|B]*[D|C] */
+-
+-                      cselect(swap, A, C);
+-                      cselect(swap, B, D);
+-
+-                      sqr_eltfp25519_2w_adx(AB);              /* [AA|BB] = [A^2|B^2] */
+-                      add_eltfp25519_1w_adx(X3, DA, CB);      /* X3 = (DA+CB) */
+-                      sub_eltfp25519_1w(Z3, DA, CB);          /* Z3 = (DA-CB) */
+-                      sqr_eltfp25519_2w_adx(X3Z3);            /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
+-
+-                      copy_eltfp25519_1w(X2, B);              /* X2 = B^2 */
+-                      sub_eltfp25519_1w(Z2, A, B);            /* Z2 = E = AA-BB */
+-
+-                      mul_a24_eltfp25519_1w(B, Z2);           /* B = a24*E */
+-                      add_eltfp25519_1w_adx(B, B, X2);        /* B = a24*E+B */
+-                      mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB);  /* [X2|Z2] = [B|E]*[A|a24*E+B] */
+-                      mul_eltfp25519_1w_adx(Z3, Z3, X1);      /* Z3 = Z3*X1 */
+-                      --j;
+-              }
+-              j = 63;
+-      }
+-
+-      inv_eltfp25519_1w_adx(A, Qz);
+-      mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
+-      fred_eltfp25519_1w((u64 *)shared);
+-
+-      memzero_explicit(&m, sizeof(m));
+-}
+-
+-static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE],
+-                              const u8 private_key[CURVE25519_KEY_SIZE])
++static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
+ {
+-      struct {
+-              u64 buffer[4 * NUM_WORDS_ELTFP25519];
+-              u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+-              u64 workspace[4 * NUM_WORDS_ELTFP25519];
+-              u8 private[CURVE25519_KEY_SIZE];
+-      } __aligned(32) m;
+-
+-      const int ite[4] = { 64, 64, 64, 63 };
+-      const int q = 3;
+-      u64 swap = 1;
+-
+-      int i = 0, j = 0, k = 0;
+-      u64 *const key = (u64 *)m.private;
+-      u64 *const Ur1 = m.coordinates + 0;
+-      u64 *const Zr1 = m.coordinates + 4;
+-      u64 *const Ur2 = m.coordinates + 8;
+-      u64 *const Zr2 = m.coordinates + 12;
+-
+-      u64 *const UZr1 = m.coordinates + 0;
+-      u64 *const ZUr2 = m.coordinates + 8;
+-
+-      u64 *const A = m.workspace + 0;
+-      u64 *const B = m.workspace + 4;
+-      u64 *const C = m.workspace + 8;
+-      u64 *const D = m.workspace + 12;
+-
+-      u64 *const AB = m.workspace + 0;
+-      u64 *const CD = m.workspace + 8;
+-
+-      const u64 *const P = table_ladder_8k;
+-
+-      memcpy(m.private, private_key, sizeof(m.private));
+-
+-      curve25519_clamp_secret(m.private);
+-
+-      setzero_eltfp25519_1w(Ur1);
+-      setzero_eltfp25519_1w(Zr1);
+-      setzero_eltfp25519_1w(Zr2);
+-      Ur1[0] = 1;
+-      Zr1[0] = 1;
+-      Zr2[0] = 1;
+-
+-      /* G-S */
+-      Ur2[3] = 0x1eaecdeee27cab34UL;
+-      Ur2[2] = 0xadc7a0b9235d48e2UL;
+-      Ur2[1] = 0xbbf095ae14b2edf8UL;
+-      Ur2[0] = 0x7e94e1fec82faabdUL;
+-
+-      /* main-loop */
+-      j = q;
+-      for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
+-              while (j < ite[i]) {
+-                      u64 bit = (key[i] >> j) & 0x1;
+-                      k = (64 * i + j - q);
+-                      swap = swap ^ bit;
+-                      cswap(swap, Ur1, Ur2);
+-                      cswap(swap, Zr1, Zr2);
+-                      swap = bit;
+-                      /* Addition */
+-                      sub_eltfp25519_1w(B, Ur1, Zr1);         /* B = Ur1-Zr1 */
+-                      add_eltfp25519_1w_adx(A, Ur1, Zr1);     /* A = Ur1+Zr1 */
+-                      mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */
+-                      sub_eltfp25519_1w(B, A, C);             /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
+-                      add_eltfp25519_1w_adx(A, A, C);         /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
+-                      sqr_eltfp25519_2w_adx(AB);              /* A = A^2      |  B = B^2 */
+-                      mul_eltfp25519_2w_adx(UZr1, ZUr2, AB);  /* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
+-                      ++j;
++      u64 *nq = p01_tmp1;
++      u64 *nq_p1 = p01_tmp1 + (u32)8U;
++      u64 *tmp1 = p01_tmp1 + (u32)16U;
++      u64 *x1 = q;
++      u64 *x2 = nq;
++      u64 *z2 = nq + (u32)4U;
++      u64 *z3 = nq_p1 + (u32)4U;
++      u64 *a = tmp1;
++      u64 *b = tmp1 + (u32)4U;
++      u64 *ab = tmp1;
++      u64 *dc = tmp1 + (u32)8U;
++      u64 *x3;
++      u64 *z31;
++      u64 *d0;
++      u64 *c0;
++      u64 *a1;
++      u64 *b1;
++      u64 *d;
++      u64 *c;
++      u64 *ab1;
++      u64 *dc1;
++      fadd(a, x2, z2);
++      fsub(b, x2, z2);
++      x3 = nq_p1;
++      z31 = nq_p1 + (u32)4U;
++      d0 = dc;
++      c0 = dc + (u32)4U;
++      fadd(c0, x3, z31);
++      fsub(d0, x3, z31);
++      fmul2(dc, dc, ab, tmp2);
++      fadd(x3, d0, c0);
++      fsub(z31, d0, c0);
++      a1 = tmp1;
++      b1 = tmp1 + (u32)4U;
++      d = tmp1 + (u32)8U;
++      c = tmp1 + (u32)12U;
++      ab1 = tmp1;
++      dc1 = tmp1 + (u32)8U;
++      fsqr2(dc1, ab1, tmp2);
++      fsqr2(nq_p1, nq_p1, tmp2);
++      a1[0U] = c[0U];
++      a1[1U] = c[1U];
++      a1[2U] = c[2U];
++      a1[3U] = c[3U];
++      fsub(c, d, c);
++      fmul_scalar(b1, c, (u64)121665U);
++      fadd(b1, b1, d);
++      fmul2(nq, dc1, ab1, tmp2);
++      fmul(z3, z3, x1, tmp2);
++}
++
++static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
++{
++      u64 *x2 = nq;
++      u64 *z2 = nq + (u32)4U;
++      u64 *a = tmp1;
++      u64 *b = tmp1 + (u32)4U;
++      u64 *d = tmp1 + (u32)8U;
++      u64 *c = tmp1 + (u32)12U;
++      u64 *ab = tmp1;
++      u64 *dc = tmp1 + (u32)8U;
++      fadd(a, x2, z2);
++      fsub(b, x2, z2);
++      fsqr2(dc, ab, tmp2);
++      a[0U] = c[0U];
++      a[1U] = c[1U];
++      a[2U] = c[2U];
++      a[3U] = c[3U];
++      fsub(c, d, c);
++      fmul_scalar(b, c, (u64)121665U);
++      fadd(b, b, d);
++      fmul2(nq, dc, ab, tmp2);
++}
++
++static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
++{
++      u64 tmp2[16U] = { 0U };
++      u64 p01_tmp1_swap[33U] = { 0U };
++      u64 *p0 = p01_tmp1_swap;
++      u64 *p01 = p01_tmp1_swap;
++      u64 *p03 = p01;
++      u64 *p11 = p01 + (u32)8U;
++      u64 *x0;
++      u64 *z0;
++      u64 *p01_tmp1;
++      u64 *p01_tmp11;
++      u64 *nq10;
++      u64 *nq_p11;
++      u64 *swap1;
++      u64 sw0;
++      u64 *nq1;
++      u64 *tmp1;
++      memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
++      x0 = p03;
++      z0 = p03 + (u32)4U;
++      x0[0U] = (u64)1U;
++      x0[1U] = (u64)0U;
++      x0[2U] = (u64)0U;
++      x0[3U] = (u64)0U;
++      z0[0U] = (u64)0U;
++      z0[1U] = (u64)0U;
++      z0[2U] = (u64)0U;
++      z0[3U] = (u64)0U;
++      p01_tmp1 = p01_tmp1_swap;
++      p01_tmp11 = p01_tmp1_swap;
++      nq10 = p01_tmp1_swap;
++      nq_p11 = p01_tmp1_swap + (u32)8U;
++      swap1 = p01_tmp1_swap + (u32)32U;
++      cswap2((u64)1U, nq10, nq_p11);
++      point_add_and_double(init1, p01_tmp11, tmp2);
++      swap1[0U] = (u64)1U;
++      {
++              u32 i;
++              for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
++                      u64 *p01_tmp12 = p01_tmp1_swap;
++                      u64 *swap2 = p01_tmp1_swap + (u32)32U;
++                      u64 *nq2 = p01_tmp12;
++                      u64 *nq_p12 = p01_tmp12 + (u32)8U;
++                      u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
++                      u64 sw = swap2[0U] ^ bit;
++                      cswap2(sw, nq2, nq_p12);
++                      point_add_and_double(init1, p01_tmp12, tmp2);
++                      swap2[0U] = bit;
+               }
+-              j = 0;
+       }
+-
+-      /* Doubling */
+-      for (i = 0; i < q; ++i) {
+-              add_eltfp25519_1w_adx(A, Ur1, Zr1);     /*  A = Ur1+Zr1 */
+-              sub_eltfp25519_1w(B, Ur1, Zr1);         /*  B = Ur1-Zr1 */
+-              sqr_eltfp25519_2w_adx(AB);              /*  A = A**2     B = B**2 */
+-              copy_eltfp25519_1w(C, B);               /*  C = B */
+-              sub_eltfp25519_1w(B, A, B);             /*  B = A-B */
+-              mul_a24_eltfp25519_1w(D, B);            /*  D = my_a24*B */
+-              add_eltfp25519_1w_adx(D, D, C);         /*  D = D+C */
+-              mul_eltfp25519_2w_adx(UZr1, AB, CD);    /*  Ur1 = A*B   Zr1 = Zr1*A */
+-      }
+-
+-      /* Convert to affine coordinates */
+-      inv_eltfp25519_1w_adx(A, Zr1);
+-      mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
+-      fred_eltfp25519_1w((u64 *)session_key);
+-
+-      memzero_explicit(&m, sizeof(m));
+-}
+-
+-static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE],
+-                          const u8 private_key[CURVE25519_KEY_SIZE],
+-                          const u8 session_key[CURVE25519_KEY_SIZE])
+-{
+-      struct {
+-              u64 buffer[4 * NUM_WORDS_ELTFP25519];
+-              u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+-              u64 workspace[6 * NUM_WORDS_ELTFP25519];
+-              u8 session[CURVE25519_KEY_SIZE];
+-              u8 private[CURVE25519_KEY_SIZE];
+-      } __aligned(32) m;
+-
+-      int i = 0, j = 0;
+-      u64 prev = 0;
+-      u64 *const X1 = (u64 *)m.session;
+-      u64 *const key = (u64 *)m.private;
+-      u64 *const Px = m.coordinates + 0;
+-      u64 *const Pz = m.coordinates + 4;
+-      u64 *const Qx = m.coordinates + 8;
+-      u64 *const Qz = m.coordinates + 12;
+-      u64 *const X2 = Qx;
+-      u64 *const Z2 = Qz;
+-      u64 *const X3 = Px;
+-      u64 *const Z3 = Pz;
+-      u64 *const X2Z2 = Qx;
+-      u64 *const X3Z3 = Px;
+-
+-      u64 *const A = m.workspace + 0;
+-      u64 *const B = m.workspace + 4;
+-      u64 *const D = m.workspace + 8;
+-      u64 *const C = m.workspace + 12;
+-      u64 *const DA = m.workspace + 16;
+-      u64 *const CB = m.workspace + 20;
+-      u64 *const AB = A;
+-      u64 *const DC = D;
+-      u64 *const DACB = DA;
+-
+-      memcpy(m.private, private_key, sizeof(m.private));
+-      memcpy(m.session, session_key, sizeof(m.session));
+-
+-      curve25519_clamp_secret(m.private);
+-
+-      /* As in the draft:
+-       * When receiving such an array, implementations of curve25519
+-       * MUST mask the most-significant bit in the final byte. This
+-       * is done to preserve compatibility with point formats which
+-       * reserve the sign bit for use in other protocols and to
+-       * increase resistance to implementation fingerprinting
+-       */
+-      m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
+-
+-      copy_eltfp25519_1w(Px, X1);
+-      setzero_eltfp25519_1w(Pz);
+-      setzero_eltfp25519_1w(Qx);
+-      setzero_eltfp25519_1w(Qz);
+-
+-      Pz[0] = 1;
+-      Qx[0] = 1;
+-
+-      /* main-loop */
+-      prev = 0;
+-      j = 62;
+-      for (i = 3; i >= 0; --i) {
+-              while (j >= 0) {
+-                      u64 bit = (key[i] >> j) & 0x1;
+-                      u64 swap = bit ^ prev;
+-                      prev = bit;
+-
+-                      add_eltfp25519_1w_bmi2(A, X2, Z2);      /* A = (X2+Z2) */
+-                      sub_eltfp25519_1w(B, X2, Z2);           /* B = (X2-Z2) */
+-                      add_eltfp25519_1w_bmi2(C, X3, Z3);      /* C = (X3+Z3) */
+-                      sub_eltfp25519_1w(D, X3, Z3);           /* D = (X3-Z3) */
+-                      mul_eltfp25519_2w_bmi2(DACB, AB, DC);   /* [DA|CB] = [A|B]*[D|C] */
+-
+-                      cselect(swap, A, C);
+-                      cselect(swap, B, D);
+-
+-                      sqr_eltfp25519_2w_bmi2(AB);             /* [AA|BB] = [A^2|B^2] */
+-                      add_eltfp25519_1w_bmi2(X3, DA, CB);     /* X3 = (DA+CB) */
+-                      sub_eltfp25519_1w(Z3, DA, CB);          /* Z3 = (DA-CB) */
+-                      sqr_eltfp25519_2w_bmi2(X3Z3);           /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
+-
+-                      copy_eltfp25519_1w(X2, B);              /* X2 = B^2 */
+-                      sub_eltfp25519_1w(Z2, A, B);            /* Z2 = E = AA-BB */
+-
+-                      mul_a24_eltfp25519_1w(B, Z2);           /* B = a24*E */
+-                      add_eltfp25519_1w_bmi2(B, B, X2);       /* B = a24*E+B */
+-                      mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
+-                      mul_eltfp25519_1w_bmi2(Z3, Z3, X1);     /* Z3 = Z3*X1 */
+-                      --j;
++      sw0 = swap1[0U];
++      cswap2(sw0, nq10, nq_p11);
++      nq1 = p01_tmp1;
++      tmp1 = p01_tmp1 + (u32)16U;
++      point_double(nq1, tmp1, tmp2);
++      point_double(nq1, tmp1, tmp2);
++      point_double(nq1, tmp1, tmp2);
++      memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
++
++      memzero_explicit(tmp2, sizeof(tmp2));
++      memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
++}
++
++static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
++{
++      u32 i;
++      fsqr(o, inp, tmp);
++      for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
++              fsqr(o, o, tmp);
++}
++
++static void finv(u64 *o, const u64 *i, u64 *tmp)
++{
++      u64 t1[16U] = { 0U };
++      u64 *a0 = t1;
++      u64 *b = t1 + (u32)4U;
++      u64 *c = t1 + (u32)8U;
++      u64 *t00 = t1 + (u32)12U;
++      u64 *tmp1 = tmp;
++      u64 *a;
++      u64 *t0;
++      fsquare_times(a0, i, tmp1, (u32)1U);
++      fsquare_times(t00, a0, tmp1, (u32)2U);
++      fmul(b, t00, i, tmp);
++      fmul(a0, b, a0, tmp);
++      fsquare_times(t00, a0, tmp1, (u32)1U);
++      fmul(b, t00, b, tmp);
++      fsquare_times(t00, b, tmp1, (u32)5U);
++      fmul(b, t00, b, tmp);
++      fsquare_times(t00, b, tmp1, (u32)10U);
++      fmul(c, t00, b, tmp);
++      fsquare_times(t00, c, tmp1, (u32)20U);
++      fmul(t00, t00, c, tmp);
++      fsquare_times(t00, t00, tmp1, (u32)10U);
++      fmul(b, t00, b, tmp);
++      fsquare_times(t00, b, tmp1, (u32)50U);
++      fmul(c, t00, b, tmp);
++      fsquare_times(t00, c, tmp1, (u32)100U);
++      fmul(t00, t00, c, tmp);
++      fsquare_times(t00, t00, tmp1, (u32)50U);
++      fmul(t00, t00, b, tmp);
++      fsquare_times(t00, t00, tmp1, (u32)5U);
++      a = t1;
++      t0 = t1 + (u32)12U;
++      fmul(o, t0, a, tmp);
++}
++
++static void store_felem(u64 *b, u64 *f)
++{
++      u64 f30 = f[3U];
++      u64 top_bit0 = f30 >> (u32)63U;
++      u64 carry0;
++      u64 f31;
++      u64 top_bit;
++      u64 carry;
++      u64 f0;
++      u64 f1;
++      u64 f2;
++      u64 f3;
++      u64 m0;
++      u64 m1;
++      u64 m2;
++      u64 m3;
++      u64 mask;
++      u64 f0_;
++      u64 f1_;
++      u64 f2_;
++      u64 f3_;
++      u64 o0;
++      u64 o1;
++      u64 o2;
++      u64 o3;
++      f[3U] = f30 & (u64)0x7fffffffffffffffU;
++      carry0 = add_scalar(f, f, (u64)19U * top_bit0);
++      f31 = f[3U];
++      top_bit = f31 >> (u32)63U;
++      f[3U] = f31 & (u64)0x7fffffffffffffffU;
++      carry = add_scalar(f, f, (u64)19U * top_bit);
++      f0 = f[0U];
++      f1 = f[1U];
++      f2 = f[2U];
++      f3 = f[3U];
++      m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
++      m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
++      m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
++      m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
++      mask = ((m0 & m1) & m2) & m3;
++      f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
++      f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
++      f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
++      f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
++      o0 = f0_;
++      o1 = f1_;
++      o2 = f2_;
++      o3 = f3_;
++      b[0U] = o0;
++      b[1U] = o1;
++      b[2U] = o2;
++      b[3U] = o3;
++}
++
++static void encode_point(u8 *o, const u64 *i)
++{
++      const u64 *x = i;
++      const u64 *z = i + (u32)4U;
++      u64 tmp[4U] = { 0U };
++      u64 tmp_w[16U] = { 0U };
++      finv(tmp, z, tmp_w);
++      fmul(tmp, tmp, x, tmp_w);
++      store_felem((u64 *)o, tmp);
++}
++
++static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
++{
++      u64 init1[8U] = { 0U };
++      u64 tmp[4U] = { 0U };
++      u64 tmp3;
++      u64 *x;
++      u64 *z;
++      {
++              u32 i;
++              for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
++                      u64 *os = tmp;
++                      const u8 *bj = pub + i * (u32)8U;
++                      u64 u = *(u64 *)bj;
++                      u64 r = u;
++                      u64 x0 = r;
++                      os[i] = x0;
+               }
+-              j = 63;
+       }
++      tmp3 = tmp[3U];
++      tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
++      x = init1;
++      z = init1 + (u32)4U;
++      z[0U] = (u64)1U;
++      z[1U] = (u64)0U;
++      z[2U] = (u64)0U;
++      z[3U] = (u64)0U;
++      x[0U] = tmp[0U];
++      x[1U] = tmp[1U];
++      x[2U] = tmp[2U];
++      x[3U] = tmp[3U];
++      montgomery_ladder(init1, priv, init1);
++      encode_point(out, init1);
++}
++
++/* The below constants were generated using this sage script:
++ *
++ * #!/usr/bin/env sage
++ * import sys
++ * from sage.all import *
++ * def limbs(n):
++ *    n = int(n)
++ *    l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
++ *    return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
++ * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
++ * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
++ * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
++ * print("static const u64 table_ladder[] = {")
++ * p = ec.lift_x(9)
++ * for i in range(252):
++ *    l = (p[0] + p[2]) / (p[0] - p[2])
++ *    print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
++ *    p = p * 2
++ * print("};")
++ *
++ */
+ 
+-      inv_eltfp25519_1w_bmi2(A, Qz);
+-      mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
+-      fred_eltfp25519_1w((u64 *)shared);
++static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
+ 
+-      memzero_explicit(&m, sizeof(m));
+-}
++static const u64 table_ladder[] = {
++      0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
++      0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
++      0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
++      0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
++      0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
++      0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
++      0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
++      0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
++      0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
++      0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
++      0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
++      0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
++      0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
++      0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
++      0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
++      0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
++      0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
++      0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
++      0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
++      0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
++      0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
++      0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
++      0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
++      0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
++      0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
++      0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
++      0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
++      0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
++      0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
++      0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
++      0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
++      0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
++      0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
++      0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
++      0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
++      0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
++      0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
++      0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
++      0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
++      0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
++      0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
++      0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
++      0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
++      0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
++      0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
++      0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
++      0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
++      0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
++      0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
++      0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
++      0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
++      0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
++      0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
++      0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
++      0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
++      0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
++      0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
++      0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
++      0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
++      0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
++      0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
++      0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
++      0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
++      0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
++      0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
++      0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
++      0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
++      0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
++      0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
++      0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
++      0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
++      0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
++      0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
++      0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
++      0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
++      0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
++      0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
++      0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
++      0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
++      0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
++      0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
++      0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
++      0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
++      0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
++      0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
++      0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
++      0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
++      0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
++      0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
++      0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
++      0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
++      0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
++      0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
++      0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
++      0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
++      0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
++      0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
++      0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
++      0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
++      0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
++      0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
++      0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
++      0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
++      0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
++      0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
++      0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
++      0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
++      0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
++      0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
++      0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
++      0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
++      0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
++      0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
++      0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
++      0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
++      0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
++      0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
++      0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
++      0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
++      0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
++      0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
++      0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
++      0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
++      0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
++      0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
++      0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
++      0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
++      0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
++      0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
++      0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
++      0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
++      0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
++      0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
++      0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
++      0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
++      0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
++      0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
++      0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
++      0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
++      0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
++      0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
++      0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
++      0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
++      0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
++      0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
++      0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
++      0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
++      0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
++      0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
++      0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
++      0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
++      0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
++      0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
++      0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
++      0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
++      0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
++      0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
++      0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
++      0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
++      0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
++      0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
++      0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
++      0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
++      0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
++      0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
++      0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
++      0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
++      0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
++      0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
++      0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
++      0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
++      0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
++      0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
++      0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
++      0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
++      0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
++      0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
++      0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
++      0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
++      0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
++      0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
++      0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
++      0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
++      0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
++      0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
++      0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
++      0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
++      0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
++      0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
++      0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
++      0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
++      0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
++      0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
++      0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
++      0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
++      0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
++      0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
++      0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
++      0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
++      0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
++      0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
++      0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
++      0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
++      0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
++      0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
++      0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
++      0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
++      0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
++      0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
++      0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
++      0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
++      0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
++      0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
++      0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
++      0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
++      0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
++      0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
++      0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
++      0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
++      0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
++      0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
++      0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
++      0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
++      0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
++      0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
++      0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
++      0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
++      0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
++      0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
++      0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
++      0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
++      0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
++      0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
++      0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
++      0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
++      0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
++      0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
++      0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
++      0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
++      0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
++      0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
++      0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
++      0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
++      0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
++      0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
++      0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
++      0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
++      0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
++      0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
++      0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
++      0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
++      0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
++};
+ 
+-static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE],
+-                               const u8 private_key[CURVE25519_KEY_SIZE])
++static void curve25519_ever64_base(u8 *out, const u8 *priv)
+ {
+-      struct {
+-              u64 buffer[4 * NUM_WORDS_ELTFP25519];
+-              u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+-              u64 workspace[4 * NUM_WORDS_ELTFP25519];
+-              u8 private[CURVE25519_KEY_SIZE];
+-      } __aligned(32) m;
+-
+-      const int ite[4] = { 64, 64, 64, 63 };
+-      const int q = 3;
+       u64 swap = 1;
+-
+-      int i = 0, j = 0, k = 0;
+-      u64 *const key = (u64 *)m.private;
+-      u64 *const Ur1 = m.coordinates + 0;
+-      u64 *const Zr1 = m.coordinates + 4;
+-      u64 *const Ur2 = m.coordinates + 8;
+-      u64 *const Zr2 = m.coordinates + 12;
+-
+-      u64 *const UZr1 = m.coordinates + 0;
+-      u64 *const ZUr2 = m.coordinates + 8;
+-
+-      u64 *const A = m.workspace + 0;
+-      u64 *const B = m.workspace + 4;
+-      u64 *const C = m.workspace + 8;
+-      u64 *const D = m.workspace + 12;
+-
+-      u64 *const AB = m.workspace + 0;
+-      u64 *const CD = m.workspace + 8;
+-
+-      const u64 *const P = table_ladder_8k;
+-
+-      memcpy(m.private, private_key, sizeof(m.private));
+-
+-      curve25519_clamp_secret(m.private);
+-
+-      setzero_eltfp25519_1w(Ur1);
+-      setzero_eltfp25519_1w(Zr1);
+-      setzero_eltfp25519_1w(Zr2);
+-      Ur1[0] = 1;
+-      Zr1[0] = 1;
+-      Zr2[0] = 1;
+-
+-      /* G-S */
+-      Ur2[3] = 0x1eaecdeee27cab34UL;
+-      Ur2[2] = 0xadc7a0b9235d48e2UL;
+-      Ur2[1] = 0xbbf095ae14b2edf8UL;
+-      Ur2[0] = 0x7e94e1fec82faabdUL;
+-
+-      /* main-loop */
+-      j = q;
+-      for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
+-              while (j < ite[i]) {
+-                      u64 bit = (key[i] >> j) & 0x1;
+-                      k = (64 * i + j - q);
++      int i, j, k;
++      u64 tmp[16 + 32 + 4];
++      u64 *x1 = &tmp[0];
++      u64 *z1 = &tmp[4];
++      u64 *x2 = &tmp[8];
++      u64 *z2 = &tmp[12];
++      u64 *xz1 = &tmp[0];
++      u64 *xz2 = &tmp[8];
++      u64 *a = &tmp[0 + 16];
++      u64 *b = &tmp[4 + 16];
++      u64 *c = &tmp[8 + 16];
++      u64 *ab = &tmp[0 + 16];
++      u64 *abcd = &tmp[0 + 16];
++      u64 *ef = &tmp[16 + 16];
++      u64 *efgh = &tmp[16 + 16];
++      u64 *key = &tmp[0 + 16 + 32];
++
++      memcpy(key, priv, 32);
++      ((u8 *)key)[0] &= 248;
++      ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
++
++      x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
++      z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
++      z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
++      memcpy(x2, p_minus_s, sizeof(p_minus_s));
++
++      j = 3;
++      for (i = 0; i < 4; ++i) {
++              while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
++                      u64 bit = (key[i] >> j) & 1;
++                      k = (64 * i + j - 3);
+                       swap = swap ^ bit;
+-                      cswap(swap, Ur1, Ur2);
+-                      cswap(swap, Zr1, Zr2);
++                      cswap2(swap, xz1, xz2);
+                       swap = bit;
+-                      /* Addition */
+-                      sub_eltfp25519_1w(B, Ur1, Zr1);         /* B = Ur1-Zr1 */
+-                      add_eltfp25519_1w_bmi2(A, Ur1, Zr1);    /* A = Ur1+Zr1 */
+-                      mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
+-                      sub_eltfp25519_1w(B, A, C);             /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
+-                      add_eltfp25519_1w_bmi2(A, A, C);        /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
+-                      sqr_eltfp25519_2w_bmi2(AB);             /* A = A^2      |  B = B^2 */
+-                      mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
++                      fsub(b, x1, z1);
++                      fadd(a, x1, z1);
++                      fmul(c, &table_ladder[4 * k], b, ef);
++                      fsub(b, a, c);
++                      fadd(a, a, c);
++                      fsqr2(ab, ab, efgh);
++                      fmul2(xz1, xz2, ab, efgh);
+                       ++j;
+               }
+               j = 0;
+       }
+ 
+-      /* Doubling */
+-      for (i = 0; i < q; ++i) {
+-              add_eltfp25519_1w_bmi2(A, Ur1, Zr1);    /*  A = Ur1+Zr1 */
+-              sub_eltfp25519_1w(B, Ur1, Zr1);         /*  B = Ur1-Zr1 */
+-              sqr_eltfp25519_2w_bmi2(AB);             /*  A = A**2     B = B**2 */
+-              copy_eltfp25519_1w(C, B);               /*  C = B */
+-              sub_eltfp25519_1w(B, A, B);             /*  B = A-B */
+-              mul_a24_eltfp25519_1w(D, B);            /*  D = my_a24*B */
+-              add_eltfp25519_1w_bmi2(D, D, C);        /*  D = D+C */
+-              mul_eltfp25519_2w_bmi2(UZr1, AB, CD);   /*  Ur1 = A*B   Zr1 = Zr1*A */
+-      }
+-
+-      /* Convert to affine coordinates */
+-      inv_eltfp25519_1w_bmi2(A, Zr1);
+-      mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
+-      fred_eltfp25519_1w((u64 *)session_key);
++      point_double(xz1, abcd, efgh);
++      point_double(xz1, abcd, efgh);
++      point_double(xz1, abcd, efgh);
++      encode_point(out, xz1);
+ 
+-      memzero_explicit(&m, sizeof(m));
++      memzero_explicit(tmp, sizeof(tmp));
+ }
+ 
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
++
+ void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+                    const u8 secret[CURVE25519_KEY_SIZE],
+                    const u8 basepoint[CURVE25519_KEY_SIZE])
+ {
+-      if (static_branch_likely(&curve25519_use_adx))
+-              curve25519_adx(mypublic, secret, basepoint);
+-      else if (static_branch_likely(&curve25519_use_bmi2))
+-              curve25519_bmi2(mypublic, secret, basepoint);
++      if (static_branch_likely(&curve25519_use_bmi2_adx))
++              curve25519_ever64(mypublic, secret, basepoint);
+       else
+               curve25519_generic(mypublic, secret, basepoint);
+ }
+@@ -2355,10 +1395,8 @@ EXPORT_SYMBOL(curve25519_arch);
+ void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+                         const u8 secret[CURVE25519_KEY_SIZE])
+ {
+-      if (static_branch_likely(&curve25519_use_adx))
+-              curve25519_adx_base(pub, secret);
+-      else if (static_branch_likely(&curve25519_use_bmi2))
+-              curve25519_bmi2_base(pub, secret);
++      if (static_branch_likely(&curve25519_use_bmi2_adx))
++              curve25519_ever64_base(pub, secret);
+       else
+               curve25519_generic(pub, secret, curve25519_base_point);
+ }
+@@ -2449,12 +1487,11 @@ static struct kpp_alg curve25519_alg = {
+       .max_size               = curve25519_max_size,
+ };
+ 
++
+ static int __init curve25519_mod_init(void)
+ {
+-      if (boot_cpu_has(X86_FEATURE_BMI2))
+-              static_branch_enable(&curve25519_use_bmi2);
+-      else if (boot_cpu_has(X86_FEATURE_ADX))
+-              static_branch_enable(&curve25519_use_adx);
++      if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
++              static_branch_enable(&curve25519_use_bmi2_adx);
+       else
+               return 0;
+       return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
+@@ -2474,3 +1511,4 @@ module_exit(curve25519_mod_exit);
+ MODULE_ALIAS_CRYPTO("curve25519");
+ MODULE_ALIAS_CRYPTO("curve25519-x86");
+ MODULE_LICENSE("GPL v2");
++MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0055-crypto-x86-curve25519-leave-r12-as-spare-register.patch b/target/linux/generic/backport-5.4/080-wireguard-0055-crypto-x86-curve25519-leave-r12-as-spare-register.patch

new file mode 100644 (file)

index 0000000..74a6ef6
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0055-crypto-x86-curve25519-leave-r12-as-spare-register.patch
@@ -0,0 +1,376 @@
+From 481c5ed9ac2acec32d93847636707bda02208ec8 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 1 Mar 2020 16:06:56 +0800
+Subject: [PATCH 055/124] crypto: x86/curve25519 - leave r12 as spare register
+
+commit dc7fc3a53ae158263196b1892b672aedf67796c5 upstream.
+
+This updates to the newer register selection proved by HACL*, which
+leads to a more compact instruction encoding, and saves around 100
+cycles.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/curve25519-x86_64.c | 110 ++++++++++++++--------------
+ 1 file changed, 55 insertions(+), 55 deletions(-)
+
+--- a/arch/x86/crypto/curve25519-x86_64.c
++++ b/arch/x86/crypto/curve25519-x86_64.c
+@@ -167,28 +167,28 @@ static inline void fmul(u64 *out, const
+               "  movq 0(%1), %%rdx;"
+               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
+               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
+-              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
++              "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"
+               /* Compute src1[1] * src2 */
+               "  movq 8(%1), %%rdx;"
+               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+-              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 16(%0);"
+-              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
++              "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[2] * src2 */
+               "  movq 16(%1), %%rdx;"
+               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
+-              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 24(%0);"
+-              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
++              "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[3] * src2 */
+               "  movq 24(%1), %%rdx;"
+               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
+-              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 32(%0);"
+-              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 40(%0);"    "  mov $0, %%r8;"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
++              "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
+               /* Line up pointers */
+@@ -202,11 +202,11 @@ static inline void fmul(u64 *out, const
+               "  mulxq 32(%1), %%r8, %%r13;"
+               "  xor %3, %3;"
+               "  adoxq 0(%1), %%r8;"
+-              "  mulxq 40(%1), %%r9, %%r12;"
++              "  mulxq 40(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+               "  adoxq 8(%1), %%r9;"
+               "  mulxq 48(%1), %%r10, %%r13;"
+-              "  adcx %%r12, %%r10;"
++              "  adcx %%rbx, %%r10;"
+               "  adoxq 16(%1), %%r10;"
+               "  mulxq 56(%1), %%r11, %%rax;"
+               "  adcx %%r13, %%r11;"
+@@ -231,7 +231,7 @@ static inline void fmul(u64 *out, const
+               "  movq %%r8, 0(%0);"
+       : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
+       :
+-      : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
++      : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
+       );
+ }
+ 
+@@ -248,28 +248,28 @@ static inline void fmul2(u64 *out, const
+               "  movq 0(%1), %%rdx;"
+               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
+               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
+-              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
++              "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"
+               /* Compute src1[1] * src2 */
+               "  movq 8(%1), %%rdx;"
+               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+-              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 16(%0);"
+-              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
++              "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[2] * src2 */
+               "  movq 16(%1), %%rdx;"
+               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
+-              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 24(%0);"
+-              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
++              "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[3] * src2 */
+               "  movq 24(%1), %%rdx;"
+               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
+-              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 32(%0);"
+-              "  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 40(%0);"    "  mov $0, %%r8;"
++              "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
++              "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
+ 
+@@ -279,28 +279,28 @@ static inline void fmul2(u64 *out, const
+               "  movq 32(%1), %%rdx;"
+               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
+               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
+-              "  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
++              "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"
+               /* Compute src1[1] * src2 */
+               "  movq 40(%1), %%rdx;"
+               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
+-              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 80(%0);"
+-              "  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
++              "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[2] * src2 */
+               "  movq 48(%1), %%rdx;"
+               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
+-              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 88(%0);"
+-              "  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
++              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
++              "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[3] * src2 */
+               "  movq 56(%1), %%rdx;"
+               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
+-              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 96(%0);"
+-              "  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 104(%0);"    "  mov $0, %%r8;"
++              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
++              "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
+               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
+               /* Line up pointers */
+@@ -314,11 +314,11 @@ static inline void fmul2(u64 *out, const
+               "  mulxq 32(%1), %%r8, %%r13;"
+               "  xor %3, %3;"
+               "  adoxq 0(%1), %%r8;"
+-              "  mulxq 40(%1), %%r9, %%r12;"
++              "  mulxq 40(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+               "  adoxq 8(%1), %%r9;"
+               "  mulxq 48(%1), %%r10, %%r13;"
+-              "  adcx %%r12, %%r10;"
++              "  adcx %%rbx, %%r10;"
+               "  adoxq 16(%1), %%r10;"
+               "  mulxq 56(%1), %%r11, %%rax;"
+               "  adcx %%r13, %%r11;"
+@@ -347,11 +347,11 @@ static inline void fmul2(u64 *out, const
+               "  mulxq 96(%1), %%r8, %%r13;"
+               "  xor %3, %3;"
+               "  adoxq 64(%1), %%r8;"
+-              "  mulxq 104(%1), %%r9, %%r12;"
++              "  mulxq 104(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+               "  adoxq 72(%1), %%r9;"
+               "  mulxq 112(%1), %%r10, %%r13;"
+-              "  adcx %%r12, %%r10;"
++              "  adcx %%rbx, %%r10;"
+               "  adoxq 80(%1), %%r10;"
+               "  mulxq 120(%1), %%r11, %%rax;"
+               "  adcx %%r13, %%r11;"
+@@ -376,7 +376,7 @@ static inline void fmul2(u64 *out, const
+               "  movq %%r8, 32(%0);"
+       : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
+       :
+-      : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
++      : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
+       );
+ }
+ 
+@@ -388,11 +388,11 @@ static inline void fmul_scalar(u64 *out,
+       asm volatile(
+               /* Compute the raw multiplication of f1*f2 */
+               "  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
+-              "  mulxq 8(%2), %%r9, %%r12;"      /* f1[1]*f2 */
++              "  mulxq 8(%2), %%r9, %%rbx;"      /* f1[1]*f2 */
+               "  add %%rcx, %%r9;"
+               "  mov $0, %%rcx;"
+               "  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
+-              "  adcx %%r12, %%r10;"
++              "  adcx %%rbx, %%r10;"
+               "  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
+               "  adcx %%r13, %%r11;"
+               "  adcx %%rcx, %%rax;"
+@@ -419,7 +419,7 @@ static inline void fmul_scalar(u64 *out,
+               "  movq %%r8, 0(%1);"
+       : "+&r" (f2_r)
+       : "r" (out), "r" (f1)
+-      : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc"
++      : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
+       );
+ }
+ 
+@@ -520,8 +520,8 @@ static inline void fsqr(u64 *out, const
+               "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+               "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+               "  movq 24(%1), %%rdx;"                                      /* f[3] */
+-              "  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+-              "  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
++              "  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
++              "  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
+               "  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+               "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+@@ -531,12 +531,12 @@ static inline void fsqr(u64 *out, const
+               "  adcx %%r8, %%r8;"
+               "  adox %%rcx, %%r11;"
+               "  adcx %%r9, %%r9;"
+-              "  adox %%r15, %%r12;"
++              "  adox %%r15, %%rbx;"
+               "  adcx %%r10, %%r10;"
+               "  adox %%r15, %%r13;"
+               "  adcx %%r11, %%r11;"
+               "  adox %%r15, %%r14;"
+-              "  adcx %%r12, %%r12;"
++              "  adcx %%rbx, %%rbx;"
+               "  adcx %%r13, %%r13;"
+               "  adcx %%r14, %%r14;"
+ 
+@@ -549,7 +549,7 @@ static inline void fsqr(u64 *out, const
+               "  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
+               "  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
+               "  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
+-              "  adcx %%rcx, %%r12;"     "  movq %%r12, 40(%0);"
++              "  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
+               "  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
+               "  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
+               "  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
+@@ -565,11 +565,11 @@ static inline void fsqr(u64 *out, const
+               "  mulxq 32(%1), %%r8, %%r13;"
+               "  xor %%rcx, %%rcx;"
+               "  adoxq 0(%1), %%r8;"
+-              "  mulxq 40(%1), %%r9, %%r12;"
++              "  mulxq 40(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+               "  adoxq 8(%1), %%r9;"
+               "  mulxq 48(%1), %%r10, %%r13;"
+-              "  adcx %%r12, %%r10;"
++              "  adcx %%rbx, %%r10;"
+               "  adoxq 16(%1), %%r10;"
+               "  mulxq 56(%1), %%r11, %%rax;"
+               "  adcx %%r13, %%r11;"
+@@ -594,7 +594,7 @@ static inline void fsqr(u64 *out, const
+               "  movq %%r8, 0(%0);"
+       : "+&r" (tmp), "+&r" (f), "+&r" (out)
+       :
+-      : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
++      : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
+       );
+ }
+ 
+@@ -611,8 +611,8 @@ static inline void fsqr2(u64 *out, const
+               "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+               "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+               "  movq 24(%1), %%rdx;"                                      /* f[3] */
+-              "  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+-              "  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
++              "  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
++              "  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
+               "  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+               "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+@@ -622,12 +622,12 @@ static inline void fsqr2(u64 *out, const
+               "  adcx %%r8, %%r8;"
+               "  adox %%rcx, %%r11;"
+               "  adcx %%r9, %%r9;"
+-              "  adox %%r15, %%r12;"
++              "  adox %%r15, %%rbx;"
+               "  adcx %%r10, %%r10;"
+               "  adox %%r15, %%r13;"
+               "  adcx %%r11, %%r11;"
+               "  adox %%r15, %%r14;"
+-              "  adcx %%r12, %%r12;"
++              "  adcx %%rbx, %%rbx;"
+               "  adcx %%r13, %%r13;"
+               "  adcx %%r14, %%r14;"
+ 
+@@ -640,7 +640,7 @@ static inline void fsqr2(u64 *out, const
+               "  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
+               "  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
+               "  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
+-              "  adcx %%rcx, %%r12;"     "  movq %%r12, 40(%0);"
++              "  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
+               "  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
+               "  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
+               "  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
+@@ -651,8 +651,8 @@ static inline void fsqr2(u64 *out, const
+               "  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+               "  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+               "  movq 56(%1), %%rdx;"                                      /* f[3] */
+-              "  mulxq 40(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+-              "  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
++              "  mulxq 40(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
++              "  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
+               "  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+               "  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+@@ -662,12 +662,12 @@ static inline void fsqr2(u64 *out, const
+               "  adcx %%r8, %%r8;"
+               "  adox %%rcx, %%r11;"
+               "  adcx %%r9, %%r9;"
+-              "  adox %%r15, %%r12;"
++              "  adox %%r15, %%rbx;"
+               "  adcx %%r10, %%r10;"
+               "  adox %%r15, %%r13;"
+               "  adcx %%r11, %%r11;"
+               "  adox %%r15, %%r14;"
+-              "  adcx %%r12, %%r12;"
++              "  adcx %%rbx, %%rbx;"
+               "  adcx %%r13, %%r13;"
+               "  adcx %%r14, %%r14;"
+ 
+@@ -680,7 +680,7 @@ static inline void fsqr2(u64 *out, const
+               "  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
+               "  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
+               "  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
+-              "  adcx %%rcx, %%r12;"     "  movq %%r12, 104(%0);"
++              "  adcx %%rcx, %%rbx;"     "  movq %%rbx, 104(%0);"
+               "  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
+               "  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
+               "  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
+@@ -694,11 +694,11 @@ static inline void fsqr2(u64 *out, const
+               "  mulxq 32(%1), %%r8, %%r13;"
+               "  xor %%rcx, %%rcx;"
+               "  adoxq 0(%1), %%r8;"
+-              "  mulxq 40(%1), %%r9, %%r12;"
++              "  mulxq 40(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+               "  adoxq 8(%1), %%r9;"
+               "  mulxq 48(%1), %%r10, %%r13;"
+-              "  adcx %%r12, %%r10;"
++              "  adcx %%rbx, %%r10;"
+               "  adoxq 16(%1), %%r10;"
+               "  mulxq 56(%1), %%r11, %%rax;"
+               "  adcx %%r13, %%r11;"
+@@ -727,11 +727,11 @@ static inline void fsqr2(u64 *out, const
+               "  mulxq 96(%1), %%r8, %%r13;"
+               "  xor %%rcx, %%rcx;"
+               "  adoxq 64(%1), %%r8;"
+-              "  mulxq 104(%1), %%r9, %%r12;"
++              "  mulxq 104(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+               "  adoxq 72(%1), %%r9;"
+               "  mulxq 112(%1), %%r10, %%r13;"
+-              "  adcx %%r12, %%r10;"
++              "  adcx %%rbx, %%r10;"
+               "  adoxq 80(%1), %%r10;"
+               "  mulxq 120(%1), %%r11, %%rax;"
+               "  adcx %%r13, %%r11;"
+@@ -756,7 +756,7 @@ static inline void fsqr2(u64 *out, const
+               "  movq %%r8, 32(%0);"
+       : "+&r" (tmp), "+&r" (f), "+&r" (out)
+       :
+-      : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
++      : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
+       );
+ }
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0056-crypto-arm-64-poly1305-add-artifact-to-.gitignore-fi.patch b/target/linux/generic/backport-5.4/080-wireguard-0056-crypto-arm-64-poly1305-add-artifact-to-.gitignore-fi.patch

new file mode 100644 (file)

index 0000000..528de43
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0056-crypto-arm-64-poly1305-add-artifact-to-.gitignore-fi.patch
@@ -0,0 +1,36 @@
+From 216f24cb4aba8385025c38da0f79c4aa8e637484 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 19 Mar 2020 11:56:17 -0600
+Subject: [PATCH 056/124] crypto: arm[64]/poly1305 - add artifact to .gitignore
+ files
+
+commit 6e4e00d8b68ca7eb30d08afb740033e0d36abe55 upstream.
+
+The .S_shipped yields a .S, and the pattern in these directories is to
+add that to .gitignore so that git-status doesn't raise a fuss.
+
+Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
+Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
+Reported-by: Emil Renner Berthing <kernel@esmil.dk>
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/.gitignore   | 1 +
+ arch/arm64/crypto/.gitignore | 1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/arch/arm/crypto/.gitignore
++++ b/arch/arm/crypto/.gitignore
+@@ -1,3 +1,4 @@
+ aesbs-core.S
+ sha256-core.S
+ sha512-core.S
++poly1305-core.S
+--- a/arch/arm64/crypto/.gitignore
++++ b/arch/arm64/crypto/.gitignore
+@@ -1,2 +1,3 @@
+ sha256-core.S
+ sha512-core.S
++poly1305-core.S
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch b/target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch

new file mode 100644 (file)

index 0000000..bb0f580
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch
@@ -0,0 +1,243 @@
+From af386d2b1f9207290a12aa97ecec8b428f3bebb2 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 23 Apr 2020 15:54:04 -0600
+Subject: [PATCH 057/124] crypto: arch/lib - limit simd usage to 4k chunks
+
+commit 706024a52c614b478b63f7728d202532ce6591a9 upstream.
+
+The initial Zinc patchset, after some mailing list discussion, contained
+code to ensure that kernel_fpu_enable would not be kept on for more than
+a 4k chunk, since it disables preemption. The choice of 4k isn't totally
+scientific, but it's not a bad guess either, and it's what's used in
+both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
+of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
+former two).
+
+Ard did some back of the envelope calculations and found that
+at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
+means we have a maximum preemption disabling of 20us, which Sebastian
+confirmed was probably a good limit.
+
+Unfortunately the chunking appears to have been left out of the final
+patchset that added the glue code. So, this commit adds it back in.
+
+Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
+Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
+Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
+Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
+Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
+Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
+Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
+Cc: Eric Biggers <ebiggers@google.com>
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-glue.c        | 14 +++++++++++---
+ arch/arm/crypto/poly1305-glue.c      | 15 +++++++++++----
+ arch/arm64/crypto/chacha-neon-glue.c | 14 +++++++++++---
+ arch/arm64/crypto/poly1305-glue.c    | 15 +++++++++++----
+ arch/x86/crypto/blake2s-glue.c       | 10 ++++------
+ arch/x86/crypto/chacha_glue.c        | 14 +++++++++++---
+ arch/x86/crypto/poly1305_glue.c      | 13 ++++++-------
+ 7 files changed, 65 insertions(+), 30 deletions(-)
+
+--- a/arch/arm/crypto/chacha-glue.c
++++ b/arch/arm/crypto/chacha-glue.c
+@@ -91,9 +91,17 @@ void chacha_crypt_arch(u32 *state, u8 *d
+               return;
+       }
+ 
+-      kernel_neon_begin();
+-      chacha_doneon(state, dst, src, bytes, nrounds);
+-      kernel_neon_end();
++      do {
++              unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
++
++              kernel_neon_begin();
++              chacha_doneon(state, dst, src, todo, nrounds);
++              kernel_neon_end();
++
++              bytes -= todo;
++              src += todo;
++              dst += todo;
++      } while (bytes);
+ }
+ EXPORT_SYMBOL(chacha_crypt_arch);
+ 
+--- a/arch/arm/crypto/poly1305-glue.c
++++ b/arch/arm/crypto/poly1305-glue.c
+@@ -160,13 +160,20 @@ void poly1305_update_arch(struct poly130
+               unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+ 
+               if (static_branch_likely(&have_neon) && do_neon) {
+-                      kernel_neon_begin();
+-                      poly1305_blocks_neon(&dctx->h, src, len, 1);
+-                      kernel_neon_end();
++                      do {
++                              unsigned int todo = min_t(unsigned int, len, SZ_4K);
++
++                              kernel_neon_begin();
++                              poly1305_blocks_neon(&dctx->h, src, todo, 1);
++                              kernel_neon_end();
++
++                              len -= todo;
++                              src += todo;
++                      } while (len);
+               } else {
+                       poly1305_blocks_arm(&dctx->h, src, len, 1);
++                      src += len;
+               }
+-              src += len;
+               nbytes %= POLY1305_BLOCK_SIZE;
+       }
+ 
+--- a/arch/arm64/crypto/chacha-neon-glue.c
++++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -87,9 +87,17 @@ void chacha_crypt_arch(u32 *state, u8 *d
+           !crypto_simd_usable())
+               return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+ 
+-      kernel_neon_begin();
+-      chacha_doneon(state, dst, src, bytes, nrounds);
+-      kernel_neon_end();
++      do {
++              unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
++
++              kernel_neon_begin();
++              chacha_doneon(state, dst, src, todo, nrounds);
++              kernel_neon_end();
++
++              bytes -= todo;
++              src += todo;
++              dst += todo;
++      } while (bytes);
+ }
+ EXPORT_SYMBOL(chacha_crypt_arch);
+ 
+--- a/arch/arm64/crypto/poly1305-glue.c
++++ b/arch/arm64/crypto/poly1305-glue.c
+@@ -143,13 +143,20 @@ void poly1305_update_arch(struct poly130
+               unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+ 
+               if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
+-                      kernel_neon_begin();
+-                      poly1305_blocks_neon(&dctx->h, src, len, 1);
+-                      kernel_neon_end();
++                      do {
++                              unsigned int todo = min_t(unsigned int, len, SZ_4K);
++
++                              kernel_neon_begin();
++                              poly1305_blocks_neon(&dctx->h, src, todo, 1);
++                              kernel_neon_end();
++
++                              len -= todo;
++                              src += todo;
++                      } while (len);
+               } else {
+                       poly1305_blocks(&dctx->h, src, len, 1);
++                      src += len;
+               }
+-              src += len;
+               nbytes %= POLY1305_BLOCK_SIZE;
+       }
+ 
+--- a/arch/x86/crypto/blake2s-glue.c
++++ b/arch/x86/crypto/blake2s-glue.c
+@@ -32,16 +32,16 @@ void blake2s_compress_arch(struct blake2
+                          const u32 inc)
+ {
+       /* SIMD disables preemption, so relax after processing each page. */
+-      BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
++      BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
+ 
+       if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
+               blake2s_compress_generic(state, block, nblocks, inc);
+               return;
+       }
+ 
+-      for (;;) {
++      do {
+               const size_t blocks = min_t(size_t, nblocks,
+-                                          PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
++                                          SZ_4K / BLAKE2S_BLOCK_SIZE);
+ 
+               kernel_fpu_begin();
+               if (IS_ENABLED(CONFIG_AS_AVX512) &&
+@@ -52,10 +52,8 @@ void blake2s_compress_arch(struct blake2
+               kernel_fpu_end();
+ 
+               nblocks -= blocks;
+-              if (!nblocks)
+-                      break;
+               block += blocks * BLAKE2S_BLOCK_SIZE;
+-      }
++      } while (nblocks);
+ }
+ EXPORT_SYMBOL(blake2s_compress_arch);
+ 
+--- a/arch/x86/crypto/chacha_glue.c
++++ b/arch/x86/crypto/chacha_glue.c
+@@ -154,9 +154,17 @@ void chacha_crypt_arch(u32 *state, u8 *d
+           bytes <= CHACHA_BLOCK_SIZE)
+               return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+ 
+-      kernel_fpu_begin();
+-      chacha_dosimd(state, dst, src, bytes, nrounds);
+-      kernel_fpu_end();
++      do {
++              unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
++
++              kernel_fpu_begin();
++              chacha_dosimd(state, dst, src, todo, nrounds);
++              kernel_fpu_end();
++
++              bytes -= todo;
++              src += todo;
++              dst += todo;
++      } while (bytes);
+ }
+ EXPORT_SYMBOL(chacha_crypt_arch);
+ 
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -91,8 +91,8 @@ static void poly1305_simd_blocks(void *c
+       struct poly1305_arch_internal *state = ctx;
+ 
+       /* SIMD disables preemption, so relax after processing each page. */
+-      BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
+-                   PAGE_SIZE % POLY1305_BLOCK_SIZE);
++      BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
++                   SZ_4K % POLY1305_BLOCK_SIZE);
+ 
+       if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
+           (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
+@@ -102,8 +102,8 @@ static void poly1305_simd_blocks(void *c
+               return;
+       }
+ 
+-      for (;;) {
+-              const size_t bytes = min_t(size_t, len, PAGE_SIZE);
++      do {
++              const size_t bytes = min_t(size_t, len, SZ_4K);
+ 
+               kernel_fpu_begin();
+               if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
+@@ -113,11 +113,10 @@ static void poly1305_simd_blocks(void *c
+               else
+                       poly1305_blocks_avx(ctx, inp, bytes, padbit);
+               kernel_fpu_end();
++
+               len -= bytes;
+-              if (!len)
+-                      break;
+               inp += bytes;
+-      }
++      } while (len);
+ }
+ 
+ static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0058-crypto-lib-chacha20poly1305-Add-missing-function-dec.patch b/target/linux/generic/backport-5.4/080-wireguard-0058-crypto-lib-chacha20poly1305-Add-missing-function-dec.patch

new file mode 100644 (file)

index 0000000..0653e3a
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0058-crypto-lib-chacha20poly1305-Add-missing-function-dec.patch
@@ -0,0 +1,38 @@
+From 58c2229461f888087fc3175650bc2e6aa70fd862 Mon Sep 17 00:00:00 2001
+From: Herbert Xu <herbert@gondor.apana.org.au>
+Date: Wed, 8 Jul 2020 12:41:13 +1000
+Subject: [PATCH 058/124] crypto: lib/chacha20poly1305 - Add missing function
+ declaration
+
+commit 06cc2afbbdf9a9e8df3e2f8db724997dd6e1b4ac upstream.
+
+This patch adds a declaration for chacha20poly1305_selftest to
+silence a sparse warning.
+
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/crypto/chacha20poly1305.h | 2 ++
+ lib/crypto/chacha20poly1305.c     | 2 --
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/include/crypto/chacha20poly1305.h
++++ b/include/crypto/chacha20poly1305.h
+@@ -45,4 +45,6 @@ bool chacha20poly1305_decrypt_sg_inplace
+                                        const u64 nonce,
+                                        const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+ 
++bool chacha20poly1305_selftest(void);
++
+ #endif /* __CHACHA20POLY1305_H */
+--- a/lib/crypto/chacha20poly1305.c
++++ b/lib/crypto/chacha20poly1305.c
+@@ -21,8 +21,6 @@
+ 
+ #define CHACHA_KEY_WORDS      (CHACHA_KEY_SIZE / sizeof(u32))
+ 
+-bool __init chacha20poly1305_selftest(void);
+-
+ static void chacha_load_key(u32 *k, const u8 *in)
+ {
+       k[0] = get_unaligned_le32(in);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch b/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch

new file mode 100644 (file)

index 0000000..6524393
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch
@@ -0,0 +1,148 @@
+From 833ca409e17c10f4affb5879e22a03fdf1933439 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Wed, 8 Jul 2020 12:11:18 +0300
+Subject: [PATCH 059/124] crypto: x86/chacha-sse3 - use unaligned loads for
+ state array
+
+commit e79a31715193686e92dadb4caedfbb1f5de3659c upstream.
+
+Due to the fact that the x86 port does not support allocating objects
+on the stack with an alignment that exceeds 8 bytes, we have a rather
+ugly hack in the x86 code for ChaCha to ensure that the state array is
+aligned to 16 bytes, allowing the SSE3 implementation of the algorithm
+to use aligned loads.
+
+Given that the performance benefit of using of aligned loads appears to
+be limited (~0.25% for 1k blocks using tcrypt on a Corei7-8650U), and
+the fact that this hack has leaked into generic ChaCha code, let's just
+remove it.
+
+Cc: Martin Willi <martin@strongswan.org>
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: Eric Biggers <ebiggers@kernel.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Reviewed-by: Martin Willi <martin@strongswan.org>
+Reviewed-by: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/chacha-ssse3-x86_64.S | 16 ++++++++--------
+ arch/x86/crypto/chacha_glue.c         | 17 ++---------------
+ include/crypto/chacha.h               |  4 ----
+ 3 files changed, 10 insertions(+), 27 deletions(-)
+
+--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
++++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
+@@ -120,10 +120,10 @@ ENTRY(chacha_block_xor_ssse3)
+       FRAME_BEGIN
+ 
+       # x0..3 = s0..3
+-      movdqa          0x00(%rdi),%xmm0
+-      movdqa          0x10(%rdi),%xmm1
+-      movdqa          0x20(%rdi),%xmm2
+-      movdqa          0x30(%rdi),%xmm3
++      movdqu          0x00(%rdi),%xmm0
++      movdqu          0x10(%rdi),%xmm1
++      movdqu          0x20(%rdi),%xmm2
++      movdqu          0x30(%rdi),%xmm3
+       movdqa          %xmm0,%xmm8
+       movdqa          %xmm1,%xmm9
+       movdqa          %xmm2,%xmm10
+@@ -205,10 +205,10 @@ ENTRY(hchacha_block_ssse3)
+       # %edx: nrounds
+       FRAME_BEGIN
+ 
+-      movdqa          0x00(%rdi),%xmm0
+-      movdqa          0x10(%rdi),%xmm1
+-      movdqa          0x20(%rdi),%xmm2
+-      movdqa          0x30(%rdi),%xmm3
++      movdqu          0x00(%rdi),%xmm0
++      movdqu          0x10(%rdi),%xmm1
++      movdqu          0x20(%rdi),%xmm2
++      movdqu          0x30(%rdi),%xmm3
+ 
+       mov             %edx,%r8d
+       call            chacha_permute
+--- a/arch/x86/crypto/chacha_glue.c
++++ b/arch/x86/crypto/chacha_glue.c
+@@ -14,8 +14,6 @@
+ #include <linux/module.h>
+ #include <asm/simd.h>
+ 
+-#define CHACHA_STATE_ALIGN 16
+-
+ asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+                                      unsigned int len, int nrounds);
+ asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+@@ -125,8 +123,6 @@ static void chacha_dosimd(u32 *state, u8
+ 
+ void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+ {
+-      state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+-
+       if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
+               hchacha_block_generic(state, stream, nrounds);
+       } else {
+@@ -139,8 +135,6 @@ EXPORT_SYMBOL(hchacha_block_arch);
+ 
+ void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+ {
+-      state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+-
+       chacha_init_generic(state, key, iv);
+ }
+ EXPORT_SYMBOL(chacha_init_arch);
+@@ -148,8 +142,6 @@ EXPORT_SYMBOL(chacha_init_arch);
+ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+                      int nrounds)
+ {
+-      state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+-
+       if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
+           bytes <= CHACHA_BLOCK_SIZE)
+               return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+@@ -171,15 +163,12 @@ EXPORT_SYMBOL(chacha_crypt_arch);
+ static int chacha_simd_stream_xor(struct skcipher_request *req,
+                                 const struct chacha_ctx *ctx, const u8 *iv)
+ {
+-      u32 *state, state_buf[16 + 2] __aligned(8);
++      u32 state[CHACHA_STATE_WORDS] __aligned(8);
+       struct skcipher_walk walk;
+       int err;
+ 
+       err = skcipher_walk_virt(&walk, req, false);
+ 
+-      BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+-      state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+-
+       chacha_init_generic(state, ctx->key, iv);
+ 
+       while (walk.nbytes > 0) {
+@@ -218,12 +207,10 @@ static int xchacha_simd(struct skcipher_
+ {
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-      u32 *state, state_buf[16 + 2] __aligned(8);
++      u32 state[CHACHA_STATE_WORDS] __aligned(8);
+       struct chacha_ctx subctx;
+       u8 real_iv[16];
+ 
+-      BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+-      state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+       chacha_init_generic(state, ctx->key, req->iv);
+ 
+       if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
+--- a/include/crypto/chacha.h
++++ b/include/crypto/chacha.h
+@@ -25,11 +25,7 @@
+ #define CHACHA_BLOCK_SIZE     64
+ #define CHACHAPOLY_IV_SIZE    12
+ 
+-#ifdef CONFIG_X86_64
+-#define CHACHA_STATE_WORDS    ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
+-#else
+ #define CHACHA_STATE_WORDS    (CHACHA_BLOCK_SIZE / sizeof(u32))
+-#endif
+ 
+ /* 192-bit nonce, then 64-bit stream position */
+ #define XCHACHA_IV_SIZE               32
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0060-crypto-x86-curve25519-Remove-unused-carry-variables.patch b/target/linux/generic/backport-5.4/080-wireguard-0060-crypto-x86-curve25519-Remove-unused-carry-variables.patch

new file mode 100644 (file)

index 0000000..9c7c3c7
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0060-crypto-x86-curve25519-Remove-unused-carry-variables.patch
@@ -0,0 +1,47 @@
+From 9cfd2787b0b37940c656c6ea5fede6b3c360f0e5 Mon Sep 17 00:00:00 2001
+From: Herbert Xu <herbert@gondor.apana.org.au>
+Date: Thu, 23 Jul 2020 17:50:48 +1000
+Subject: [PATCH 060/124] crypto: x86/curve25519 - Remove unused carry
+ variables
+
+commit 054a5540fb8f7268e2c79e9deab4242db15c8cba upstream.
+
+The carry variables are assigned but never used, which upsets
+the compiler.  This patch removes them.
+
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Reviewed-by: Karthikeyan Bhargavan <karthik.bhargavan@gmail.com>
+Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/curve25519-x86_64.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/crypto/curve25519-x86_64.c
++++ b/arch/x86/crypto/curve25519-x86_64.c
+@@ -948,10 +948,8 @@ static void store_felem(u64 *b, u64 *f)
+ {
+       u64 f30 = f[3U];
+       u64 top_bit0 = f30 >> (u32)63U;
+-      u64 carry0;
+       u64 f31;
+       u64 top_bit;
+-      u64 carry;
+       u64 f0;
+       u64 f1;
+       u64 f2;
+@@ -970,11 +968,11 @@ static void store_felem(u64 *b, u64 *f)
+       u64 o2;
+       u64 o3;
+       f[3U] = f30 & (u64)0x7fffffffffffffffU;
+-      carry0 = add_scalar(f, f, (u64)19U * top_bit0);
++      add_scalar(f, f, (u64)19U * top_bit0);
+       f31 = f[3U];
+       top_bit = f31 >> (u32)63U;
+       f[3U] = f31 & (u64)0x7fffffffffffffffU;
+-      carry = add_scalar(f, f, (u64)19U * top_bit);
++      add_scalar(f, f, (u64)19U * top_bit);
+       f0 = f[0U];
+       f1 = f[1U];
+       f2 = f[2U];
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0061-crypto-arm-curve25519-include-linux-scatterlist.h.patch b/target/linux/generic/backport-5.4/080-wireguard-0061-crypto-arm-curve25519-include-linux-scatterlist.h.patch

new file mode 100644 (file)

index 0000000..e1857f8
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0061-crypto-arm-curve25519-include-linux-scatterlist.h.patch
@@ -0,0 +1,37 @@
+From 6ae9f0d421af5145d457c51abe2b704ebb297a17 Mon Sep 17 00:00:00 2001
+From: Fabio Estevam <festevam@gmail.com>
+Date: Mon, 24 Aug 2020 11:09:53 -0300
+Subject: [PATCH 061/124] crypto: arm/curve25519 - include
+ <linux/scatterlist.h>
+
+commit 6779d0e6b0fe193ab3010ea201782ca6f75a3862 upstream.
+
+Building ARM allmodconfig leads to the following warnings:
+
+arch/arm/crypto/curve25519-glue.c:73:12: error: implicit declaration of function 'sg_copy_to_buffer' [-Werror=implicit-function-declaration]
+arch/arm/crypto/curve25519-glue.c:74:9: error: implicit declaration of function 'sg_nents_for_len' [-Werror=implicit-function-declaration]
+arch/arm/crypto/curve25519-glue.c:88:11: error: implicit declaration of function 'sg_copy_from_buffer' [-Werror=implicit-function-declaration]
+
+Include <linux/scatterlist.h> to fix such warnings
+
+Reported-by: Olof's autobuilder <build@lixom.net>
+Fixes: 0c3dc787a62a ("crypto: algapi - Remove skbuff.h inclusion")
+Signed-off-by: Fabio Estevam <festevam@gmail.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/curve25519-glue.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm/crypto/curve25519-glue.c
++++ b/arch/arm/crypto/curve25519-glue.c
+@@ -16,6 +16,7 @@
+ #include <linux/module.h>
+ #include <linux/init.h>
+ #include <linux/jump_label.h>
++#include <linux/scatterlist.h>
+ #include <crypto/curve25519.h>
+ 
+ asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0062-crypto-arm-poly1305-Add-prototype-for-poly1305_block.patch b/target/linux/generic/backport-5.4/080-wireguard-0062-crypto-arm-poly1305-Add-prototype-for-poly1305_block.patch

new file mode 100644 (file)

index 0000000..7cdf0db
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0062-crypto-arm-poly1305-Add-prototype-for-poly1305_block.patch
@@ -0,0 +1,34 @@
+From 55a3d2044f411ecf291777f31053b8d8ee81c051 Mon Sep 17 00:00:00 2001
+From: Herbert Xu <herbert@gondor.apana.org.au>
+Date: Tue, 25 Aug 2020 11:23:00 +1000
+Subject: [PATCH 062/124] crypto: arm/poly1305 - Add prototype for
+ poly1305_blocks_neon
+
+commit 51982ea02aef972132eb35c583d3e4c5b83166e5 upstream.
+
+This patch adds a prototype for poly1305_blocks_neon to slience
+a compiler warning:
+
+  CC [M]  arch/arm/crypto/poly1305-glue.o
+../arch/arm/crypto/poly1305-glue.c:25:13: warning: no previous prototype for `poly1305_blocks_neon' [-Wmissing-prototypes]
+ void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
+             ^~~~~~~~~~~~~~~~~~~~
+
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/poly1305-glue.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm/crypto/poly1305-glue.c
++++ b/arch/arm/crypto/poly1305-glue.c
+@@ -20,6 +20,7 @@
+ 
+ void poly1305_init_arm(void *state, const u8 *key);
+ void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
++void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
+ void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
+ 
+ void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0063-crypto-curve25519-x86_64-Use-XORL-r32-32.patch b/target/linux/generic/backport-5.4/080-wireguard-0063-crypto-curve25519-x86_64-Use-XORL-r32-32.patch

new file mode 100644 (file)

index 0000000..9c2d666
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0063-crypto-curve25519-x86_64-Use-XORL-r32-32.patch
@@ -0,0 +1,261 @@
+From 4c4ab112443b42603d57b698111b55bfec278001 Mon Sep 17 00:00:00 2001
+From: Uros Bizjak <ubizjak@gmail.com>
+Date: Thu, 27 Aug 2020 19:30:58 +0200
+Subject: [PATCH 063/124] crypto: curve25519-x86_64 - Use XORL r32,32
+
+commit db719539fd3889836900bf912755aa30a5985e9a upstream.
+
+x86_64 zero extends 32bit operations, so for 64bit operands,
+XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
+a REX prefix byte when legacy registers are used.
+
+Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: "David S. Miller" <davem@davemloft.net>
+Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
+ 1 file changed, 34 insertions(+), 34 deletions(-)
+
+--- a/arch/x86/crypto/curve25519-x86_64.c
++++ b/arch/x86/crypto/curve25519-x86_64.c
+@@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, c
+ 
+       asm volatile(
+               /* Clear registers to propagate the carry bit */
+-              "  xor %%r8, %%r8;"
+-              "  xor %%r9, %%r9;"
+-              "  xor %%r10, %%r10;"
+-              "  xor %%r11, %%r11;"
+-              "  xor %1, %1;"
++              "  xor %%r8d, %%r8d;"
++              "  xor %%r9d, %%r9d;"
++              "  xor %%r10d, %%r10d;"
++              "  xor %%r11d, %%r11d;"
++              "  xor %k1, %k1;"
+ 
+               /* Begin addition chain */
+               "  addq 0(%3), %0;"
+@@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const
+               "  cmovc %0, %%rax;"
+ 
+               /* Step 2: Add carry*38 to the original sum */
+-              "  xor %%rcx, %%rcx;"
++              "  xor %%ecx, %%ecx;"
+               "  add %%rax, %%r8;"
+               "  adcx %%rcx, %%r9;"
+               "  movq %%r9, 8(%1);"
+@@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const
+ 
+               /* Compute src1[0] * src2 */
+               "  movq 0(%1), %%rdx;"
+-              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
+               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
+               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"
+               /* Compute src1[1] * src2 */
+               "  movq 8(%1), %%rdx;"
+-              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
+               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[2] * src2 */
+               "  movq 16(%1), %%rdx;"
+-              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
+               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
+               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[3] * src2 */
+               "  movq 24(%1), %%rdx;"
+-              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
+               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
+               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
+@@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const
+               /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+               "  mov $38, %%rdx;"
+               "  mulxq 32(%1), %%r8, %%r13;"
+-              "  xor %3, %3;"
++              "  xor %k3, %k3;"
+               "  adoxq 0(%1), %%r8;"
+               "  mulxq 40(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+@@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const
+ 
+               /* Compute src1[0] * src2 */
+               "  movq 0(%1), %%rdx;"
+-              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
+               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
+               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"
+               /* Compute src1[1] * src2 */
+               "  movq 8(%1), %%rdx;"
+-              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
+               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[2] * src2 */
+               "  movq 16(%1), %%rdx;"
+-              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
+               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
+               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[3] * src2 */
+               "  movq 24(%1), %%rdx;"
+-              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
++              "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
+               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
+               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
+               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
+@@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const
+ 
+               /* Compute src1[0] * src2 */
+               "  movq 32(%1), %%rdx;"
+-              "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
+-              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
++              "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
++              "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
+               "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"
+               /* Compute src1[1] * src2 */
+               "  movq 40(%1), %%rdx;"
+-              "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
+-              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
++              "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
++              "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
+               "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[2] * src2 */
+               "  movq 48(%1), %%rdx;"
+-              "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
+-              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
++              "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
++              "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
+               "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+               /* Compute src1[3] * src2 */
+               "  movq 56(%1), %%rdx;"
+-              "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
+-              "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
++              "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
++              "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
+               "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
+               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
+                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
+@@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const
+               /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+               "  mov $38, %%rdx;"
+               "  mulxq 32(%1), %%r8, %%r13;"
+-              "  xor %3, %3;"
++              "  xor %k3, %k3;"
+               "  adoxq 0(%1), %%r8;"
+               "  mulxq 40(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+@@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const
+               /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+               "  mov $38, %%rdx;"
+               "  mulxq 96(%1), %%r8, %%r13;"
+-              "  xor %3, %3;"
++              "  xor %k3, %k3;"
+               "  adoxq 64(%1), %%r8;"
+               "  mulxq 104(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+@@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const
+ 
+               /* Step 1: Compute all partial products */
+               "  movq 0(%1), %%rdx;"                                       /* f[0] */
+-              "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
++              "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
+               "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+               "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+               "  movq 24(%1), %%rdx;"                                      /* f[3] */
+@@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const
+               "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+               /* Step 2: Compute two parallel carry chains */
+-              "  xor %%r15, %%r15;"
++              "  xor %%r15d, %%r15d;"
+               "  adox %%rax, %%r10;"
+               "  adcx %%r8, %%r8;"
+               "  adox %%rcx, %%r11;"
+@@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const
+               /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+               "  mov $38, %%rdx;"
+               "  mulxq 32(%1), %%r8, %%r13;"
+-              "  xor %%rcx, %%rcx;"
++              "  xor %%ecx, %%ecx;"
+               "  adoxq 0(%1), %%r8;"
+               "  mulxq 40(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+@@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const
+       asm volatile(
+               /* Step 1: Compute all partial products */
+               "  movq 0(%1), %%rdx;"                                       /* f[0] */
+-              "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
++              "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
+               "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+               "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+               "  movq 24(%1), %%rdx;"                                      /* f[3] */
+@@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const
+               "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+               /* Step 2: Compute two parallel carry chains */
+-              "  xor %%r15, %%r15;"
++              "  xor %%r15d, %%r15d;"
+               "  adox %%rax, %%r10;"
+               "  adcx %%r8, %%r8;"
+               "  adox %%rcx, %%r11;"
+@@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const
+ 
+               /* Step 1: Compute all partial products */
+               "  movq 32(%1), %%rdx;"                                       /* f[0] */
+-              "  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
++              "  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
+               "  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+               "  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+               "  movq 56(%1), %%rdx;"                                      /* f[3] */
+@@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const
+               "  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+               /* Step 2: Compute two parallel carry chains */
+-              "  xor %%r15, %%r15;"
++              "  xor %%r15d, %%r15d;"
+               "  adox %%rax, %%r10;"
+               "  adcx %%r8, %%r8;"
+               "  adox %%rcx, %%r11;"
+@@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const
+               /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+               "  mov $38, %%rdx;"
+               "  mulxq 32(%1), %%r8, %%r13;"
+-              "  xor %%rcx, %%rcx;"
++              "  xor %%ecx, %%ecx;"
+               "  adoxq 0(%1), %%r8;"
+               "  mulxq 40(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
+@@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const
+               /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+               "  mov $38, %%rdx;"
+               "  mulxq 96(%1), %%r8, %%r13;"
+-              "  xor %%rcx, %%rcx;"
++              "  xor %%ecx, %%ecx;"
+               "  adoxq 64(%1), %%r8;"
+               "  mulxq 104(%1), %%r9, %%rbx;"
+               "  adcx %%r13, %%r9;"
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0064-crypto-poly1305-x86_64-Use-XORL-r32-32.patch b/target/linux/generic/backport-5.4/080-wireguard-0064-crypto-poly1305-x86_64-Use-XORL-r32-32.patch

new file mode 100644 (file)

index 0000000..fa5c188
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0064-crypto-poly1305-x86_64-Use-XORL-r32-32.patch
@@ -0,0 +1,59 @@
+From a2c7d387da3b3cdb8b7c16ef91cce45f92ebcf61 Mon Sep 17 00:00:00 2001
+From: Uros Bizjak <ubizjak@gmail.com>
+Date: Thu, 27 Aug 2020 19:38:31 +0200
+Subject: [PATCH 064/124] crypto: poly1305-x86_64 - Use XORL r32,32
+
+commit 7dfd1e01b3dfc13431b1b25720cf2692a7e111ef upstream.
+
+x86_64 zero extends 32bit operations, so for 64bit operands,
+XORL r32,r32 is functionally equal to XORQ r64,r64, but avoids
+a REX prefix byte when legacy registers are used.
+
+Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: "David S. Miller" <davem@davemloft.net>
+Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
++++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+@@ -246,7 +246,7 @@ $code.=<<___ if (!$kernel);
+ ___
+ &declare_function("poly1305_init_x86_64", 32, 3);
+ $code.=<<___;
+-      xor     %rax,%rax
++      xor     %eax,%eax
+       mov     %rax,0($ctx)            # initialize hash value
+       mov     %rax,8($ctx)
+       mov     %rax,16($ctx)
+@@ -2869,7 +2869,7 @@ $code.=<<___;
+ .type poly1305_init_base2_44,\@function,3
+ .align        32
+ poly1305_init_base2_44:
+-      xor     %rax,%rax
++      xor     %eax,%eax
+       mov     %rax,0($ctx)            # initialize hash value
+       mov     %rax,8($ctx)
+       mov     %rax,16($ctx)
+@@ -3963,7 +3963,7 @@ xor128_decrypt_n_pad:
+       mov     \$16,$len
+       sub     %r10,$len
+       xor     %eax,%eax
+-      xor     %r11,%r11
++      xor     %r11d,%r11d
+ .Loop_dec_byte:
+       mov     ($inp,$otp),%r11b
+       mov     ($otp),%al
+@@ -4101,7 +4101,7 @@ avx_handler:
+       .long   0xa548f3fc              # cld; rep movsq
+ 
+       mov     $disp,%rsi
+-      xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
++      xor     %ecx,%ecx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0065-crypto-x86-poly1305-Remove-assignments-with-no-effec.patch b/target/linux/generic/backport-5.4/080-wireguard-0065-crypto-x86-poly1305-Remove-assignments-with-no-effec.patch

new file mode 100644 (file)

index 0000000..0f8c836
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0065-crypto-x86-poly1305-Remove-assignments-with-no-effec.patch
@@ -0,0 +1,30 @@
+From 5502c4d51b8c27631ed1026ef172bd9ce58303d2 Mon Sep 17 00:00:00 2001
+From: Herbert Xu <herbert@gondor.apana.org.au>
+Date: Thu, 24 Sep 2020 13:29:04 +1000
+Subject: [PATCH 065/124] crypto: x86/poly1305 - Remove assignments with no
+ effect
+
+commit 4a0c1de64bf9d9027a6f19adfba89fc27893db23 upstream.
+
+This patch removes a few ineffectual assignments from the function
+crypto_poly1305_setdctxkey.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c | 3 ---
+ 1 file changed, 3 deletions(-)
+
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -157,9 +157,6 @@ static unsigned int crypto_poly1305_setd
+                       dctx->s[1] = get_unaligned_le32(&inp[4]);
+                       dctx->s[2] = get_unaligned_le32(&inp[8]);
+                       dctx->s[3] = get_unaligned_le32(&inp[12]);
+-                      inp += POLY1305_BLOCK_SIZE;
+-                      len -= POLY1305_BLOCK_SIZE;
+-                      acc += POLY1305_BLOCK_SIZE;
+                       dctx->sset = true;
+               }
+       }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0066-crypto-x86-poly1305-add-back-a-needed-assignment.patch b/target/linux/generic/backport-5.4/080-wireguard-0066-crypto-x86-poly1305-add-back-a-needed-assignment.patch

new file mode 100644 (file)

index 0000000..aebedb0
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0066-crypto-x86-poly1305-add-back-a-needed-assignment.patch
@@ -0,0 +1,33 @@
+From 4849474f7e021d0d2e33a008abf93cacebf812f4 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Fri, 23 Oct 2020 15:27:48 -0700
+Subject: [PATCH 066/124] crypto: x86/poly1305 - add back a needed assignment
+
+commit c3a98c3ad5c0dc60a1ac66bf91147a3f39cac96b upstream.
+
+One of the assignments that was removed by commit 4a0c1de64bf9 ("crypto:
+x86/poly1305 - Remove assignments with no effect") is actually needed,
+since it affects the return value.
+
+This fixes the following crypto self-test failure:
+
+    alg: shash: poly1305-simd test failed (wrong result) on test vector 2, cfg="init+update+final aligned buffer"
+
+Fixes: 4a0c1de64bf9 ("crypto: x86/poly1305 - Remove assignments with no effect")
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/crypto/poly1305_glue.c
++++ b/arch/x86/crypto/poly1305_glue.c
+@@ -157,6 +157,7 @@ static unsigned int crypto_poly1305_setd
+                       dctx->s[1] = get_unaligned_le32(&inp[4]);
+                       dctx->s[2] = get_unaligned_le32(&inp[8]);
+                       dctx->s[3] = get_unaligned_le32(&inp[12]);
++                      acc += POLY1305_BLOCK_SIZE;
+                       dctx->sset = true;
+               }
+       }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0067-crypto-Kconfig-CRYPTO_MANAGER_EXTRA_TESTS-requires-t.patch b/target/linux/generic/backport-5.4/080-wireguard-0067-crypto-Kconfig-CRYPTO_MANAGER_EXTRA_TESTS-requires-t.patch

new file mode 100644 (file)

index 0000000..430737e
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0067-crypto-Kconfig-CRYPTO_MANAGER_EXTRA_TESTS-requires-t.patch
@@ -0,0 +1,33 @@
+From 4517445d7df86d35d348f884a228e6979113d485 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 2 Nov 2020 14:48:15 +0100
+Subject: [PATCH 067/124] crypto: Kconfig - CRYPTO_MANAGER_EXTRA_TESTS requires
+ the manager
+
+commit 6569e3097f1c4a490bdf2b23d326855e04942dfd upstream.
+
+The extra tests in the manager actually require the manager to be
+selected too. Otherwise the linker gives errors like:
+
+ld: arch/x86/crypto/chacha_glue.o: in function `chacha_simd_stream_xor':
+chacha_glue.c:(.text+0x422): undefined reference to `crypto_simd_disabled_for_test'
+
+Fixes: 2343d1529aff ("crypto: Kconfig - allow tests to be disabled when manager is disabled")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/Kconfig | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -145,7 +145,7 @@ config CRYPTO_MANAGER_DISABLE_TESTS
+ 
+ config CRYPTO_MANAGER_EXTRA_TESTS
+       bool "Enable extra run-time crypto self tests"
+-      depends on DEBUG_KERNEL && !CRYPTO_MANAGER_DISABLE_TESTS
++      depends on DEBUG_KERNEL && !CRYPTO_MANAGER_DISABLE_TESTS && CRYPTO_MANAGER
+       help
+         Enable extra run-time self tests of registered crypto algorithms,
+         including randomized fuzz tests.
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch b/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch

new file mode 100644 (file)

index 0000000..2ecdbec
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch
@@ -0,0 +1,272 @@
+From de69c3a866f93a10d86d25d04af54a722bebc420 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Tue, 3 Nov 2020 17:28:09 +0100
+Subject: [PATCH 068/124] crypto: arm/chacha-neon - optimize for non-block size
+ multiples
+
+commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream.
+
+The current NEON based ChaCha implementation for ARM is optimized for
+multiples of 4x the ChaCha block size (64 bytes). This makes sense for
+block encryption, but given that ChaCha is also often used in the
+context of networking, it makes sense to consider arbitrary length
+inputs as well.
+
+For example, WireGuard typically uses 1420 byte packets, and performing
+ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
+and 3 invocations of chacha_block_xor_neon(), where the last one also
+involves a memcpy() using a buffer on the stack to process the final
+chunk of 1420 % 64 == 12 bytes.
+
+Let's optimize for this case as well, by letting chacha_4block_xor_neon()
+deal with any input size between 64 and 256 bytes, using NEON permutation
+instructions and overlapping loads and stores. This way, the 140 byte
+tail of a 1420 byte input buffer can simply be processed in one go.
+
+This results in the following performance improvements for 1420 byte
+blocks, without significant impact on power-of-2 input sizes. (Note
+that Raspberry Pi is widely used in combination with a 32-bit kernel,
+even though the core is 64-bit capable)
+
+   Cortex-A8  (BeagleBone)       :   7%
+   Cortex-A15 (Calxeda Midway)   :  21%
+   Cortex-A53 (Raspberry Pi 3)   :   3%
+   Cortex-A72 (Raspberry Pi 4)   :  19%
+
+Cc: Eric Biggers <ebiggers@google.com>
+Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-glue.c      | 34 +++++------
+ arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++---
+ 2 files changed, 107 insertions(+), 24 deletions(-)
+
+--- a/arch/arm/crypto/chacha-glue.c
++++ b/arch/arm/crypto/chacha-glue.c
+@@ -23,7 +23,7 @@
+ asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+                                     int nrounds);
+ asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+-                                     int nrounds);
++                                     int nrounds, unsigned int nbytes);
+ asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
+ asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+ 
+@@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8
+ {
+       u8 buf[CHACHA_BLOCK_SIZE];
+ 
+-      while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+-              chacha_4block_xor_neon(state, dst, src, nrounds);
+-              bytes -= CHACHA_BLOCK_SIZE * 4;
+-              src += CHACHA_BLOCK_SIZE * 4;
+-              dst += CHACHA_BLOCK_SIZE * 4;
+-              state[12] += 4;
+-      }
+-      while (bytes >= CHACHA_BLOCK_SIZE) {
+-              chacha_block_xor_neon(state, dst, src, nrounds);
+-              bytes -= CHACHA_BLOCK_SIZE;
+-              src += CHACHA_BLOCK_SIZE;
+-              dst += CHACHA_BLOCK_SIZE;
+-              state[12]++;
++      while (bytes > CHACHA_BLOCK_SIZE) {
++              unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
++
++              chacha_4block_xor_neon(state, dst, src, nrounds, l);
++              bytes -= l;
++              src += l;
++              dst += l;
++              state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+       }
+       if (bytes) {
+-              memcpy(buf, src, bytes);
+-              chacha_block_xor_neon(state, buf, buf, nrounds);
+-              memcpy(dst, buf, bytes);
++              const u8 *s = src;
++              u8 *d = dst;
++
++              if (bytes != CHACHA_BLOCK_SIZE)
++                      s = d = memcpy(buf, src, bytes);
++              chacha_block_xor_neon(state, d, s, nrounds);
++              if (d != dst)
++                      memcpy(dst, buf, bytes);
+       }
+ }
+ 
+--- a/arch/arm/crypto/chacha-neon-core.S
++++ b/arch/arm/crypto/chacha-neon-core.S
+@@ -47,6 +47,7 @@
+   */
+ 
+ #include <linux/linkage.h>
++#include <asm/cache.h>
+ 
+       .text
+       .fpu            neon
+@@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)
+ 
+       .align          5
+ ENTRY(chacha_4block_xor_neon)
+-      push            {r4-r5}
++      push            {r4, lr}
+       mov             r4, sp                  // preserve the stack pointer
+       sub             ip, sp, #0x20           // allocate a 32 byte buffer
+       bic             ip, ip, #0x1f           // aligned to 32 bytes
+@@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)
+       vld1.32         {q0-q1}, [r0]
+       vld1.32         {q2-q3}, [ip]
+ 
+-      adr             r5, .Lctrinc
++      adr             lr, .Lctrinc
+       vdup.32         q15, d7[1]
+       vdup.32         q14, d7[0]
+-      vld1.32         {q4}, [r5, :128]
++      vld1.32         {q4}, [lr, :128]
+       vdup.32         q13, d6[1]
+       vdup.32         q12, d6[0]
+       vdup.32         q11, d5[1]
+@@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)
+ 
+       // Re-interleave the words in the first two rows of each block (x0..7).
+       // Also add the counter values 0-3 to x12[0-3].
+-        vld1.32       {q8}, [r5, :128]        // load counter values 0-3
++        vld1.32       {q8}, [lr, :128]        // load counter values 0-3
+       vzip.32         q0, q1                  // => (0 1 0 1) (0 1 0 1)
+       vzip.32         q2, q3                  // => (2 3 2 3) (2 3 2 3)
+       vzip.32         q4, q5                  // => (4 5 4 5) (4 5 4 5)
+@@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)
+ 
+       // Re-interleave the words in the last two rows of each block (x8..15).
+       vld1.32         {q8-q9}, [sp, :256]
++        mov           sp, r4          // restore original stack pointer
++        ldr           r4, [r4, #8]    // load number of bytes
+       vzip.32         q12, q13        // => (12 13 12 13) (12 13 12 13)
+       vzip.32         q14, q15        // => (14 15 14 15) (14 15 14 15)
+       vzip.32         q8, q9          // => (8 9 8 9) (8 9 8 9)
+@@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)
+       // XOR the rest of the data with the keystream
+ 
+       vld1.8          {q0-q1}, [r2]!
++      subs            r4, r4, #96
+       veor            q0, q0, q8
+       veor            q1, q1, q12
++      ble             .Lle96
+       vst1.8          {q0-q1}, [r1]!
+ 
+       vld1.8          {q0-q1}, [r2]!
++      subs            r4, r4, #32
+       veor            q0, q0, q2
+       veor            q1, q1, q6
++      ble             .Lle128
+       vst1.8          {q0-q1}, [r1]!
+ 
+       vld1.8          {q0-q1}, [r2]!
++      subs            r4, r4, #32
+       veor            q0, q0, q10
+       veor            q1, q1, q14
++      ble             .Lle160
+       vst1.8          {q0-q1}, [r1]!
+ 
+       vld1.8          {q0-q1}, [r2]!
++      subs            r4, r4, #32
+       veor            q0, q0, q4
+       veor            q1, q1, q5
++      ble             .Lle192
+       vst1.8          {q0-q1}, [r1]!
+ 
+       vld1.8          {q0-q1}, [r2]!
++      subs            r4, r4, #32
+       veor            q0, q0, q9
+       veor            q1, q1, q13
++      ble             .Lle224
+       vst1.8          {q0-q1}, [r1]!
+ 
+       vld1.8          {q0-q1}, [r2]!
++      subs            r4, r4, #32
+       veor            q0, q0, q3
+       veor            q1, q1, q7
++      blt             .Llt256
++.Lout:
+       vst1.8          {q0-q1}, [r1]!
+ 
+       vld1.8          {q0-q1}, [r2]
+-        mov           sp, r4          // restore original stack pointer
+       veor            q0, q0, q11
+       veor            q1, q1, q15
+       vst1.8          {q0-q1}, [r1]
+ 
+-      pop             {r4-r5}
+-      bx              lr
++      pop             {r4, pc}
++
++.Lle192:
++      vmov            q4, q9
++      vmov            q5, q13
++
++.Lle160:
++      // nothing to do
++
++.Lfinalblock:
++      // Process the final block if processing less than 4 full blocks.
++      // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
++      // previous 32 byte output block that still needs to be written at
++      // [r1] in q0-q1.
++      beq             .Lfullblock
++
++.Lpartialblock:
++      adr             lr, .Lpermute + 32
++      add             r2, r2, r4
++      add             lr, lr, r4
++      add             r4, r4, r1
++
++      vld1.8          {q2-q3}, [lr]
++      vld1.8          {q6-q7}, [r2]
++
++      add             r4, r4, #32
++
++      vtbl.8          d4, {q4-q5}, d4
++      vtbl.8          d5, {q4-q5}, d5
++      vtbl.8          d6, {q4-q5}, d6
++      vtbl.8          d7, {q4-q5}, d7
++
++      veor            q6, q6, q2
++      veor            q7, q7, q3
++
++      vst1.8          {q6-q7}, [r4]   // overlapping stores
++      vst1.8          {q0-q1}, [r1]
++      pop             {r4, pc}
++
++.Lfullblock:
++      vmov            q11, q4
++      vmov            q15, q5
++      b               .Lout
++.Lle96:
++      vmov            q4, q2
++      vmov            q5, q6
++      b               .Lfinalblock
++.Lle128:
++      vmov            q4, q10
++      vmov            q5, q14
++      b               .Lfinalblock
++.Lle224:
++      vmov            q4, q3
++      vmov            q5, q7
++      b               .Lfinalblock
++.Llt256:
++      vmov            q4, q11
++      vmov            q5, q15
++      b               .Lpartialblock
+ ENDPROC(chacha_4block_xor_neon)
++
++      .align          L1_CACHE_SHIFT
++.Lpermute:
++      .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
++      .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
++      .byte           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
++      .byte           0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
++      .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
++      .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
++      .byte           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
++      .byte           0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0069-crypto-arm64-chacha-simplify-tail-block-handling.patch b/target/linux/generic/backport-5.4/080-wireguard-0069-crypto-arm64-chacha-simplify-tail-block-handling.patch

new file mode 100644 (file)

index 0000000..55e1624
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0069-crypto-arm64-chacha-simplify-tail-block-handling.patch
@@ -0,0 +1,324 @@
+From af8c75e27b20e01464aa6ad43ca3095534c81a8b Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 6 Nov 2020 17:39:38 +0100
+Subject: [PATCH 069/124] crypto: arm64/chacha - simplify tail block handling
+
+commit c4fc6328d6c67690a7e6e03f43a5a976a13120ef upstream.
+
+Based on lessons learnt from optimizing the 32-bit version of this driver,
+we can simplify the arm64 version considerably, by reordering the final
+two stores when the last block is not a multiple of 64 bytes. This removes
+the need to use permutation instructions to calculate the elements that are
+clobbered by the final overlapping store, given that the store of the
+penultimate block now follows it, and that one carries the correct values
+for those elements already.
+
+While at it, simplify the overlapping loads as well, by calculating the
+address of the final overlapping load upfront, and switching to this
+address for every load that would otherwise extend past the end of the
+source buffer.
+
+There is no impact on performance, but the resulting code is substantially
+smaller and easier to follow.
+
+Cc: Eric Biggers <ebiggers@google.com>
+Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm64/crypto/chacha-neon-core.S | 193 ++++++++++-----------------
+ 1 file changed, 69 insertions(+), 124 deletions(-)
+
+--- a/arch/arm64/crypto/chacha-neon-core.S
++++ b/arch/arm64/crypto/chacha-neon-core.S
+@@ -195,7 +195,6 @@ ENTRY(chacha_4block_xor_neon)
+       adr_l           x10, .Lpermute
+       and             x5, x4, #63
+       add             x10, x10, x5
+-      add             x11, x10, #64
+ 
+       //
+       // This function encrypts four consecutive ChaCha blocks by loading
+@@ -645,11 +644,11 @@ CPU_BE(    rev           a15, a15        )
+       zip2            v31.4s, v14.4s, v15.4s
+         eor           a15, a15, w9
+ 
+-      mov             x3, #64
++      add             x3, x2, x4
++      sub             x3, x3, #128            // start of last block
++
+       subs            x5, x4, #128
+-      add             x6, x5, x2
+-      csel            x3, x3, xzr, ge
+-      csel            x2, x2, x6, ge
++      csel            x2, x2, x3, ge
+ 
+       // interleave 64-bit words in state n, n+2
+       zip1            v0.2d, v16.2d, v18.2d
+@@ -658,13 +657,10 @@ CPU_BE(    rev           a15, a15        )
+       zip1            v8.2d, v17.2d, v19.2d
+       zip2            v12.2d, v17.2d, v19.2d
+         stp           a2, a3, [x1, #-56]
+-      ld1             {v16.16b-v19.16b}, [x2], x3
+ 
+       subs            x6, x4, #192
+-      ccmp            x3, xzr, #4, lt
+-      add             x7, x6, x2
+-      csel            x3, x3, xzr, eq
+-      csel            x2, x2, x7, eq
++      ld1             {v16.16b-v19.16b}, [x2], #64
++      csel            x2, x2, x3, ge
+ 
+       zip1            v1.2d, v20.2d, v22.2d
+       zip2            v5.2d, v20.2d, v22.2d
+@@ -672,13 +668,10 @@ CPU_BE(    rev           a15, a15        )
+       zip1            v9.2d, v21.2d, v23.2d
+       zip2            v13.2d, v21.2d, v23.2d
+         stp           a6, a7, [x1, #-40]
+-      ld1             {v20.16b-v23.16b}, [x2], x3
+ 
+       subs            x7, x4, #256
+-      ccmp            x3, xzr, #4, lt
+-      add             x8, x7, x2
+-      csel            x3, x3, xzr, eq
+-      csel            x2, x2, x8, eq
++      ld1             {v20.16b-v23.16b}, [x2], #64
++      csel            x2, x2, x3, ge
+ 
+       zip1            v2.2d, v24.2d, v26.2d
+       zip2            v6.2d, v24.2d, v26.2d
+@@ -686,12 +679,10 @@ CPU_BE(    rev           a15, a15        )
+       zip1            v10.2d, v25.2d, v27.2d
+       zip2            v14.2d, v25.2d, v27.2d
+         stp           a10, a11, [x1, #-24]
+-      ld1             {v24.16b-v27.16b}, [x2], x3
+ 
+       subs            x8, x4, #320
+-      ccmp            x3, xzr, #4, lt
+-      add             x9, x8, x2
+-      csel            x2, x2, x9, eq
++      ld1             {v24.16b-v27.16b}, [x2], #64
++      csel            x2, x2, x3, ge
+ 
+       zip1            v3.2d, v28.2d, v30.2d
+       zip2            v7.2d, v28.2d, v30.2d
+@@ -699,151 +690,105 @@ CPU_BE(          rev           a15, a15        )
+       zip1            v11.2d, v29.2d, v31.2d
+       zip2            v15.2d, v29.2d, v31.2d
+         stp           a14, a15, [x1, #-8]
++
++      tbnz            x5, #63, .Lt128
+       ld1             {v28.16b-v31.16b}, [x2]
+ 
+       // xor with corresponding input, write to output
+-      tbnz            x5, #63, 0f
+       eor             v16.16b, v16.16b, v0.16b
+       eor             v17.16b, v17.16b, v1.16b
+       eor             v18.16b, v18.16b, v2.16b
+       eor             v19.16b, v19.16b, v3.16b
+-      st1             {v16.16b-v19.16b}, [x1], #64
+-      cbz             x5, .Lout
+ 
+-      tbnz            x6, #63, 1f
++      tbnz            x6, #63, .Lt192
++
+       eor             v20.16b, v20.16b, v4.16b
+       eor             v21.16b, v21.16b, v5.16b
+       eor             v22.16b, v22.16b, v6.16b
+       eor             v23.16b, v23.16b, v7.16b
+-      st1             {v20.16b-v23.16b}, [x1], #64
+-      cbz             x6, .Lout
+ 
+-      tbnz            x7, #63, 2f
++      st1             {v16.16b-v19.16b}, [x1], #64
++      tbnz            x7, #63, .Lt256
++
+       eor             v24.16b, v24.16b, v8.16b
+       eor             v25.16b, v25.16b, v9.16b
+       eor             v26.16b, v26.16b, v10.16b
+       eor             v27.16b, v27.16b, v11.16b
+-      st1             {v24.16b-v27.16b}, [x1], #64
+-      cbz             x7, .Lout
+ 
+-      tbnz            x8, #63, 3f
++      st1             {v20.16b-v23.16b}, [x1], #64
++      tbnz            x8, #63, .Lt320
++
+       eor             v28.16b, v28.16b, v12.16b
+       eor             v29.16b, v29.16b, v13.16b
+       eor             v30.16b, v30.16b, v14.16b
+       eor             v31.16b, v31.16b, v15.16b
++
++      st1             {v24.16b-v27.16b}, [x1], #64
+       st1             {v28.16b-v31.16b}, [x1]
+ 
+ .Lout:        frame_pop
+       ret
+ 
+-      // fewer than 128 bytes of in/output
+-0:    ld1             {v8.16b}, [x10]
+-      ld1             {v9.16b}, [x11]
+-      movi            v10.16b, #16
+-      sub             x2, x1, #64
+-      add             x1, x1, x5
+-      ld1             {v16.16b-v19.16b}, [x2]
+-      tbl             v4.16b, {v0.16b-v3.16b}, v8.16b
+-      tbx             v20.16b, {v16.16b-v19.16b}, v9.16b
+-      add             v8.16b, v8.16b, v10.16b
+-      add             v9.16b, v9.16b, v10.16b
+-      tbl             v5.16b, {v0.16b-v3.16b}, v8.16b
+-      tbx             v21.16b, {v16.16b-v19.16b}, v9.16b
+-      add             v8.16b, v8.16b, v10.16b
+-      add             v9.16b, v9.16b, v10.16b
+-      tbl             v6.16b, {v0.16b-v3.16b}, v8.16b
+-      tbx             v22.16b, {v16.16b-v19.16b}, v9.16b
+-      add             v8.16b, v8.16b, v10.16b
+-      add             v9.16b, v9.16b, v10.16b
+-      tbl             v7.16b, {v0.16b-v3.16b}, v8.16b
+-      tbx             v23.16b, {v16.16b-v19.16b}, v9.16b
+-
+-      eor             v20.16b, v20.16b, v4.16b
+-      eor             v21.16b, v21.16b, v5.16b
+-      eor             v22.16b, v22.16b, v6.16b
+-      eor             v23.16b, v23.16b, v7.16b
+-      st1             {v20.16b-v23.16b}, [x1]
+-      b               .Lout
+-
+       // fewer than 192 bytes of in/output
+-1:    ld1             {v8.16b}, [x10]
+-      ld1             {v9.16b}, [x11]
+-      movi            v10.16b, #16
+-      add             x1, x1, x6
+-      tbl             v0.16b, {v4.16b-v7.16b}, v8.16b
+-      tbx             v20.16b, {v16.16b-v19.16b}, v9.16b
+-      add             v8.16b, v8.16b, v10.16b
+-      add             v9.16b, v9.16b, v10.16b
+-      tbl             v1.16b, {v4.16b-v7.16b}, v8.16b
+-      tbx             v21.16b, {v16.16b-v19.16b}, v9.16b
+-      add             v8.16b, v8.16b, v10.16b
+-      add             v9.16b, v9.16b, v10.16b
+-      tbl             v2.16b, {v4.16b-v7.16b}, v8.16b
+-      tbx             v22.16b, {v16.16b-v19.16b}, v9.16b
+-      add             v8.16b, v8.16b, v10.16b
+-      add             v9.16b, v9.16b, v10.16b
+-      tbl             v3.16b, {v4.16b-v7.16b}, v8.16b
+-      tbx             v23.16b, {v16.16b-v19.16b}, v9.16b
+-
+-      eor             v20.16b, v20.16b, v0.16b
+-      eor             v21.16b, v21.16b, v1.16b
+-      eor             v22.16b, v22.16b, v2.16b
+-      eor             v23.16b, v23.16b, v3.16b
+-      st1             {v20.16b-v23.16b}, [x1]
++.Lt192:       cbz             x5, 1f                          // exactly 128 bytes?
++      ld1             {v28.16b-v31.16b}, [x10]
++      add             x5, x5, x1
++      tbl             v28.16b, {v4.16b-v7.16b}, v28.16b
++      tbl             v29.16b, {v4.16b-v7.16b}, v29.16b
++      tbl             v30.16b, {v4.16b-v7.16b}, v30.16b
++      tbl             v31.16b, {v4.16b-v7.16b}, v31.16b
++
++0:    eor             v20.16b, v20.16b, v28.16b
++      eor             v21.16b, v21.16b, v29.16b
++      eor             v22.16b, v22.16b, v30.16b
++      eor             v23.16b, v23.16b, v31.16b
++      st1             {v20.16b-v23.16b}, [x5]         // overlapping stores
++1:    st1             {v16.16b-v19.16b}, [x1]
+       b               .Lout
+ 
++      // fewer than 128 bytes of in/output
++.Lt128:       ld1             {v28.16b-v31.16b}, [x10]
++      add             x5, x5, x1
++      sub             x1, x1, #64
++      tbl             v28.16b, {v0.16b-v3.16b}, v28.16b
++      tbl             v29.16b, {v0.16b-v3.16b}, v29.16b
++      tbl             v30.16b, {v0.16b-v3.16b}, v30.16b
++      tbl             v31.16b, {v0.16b-v3.16b}, v31.16b
++      ld1             {v16.16b-v19.16b}, [x1]         // reload first output block
++      b               0b
++
+       // fewer than 256 bytes of in/output
+-2:    ld1             {v4.16b}, [x10]
+-      ld1             {v5.16b}, [x11]
+-      movi            v6.16b, #16
+-      add             x1, x1, x7
++.Lt256:       cbz             x6, 2f                          // exactly 192 bytes?
++      ld1             {v4.16b-v7.16b}, [x10]
++      add             x6, x6, x1
+       tbl             v0.16b, {v8.16b-v11.16b}, v4.16b
+-      tbx             v24.16b, {v20.16b-v23.16b}, v5.16b
+-      add             v4.16b, v4.16b, v6.16b
+-      add             v5.16b, v5.16b, v6.16b
+-      tbl             v1.16b, {v8.16b-v11.16b}, v4.16b
+-      tbx             v25.16b, {v20.16b-v23.16b}, v5.16b
+-      add             v4.16b, v4.16b, v6.16b
+-      add             v5.16b, v5.16b, v6.16b
+-      tbl             v2.16b, {v8.16b-v11.16b}, v4.16b
+-      tbx             v26.16b, {v20.16b-v23.16b}, v5.16b
+-      add             v4.16b, v4.16b, v6.16b
+-      add             v5.16b, v5.16b, v6.16b
+-      tbl             v3.16b, {v8.16b-v11.16b}, v4.16b
+-      tbx             v27.16b, {v20.16b-v23.16b}, v5.16b
+-
+-      eor             v24.16b, v24.16b, v0.16b
+-      eor             v25.16b, v25.16b, v1.16b
+-      eor             v26.16b, v26.16b, v2.16b
+-      eor             v27.16b, v27.16b, v3.16b
+-      st1             {v24.16b-v27.16b}, [x1]
++      tbl             v1.16b, {v8.16b-v11.16b}, v5.16b
++      tbl             v2.16b, {v8.16b-v11.16b}, v6.16b
++      tbl             v3.16b, {v8.16b-v11.16b}, v7.16b
++
++      eor             v28.16b, v28.16b, v0.16b
++      eor             v29.16b, v29.16b, v1.16b
++      eor             v30.16b, v30.16b, v2.16b
++      eor             v31.16b, v31.16b, v3.16b
++      st1             {v28.16b-v31.16b}, [x6]         // overlapping stores
++2:    st1             {v20.16b-v23.16b}, [x1]
+       b               .Lout
+ 
+       // fewer than 320 bytes of in/output
+-3:    ld1             {v4.16b}, [x10]
+-      ld1             {v5.16b}, [x11]
+-      movi            v6.16b, #16
+-      add             x1, x1, x8
++.Lt320:       cbz             x7, 3f                          // exactly 256 bytes?
++      ld1             {v4.16b-v7.16b}, [x10]
++      add             x7, x7, x1
+       tbl             v0.16b, {v12.16b-v15.16b}, v4.16b
+-      tbx             v28.16b, {v24.16b-v27.16b}, v5.16b
+-      add             v4.16b, v4.16b, v6.16b
+-      add             v5.16b, v5.16b, v6.16b
+-      tbl             v1.16b, {v12.16b-v15.16b}, v4.16b
+-      tbx             v29.16b, {v24.16b-v27.16b}, v5.16b
+-      add             v4.16b, v4.16b, v6.16b
+-      add             v5.16b, v5.16b, v6.16b
+-      tbl             v2.16b, {v12.16b-v15.16b}, v4.16b
+-      tbx             v30.16b, {v24.16b-v27.16b}, v5.16b
+-      add             v4.16b, v4.16b, v6.16b
+-      add             v5.16b, v5.16b, v6.16b
+-      tbl             v3.16b, {v12.16b-v15.16b}, v4.16b
+-      tbx             v31.16b, {v24.16b-v27.16b}, v5.16b
++      tbl             v1.16b, {v12.16b-v15.16b}, v5.16b
++      tbl             v2.16b, {v12.16b-v15.16b}, v6.16b
++      tbl             v3.16b, {v12.16b-v15.16b}, v7.16b
+ 
+       eor             v28.16b, v28.16b, v0.16b
+       eor             v29.16b, v29.16b, v1.16b
+       eor             v30.16b, v30.16b, v2.16b
+       eor             v31.16b, v31.16b, v3.16b
+-      st1             {v28.16b-v31.16b}, [x1]
++      st1             {v28.16b-v31.16b}, [x7]         // overlapping stores
++3:    st1             {v24.16b-v27.16b}, [x1]
+       b               .Lout
+ ENDPROC(chacha_4block_xor_neon)
+ 
+@@ -851,7 +796,7 @@ ENDPROC(chacha_4block_xor_neon)
+       .align          L1_CACHE_SHIFT
+ .Lpermute:
+       .set            .Li, 0
+-      .rept           192
++      .rept           128
+       .byte           (.Li - 64)
+       .set            .Li, .Li + 1
+       .endr
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0070-crypto-lib-chacha20poly1305-define-empty-module-exit.patch b/target/linux/generic/backport-5.4/080-wireguard-0070-crypto-lib-chacha20poly1305-define-empty-module-exit.patch

new file mode 100644 (file)

index 0000000..e4ca889
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0070-crypto-lib-chacha20poly1305-define-empty-module-exit.patch
@@ -0,0 +1,37 @@
+From 06c613a67ec604201f424e8e763f3361264d995e Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 15 Jan 2021 20:30:12 +0100
+Subject: [PATCH 070/124] crypto: lib/chacha20poly1305 - define empty module
+ exit function
+
+commit ac88c322d0f2917d41d13553c69e9d7f043c8b6f upstream.
+
+With no mod_exit function, users are unable to unload the module after
+use. I'm not aware of any reason why module unloading should be
+prohibited for this one, so this commit simply adds an empty exit
+function.
+
+Reported-and-tested-by: John Donnelly <john.p.donnelly@oracle.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/chacha20poly1305.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/lib/crypto/chacha20poly1305.c
++++ b/lib/crypto/chacha20poly1305.c
+@@ -364,7 +364,12 @@ static int __init mod_init(void)
+       return 0;
+ }
+ 
++static void __exit mod_exit(void)
++{
++}
++
+ module_init(mod_init);
++module_exit(mod_exit);
+ MODULE_LICENSE("GPL v2");
+ MODULE_DESCRIPTION("ChaCha20Poly1305 AEAD construction");
+ MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0071-icmp-introduce-helper-for-nat-d-source-address-in-ne.patch b/target/linux/generic/backport-5.4/080-wireguard-0071-icmp-introduce-helper-for-nat-d-source-address-in-ne.patch

new file mode 100644 (file)

index 0000000..f5ad6fe
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0071-icmp-introduce-helper-for-nat-d-source-address-in-ne.patch
@@ -0,0 +1,148 @@
+From 9793cc7357e8d70fed9cb350d2d39346328cc73b Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 11 Feb 2020 20:47:05 +0100
+Subject: [PATCH 071/124] icmp: introduce helper for nat'd source address in
+ network device context
+
+commit 0b41713b606694257b90d61ba7e2712d8457648b upstream.
+
+This introduces a helper function to be called only by network drivers
+that wraps calls to icmp[v6]_send in a conntrack transformation, in case
+NAT has been used. We don't want to pollute the non-driver path, though,
+so we introduce this as a helper to be called by places that actually
+make use of this, as suggested by Florian.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/linux/icmpv6.h | 10 ++++++++++
+ include/net/icmp.h     |  6 ++++++
+ net/ipv4/icmp.c        | 33 +++++++++++++++++++++++++++++++++
+ net/ipv6/ip6_icmp.c    | 34 ++++++++++++++++++++++++++++++++++
+ 4 files changed, 83 insertions(+)
+
+--- a/include/linux/icmpv6.h
++++ b/include/linux/icmpv6.h
+@@ -22,12 +22,22 @@ extern int inet6_unregister_icmp_sender(
+ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
+                              unsigned int data_len);
+ 
++#if IS_ENABLED(CONFIG_NF_NAT)
++void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
++#else
++#define icmpv6_ndo_send icmpv6_send
++#endif
++
+ #else
+ 
+ static inline void icmpv6_send(struct sk_buff *skb,
+                              u8 type, u8 code, __u32 info)
+ {
++}
+ 
++static inline void icmpv6_ndo_send(struct sk_buff *skb,
++                                 u8 type, u8 code, __u32 info)
++{
+ }
+ #endif
+ 
+--- a/include/net/icmp.h
++++ b/include/net/icmp.h
+@@ -43,6 +43,12 @@ static inline void icmp_send(struct sk_b
+       __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt);
+ }
+ 
++#if IS_ENABLED(CONFIG_NF_NAT)
++void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info);
++#else
++#define icmp_ndo_send icmp_send
++#endif
++
+ int icmp_rcv(struct sk_buff *skb);
+ int icmp_err(struct sk_buff *skb, u32 info);
+ int icmp_init(void);
+--- a/net/ipv4/icmp.c
++++ b/net/ipv4/icmp.c
+@@ -750,6 +750,39 @@ out:;
+ }
+ EXPORT_SYMBOL(__icmp_send);
+ 
++#if IS_ENABLED(CONFIG_NF_NAT)
++#include <net/netfilter/nf_conntrack.h>
++void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
++{
++      struct sk_buff *cloned_skb = NULL;
++      enum ip_conntrack_info ctinfo;
++      struct nf_conn *ct;
++      __be32 orig_ip;
++
++      ct = nf_ct_get(skb_in, &ctinfo);
++      if (!ct || !(ct->status & IPS_SRC_NAT)) {
++              icmp_send(skb_in, type, code, info);
++              return;
++      }
++
++      if (skb_shared(skb_in))
++              skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
++
++      if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
++          (skb_network_header(skb_in) + sizeof(struct iphdr)) >
++          skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
++          skb_network_offset(skb_in) + sizeof(struct iphdr))))
++              goto out;
++
++      orig_ip = ip_hdr(skb_in)->saddr;
++      ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
++      icmp_send(skb_in, type, code, info);
++      ip_hdr(skb_in)->saddr = orig_ip;
++out:
++      consume_skb(cloned_skb);
++}
++EXPORT_SYMBOL(icmp_ndo_send);
++#endif
+ 
+ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
+ {
+--- a/net/ipv6/ip6_icmp.c
++++ b/net/ipv6/ip6_icmp.c
+@@ -45,4 +45,38 @@ out:
+       rcu_read_unlock();
+ }
+ EXPORT_SYMBOL(icmpv6_send);
++
++#if IS_ENABLED(CONFIG_NF_NAT)
++#include <net/netfilter/nf_conntrack.h>
++void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
++{
++      struct sk_buff *cloned_skb = NULL;
++      enum ip_conntrack_info ctinfo;
++      struct in6_addr orig_ip;
++      struct nf_conn *ct;
++
++      ct = nf_ct_get(skb_in, &ctinfo);
++      if (!ct || !(ct->status & IPS_SRC_NAT)) {
++              icmpv6_send(skb_in, type, code, info);
++              return;
++      }
++
++      if (skb_shared(skb_in))
++              skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
++
++      if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
++          (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) >
++          skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
++          skb_network_offset(skb_in) + sizeof(struct ipv6hdr))))
++              goto out;
++
++      orig_ip = ipv6_hdr(skb_in)->saddr;
++      ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6;
++      icmpv6_send(skb_in, type, code, info);
++      ipv6_hdr(skb_in)->saddr = orig_ip;
++out:
++      consume_skb(cloned_skb);
++}
++EXPORT_SYMBOL(icmpv6_ndo_send);
++#endif
+ #endif
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0072-net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-bef.patch b/target/linux/generic/backport-5.4/080-wireguard-0072-net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-bef.patch

new file mode 100644 (file)

index 0000000..fcca169
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0072-net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-bef.patch
@@ -0,0 +1,299 @@
+From 4a25324891a32d080589a6e3a4dec2be2d9e3d60 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 23 Feb 2021 14:18:58 +0100
+Subject: [PATCH 072/124] net: icmp: pass zeroed opts from icmp{,v6}_ndo_send
+ before sending
+
+commit ee576c47db60432c37e54b1e2b43a8ca6d3a8dca upstream.
+
+The icmp{,v6}_send functions make all sorts of use of skb->cb, casting
+it with IPCB or IP6CB, assuming the skb to have come directly from the
+inet layer. But when the packet comes from the ndo layer, especially
+when forwarded, there's no telling what might be in skb->cb at that
+point. As a result, the icmp sending code risks reading bogus memory
+contents, which can result in nasty stack overflows such as this one
+reported by a user:
+
+    panic+0x108/0x2ea
+    __stack_chk_fail+0x14/0x20
+    __icmp_send+0x5bd/0x5c0
+    icmp_ndo_send+0x148/0x160
+
+In icmp_send, skb->cb is cast with IPCB and an ip_options struct is read
+from it. The optlen parameter there is of particular note, as it can
+induce writes beyond bounds. There are quite a few ways that can happen
+in __ip_options_echo. For example:
+
+    // sptr/skb are attacker-controlled skb bytes
+    sptr = skb_network_header(skb);
+    // dptr/dopt points to stack memory allocated by __icmp_send
+    dptr = dopt->__data;
+    // sopt is the corrupt skb->cb in question
+    if (sopt->rr) {
+        optlen  = sptr[sopt->rr+1]; // corrupt skb->cb + skb->data
+        soffset = sptr[sopt->rr+2]; // corrupt skb->cb + skb->data
+       // this now writes potentially attacker-controlled data, over
+       // flowing the stack:
+        memcpy(dptr, sptr+sopt->rr, optlen);
+    }
+
+In the icmpv6_send case, the story is similar, but not as dire, as only
+IP6CB(skb)->iif and IP6CB(skb)->dsthao are used. The dsthao case is
+worse than the iif case, but it is passed to ipv6_find_tlv, which does
+a bit of bounds checking on the value.
+
+This is easy to simulate by doing a `memset(skb->cb, 0x41,
+sizeof(skb->cb));` before calling icmp{,v6}_ndo_send, and it's only by
+good fortune and the rarity of icmp sending from that context that we've
+avoided reports like this until now. For example, in KASAN:
+
+    BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0xa0e/0x12b0
+    Write of size 38 at addr ffff888006f1f80e by task ping/89
+    CPU: 2 PID: 89 Comm: ping Not tainted 5.10.0-rc7-debug+ #5
+    Call Trace:
+     dump_stack+0x9a/0xcc
+     print_address_description.constprop.0+0x1a/0x160
+     __kasan_report.cold+0x20/0x38
+     kasan_report+0x32/0x40
+     check_memory_region+0x145/0x1a0
+     memcpy+0x39/0x60
+     __ip_options_echo+0xa0e/0x12b0
+     __icmp_send+0x744/0x1700
+
+Actually, out of the 4 drivers that do this, only gtp zeroed the cb for
+the v4 case, while the rest did not. So this commit actually removes the
+gtp-specific zeroing, while putting the code where it belongs in the
+shared infrastructure of icmp{,v6}_ndo_send.
+
+This commit fixes the issue by passing an empty IPCB or IP6CB along to
+the functions that actually do the work. For the icmp_send, this was
+already trivial, thanks to __icmp_send providing the plumbing function.
+For icmpv6_send, this required a tiny bit of refactoring to make it
+behave like the v4 case, after which it was straight forward.
+
+Fixes: a2b78e9b2cac ("sunvnet: generate ICMP PTMUD messages for smaller port MTUs")
+Reported-by: SinYu <liuxyon@gmail.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Link: https://lore.kernel.org/netdev/CAF=yD-LOF116aHub6RMe8vB8ZpnrrnoTdqhobEx+bvoA8AsP0w@mail.gmail.com/T/
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Link: https://lore.kernel.org/r/20210223131858.72082-1-Jason@zx2c4.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+[Jason: the gtp part didn't apply because it doesn't use icmp_ndo_send on 5.4]
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/linux/icmpv6.h | 17 ++++++++++++++---
+ include/linux/ipv6.h   |  1 -
+ include/net/icmp.h     |  6 +++++-
+ net/ipv4/icmp.c        |  5 +++--
+ net/ipv6/icmp.c        | 16 ++++++++--------
+ net/ipv6/ip6_icmp.c    | 12 +++++++-----
+ 6 files changed, 37 insertions(+), 20 deletions(-)
+
+--- a/include/linux/icmpv6.h
++++ b/include/linux/icmpv6.h
+@@ -3,6 +3,7 @@
+ #define _LINUX_ICMPV6_H
+ 
+ #include <linux/skbuff.h>
++#include <linux/ipv6.h>
+ #include <uapi/linux/icmpv6.h>
+ 
+ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb)
+@@ -13,10 +14,16 @@ static inline struct icmp6hdr *icmp6_hdr
+ #include <linux/netdevice.h>
+ 
+ #if IS_ENABLED(CONFIG_IPV6)
+-extern void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info);
++extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
++                        const struct inet6_skb_parm *parm);
+ 
++static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
++{
++      __icmpv6_send(skb, type, code, info, IP6CB(skb));
++}
+ typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+-                           const struct in6_addr *force_saddr);
++                           const struct in6_addr *force_saddr,
++                           const struct inet6_skb_parm *parm);
+ extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn);
+ extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn);
+ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
+@@ -25,7 +32,11 @@ int ip6_err_gen_icmpv6_unreach(struct sk
+ #if IS_ENABLED(CONFIG_NF_NAT)
+ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
+ #else
+-#define icmpv6_ndo_send icmpv6_send
++static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
++{
++      struct inet6_skb_parm parm = { 0 };
++      __icmpv6_send(skb_in, type, code, info, &parm);
++}
+ #endif
+ 
+ #else
+--- a/include/linux/ipv6.h
++++ b/include/linux/ipv6.h
+@@ -83,7 +83,6 @@ struct ipv6_params {
+       __s32 autoconf;
+ };
+ extern struct ipv6_params ipv6_defaults;
+-#include <linux/icmpv6.h>
+ #include <linux/tcp.h>
+ #include <linux/udp.h>
+ 
+--- a/include/net/icmp.h
++++ b/include/net/icmp.h
+@@ -46,7 +46,11 @@ static inline void icmp_send(struct sk_b
+ #if IS_ENABLED(CONFIG_NF_NAT)
+ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info);
+ #else
+-#define icmp_ndo_send icmp_send
++static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
++{
++      struct ip_options opts = { 0 };
++      __icmp_send(skb_in, type, code, info, &opts);
++}
+ #endif
+ 
+ int icmp_rcv(struct sk_buff *skb);
+--- a/net/ipv4/icmp.c
++++ b/net/ipv4/icmp.c
+@@ -755,13 +755,14 @@ EXPORT_SYMBOL(__icmp_send);
+ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+ {
+       struct sk_buff *cloned_skb = NULL;
++      struct ip_options opts = { 0 };
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *ct;
+       __be32 orig_ip;
+ 
+       ct = nf_ct_get(skb_in, &ctinfo);
+       if (!ct || !(ct->status & IPS_SRC_NAT)) {
+-              icmp_send(skb_in, type, code, info);
++              __icmp_send(skb_in, type, code, info, &opts);
+               return;
+       }
+ 
+@@ -776,7 +777,7 @@ void icmp_ndo_send(struct sk_buff *skb_i
+ 
+       orig_ip = ip_hdr(skb_in)->saddr;
+       ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
+-      icmp_send(skb_in, type, code, info);
++      __icmp_send(skb_in, type, code, info, &opts);
+       ip_hdr(skb_in)->saddr = orig_ip;
+ out:
+       consume_skb(cloned_skb);
+--- a/net/ipv6/icmp.c
++++ b/net/ipv6/icmp.c
+@@ -312,10 +312,9 @@ static int icmpv6_getfrag(void *from, ch
+ }
+ 
+ #if IS_ENABLED(CONFIG_IPV6_MIP6)
+-static void mip6_addr_swap(struct sk_buff *skb)
++static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt)
+ {
+       struct ipv6hdr *iph = ipv6_hdr(skb);
+-      struct inet6_skb_parm *opt = IP6CB(skb);
+       struct ipv6_destopt_hao *hao;
+       struct in6_addr tmp;
+       int off;
+@@ -332,7 +331,7 @@ static void mip6_addr_swap(struct sk_buf
+       }
+ }
+ #else
+-static inline void mip6_addr_swap(struct sk_buff *skb) {}
++static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {}
+ #endif
+ 
+ static struct dst_entry *icmpv6_route_lookup(struct net *net,
+@@ -427,7 +426,8 @@ static int icmp6_iif(const struct sk_buf
+  *    Send an ICMP message in response to a packet in error
+  */
+ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+-                     const struct in6_addr *force_saddr)
++                     const struct in6_addr *force_saddr,
++                     const struct inet6_skb_parm *parm)
+ {
+       struct inet6_dev *idev = NULL;
+       struct ipv6hdr *hdr = ipv6_hdr(skb);
+@@ -520,7 +520,7 @@ static void icmp6_send(struct sk_buff *s
+       if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type))
+               goto out_bh_enable;
+ 
+-      mip6_addr_swap(skb);
++      mip6_addr_swap(skb, parm);
+ 
+       memset(&fl6, 0, sizeof(fl6));
+       fl6.flowi6_proto = IPPROTO_ICMPV6;
+@@ -605,7 +605,7 @@ out_bh_enable:
+  */
+ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
+ {
+-      icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL);
++      icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb));
+       kfree_skb(skb);
+ }
+ 
+@@ -662,10 +662,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk
+       }
+       if (type == ICMP_TIME_EXCEEDED)
+               icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
+-                         info, &temp_saddr);
++                         info, &temp_saddr, IP6CB(skb2));
+       else
+               icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
+-                         info, &temp_saddr);
++                         info, &temp_saddr, IP6CB(skb2));
+       if (rt)
+               ip6_rt_put(rt);
+ 
+--- a/net/ipv6/ip6_icmp.c
++++ b/net/ipv6/ip6_icmp.c
+@@ -31,7 +31,8 @@ int inet6_unregister_icmp_sender(ip6_icm
+ }
+ EXPORT_SYMBOL(inet6_unregister_icmp_sender);
+ 
+-void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
++void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
++                 const struct inet6_skb_parm *parm)
+ {
+       ip6_icmp_send_t *send;
+ 
+@@ -40,16 +41,17 @@ void icmpv6_send(struct sk_buff *skb, u8
+ 
+       if (!send)
+               goto out;
+-      send(skb, type, code, info, NULL);
++      send(skb, type, code, info, NULL, parm);
+ out:
+       rcu_read_unlock();
+ }
+-EXPORT_SYMBOL(icmpv6_send);
++EXPORT_SYMBOL(__icmpv6_send);
+ 
+ #if IS_ENABLED(CONFIG_NF_NAT)
+ #include <net/netfilter/nf_conntrack.h>
+ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
+ {
++      struct inet6_skb_parm parm = { 0 };
+       struct sk_buff *cloned_skb = NULL;
+       enum ip_conntrack_info ctinfo;
+       struct in6_addr orig_ip;
+@@ -57,7 +59,7 @@ void icmpv6_ndo_send(struct sk_buff *skb
+ 
+       ct = nf_ct_get(skb_in, &ctinfo);
+       if (!ct || !(ct->status & IPS_SRC_NAT)) {
+-              icmpv6_send(skb_in, type, code, info);
++              __icmpv6_send(skb_in, type, code, info, &parm);
+               return;
+       }
+ 
+@@ -72,7 +74,7 @@ void icmpv6_ndo_send(struct sk_buff *skb
+ 
+       orig_ip = ipv6_hdr(skb_in)->saddr;
+       ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6;
+-      icmpv6_send(skb_in, type, code, info);
++      __icmpv6_send(skb_in, type, code, info, &parm);
+       ipv6_hdr(skb_in)->saddr = orig_ip;
+ out:
+       consume_skb(cloned_skb);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0073-net-WireGuard-secure-network-tunnel.patch b/target/linux/generic/backport-5.4/080-wireguard-0073-net-WireGuard-secure-network-tunnel.patch

new file mode 100644 (file)

index 0000000..8651c73
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0073-net-WireGuard-secure-network-tunnel.patch
@@ -0,0 +1,8071 @@
+From 3e5c0a5efec6e13aa22c59b7170837972e23df49 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 9 Dec 2019 00:27:34 +0100
+Subject: [PATCH 073/124] net: WireGuard secure network tunnel
+
+commit e7096c131e5161fa3b8e52a650d7719d2857adfd upstream.
+
+WireGuard is a layer 3 secure networking tunnel made specifically for
+the kernel, that aims to be much simpler and easier to audit than IPsec.
+Extensive documentation and description of the protocol and
+considerations, along with formal proofs of the cryptography, are
+available at:
+
+  * https://www.wireguard.com/
+  * https://www.wireguard.com/papers/wireguard.pdf
+
+This commit implements WireGuard as a simple network device driver,
+accessible in the usual RTNL way used by virtual network drivers. It
+makes use of the udp_tunnel APIs, GRO, GSO, NAPI, and the usual set of
+networking subsystem APIs. It has a somewhat novel multicore queueing
+system designed for maximum throughput and minimal latency of encryption
+operations, but it is implemented modestly using workqueues and NAPI.
+Configuration is done via generic Netlink, and following a review from
+the Netlink maintainer a year ago, several high profile userspace tools
+have already implemented the API.
+
+This commit also comes with several different tests, both in-kernel
+tests and out-of-kernel tests based on network namespaces, taking profit
+of the fact that sockets used by WireGuard intentionally stay in the
+namespace the WireGuard interface was originally created, exactly like
+the semantics of userspace tun devices. See wireguard.com/netns/ for
+pictures and examples.
+
+The source code is fairly short, but rather than combining everything
+into a single file, WireGuard is developed as cleanly separable files,
+making auditing and comprehension easier. Things are laid out as
+follows:
+
+  * noise.[ch], cookie.[ch], messages.h: These implement the bulk of the
+    cryptographic aspects of the protocol, and are mostly data-only in
+    nature, taking in buffers of bytes and spitting out buffers of
+    bytes. They also handle reference counting for their various shared
+    pieces of data, like keys and key lists.
+
+  * ratelimiter.[ch]: Used as an integral part of cookie.[ch] for
+    ratelimiting certain types of cryptographic operations in accordance
+    with particular WireGuard semantics.
+
+  * allowedips.[ch], peerlookup.[ch]: The main lookup structures of
+    WireGuard, the former being trie-like with particular semantics, an
+    integral part of the design of the protocol, and the latter just
+    being nice helper functions around the various hashtables we use.
+
+  * device.[ch]: Implementation of functions for the netdevice and for
+    rtnl, responsible for maintaining the life of a given interface and
+    wiring it up to the rest of WireGuard.
+
+  * peer.[ch]: Each interface has a list of peers, with helper functions
+    available here for creation, destruction, and reference counting.
+
+  * socket.[ch]: Implementation of functions related to udp_socket and
+    the general set of kernel socket APIs, for sending and receiving
+    ciphertext UDP packets, and taking care of WireGuard-specific sticky
+    socket routing semantics for the automatic roaming.
+
+  * netlink.[ch]: Userspace API entry point for configuring WireGuard
+    peers and devices. The API has been implemented by several userspace
+    tools and network management utility, and the WireGuard project
+    distributes the basic wg(8) tool.
+
+  * queueing.[ch]: Shared function on the rx and tx path for handling
+    the various queues used in the multicore algorithms.
+
+  * send.c: Handles encrypting outgoing packets in parallel on
+    multiple cores, before sending them in order on a single core, via
+    workqueues and ring buffers. Also handles sending handshake and cookie
+    messages as part of the protocol, in parallel.
+
+  * receive.c: Handles decrypting incoming packets in parallel on
+    multiple cores, before passing them off in order to be ingested via
+    the rest of the networking subsystem with GRO via the typical NAPI
+    poll function. Also handles receiving handshake and cookie messages
+    as part of the protocol, in parallel.
+
+  * timers.[ch]: Uses the timer wheel to implement protocol particular
+    event timeouts, and gives a set of very simple event-driven entry
+    point functions for callers.
+
+  * main.c, version.h: Initialization and deinitialization of the module.
+
+  * selftest/*.h: Runtime unit tests for some of the most security
+    sensitive functions.
+
+  * tools/testing/selftests/wireguard/netns.sh: Aforementioned testing
+    script using network namespaces.
+
+This commit aims to be as self-contained as possible, implementing
+WireGuard as a standalone module not needing much special handling or
+coordination from the network subsystem. I expect for future
+optimizations to the network stack to positively improve WireGuard, and
+vice-versa, but for the time being, this exists as intentionally
+standalone.
+
+We introduce a menu option for CONFIG_WIREGUARD, as well as providing a
+verbose debug log and self-tests via CONFIG_WIREGUARD_DEBUG.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Cc: David Miller <davem@davemloft.net>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: linux-crypto@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Cc: netdev@vger.kernel.org
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[Jason: ported to 5.4 by doing the following:
+ - wg_get_device_start uses genl_family_attrbuf
+ - trival skb_redirect_reset change from 2c64605b590e is folded in
+ - skb_list_walk_safe was already backported prior]
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ MAINTAINERS                                  |   8 +
+ drivers/net/Kconfig                          |  41 +
+ drivers/net/Makefile                         |   1 +
+ drivers/net/wireguard/Makefile               |  18 +
+ drivers/net/wireguard/allowedips.c           | 381 +++++++++
+ drivers/net/wireguard/allowedips.h           |  59 ++
+ drivers/net/wireguard/cookie.c               | 236 ++++++
+ drivers/net/wireguard/cookie.h               |  59 ++
+ drivers/net/wireguard/device.c               | 458 ++++++++++
+ drivers/net/wireguard/device.h               |  65 ++
+ drivers/net/wireguard/main.c                 |  64 ++
+ drivers/net/wireguard/messages.h             | 128 +++
+ drivers/net/wireguard/netlink.c              | 648 +++++++++++++++
+ drivers/net/wireguard/netlink.h              |  12 +
+ drivers/net/wireguard/noise.c                | 828 +++++++++++++++++++
+ drivers/net/wireguard/noise.h                | 137 +++
+ drivers/net/wireguard/peer.c                 | 240 ++++++
+ drivers/net/wireguard/peer.h                 |  83 ++
+ drivers/net/wireguard/peerlookup.c           | 221 +++++
+ drivers/net/wireguard/peerlookup.h           |  64 ++
+ drivers/net/wireguard/queueing.c             |  53 ++
+ drivers/net/wireguard/queueing.h             | 197 +++++
+ drivers/net/wireguard/ratelimiter.c          | 223 +++++
+ drivers/net/wireguard/ratelimiter.h          |  19 +
+ drivers/net/wireguard/receive.c              | 595 +++++++++++++
+ drivers/net/wireguard/selftest/allowedips.c  | 683 +++++++++++++++
+ drivers/net/wireguard/selftest/counter.c     | 104 +++
+ drivers/net/wireguard/selftest/ratelimiter.c | 226 +++++
+ drivers/net/wireguard/send.c                 | 413 +++++++++
+ drivers/net/wireguard/socket.c               | 437 ++++++++++
+ drivers/net/wireguard/socket.h               |  44 +
+ drivers/net/wireguard/timers.c               | 243 ++++++
+ drivers/net/wireguard/timers.h               |  31 +
+ drivers/net/wireguard/version.h              |   1 +
+ include/uapi/linux/wireguard.h               | 196 +++++
+ tools/testing/selftests/wireguard/netns.sh   | 537 ++++++++++++
+ 36 files changed, 7753 insertions(+)
+ create mode 100644 drivers/net/wireguard/Makefile
+ create mode 100644 drivers/net/wireguard/allowedips.c
+ create mode 100644 drivers/net/wireguard/allowedips.h
+ create mode 100644 drivers/net/wireguard/cookie.c
+ create mode 100644 drivers/net/wireguard/cookie.h
+ create mode 100644 drivers/net/wireguard/device.c
+ create mode 100644 drivers/net/wireguard/device.h
+ create mode 100644 drivers/net/wireguard/main.c
+ create mode 100644 drivers/net/wireguard/messages.h
+ create mode 100644 drivers/net/wireguard/netlink.c
+ create mode 100644 drivers/net/wireguard/netlink.h
+ create mode 100644 drivers/net/wireguard/noise.c
+ create mode 100644 drivers/net/wireguard/noise.h
+ create mode 100644 drivers/net/wireguard/peer.c
+ create mode 100644 drivers/net/wireguard/peer.h
+ create mode 100644 drivers/net/wireguard/peerlookup.c
+ create mode 100644 drivers/net/wireguard/peerlookup.h
+ create mode 100644 drivers/net/wireguard/queueing.c
+ create mode 100644 drivers/net/wireguard/queueing.h
+ create mode 100644 drivers/net/wireguard/ratelimiter.c
+ create mode 100644 drivers/net/wireguard/ratelimiter.h
+ create mode 100644 drivers/net/wireguard/receive.c
+ create mode 100644 drivers/net/wireguard/selftest/allowedips.c
+ create mode 100644 drivers/net/wireguard/selftest/counter.c
+ create mode 100644 drivers/net/wireguard/selftest/ratelimiter.c
+ create mode 100644 drivers/net/wireguard/send.c
+ create mode 100644 drivers/net/wireguard/socket.c
+ create mode 100644 drivers/net/wireguard/socket.h
+ create mode 100644 drivers/net/wireguard/timers.c
+ create mode 100644 drivers/net/wireguard/timers.h
+ create mode 100644 drivers/net/wireguard/version.h
+ create mode 100644 include/uapi/linux/wireguard.h
+ create mode 100755 tools/testing/selftests/wireguard/netns.sh
+
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -17584,6 +17584,14 @@ L:    linux-gpio@vger.kernel.org
+ S:    Maintained
+ F:    drivers/gpio/gpio-ws16c48.c
+ 
++WIREGUARD SECURE NETWORK TUNNEL
++M:    Jason A. Donenfeld <Jason@zx2c4.com>
++S:    Maintained
++F:    drivers/net/wireguard/
++F:    tools/testing/selftests/wireguard/
++L:    wireguard@lists.zx2c4.com
++L:    netdev@vger.kernel.org
++
+ WISTRON LAPTOP BUTTON DRIVER
+ M:    Miloslav Trmac <mitr@volny.cz>
+ S:    Maintained
+--- a/drivers/net/Kconfig
++++ b/drivers/net/Kconfig
+@@ -71,6 +71,47 @@ config DUMMY
+         To compile this driver as a module, choose M here: the module
+         will be called dummy.
+ 
++config WIREGUARD
++      tristate "WireGuard secure network tunnel"
++      depends on NET && INET
++      depends on IPV6 || !IPV6
++      select NET_UDP_TUNNEL
++      select DST_CACHE
++      select CRYPTO
++      select CRYPTO_LIB_CURVE25519
++      select CRYPTO_LIB_CHACHA20POLY1305
++      select CRYPTO_LIB_BLAKE2S
++      select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT
++      select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
++      select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
++      select CRYPTO_CURVE25519_X86 if X86 && 64BIT
++      select CRYPTO_CHACHA20_NEON if (ARM || ARM64) && KERNEL_MODE_NEON
++      select CRYPTO_POLY1305_NEON if ARM64 && KERNEL_MODE_NEON
++      select CRYPTO_POLY1305_ARM if ARM
++      select CRYPTO_CURVE25519_NEON if ARM && KERNEL_MODE_NEON
++      select CRYPTO_CHACHA_MIPS if CPU_MIPS32_R2
++      select CRYPTO_POLY1305_MIPS if CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
++      help
++        WireGuard is a secure, fast, and easy to use replacement for IPSec
++        that uses modern cryptography and clever networking tricks. It's
++        designed to be fairly general purpose and abstract enough to fit most
++        use cases, while at the same time remaining extremely simple to
++        configure. See www.wireguard.com for more info.
++
++        It's safe to say Y or M here, as the driver is very lightweight and
++        is only in use when an administrator chooses to add an interface.
++
++config WIREGUARD_DEBUG
++      bool "Debugging checks and verbose messages"
++      depends on WIREGUARD
++      help
++        This will write log messages for handshake and other events
++        that occur for a WireGuard interface. It will also perform some
++        extra validation checks and unit tests at various points. This is
++        only useful for debugging.
++
++        Say N here unless you know what you're doing.
++
+ config EQUALIZER
+       tristate "EQL (serial line load balancing) support"
+       ---help---
+--- a/drivers/net/Makefile
++++ b/drivers/net/Makefile
+@@ -10,6 +10,7 @@ obj-$(CONFIG_BONDING) += bonding/
+ obj-$(CONFIG_IPVLAN) += ipvlan/
+ obj-$(CONFIG_IPVTAP) += ipvlan/
+ obj-$(CONFIG_DUMMY) += dummy.o
++obj-$(CONFIG_WIREGUARD) += wireguard/
+ obj-$(CONFIG_EQUALIZER) += eql.o
+ obj-$(CONFIG_IFB) += ifb.o
+ obj-$(CONFIG_MACSEC) += macsec.o
+--- /dev/null
++++ b/drivers/net/wireguard/Makefile
+@@ -0,0 +1,18 @@
++ccflags-y := -O3
++ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
++ccflags-$(CONFIG_WIREGUARD_DEBUG) += -DDEBUG
++wireguard-y := main.o
++wireguard-y += noise.o
++wireguard-y += device.o
++wireguard-y += peer.o
++wireguard-y += timers.o
++wireguard-y += queueing.o
++wireguard-y += send.o
++wireguard-y += receive.o
++wireguard-y += socket.o
++wireguard-y += peerlookup.o
++wireguard-y += allowedips.o
++wireguard-y += ratelimiter.o
++wireguard-y += cookie.o
++wireguard-y += netlink.o
++obj-$(CONFIG_WIREGUARD) := wireguard.o
+--- /dev/null
++++ b/drivers/net/wireguard/allowedips.c
+@@ -0,0 +1,381 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "allowedips.h"
++#include "peer.h"
++
++static void swap_endian(u8 *dst, const u8 *src, u8 bits)
++{
++      if (bits == 32) {
++              *(u32 *)dst = be32_to_cpu(*(const __be32 *)src);
++      } else if (bits == 128) {
++              ((u64 *)dst)[0] = be64_to_cpu(((const __be64 *)src)[0]);
++              ((u64 *)dst)[1] = be64_to_cpu(((const __be64 *)src)[1]);
++      }
++}
++
++static void copy_and_assign_cidr(struct allowedips_node *node, const u8 *src,
++                               u8 cidr, u8 bits)
++{
++      node->cidr = cidr;
++      node->bit_at_a = cidr / 8U;
++#ifdef __LITTLE_ENDIAN
++      node->bit_at_a ^= (bits / 8U - 1U) % 8U;
++#endif
++      node->bit_at_b = 7U - (cidr % 8U);
++      node->bitlen = bits;
++      memcpy(node->bits, src, bits / 8U);
++}
++#define CHOOSE_NODE(parent, key) \
++      parent->bit[(key[parent->bit_at_a] >> parent->bit_at_b) & 1]
++
++static void node_free_rcu(struct rcu_head *rcu)
++{
++      kfree(container_of(rcu, struct allowedips_node, rcu));
++}
++
++static void push_rcu(struct allowedips_node **stack,
++                   struct allowedips_node __rcu *p, unsigned int *len)
++{
++      if (rcu_access_pointer(p)) {
++              WARN_ON(IS_ENABLED(DEBUG) && *len >= 128);
++              stack[(*len)++] = rcu_dereference_raw(p);
++      }
++}
++
++static void root_free_rcu(struct rcu_head *rcu)
++{
++      struct allowedips_node *node, *stack[128] = {
++              container_of(rcu, struct allowedips_node, rcu) };
++      unsigned int len = 1;
++
++      while (len > 0 && (node = stack[--len])) {
++              push_rcu(stack, node->bit[0], &len);
++              push_rcu(stack, node->bit[1], &len);
++              kfree(node);
++      }
++}
++
++static void root_remove_peer_lists(struct allowedips_node *root)
++{
++      struct allowedips_node *node, *stack[128] = { root };
++      unsigned int len = 1;
++
++      while (len > 0 && (node = stack[--len])) {
++              push_rcu(stack, node->bit[0], &len);
++              push_rcu(stack, node->bit[1], &len);
++              if (rcu_access_pointer(node->peer))
++                      list_del(&node->peer_list);
++      }
++}
++
++static void walk_remove_by_peer(struct allowedips_node __rcu **top,
++                              struct wg_peer *peer, struct mutex *lock)
++{
++#define REF(p) rcu_access_pointer(p)
++#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock))
++#define PUSH(p) ({                                                             \
++              WARN_ON(IS_ENABLED(DEBUG) && len >= 128);                      \
++              stack[len++] = p;                                              \
++      })
++
++      struct allowedips_node __rcu **stack[128], **nptr;
++      struct allowedips_node *node, *prev;
++      unsigned int len;
++
++      if (unlikely(!peer || !REF(*top)))
++              return;
++
++      for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) {
++              nptr = stack[len - 1];
++              node = DEREF(nptr);
++              if (!node) {
++                      --len;
++                      continue;
++              }
++              if (!prev || REF(prev->bit[0]) == node ||
++                  REF(prev->bit[1]) == node) {
++                      if (REF(node->bit[0]))
++                              PUSH(&node->bit[0]);
++                      else if (REF(node->bit[1]))
++                              PUSH(&node->bit[1]);
++              } else if (REF(node->bit[0]) == prev) {
++                      if (REF(node->bit[1]))
++                              PUSH(&node->bit[1]);
++              } else {
++                      if (rcu_dereference_protected(node->peer,
++                              lockdep_is_held(lock)) == peer) {
++                              RCU_INIT_POINTER(node->peer, NULL);
++                              list_del_init(&node->peer_list);
++                              if (!node->bit[0] || !node->bit[1]) {
++                                      rcu_assign_pointer(*nptr, DEREF(
++                                             &node->bit[!REF(node->bit[0])]));
++                                      call_rcu(&node->rcu, node_free_rcu);
++                                      node = DEREF(nptr);
++                              }
++                      }
++                      --len;
++              }
++      }
++
++#undef REF
++#undef DEREF
++#undef PUSH
++}
++
++static unsigned int fls128(u64 a, u64 b)
++{
++      return a ? fls64(a) + 64U : fls64(b);
++}
++
++static u8 common_bits(const struct allowedips_node *node, const u8 *key,
++                    u8 bits)
++{
++      if (bits == 32)
++              return 32U - fls(*(const u32 *)node->bits ^ *(const u32 *)key);
++      else if (bits == 128)
++              return 128U - fls128(
++                      *(const u64 *)&node->bits[0] ^ *(const u64 *)&key[0],
++                      *(const u64 *)&node->bits[8] ^ *(const u64 *)&key[8]);
++      return 0;
++}
++
++static bool prefix_matches(const struct allowedips_node *node, const u8 *key,
++                         u8 bits)
++{
++      /* This could be much faster if it actually just compared the common
++       * bits properly, by precomputing a mask bswap(~0 << (32 - cidr)), and
++       * the rest, but it turns out that common_bits is already super fast on
++       * modern processors, even taking into account the unfortunate bswap.
++       * So, we just inline it like this instead.
++       */
++      return common_bits(node, key, bits) >= node->cidr;
++}
++
++static struct allowedips_node *find_node(struct allowedips_node *trie, u8 bits,
++                                       const u8 *key)
++{
++      struct allowedips_node *node = trie, *found = NULL;
++
++      while (node && prefix_matches(node, key, bits)) {
++              if (rcu_access_pointer(node->peer))
++                      found = node;
++              if (node->cidr == bits)
++                      break;
++              node = rcu_dereference_bh(CHOOSE_NODE(node, key));
++      }
++      return found;
++}
++
++/* Returns a strong reference to a peer */
++static struct wg_peer *lookup(struct allowedips_node __rcu *root, u8 bits,
++                            const void *be_ip)
++{
++      /* Aligned so it can be passed to fls/fls64 */
++      u8 ip[16] __aligned(__alignof(u64));
++      struct allowedips_node *node;
++      struct wg_peer *peer = NULL;
++
++      swap_endian(ip, be_ip, bits);
++
++      rcu_read_lock_bh();
++retry:
++      node = find_node(rcu_dereference_bh(root), bits, ip);
++      if (node) {
++              peer = wg_peer_get_maybe_zero(rcu_dereference_bh(node->peer));
++              if (!peer)
++                      goto retry;
++      }
++      rcu_read_unlock_bh();
++      return peer;
++}
++
++static bool node_placement(struct allowedips_node __rcu *trie, const u8 *key,
++                         u8 cidr, u8 bits, struct allowedips_node **rnode,
++                         struct mutex *lock)
++{
++      struct allowedips_node *node = rcu_dereference_protected(trie,
++                                              lockdep_is_held(lock));
++      struct allowedips_node *parent = NULL;
++      bool exact = false;
++
++      while (node && node->cidr <= cidr && prefix_matches(node, key, bits)) {
++              parent = node;
++              if (parent->cidr == cidr) {
++                      exact = true;
++                      break;
++              }
++              node = rcu_dereference_protected(CHOOSE_NODE(parent, key),
++                                               lockdep_is_held(lock));
++      }
++      *rnode = parent;
++      return exact;
++}
++
++static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
++             u8 cidr, struct wg_peer *peer, struct mutex *lock)
++{
++      struct allowedips_node *node, *parent, *down, *newnode;
++
++      if (unlikely(cidr > bits || !peer))
++              return -EINVAL;
++
++      if (!rcu_access_pointer(*trie)) {
++              node = kzalloc(sizeof(*node), GFP_KERNEL);
++              if (unlikely(!node))
++                      return -ENOMEM;
++              RCU_INIT_POINTER(node->peer, peer);
++              list_add_tail(&node->peer_list, &peer->allowedips_list);
++              copy_and_assign_cidr(node, key, cidr, bits);
++              rcu_assign_pointer(*trie, node);
++              return 0;
++      }
++      if (node_placement(*trie, key, cidr, bits, &node, lock)) {
++              rcu_assign_pointer(node->peer, peer);
++              list_move_tail(&node->peer_list, &peer->allowedips_list);
++              return 0;
++      }
++
++      newnode = kzalloc(sizeof(*newnode), GFP_KERNEL);
++      if (unlikely(!newnode))
++              return -ENOMEM;
++      RCU_INIT_POINTER(newnode->peer, peer);
++      list_add_tail(&newnode->peer_list, &peer->allowedips_list);
++      copy_and_assign_cidr(newnode, key, cidr, bits);
++
++      if (!node) {
++              down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
++      } else {
++              down = rcu_dereference_protected(CHOOSE_NODE(node, key),
++                                               lockdep_is_held(lock));
++              if (!down) {
++                      rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
++                      return 0;
++              }
++      }
++      cidr = min(cidr, common_bits(down, key, bits));
++      parent = node;
++
++      if (newnode->cidr == cidr) {
++              rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
++              if (!parent)
++                      rcu_assign_pointer(*trie, newnode);
++              else
++                      rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits),
++                                         newnode);
++      } else {
++              node = kzalloc(sizeof(*node), GFP_KERNEL);
++              if (unlikely(!node)) {
++                      kfree(newnode);
++                      return -ENOMEM;
++              }
++              INIT_LIST_HEAD(&node->peer_list);
++              copy_and_assign_cidr(node, newnode->bits, cidr, bits);
++
++              rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
++              rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
++              if (!parent)
++                      rcu_assign_pointer(*trie, node);
++              else
++                      rcu_assign_pointer(CHOOSE_NODE(parent, node->bits),
++                                         node);
++      }
++      return 0;
++}
++
++void wg_allowedips_init(struct allowedips *table)
++{
++      table->root4 = table->root6 = NULL;
++      table->seq = 1;
++}
++
++void wg_allowedips_free(struct allowedips *table, struct mutex *lock)
++{
++      struct allowedips_node __rcu *old4 = table->root4, *old6 = table->root6;
++
++      ++table->seq;
++      RCU_INIT_POINTER(table->root4, NULL);
++      RCU_INIT_POINTER(table->root6, NULL);
++      if (rcu_access_pointer(old4)) {
++              struct allowedips_node *node = rcu_dereference_protected(old4,
++                                                      lockdep_is_held(lock));
++
++              root_remove_peer_lists(node);
++              call_rcu(&node->rcu, root_free_rcu);
++      }
++      if (rcu_access_pointer(old6)) {
++              struct allowedips_node *node = rcu_dereference_protected(old6,
++                                                      lockdep_is_held(lock));
++
++              root_remove_peer_lists(node);
++              call_rcu(&node->rcu, root_free_rcu);
++      }
++}
++
++int wg_allowedips_insert_v4(struct allowedips *table, const struct in_addr *ip,
++                          u8 cidr, struct wg_peer *peer, struct mutex *lock)
++{
++      /* Aligned so it can be passed to fls */
++      u8 key[4] __aligned(__alignof(u32));
++
++      ++table->seq;
++      swap_endian(key, (const u8 *)ip, 32);
++      return add(&table->root4, 32, key, cidr, peer, lock);
++}
++
++int wg_allowedips_insert_v6(struct allowedips *table, const struct in6_addr *ip,
++                          u8 cidr, struct wg_peer *peer, struct mutex *lock)
++{
++      /* Aligned so it can be passed to fls64 */
++      u8 key[16] __aligned(__alignof(u64));
++
++      ++table->seq;
++      swap_endian(key, (const u8 *)ip, 128);
++      return add(&table->root6, 128, key, cidr, peer, lock);
++}
++
++void wg_allowedips_remove_by_peer(struct allowedips *table,
++                                struct wg_peer *peer, struct mutex *lock)
++{
++      ++table->seq;
++      walk_remove_by_peer(&table->root4, peer, lock);
++      walk_remove_by_peer(&table->root6, peer, lock);
++}
++
++int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr)
++{
++      const unsigned int cidr_bytes = DIV_ROUND_UP(node->cidr, 8U);
++      swap_endian(ip, node->bits, node->bitlen);
++      memset(ip + cidr_bytes, 0, node->bitlen / 8U - cidr_bytes);
++      if (node->cidr)
++              ip[cidr_bytes - 1U] &= ~0U << (-node->cidr % 8U);
++
++      *cidr = node->cidr;
++      return node->bitlen == 32 ? AF_INET : AF_INET6;
++}
++
++/* Returns a strong reference to a peer */
++struct wg_peer *wg_allowedips_lookup_dst(struct allowedips *table,
++                                       struct sk_buff *skb)
++{
++      if (skb->protocol == htons(ETH_P_IP))
++              return lookup(table->root4, 32, &ip_hdr(skb)->daddr);
++      else if (skb->protocol == htons(ETH_P_IPV6))
++              return lookup(table->root6, 128, &ipv6_hdr(skb)->daddr);
++      return NULL;
++}
++
++/* Returns a strong reference to a peer */
++struct wg_peer *wg_allowedips_lookup_src(struct allowedips *table,
++                                       struct sk_buff *skb)
++{
++      if (skb->protocol == htons(ETH_P_IP))
++              return lookup(table->root4, 32, &ip_hdr(skb)->saddr);
++      else if (skb->protocol == htons(ETH_P_IPV6))
++              return lookup(table->root6, 128, &ipv6_hdr(skb)->saddr);
++      return NULL;
++}
++
++#include "selftest/allowedips.c"
+--- /dev/null
++++ b/drivers/net/wireguard/allowedips.h
+@@ -0,0 +1,59 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_ALLOWEDIPS_H
++#define _WG_ALLOWEDIPS_H
++
++#include <linux/mutex.h>
++#include <linux/ip.h>
++#include <linux/ipv6.h>
++
++struct wg_peer;
++
++struct allowedips_node {
++      struct wg_peer __rcu *peer;
++      struct allowedips_node __rcu *bit[2];
++      /* While it may seem scandalous that we waste space for v4,
++       * we're alloc'ing to the nearest power of 2 anyway, so this
++       * doesn't actually make a difference.
++       */
++      u8 bits[16] __aligned(__alignof(u64));
++      u8 cidr, bit_at_a, bit_at_b, bitlen;
++
++      /* Keep rarely used list at bottom to be beyond cache line. */
++      union {
++              struct list_head peer_list;
++              struct rcu_head rcu;
++      };
++};
++
++struct allowedips {
++      struct allowedips_node __rcu *root4;
++      struct allowedips_node __rcu *root6;
++      u64 seq;
++};
++
++void wg_allowedips_init(struct allowedips *table);
++void wg_allowedips_free(struct allowedips *table, struct mutex *mutex);
++int wg_allowedips_insert_v4(struct allowedips *table, const struct in_addr *ip,
++                          u8 cidr, struct wg_peer *peer, struct mutex *lock);
++int wg_allowedips_insert_v6(struct allowedips *table, const struct in6_addr *ip,
++                          u8 cidr, struct wg_peer *peer, struct mutex *lock);
++void wg_allowedips_remove_by_peer(struct allowedips *table,
++                                struct wg_peer *peer, struct mutex *lock);
++/* The ip input pointer should be __aligned(__alignof(u64))) */
++int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr);
++
++/* These return a strong reference to a peer: */
++struct wg_peer *wg_allowedips_lookup_dst(struct allowedips *table,
++                                       struct sk_buff *skb);
++struct wg_peer *wg_allowedips_lookup_src(struct allowedips *table,
++                                       struct sk_buff *skb);
++
++#ifdef DEBUG
++bool wg_allowedips_selftest(void);
++#endif
++
++#endif /* _WG_ALLOWEDIPS_H */
+--- /dev/null
++++ b/drivers/net/wireguard/cookie.c
+@@ -0,0 +1,236 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "cookie.h"
++#include "peer.h"
++#include "device.h"
++#include "messages.h"
++#include "ratelimiter.h"
++#include "timers.h"
++
++#include <crypto/blake2s.h>
++#include <crypto/chacha20poly1305.h>
++
++#include <net/ipv6.h>
++#include <crypto/algapi.h>
++
++void wg_cookie_checker_init(struct cookie_checker *checker,
++                          struct wg_device *wg)
++{
++      init_rwsem(&checker->secret_lock);
++      checker->secret_birthdate = ktime_get_coarse_boottime_ns();
++      get_random_bytes(checker->secret, NOISE_HASH_LEN);
++      checker->device = wg;
++}
++
++enum { COOKIE_KEY_LABEL_LEN = 8 };
++static const u8 mac1_key_label[COOKIE_KEY_LABEL_LEN] = "mac1----";
++static const u8 cookie_key_label[COOKIE_KEY_LABEL_LEN] = "cookie--";
++
++static void precompute_key(u8 key[NOISE_SYMMETRIC_KEY_LEN],
++                         const u8 pubkey[NOISE_PUBLIC_KEY_LEN],
++                         const u8 label[COOKIE_KEY_LABEL_LEN])
++{
++      struct blake2s_state blake;
++
++      blake2s_init(&blake, NOISE_SYMMETRIC_KEY_LEN);
++      blake2s_update(&blake, label, COOKIE_KEY_LABEL_LEN);
++      blake2s_update(&blake, pubkey, NOISE_PUBLIC_KEY_LEN);
++      blake2s_final(&blake, key);
++}
++
++/* Must hold peer->handshake.static_identity->lock */
++void wg_cookie_checker_precompute_device_keys(struct cookie_checker *checker)
++{
++      if (likely(checker->device->static_identity.has_identity)) {
++              precompute_key(checker->cookie_encryption_key,
++                             checker->device->static_identity.static_public,
++                             cookie_key_label);
++              precompute_key(checker->message_mac1_key,
++                             checker->device->static_identity.static_public,
++                             mac1_key_label);
++      } else {
++              memset(checker->cookie_encryption_key, 0,
++                     NOISE_SYMMETRIC_KEY_LEN);
++              memset(checker->message_mac1_key, 0, NOISE_SYMMETRIC_KEY_LEN);
++      }
++}
++
++void wg_cookie_checker_precompute_peer_keys(struct wg_peer *peer)
++{
++      precompute_key(peer->latest_cookie.cookie_decryption_key,
++                     peer->handshake.remote_static, cookie_key_label);
++      precompute_key(peer->latest_cookie.message_mac1_key,
++                     peer->handshake.remote_static, mac1_key_label);
++}
++
++void wg_cookie_init(struct cookie *cookie)
++{
++      memset(cookie, 0, sizeof(*cookie));
++      init_rwsem(&cookie->lock);
++}
++
++static void compute_mac1(u8 mac1[COOKIE_LEN], const void *message, size_t len,
++                       const u8 key[NOISE_SYMMETRIC_KEY_LEN])
++{
++      len = len - sizeof(struct message_macs) +
++            offsetof(struct message_macs, mac1);
++      blake2s(mac1, message, key, COOKIE_LEN, len, NOISE_SYMMETRIC_KEY_LEN);
++}
++
++static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len,
++                       const u8 cookie[COOKIE_LEN])
++{
++      len = len - sizeof(struct message_macs) +
++            offsetof(struct message_macs, mac2);
++      blake2s(mac2, message, cookie, COOKIE_LEN, len, COOKIE_LEN);
++}
++
++static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb,
++                      struct cookie_checker *checker)
++{
++      struct blake2s_state state;
++
++      if (wg_birthdate_has_expired(checker->secret_birthdate,
++                                   COOKIE_SECRET_MAX_AGE)) {
++              down_write(&checker->secret_lock);
++              checker->secret_birthdate = ktime_get_coarse_boottime_ns();
++              get_random_bytes(checker->secret, NOISE_HASH_LEN);
++              up_write(&checker->secret_lock);
++      }
++
++      down_read(&checker->secret_lock);
++
++      blake2s_init_key(&state, COOKIE_LEN, checker->secret, NOISE_HASH_LEN);
++      if (skb->protocol == htons(ETH_P_IP))
++              blake2s_update(&state, (u8 *)&ip_hdr(skb)->saddr,
++                             sizeof(struct in_addr));
++      else if (skb->protocol == htons(ETH_P_IPV6))
++              blake2s_update(&state, (u8 *)&ipv6_hdr(skb)->saddr,
++                             sizeof(struct in6_addr));
++      blake2s_update(&state, (u8 *)&udp_hdr(skb)->source, sizeof(__be16));
++      blake2s_final(&state, cookie);
++
++      up_read(&checker->secret_lock);
++}
++
++enum cookie_mac_state wg_cookie_validate_packet(struct cookie_checker *checker,
++                                              struct sk_buff *skb,
++                                              bool check_cookie)
++{
++      struct message_macs *macs = (struct message_macs *)
++              (skb->data + skb->len - sizeof(*macs));
++      enum cookie_mac_state ret;
++      u8 computed_mac[COOKIE_LEN];
++      u8 cookie[COOKIE_LEN];
++
++      ret = INVALID_MAC;
++      compute_mac1(computed_mac, skb->data, skb->len,
++                   checker->message_mac1_key);
++      if (crypto_memneq(computed_mac, macs->mac1, COOKIE_LEN))
++              goto out;
++
++      ret = VALID_MAC_BUT_NO_COOKIE;
++
++      if (!check_cookie)
++              goto out;
++
++      make_cookie(cookie, skb, checker);
++
++      compute_mac2(computed_mac, skb->data, skb->len, cookie);
++      if (crypto_memneq(computed_mac, macs->mac2, COOKIE_LEN))
++              goto out;
++
++      ret = VALID_MAC_WITH_COOKIE_BUT_RATELIMITED;
++      if (!wg_ratelimiter_allow(skb, dev_net(checker->device->dev)))
++              goto out;
++
++      ret = VALID_MAC_WITH_COOKIE;
++
++out:
++      return ret;
++}
++
++void wg_cookie_add_mac_to_packet(void *message, size_t len,
++                               struct wg_peer *peer)
++{
++      struct message_macs *macs = (struct message_macs *)
++              ((u8 *)message + len - sizeof(*macs));
++
++      down_write(&peer->latest_cookie.lock);
++      compute_mac1(macs->mac1, message, len,
++                   peer->latest_cookie.message_mac1_key);
++      memcpy(peer->latest_cookie.last_mac1_sent, macs->mac1, COOKIE_LEN);
++      peer->latest_cookie.have_sent_mac1 = true;
++      up_write(&peer->latest_cookie.lock);
++
++      down_read(&peer->latest_cookie.lock);
++      if (peer->latest_cookie.is_valid &&
++          !wg_birthdate_has_expired(peer->latest_cookie.birthdate,
++                              COOKIE_SECRET_MAX_AGE - COOKIE_SECRET_LATENCY))
++              compute_mac2(macs->mac2, message, len,
++                           peer->latest_cookie.cookie);
++      else
++              memset(macs->mac2, 0, COOKIE_LEN);
++      up_read(&peer->latest_cookie.lock);
++}
++
++void wg_cookie_message_create(struct message_handshake_cookie *dst,
++                            struct sk_buff *skb, __le32 index,
++                            struct cookie_checker *checker)
++{
++      struct message_macs *macs = (struct message_macs *)
++              ((u8 *)skb->data + skb->len - sizeof(*macs));
++      u8 cookie[COOKIE_LEN];
++
++      dst->header.type = cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE);
++      dst->receiver_index = index;
++      get_random_bytes_wait(dst->nonce, COOKIE_NONCE_LEN);
++
++      make_cookie(cookie, skb, checker);
++      xchacha20poly1305_encrypt(dst->encrypted_cookie, cookie, COOKIE_LEN,
++                                macs->mac1, COOKIE_LEN, dst->nonce,
++                                checker->cookie_encryption_key);
++}
++
++void wg_cookie_message_consume(struct message_handshake_cookie *src,
++                             struct wg_device *wg)
++{
++      struct wg_peer *peer = NULL;
++      u8 cookie[COOKIE_LEN];
++      bool ret;
++
++      if (unlikely(!wg_index_hashtable_lookup(wg->index_hashtable,
++                                              INDEX_HASHTABLE_HANDSHAKE |
++                                              INDEX_HASHTABLE_KEYPAIR,
++                                              src->receiver_index, &peer)))
++              return;
++
++      down_read(&peer->latest_cookie.lock);
++      if (unlikely(!peer->latest_cookie.have_sent_mac1)) {
++              up_read(&peer->latest_cookie.lock);
++              goto out;
++      }
++      ret = xchacha20poly1305_decrypt(
++              cookie, src->encrypted_cookie, sizeof(src->encrypted_cookie),
++              peer->latest_cookie.last_mac1_sent, COOKIE_LEN, src->nonce,
++              peer->latest_cookie.cookie_decryption_key);
++      up_read(&peer->latest_cookie.lock);
++
++      if (ret) {
++              down_write(&peer->latest_cookie.lock);
++              memcpy(peer->latest_cookie.cookie, cookie, COOKIE_LEN);
++              peer->latest_cookie.birthdate = ktime_get_coarse_boottime_ns();
++              peer->latest_cookie.is_valid = true;
++              peer->latest_cookie.have_sent_mac1 = false;
++              up_write(&peer->latest_cookie.lock);
++      } else {
++              net_dbg_ratelimited("%s: Could not decrypt invalid cookie response\n",
++                                  wg->dev->name);
++      }
++
++out:
++      wg_peer_put(peer);
++}
+--- /dev/null
++++ b/drivers/net/wireguard/cookie.h
+@@ -0,0 +1,59 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_COOKIE_H
++#define _WG_COOKIE_H
++
++#include "messages.h"
++#include <linux/rwsem.h>
++
++struct wg_peer;
++
++struct cookie_checker {
++      u8 secret[NOISE_HASH_LEN];
++      u8 cookie_encryption_key[NOISE_SYMMETRIC_KEY_LEN];
++      u8 message_mac1_key[NOISE_SYMMETRIC_KEY_LEN];
++      u64 secret_birthdate;
++      struct rw_semaphore secret_lock;
++      struct wg_device *device;
++};
++
++struct cookie {
++      u64 birthdate;
++      bool is_valid;
++      u8 cookie[COOKIE_LEN];
++      bool have_sent_mac1;
++      u8 last_mac1_sent[COOKIE_LEN];
++      u8 cookie_decryption_key[NOISE_SYMMETRIC_KEY_LEN];
++      u8 message_mac1_key[NOISE_SYMMETRIC_KEY_LEN];
++      struct rw_semaphore lock;
++};
++
++enum cookie_mac_state {
++      INVALID_MAC,
++      VALID_MAC_BUT_NO_COOKIE,
++      VALID_MAC_WITH_COOKIE_BUT_RATELIMITED,
++      VALID_MAC_WITH_COOKIE
++};
++
++void wg_cookie_checker_init(struct cookie_checker *checker,
++                          struct wg_device *wg);
++void wg_cookie_checker_precompute_device_keys(struct cookie_checker *checker);
++void wg_cookie_checker_precompute_peer_keys(struct wg_peer *peer);
++void wg_cookie_init(struct cookie *cookie);
++
++enum cookie_mac_state wg_cookie_validate_packet(struct cookie_checker *checker,
++                                              struct sk_buff *skb,
++                                              bool check_cookie);
++void wg_cookie_add_mac_to_packet(void *message, size_t len,
++                               struct wg_peer *peer);
++
++void wg_cookie_message_create(struct message_handshake_cookie *src,
++                            struct sk_buff *skb, __le32 index,
++                            struct cookie_checker *checker);
++void wg_cookie_message_consume(struct message_handshake_cookie *src,
++                             struct wg_device *wg);
++
++#endif /* _WG_COOKIE_H */
+--- /dev/null
++++ b/drivers/net/wireguard/device.c
+@@ -0,0 +1,458 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "queueing.h"
++#include "socket.h"
++#include "timers.h"
++#include "device.h"
++#include "ratelimiter.h"
++#include "peer.h"
++#include "messages.h"
++
++#include <linux/module.h>
++#include <linux/rtnetlink.h>
++#include <linux/inet.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <linux/if_arp.h>
++#include <linux/icmp.h>
++#include <linux/suspend.h>
++#include <net/icmp.h>
++#include <net/rtnetlink.h>
++#include <net/ip_tunnels.h>
++#include <net/addrconf.h>
++
++static LIST_HEAD(device_list);
++
++static int wg_open(struct net_device *dev)
++{
++      struct in_device *dev_v4 = __in_dev_get_rtnl(dev);
++      struct inet6_dev *dev_v6 = __in6_dev_get(dev);
++      struct wg_device *wg = netdev_priv(dev);
++      struct wg_peer *peer;
++      int ret;
++
++      if (dev_v4) {
++              /* At some point we might put this check near the ip_rt_send_
++               * redirect call of ip_forward in net/ipv4/ip_forward.c, similar
++               * to the current secpath check.
++               */
++              IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false);
++              IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false;
++      }
++      if (dev_v6)
++              dev_v6->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_NONE;
++
++      ret = wg_socket_init(wg, wg->incoming_port);
++      if (ret < 0)
++              return ret;
++      mutex_lock(&wg->device_update_lock);
++      list_for_each_entry(peer, &wg->peer_list, peer_list) {
++              wg_packet_send_staged_packets(peer);
++              if (peer->persistent_keepalive_interval)
++                      wg_packet_send_keepalive(peer);
++      }
++      mutex_unlock(&wg->device_update_lock);
++      return 0;
++}
++
++#ifdef CONFIG_PM_SLEEP
++static int wg_pm_notification(struct notifier_block *nb, unsigned long action,
++                            void *data)
++{
++      struct wg_device *wg;
++      struct wg_peer *peer;
++
++      /* If the machine is constantly suspending and resuming, as part of
++       * its normal operation rather than as a somewhat rare event, then we
++       * don't actually want to clear keys.
++       */
++      if (IS_ENABLED(CONFIG_PM_AUTOSLEEP) || IS_ENABLED(CONFIG_ANDROID))
++              return 0;
++
++      if (action != PM_HIBERNATION_PREPARE && action != PM_SUSPEND_PREPARE)
++              return 0;
++
++      rtnl_lock();
++      list_for_each_entry(wg, &device_list, device_list) {
++              mutex_lock(&wg->device_update_lock);
++              list_for_each_entry(peer, &wg->peer_list, peer_list) {
++                      del_timer(&peer->timer_zero_key_material);
++                      wg_noise_handshake_clear(&peer->handshake);
++                      wg_noise_keypairs_clear(&peer->keypairs);
++              }
++              mutex_unlock(&wg->device_update_lock);
++      }
++      rtnl_unlock();
++      rcu_barrier();
++      return 0;
++}
++
++static struct notifier_block pm_notifier = { .notifier_call = wg_pm_notification };
++#endif
++
++static int wg_stop(struct net_device *dev)
++{
++      struct wg_device *wg = netdev_priv(dev);
++      struct wg_peer *peer;
++
++      mutex_lock(&wg->device_update_lock);
++      list_for_each_entry(peer, &wg->peer_list, peer_list) {
++              wg_packet_purge_staged_packets(peer);
++              wg_timers_stop(peer);
++              wg_noise_handshake_clear(&peer->handshake);
++              wg_noise_keypairs_clear(&peer->keypairs);
++              wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake);
++      }
++      mutex_unlock(&wg->device_update_lock);
++      skb_queue_purge(&wg->incoming_handshakes);
++      wg_socket_reinit(wg, NULL, NULL);
++      return 0;
++}
++
++static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++      struct wg_device *wg = netdev_priv(dev);
++      struct sk_buff_head packets;
++      struct wg_peer *peer;
++      struct sk_buff *next;
++      sa_family_t family;
++      u32 mtu;
++      int ret;
++
++      if (unlikely(wg_skb_examine_untrusted_ip_hdr(skb) != skb->protocol)) {
++              ret = -EPROTONOSUPPORT;
++              net_dbg_ratelimited("%s: Invalid IP packet\n", dev->name);
++              goto err;
++      }
++
++      peer = wg_allowedips_lookup_dst(&wg->peer_allowedips, skb);
++      if (unlikely(!peer)) {
++              ret = -ENOKEY;
++              if (skb->protocol == htons(ETH_P_IP))
++                      net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI4\n",
++                                          dev->name, &ip_hdr(skb)->daddr);
++              else if (skb->protocol == htons(ETH_P_IPV6))
++                      net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI6\n",
++                                          dev->name, &ipv6_hdr(skb)->daddr);
++              goto err;
++      }
++
++      family = READ_ONCE(peer->endpoint.addr.sa_family);
++      if (unlikely(family != AF_INET && family != AF_INET6)) {
++              ret = -EDESTADDRREQ;
++              net_dbg_ratelimited("%s: No valid endpoint has been configured or discovered for peer %llu\n",
++                                  dev->name, peer->internal_id);
++              goto err_peer;
++      }
++
++      mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
++
++      __skb_queue_head_init(&packets);
++      if (!skb_is_gso(skb)) {
++              skb_mark_not_on_list(skb);
++      } else {
++              struct sk_buff *segs = skb_gso_segment(skb, 0);
++
++              if (unlikely(IS_ERR(segs))) {
++                      ret = PTR_ERR(segs);
++                      goto err_peer;
++              }
++              dev_kfree_skb(skb);
++              skb = segs;
++      }
++
++      skb_list_walk_safe(skb, skb, next) {
++              skb_mark_not_on_list(skb);
++
++              skb = skb_share_check(skb, GFP_ATOMIC);
++              if (unlikely(!skb))
++                      continue;
++
++              /* We only need to keep the original dst around for icmp,
++               * so at this point we're in a position to drop it.
++               */
++              skb_dst_drop(skb);
++
++              PACKET_CB(skb)->mtu = mtu;
++
++              __skb_queue_tail(&packets, skb);
++      }
++
++      spin_lock_bh(&peer->staged_packet_queue.lock);
++      /* If the queue is getting too big, we start removing the oldest packets
++       * until it's small again. We do this before adding the new packet, so
++       * we don't remove GSO segments that are in excess.
++       */
++      while (skb_queue_len(&peer->staged_packet_queue) > MAX_STAGED_PACKETS) {
++              dev_kfree_skb(__skb_dequeue(&peer->staged_packet_queue));
++              ++dev->stats.tx_dropped;
++      }
++      skb_queue_splice_tail(&packets, &peer->staged_packet_queue);
++      spin_unlock_bh(&peer->staged_packet_queue.lock);
++
++      wg_packet_send_staged_packets(peer);
++
++      wg_peer_put(peer);
++      return NETDEV_TX_OK;
++
++err_peer:
++      wg_peer_put(peer);
++err:
++      ++dev->stats.tx_errors;
++      if (skb->protocol == htons(ETH_P_IP))
++              icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
++      else if (skb->protocol == htons(ETH_P_IPV6))
++              icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
++      kfree_skb(skb);
++      return ret;
++}
++
++static const struct net_device_ops netdev_ops = {
++      .ndo_open               = wg_open,
++      .ndo_stop               = wg_stop,
++      .ndo_start_xmit         = wg_xmit,
++      .ndo_get_stats64        = ip_tunnel_get_stats64
++};
++
++static void wg_destruct(struct net_device *dev)
++{
++      struct wg_device *wg = netdev_priv(dev);
++
++      rtnl_lock();
++      list_del(&wg->device_list);
++      rtnl_unlock();
++      mutex_lock(&wg->device_update_lock);
++      wg->incoming_port = 0;
++      wg_socket_reinit(wg, NULL, NULL);
++      /* The final references are cleared in the below calls to destroy_workqueue. */
++      wg_peer_remove_all(wg);
++      destroy_workqueue(wg->handshake_receive_wq);
++      destroy_workqueue(wg->handshake_send_wq);
++      destroy_workqueue(wg->packet_crypt_wq);
++      wg_packet_queue_free(&wg->decrypt_queue, true);
++      wg_packet_queue_free(&wg->encrypt_queue, true);
++      rcu_barrier(); /* Wait for all the peers to be actually freed. */
++      wg_ratelimiter_uninit();
++      memzero_explicit(&wg->static_identity, sizeof(wg->static_identity));
++      skb_queue_purge(&wg->incoming_handshakes);
++      free_percpu(dev->tstats);
++      free_percpu(wg->incoming_handshakes_worker);
++      if (wg->have_creating_net_ref)
++              put_net(wg->creating_net);
++      kvfree(wg->index_hashtable);
++      kvfree(wg->peer_hashtable);
++      mutex_unlock(&wg->device_update_lock);
++
++      pr_debug("%s: Interface deleted\n", dev->name);
++      free_netdev(dev);
++}
++
++static const struct device_type device_type = { .name = KBUILD_MODNAME };
++
++static void wg_setup(struct net_device *dev)
++{
++      struct wg_device *wg = netdev_priv(dev);
++      enum { WG_NETDEV_FEATURES = NETIF_F_HW_CSUM | NETIF_F_RXCSUM |
++                                  NETIF_F_SG | NETIF_F_GSO |
++                                  NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA };
++
++      dev->netdev_ops = &netdev_ops;
++      dev->hard_header_len = 0;
++      dev->addr_len = 0;
++      dev->needed_headroom = DATA_PACKET_HEAD_ROOM;
++      dev->needed_tailroom = noise_encrypted_len(MESSAGE_PADDING_MULTIPLE);
++      dev->type = ARPHRD_NONE;
++      dev->flags = IFF_POINTOPOINT | IFF_NOARP;
++      dev->priv_flags |= IFF_NO_QUEUE;
++      dev->features |= NETIF_F_LLTX;
++      dev->features |= WG_NETDEV_FEATURES;
++      dev->hw_features |= WG_NETDEV_FEATURES;
++      dev->hw_enc_features |= WG_NETDEV_FEATURES;
++      dev->mtu = ETH_DATA_LEN - MESSAGE_MINIMUM_LENGTH -
++                 sizeof(struct udphdr) -
++                 max(sizeof(struct ipv6hdr), sizeof(struct iphdr));
++
++      SET_NETDEV_DEVTYPE(dev, &device_type);
++
++      /* We need to keep the dst around in case of icmp replies. */
++      netif_keep_dst(dev);
++
++      memset(wg, 0, sizeof(*wg));
++      wg->dev = dev;
++}
++
++static int wg_newlink(struct net *src_net, struct net_device *dev,
++                    struct nlattr *tb[], struct nlattr *data[],
++                    struct netlink_ext_ack *extack)
++{
++      struct wg_device *wg = netdev_priv(dev);
++      int ret = -ENOMEM;
++
++      wg->creating_net = src_net;
++      init_rwsem(&wg->static_identity.lock);
++      mutex_init(&wg->socket_update_lock);
++      mutex_init(&wg->device_update_lock);
++      skb_queue_head_init(&wg->incoming_handshakes);
++      wg_allowedips_init(&wg->peer_allowedips);
++      wg_cookie_checker_init(&wg->cookie_checker, wg);
++      INIT_LIST_HEAD(&wg->peer_list);
++      wg->device_update_gen = 1;
++
++      wg->peer_hashtable = wg_pubkey_hashtable_alloc();
++      if (!wg->peer_hashtable)
++              return ret;
++
++      wg->index_hashtable = wg_index_hashtable_alloc();
++      if (!wg->index_hashtable)
++              goto err_free_peer_hashtable;
++
++      dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
++      if (!dev->tstats)
++              goto err_free_index_hashtable;
++
++      wg->incoming_handshakes_worker =
++              wg_packet_percpu_multicore_worker_alloc(
++                              wg_packet_handshake_receive_worker, wg);
++      if (!wg->incoming_handshakes_worker)
++              goto err_free_tstats;
++
++      wg->handshake_receive_wq = alloc_workqueue("wg-kex-%s",
++                      WQ_CPU_INTENSIVE | WQ_FREEZABLE, 0, dev->name);
++      if (!wg->handshake_receive_wq)
++              goto err_free_incoming_handshakes;
++
++      wg->handshake_send_wq = alloc_workqueue("wg-kex-%s",
++                      WQ_UNBOUND | WQ_FREEZABLE, 0, dev->name);
++      if (!wg->handshake_send_wq)
++              goto err_destroy_handshake_receive;
++
++      wg->packet_crypt_wq = alloc_workqueue("wg-crypt-%s",
++                      WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 0, dev->name);
++      if (!wg->packet_crypt_wq)
++              goto err_destroy_handshake_send;
++
++      ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker,
++                                 true, MAX_QUEUED_PACKETS);
++      if (ret < 0)
++              goto err_destroy_packet_crypt;
++
++      ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker,
++                                 true, MAX_QUEUED_PACKETS);
++      if (ret < 0)
++              goto err_free_encrypt_queue;
++
++      ret = wg_ratelimiter_init();
++      if (ret < 0)
++              goto err_free_decrypt_queue;
++
++      ret = register_netdevice(dev);
++      if (ret < 0)
++              goto err_uninit_ratelimiter;
++
++      list_add(&wg->device_list, &device_list);
++
++      /* We wait until the end to assign priv_destructor, so that
++       * register_netdevice doesn't call it for us if it fails.
++       */
++      dev->priv_destructor = wg_destruct;
++
++      pr_debug("%s: Interface created\n", dev->name);
++      return ret;
++
++err_uninit_ratelimiter:
++      wg_ratelimiter_uninit();
++err_free_decrypt_queue:
++      wg_packet_queue_free(&wg->decrypt_queue, true);
++err_free_encrypt_queue:
++      wg_packet_queue_free(&wg->encrypt_queue, true);
++err_destroy_packet_crypt:
++      destroy_workqueue(wg->packet_crypt_wq);
++err_destroy_handshake_send:
++      destroy_workqueue(wg->handshake_send_wq);
++err_destroy_handshake_receive:
++      destroy_workqueue(wg->handshake_receive_wq);
++err_free_incoming_handshakes:
++      free_percpu(wg->incoming_handshakes_worker);
++err_free_tstats:
++      free_percpu(dev->tstats);
++err_free_index_hashtable:
++      kvfree(wg->index_hashtable);
++err_free_peer_hashtable:
++      kvfree(wg->peer_hashtable);
++      return ret;
++}
++
++static struct rtnl_link_ops link_ops __read_mostly = {
++      .kind                   = KBUILD_MODNAME,
++      .priv_size              = sizeof(struct wg_device),
++      .setup                  = wg_setup,
++      .newlink                = wg_newlink,
++};
++
++static int wg_netdevice_notification(struct notifier_block *nb,
++                                   unsigned long action, void *data)
++{
++      struct net_device *dev = ((struct netdev_notifier_info *)data)->dev;
++      struct wg_device *wg = netdev_priv(dev);
++
++      ASSERT_RTNL();
++
++      if (action != NETDEV_REGISTER || dev->netdev_ops != &netdev_ops)
++              return 0;
++
++      if (dev_net(dev) == wg->creating_net && wg->have_creating_net_ref) {
++              put_net(wg->creating_net);
++              wg->have_creating_net_ref = false;
++      } else if (dev_net(dev) != wg->creating_net &&
++                 !wg->have_creating_net_ref) {
++              wg->have_creating_net_ref = true;
++              get_net(wg->creating_net);
++      }
++      return 0;
++}
++
++static struct notifier_block netdevice_notifier = {
++      .notifier_call = wg_netdevice_notification
++};
++
++int __init wg_device_init(void)
++{
++      int ret;
++
++#ifdef CONFIG_PM_SLEEP
++      ret = register_pm_notifier(&pm_notifier);
++      if (ret)
++              return ret;
++#endif
++
++      ret = register_netdevice_notifier(&netdevice_notifier);
++      if (ret)
++              goto error_pm;
++
++      ret = rtnl_link_register(&link_ops);
++      if (ret)
++              goto error_netdevice;
++
++      return 0;
++
++error_netdevice:
++      unregister_netdevice_notifier(&netdevice_notifier);
++error_pm:
++#ifdef CONFIG_PM_SLEEP
++      unregister_pm_notifier(&pm_notifier);
++#endif
++      return ret;
++}
++
++void wg_device_uninit(void)
++{
++      rtnl_link_unregister(&link_ops);
++      unregister_netdevice_notifier(&netdevice_notifier);
++#ifdef CONFIG_PM_SLEEP
++      unregister_pm_notifier(&pm_notifier);
++#endif
++      rcu_barrier();
++}
+--- /dev/null
++++ b/drivers/net/wireguard/device.h
+@@ -0,0 +1,65 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_DEVICE_H
++#define _WG_DEVICE_H
++
++#include "noise.h"
++#include "allowedips.h"
++#include "peerlookup.h"
++#include "cookie.h"
++
++#include <linux/types.h>
++#include <linux/netdevice.h>
++#include <linux/workqueue.h>
++#include <linux/mutex.h>
++#include <linux/net.h>
++#include <linux/ptr_ring.h>
++
++struct wg_device;
++
++struct multicore_worker {
++      void *ptr;
++      struct work_struct work;
++};
++
++struct crypt_queue {
++      struct ptr_ring ring;
++      union {
++              struct {
++                      struct multicore_worker __percpu *worker;
++                      int last_cpu;
++              };
++              struct work_struct work;
++      };
++};
++
++struct wg_device {
++      struct net_device *dev;
++      struct crypt_queue encrypt_queue, decrypt_queue;
++      struct sock __rcu *sock4, *sock6;
++      struct net *creating_net;
++      struct noise_static_identity static_identity;
++      struct workqueue_struct *handshake_receive_wq, *handshake_send_wq;
++      struct workqueue_struct *packet_crypt_wq;
++      struct sk_buff_head incoming_handshakes;
++      int incoming_handshake_cpu;
++      struct multicore_worker __percpu *incoming_handshakes_worker;
++      struct cookie_checker cookie_checker;
++      struct pubkey_hashtable *peer_hashtable;
++      struct index_hashtable *index_hashtable;
++      struct allowedips peer_allowedips;
++      struct mutex device_update_lock, socket_update_lock;
++      struct list_head device_list, peer_list;
++      unsigned int num_peers, device_update_gen;
++      u32 fwmark;
++      u16 incoming_port;
++      bool have_creating_net_ref;
++};
++
++int wg_device_init(void);
++void wg_device_uninit(void);
++
++#endif /* _WG_DEVICE_H */
+--- /dev/null
++++ b/drivers/net/wireguard/main.c
+@@ -0,0 +1,64 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "version.h"
++#include "device.h"
++#include "noise.h"
++#include "queueing.h"
++#include "ratelimiter.h"
++#include "netlink.h"
++
++#include <uapi/linux/wireguard.h>
++
++#include <linux/version.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/genetlink.h>
++#include <net/rtnetlink.h>
++
++static int __init mod_init(void)
++{
++      int ret;
++
++#ifdef DEBUG
++      if (!wg_allowedips_selftest() || !wg_packet_counter_selftest() ||
++          !wg_ratelimiter_selftest())
++              return -ENOTRECOVERABLE;
++#endif
++      wg_noise_init();
++
++      ret = wg_device_init();
++      if (ret < 0)
++              goto err_device;
++
++      ret = wg_genetlink_init();
++      if (ret < 0)
++              goto err_netlink;
++
++      pr_info("WireGuard " WIREGUARD_VERSION " loaded. See www.wireguard.com for information.\n");
++      pr_info("Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.\n");
++
++      return 0;
++
++err_netlink:
++      wg_device_uninit();
++err_device:
++      return ret;
++}
++
++static void __exit mod_exit(void)
++{
++      wg_genetlink_uninit();
++      wg_device_uninit();
++}
++
++module_init(mod_init);
++module_exit(mod_exit);
++MODULE_LICENSE("GPL v2");
++MODULE_DESCRIPTION("WireGuard secure network tunnel");
++MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
++MODULE_VERSION(WIREGUARD_VERSION);
++MODULE_ALIAS_RTNL_LINK(KBUILD_MODNAME);
++MODULE_ALIAS_GENL_FAMILY(WG_GENL_NAME);
+--- /dev/null
++++ b/drivers/net/wireguard/messages.h
+@@ -0,0 +1,128 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_MESSAGES_H
++#define _WG_MESSAGES_H
++
++#include <crypto/curve25519.h>
++#include <crypto/chacha20poly1305.h>
++#include <crypto/blake2s.h>
++
++#include <linux/kernel.h>
++#include <linux/param.h>
++#include <linux/skbuff.h>
++
++enum noise_lengths {
++      NOISE_PUBLIC_KEY_LEN = CURVE25519_KEY_SIZE,
++      NOISE_SYMMETRIC_KEY_LEN = CHACHA20POLY1305_KEY_SIZE,
++      NOISE_TIMESTAMP_LEN = sizeof(u64) + sizeof(u32),
++      NOISE_AUTHTAG_LEN = CHACHA20POLY1305_AUTHTAG_SIZE,
++      NOISE_HASH_LEN = BLAKE2S_HASH_SIZE
++};
++
++#define noise_encrypted_len(plain_len) ((plain_len) + NOISE_AUTHTAG_LEN)
++
++enum cookie_values {
++      COOKIE_SECRET_MAX_AGE = 2 * 60,
++      COOKIE_SECRET_LATENCY = 5,
++      COOKIE_NONCE_LEN = XCHACHA20POLY1305_NONCE_SIZE,
++      COOKIE_LEN = 16
++};
++
++enum counter_values {
++      COUNTER_BITS_TOTAL = 2048,
++      COUNTER_REDUNDANT_BITS = BITS_PER_LONG,
++      COUNTER_WINDOW_SIZE = COUNTER_BITS_TOTAL - COUNTER_REDUNDANT_BITS
++};
++
++enum limits {
++      REKEY_AFTER_MESSAGES = 1ULL << 60,
++      REJECT_AFTER_MESSAGES = U64_MAX - COUNTER_WINDOW_SIZE - 1,
++      REKEY_TIMEOUT = 5,
++      REKEY_TIMEOUT_JITTER_MAX_JIFFIES = HZ / 3,
++      REKEY_AFTER_TIME = 120,
++      REJECT_AFTER_TIME = 180,
++      INITIATIONS_PER_SECOND = 50,
++      MAX_PEERS_PER_DEVICE = 1U << 20,
++      KEEPALIVE_TIMEOUT = 10,
++      MAX_TIMER_HANDSHAKES = 90 / REKEY_TIMEOUT,
++      MAX_QUEUED_INCOMING_HANDSHAKES = 4096, /* TODO: replace this with DQL */
++      MAX_STAGED_PACKETS = 128,
++      MAX_QUEUED_PACKETS = 1024 /* TODO: replace this with DQL */
++};
++
++enum message_type {
++      MESSAGE_INVALID = 0,
++      MESSAGE_HANDSHAKE_INITIATION = 1,
++      MESSAGE_HANDSHAKE_RESPONSE = 2,
++      MESSAGE_HANDSHAKE_COOKIE = 3,
++      MESSAGE_DATA = 4
++};
++
++struct message_header {
++      /* The actual layout of this that we want is:
++       * u8 type
++       * u8 reserved_zero[3]
++       *
++       * But it turns out that by encoding this as little endian,
++       * we achieve the same thing, and it makes checking faster.
++       */
++      __le32 type;
++};
++
++struct message_macs {
++      u8 mac1[COOKIE_LEN];
++      u8 mac2[COOKIE_LEN];
++};
++
++struct message_handshake_initiation {
++      struct message_header header;
++      __le32 sender_index;
++      u8 unencrypted_ephemeral[NOISE_PUBLIC_KEY_LEN];
++      u8 encrypted_static[noise_encrypted_len(NOISE_PUBLIC_KEY_LEN)];
++      u8 encrypted_timestamp[noise_encrypted_len(NOISE_TIMESTAMP_LEN)];
++      struct message_macs macs;
++};
++
++struct message_handshake_response {
++      struct message_header header;
++      __le32 sender_index;
++      __le32 receiver_index;
++      u8 unencrypted_ephemeral[NOISE_PUBLIC_KEY_LEN];
++      u8 encrypted_nothing[noise_encrypted_len(0)];
++      struct message_macs macs;
++};
++
++struct message_handshake_cookie {
++      struct message_header header;
++      __le32 receiver_index;
++      u8 nonce[COOKIE_NONCE_LEN];
++      u8 encrypted_cookie[noise_encrypted_len(COOKIE_LEN)];
++};
++
++struct message_data {
++      struct message_header header;
++      __le32 key_idx;
++      __le64 counter;
++      u8 encrypted_data[];
++};
++
++#define message_data_len(plain_len) \
++      (noise_encrypted_len(plain_len) + sizeof(struct message_data))
++
++enum message_alignments {
++      MESSAGE_PADDING_MULTIPLE = 16,
++      MESSAGE_MINIMUM_LENGTH = message_data_len(0)
++};
++
++#define SKB_HEADER_LEN                                       \
++      (max(sizeof(struct iphdr), sizeof(struct ipv6hdr)) + \
++       sizeof(struct udphdr) + NET_SKB_PAD)
++#define DATA_PACKET_HEAD_ROOM \
++      ALIGN(sizeof(struct message_data) + SKB_HEADER_LEN, 4)
++
++enum { HANDSHAKE_DSCP = 0x88 /* AF41, plus 00 ECN */ };
++
++#endif /* _WG_MESSAGES_H */
+--- /dev/null
++++ b/drivers/net/wireguard/netlink.c
+@@ -0,0 +1,648 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "netlink.h"
++#include "device.h"
++#include "peer.h"
++#include "socket.h"
++#include "queueing.h"
++#include "messages.h"
++
++#include <uapi/linux/wireguard.h>
++
++#include <linux/if.h>
++#include <net/genetlink.h>
++#include <net/sock.h>
++#include <crypto/algapi.h>
++
++static struct genl_family genl_family;
++
++static const struct nla_policy device_policy[WGDEVICE_A_MAX + 1] = {
++      [WGDEVICE_A_IFINDEX]            = { .type = NLA_U32 },
++      [WGDEVICE_A_IFNAME]             = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
++      [WGDEVICE_A_PRIVATE_KEY]        = { .type = NLA_EXACT_LEN, .len = NOISE_PUBLIC_KEY_LEN },
++      [WGDEVICE_A_PUBLIC_KEY]         = { .type = NLA_EXACT_LEN, .len = NOISE_PUBLIC_KEY_LEN },
++      [WGDEVICE_A_FLAGS]              = { .type = NLA_U32 },
++      [WGDEVICE_A_LISTEN_PORT]        = { .type = NLA_U16 },
++      [WGDEVICE_A_FWMARK]             = { .type = NLA_U32 },
++      [WGDEVICE_A_PEERS]              = { .type = NLA_NESTED }
++};
++
++static const struct nla_policy peer_policy[WGPEER_A_MAX + 1] = {
++      [WGPEER_A_PUBLIC_KEY]                           = { .type = NLA_EXACT_LEN, .len = NOISE_PUBLIC_KEY_LEN },
++      [WGPEER_A_PRESHARED_KEY]                        = { .type = NLA_EXACT_LEN, .len = NOISE_SYMMETRIC_KEY_LEN },
++      [WGPEER_A_FLAGS]                                = { .type = NLA_U32 },
++      [WGPEER_A_ENDPOINT]                             = { .type = NLA_MIN_LEN, .len = sizeof(struct sockaddr) },
++      [WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]        = { .type = NLA_U16 },
++      [WGPEER_A_LAST_HANDSHAKE_TIME]                  = { .type = NLA_EXACT_LEN, .len = sizeof(struct __kernel_timespec) },
++      [WGPEER_A_RX_BYTES]                             = { .type = NLA_U64 },
++      [WGPEER_A_TX_BYTES]                             = { .type = NLA_U64 },
++      [WGPEER_A_ALLOWEDIPS]                           = { .type = NLA_NESTED },
++      [WGPEER_A_PROTOCOL_VERSION]                     = { .type = NLA_U32 }
++};
++
++static const struct nla_policy allowedip_policy[WGALLOWEDIP_A_MAX + 1] = {
++      [WGALLOWEDIP_A_FAMILY]          = { .type = NLA_U16 },
++      [WGALLOWEDIP_A_IPADDR]          = { .type = NLA_MIN_LEN, .len = sizeof(struct in_addr) },
++      [WGALLOWEDIP_A_CIDR_MASK]       = { .type = NLA_U8 }
++};
++
++static struct wg_device *lookup_interface(struct nlattr **attrs,
++                                        struct sk_buff *skb)
++{
++      struct net_device *dev = NULL;
++
++      if (!attrs[WGDEVICE_A_IFINDEX] == !attrs[WGDEVICE_A_IFNAME])
++              return ERR_PTR(-EBADR);
++      if (attrs[WGDEVICE_A_IFINDEX])
++              dev = dev_get_by_index(sock_net(skb->sk),
++                                     nla_get_u32(attrs[WGDEVICE_A_IFINDEX]));
++      else if (attrs[WGDEVICE_A_IFNAME])
++              dev = dev_get_by_name(sock_net(skb->sk),
++                                    nla_data(attrs[WGDEVICE_A_IFNAME]));
++      if (!dev)
++              return ERR_PTR(-ENODEV);
++      if (!dev->rtnl_link_ops || !dev->rtnl_link_ops->kind ||
++          strcmp(dev->rtnl_link_ops->kind, KBUILD_MODNAME)) {
++              dev_put(dev);
++              return ERR_PTR(-EOPNOTSUPP);
++      }
++      return netdev_priv(dev);
++}
++
++static int get_allowedips(struct sk_buff *skb, const u8 *ip, u8 cidr,
++                        int family)
++{
++      struct nlattr *allowedip_nest;
++
++      allowedip_nest = nla_nest_start(skb, 0);
++      if (!allowedip_nest)
++              return -EMSGSIZE;
++
++      if (nla_put_u8(skb, WGALLOWEDIP_A_CIDR_MASK, cidr) ||
++          nla_put_u16(skb, WGALLOWEDIP_A_FAMILY, family) ||
++          nla_put(skb, WGALLOWEDIP_A_IPADDR, family == AF_INET6 ?
++                  sizeof(struct in6_addr) : sizeof(struct in_addr), ip)) {
++              nla_nest_cancel(skb, allowedip_nest);
++              return -EMSGSIZE;
++      }
++
++      nla_nest_end(skb, allowedip_nest);
++      return 0;
++}
++
++struct dump_ctx {
++      struct wg_device *wg;
++      struct wg_peer *next_peer;
++      u64 allowedips_seq;
++      struct allowedips_node *next_allowedip;
++};
++
++#define DUMP_CTX(cb) ((struct dump_ctx *)(cb)->args)
++
++static int
++get_peer(struct wg_peer *peer, struct sk_buff *skb, struct dump_ctx *ctx)
++{
++
++      struct nlattr *allowedips_nest, *peer_nest = nla_nest_start(skb, 0);
++      struct allowedips_node *allowedips_node = ctx->next_allowedip;
++      bool fail;
++
++      if (!peer_nest)
++              return -EMSGSIZE;
++
++      down_read(&peer->handshake.lock);
++      fail = nla_put(skb, WGPEER_A_PUBLIC_KEY, NOISE_PUBLIC_KEY_LEN,
++                     peer->handshake.remote_static);
++      up_read(&peer->handshake.lock);
++      if (fail)
++              goto err;
++
++      if (!allowedips_node) {
++              const struct __kernel_timespec last_handshake = {
++                      .tv_sec = peer->walltime_last_handshake.tv_sec,
++                      .tv_nsec = peer->walltime_last_handshake.tv_nsec
++              };
++
++              down_read(&peer->handshake.lock);
++              fail = nla_put(skb, WGPEER_A_PRESHARED_KEY,
++                             NOISE_SYMMETRIC_KEY_LEN,
++                             peer->handshake.preshared_key);
++              up_read(&peer->handshake.lock);
++              if (fail)
++                      goto err;
++
++              if (nla_put(skb, WGPEER_A_LAST_HANDSHAKE_TIME,
++                          sizeof(last_handshake), &last_handshake) ||
++                  nla_put_u16(skb, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL,
++                              peer->persistent_keepalive_interval) ||
++                  nla_put_u64_64bit(skb, WGPEER_A_TX_BYTES, peer->tx_bytes,
++                                    WGPEER_A_UNSPEC) ||
++                  nla_put_u64_64bit(skb, WGPEER_A_RX_BYTES, peer->rx_bytes,
++                                    WGPEER_A_UNSPEC) ||
++                  nla_put_u32(skb, WGPEER_A_PROTOCOL_VERSION, 1))
++                      goto err;
++
++              read_lock_bh(&peer->endpoint_lock);
++              if (peer->endpoint.addr.sa_family == AF_INET)
++                      fail = nla_put(skb, WGPEER_A_ENDPOINT,
++                                     sizeof(peer->endpoint.addr4),
++                                     &peer->endpoint.addr4);
++              else if (peer->endpoint.addr.sa_family == AF_INET6)
++                      fail = nla_put(skb, WGPEER_A_ENDPOINT,
++                                     sizeof(peer->endpoint.addr6),
++                                     &peer->endpoint.addr6);
++              read_unlock_bh(&peer->endpoint_lock);
++              if (fail)
++                      goto err;
++              allowedips_node =
++                      list_first_entry_or_null(&peer->allowedips_list,
++                                      struct allowedips_node, peer_list);
++      }
++      if (!allowedips_node)
++              goto no_allowedips;
++      if (!ctx->allowedips_seq)
++              ctx->allowedips_seq = peer->device->peer_allowedips.seq;
++      else if (ctx->allowedips_seq != peer->device->peer_allowedips.seq)
++              goto no_allowedips;
++
++      allowedips_nest = nla_nest_start(skb, WGPEER_A_ALLOWEDIPS);
++      if (!allowedips_nest)
++              goto err;
++
++      list_for_each_entry_from(allowedips_node, &peer->allowedips_list,
++                               peer_list) {
++              u8 cidr, ip[16] __aligned(__alignof(u64));
++              int family;
++
++              family = wg_allowedips_read_node(allowedips_node, ip, &cidr);
++              if (get_allowedips(skb, ip, cidr, family)) {
++                      nla_nest_end(skb, allowedips_nest);
++                      nla_nest_end(skb, peer_nest);
++                      ctx->next_allowedip = allowedips_node;
++                      return -EMSGSIZE;
++              }
++      }
++      nla_nest_end(skb, allowedips_nest);
++no_allowedips:
++      nla_nest_end(skb, peer_nest);
++      ctx->next_allowedip = NULL;
++      ctx->allowedips_seq = 0;
++      return 0;
++err:
++      nla_nest_cancel(skb, peer_nest);
++      return -EMSGSIZE;
++}
++
++static int wg_get_device_start(struct netlink_callback *cb)
++{
++      struct nlattr **attrs = genl_family_attrbuf(&genl_family);
++      struct wg_device *wg;
++      int ret;
++
++      ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + genl_family.hdrsize, attrs,
++                        genl_family.maxattr, device_policy, NULL);
++      if (ret < 0)
++              return ret;
++      wg = lookup_interface(attrs, cb->skb);
++      if (IS_ERR(wg))
++              return PTR_ERR(wg);
++      DUMP_CTX(cb)->wg = wg;
++      return 0;
++}
++
++static int wg_get_device_dump(struct sk_buff *skb, struct netlink_callback *cb)
++{
++      struct wg_peer *peer, *next_peer_cursor;
++      struct dump_ctx *ctx = DUMP_CTX(cb);
++      struct wg_device *wg = ctx->wg;
++      struct nlattr *peers_nest;
++      int ret = -EMSGSIZE;
++      bool done = true;
++      void *hdr;
++
++      rtnl_lock();
++      mutex_lock(&wg->device_update_lock);
++      cb->seq = wg->device_update_gen;
++      next_peer_cursor = ctx->next_peer;
++
++      hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
++                        &genl_family, NLM_F_MULTI, WG_CMD_GET_DEVICE);
++      if (!hdr)
++              goto out;
++      genl_dump_check_consistent(cb, hdr);
++
++      if (!ctx->next_peer) {
++              if (nla_put_u16(skb, WGDEVICE_A_LISTEN_PORT,
++                              wg->incoming_port) ||
++                  nla_put_u32(skb, WGDEVICE_A_FWMARK, wg->fwmark) ||
++                  nla_put_u32(skb, WGDEVICE_A_IFINDEX, wg->dev->ifindex) ||
++                  nla_put_string(skb, WGDEVICE_A_IFNAME, wg->dev->name))
++                      goto out;
++
++              down_read(&wg->static_identity.lock);
++              if (wg->static_identity.has_identity) {
++                      if (nla_put(skb, WGDEVICE_A_PRIVATE_KEY,
++                                  NOISE_PUBLIC_KEY_LEN,
++                                  wg->static_identity.static_private) ||
++                          nla_put(skb, WGDEVICE_A_PUBLIC_KEY,
++                                  NOISE_PUBLIC_KEY_LEN,
++                                  wg->static_identity.static_public)) {
++                              up_read(&wg->static_identity.lock);
++                              goto out;
++                      }
++              }
++              up_read(&wg->static_identity.lock);
++      }
++
++      peers_nest = nla_nest_start(skb, WGDEVICE_A_PEERS);
++      if (!peers_nest)
++              goto out;
++      ret = 0;
++      /* If the last cursor was removed via list_del_init in peer_remove, then
++       * we just treat this the same as there being no more peers left. The
++       * reason is that seq_nr should indicate to userspace that this isn't a
++       * coherent dump anyway, so they'll try again.
++       */
++      if (list_empty(&wg->peer_list) ||
++          (ctx->next_peer && list_empty(&ctx->next_peer->peer_list))) {
++              nla_nest_cancel(skb, peers_nest);
++              goto out;
++      }
++      lockdep_assert_held(&wg->device_update_lock);
++      peer = list_prepare_entry(ctx->next_peer, &wg->peer_list, peer_list);
++      list_for_each_entry_continue(peer, &wg->peer_list, peer_list) {
++              if (get_peer(peer, skb, ctx)) {
++                      done = false;
++                      break;
++              }
++              next_peer_cursor = peer;
++      }
++      nla_nest_end(skb, peers_nest);
++
++out:
++      if (!ret && !done && next_peer_cursor)
++              wg_peer_get(next_peer_cursor);
++      wg_peer_put(ctx->next_peer);
++      mutex_unlock(&wg->device_update_lock);
++      rtnl_unlock();
++
++      if (ret) {
++              genlmsg_cancel(skb, hdr);
++              return ret;
++      }
++      genlmsg_end(skb, hdr);
++      if (done) {
++              ctx->next_peer = NULL;
++              return 0;
++      }
++      ctx->next_peer = next_peer_cursor;
++      return skb->len;
++
++      /* At this point, we can't really deal ourselves with safely zeroing out
++       * the private key material after usage. This will need an additional API
++       * in the kernel for marking skbs as zero_on_free.
++       */
++}
++
++static int wg_get_device_done(struct netlink_callback *cb)
++{
++      struct dump_ctx *ctx = DUMP_CTX(cb);
++
++      if (ctx->wg)
++              dev_put(ctx->wg->dev);
++      wg_peer_put(ctx->next_peer);
++      return 0;
++}
++
++static int set_port(struct wg_device *wg, u16 port)
++{
++      struct wg_peer *peer;
++
++      if (wg->incoming_port == port)
++              return 0;
++      list_for_each_entry(peer, &wg->peer_list, peer_list)
++              wg_socket_clear_peer_endpoint_src(peer);
++      if (!netif_running(wg->dev)) {
++              wg->incoming_port = port;
++              return 0;
++      }
++      return wg_socket_init(wg, port);
++}
++
++static int set_allowedip(struct wg_peer *peer, struct nlattr **attrs)
++{
++      int ret = -EINVAL;
++      u16 family;
++      u8 cidr;
++
++      if (!attrs[WGALLOWEDIP_A_FAMILY] || !attrs[WGALLOWEDIP_A_IPADDR] ||
++          !attrs[WGALLOWEDIP_A_CIDR_MASK])
++              return ret;
++      family = nla_get_u16(attrs[WGALLOWEDIP_A_FAMILY]);
++      cidr = nla_get_u8(attrs[WGALLOWEDIP_A_CIDR_MASK]);
++
++      if (family == AF_INET && cidr <= 32 &&
++          nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in_addr))
++              ret = wg_allowedips_insert_v4(
++                      &peer->device->peer_allowedips,
++                      nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer,
++                      &peer->device->device_update_lock);
++      else if (family == AF_INET6 && cidr <= 128 &&
++               nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in6_addr))
++              ret = wg_allowedips_insert_v6(
++                      &peer->device->peer_allowedips,
++                      nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer,
++                      &peer->device->device_update_lock);
++
++      return ret;
++}
++
++static int set_peer(struct wg_device *wg, struct nlattr **attrs)
++{
++      u8 *public_key = NULL, *preshared_key = NULL;
++      struct wg_peer *peer = NULL;
++      u32 flags = 0;
++      int ret;
++
++      ret = -EINVAL;
++      if (attrs[WGPEER_A_PUBLIC_KEY] &&
++          nla_len(attrs[WGPEER_A_PUBLIC_KEY]) == NOISE_PUBLIC_KEY_LEN)
++              public_key = nla_data(attrs[WGPEER_A_PUBLIC_KEY]);
++      else
++              goto out;
++      if (attrs[WGPEER_A_PRESHARED_KEY] &&
++          nla_len(attrs[WGPEER_A_PRESHARED_KEY]) == NOISE_SYMMETRIC_KEY_LEN)
++              preshared_key = nla_data(attrs[WGPEER_A_PRESHARED_KEY]);
++
++      if (attrs[WGPEER_A_FLAGS])
++              flags = nla_get_u32(attrs[WGPEER_A_FLAGS]);
++      ret = -EOPNOTSUPP;
++      if (flags & ~__WGPEER_F_ALL)
++              goto out;
++
++      ret = -EPFNOSUPPORT;
++      if (attrs[WGPEER_A_PROTOCOL_VERSION]) {
++              if (nla_get_u32(attrs[WGPEER_A_PROTOCOL_VERSION]) != 1)
++                      goto out;
++      }
++
++      peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable,
++                                        nla_data(attrs[WGPEER_A_PUBLIC_KEY]));
++      ret = 0;
++      if (!peer) { /* Peer doesn't exist yet. Add a new one. */
++              if (flags & (WGPEER_F_REMOVE_ME | WGPEER_F_UPDATE_ONLY))
++                      goto out;
++
++              /* The peer is new, so there aren't allowed IPs to remove. */
++              flags &= ~WGPEER_F_REPLACE_ALLOWEDIPS;
++
++              down_read(&wg->static_identity.lock);
++              if (wg->static_identity.has_identity &&
++                  !memcmp(nla_data(attrs[WGPEER_A_PUBLIC_KEY]),
++                          wg->static_identity.static_public,
++                          NOISE_PUBLIC_KEY_LEN)) {
++                      /* We silently ignore peers that have the same public
++                       * key as the device. The reason we do it silently is
++                       * that we'd like for people to be able to reuse the
++                       * same set of API calls across peers.
++                       */
++                      up_read(&wg->static_identity.lock);
++                      ret = 0;
++                      goto out;
++              }
++              up_read(&wg->static_identity.lock);
++
++              peer = wg_peer_create(wg, public_key, preshared_key);
++              if (IS_ERR(peer)) {
++                      /* Similar to the above, if the key is invalid, we skip
++                       * it without fanfare, so that services don't need to
++                       * worry about doing key validation themselves.
++                       */
++                      ret = PTR_ERR(peer) == -EKEYREJECTED ? 0 : PTR_ERR(peer);
++                      peer = NULL;
++                      goto out;
++              }
++              /* Take additional reference, as though we've just been
++               * looked up.
++               */
++              wg_peer_get(peer);
++      }
++
++      if (flags & WGPEER_F_REMOVE_ME) {
++              wg_peer_remove(peer);
++              goto out;
++      }
++
++      if (preshared_key) {
++              down_write(&peer->handshake.lock);
++              memcpy(&peer->handshake.preshared_key, preshared_key,
++                     NOISE_SYMMETRIC_KEY_LEN);
++              up_write(&peer->handshake.lock);
++      }
++
++      if (attrs[WGPEER_A_ENDPOINT]) {
++              struct sockaddr *addr = nla_data(attrs[WGPEER_A_ENDPOINT]);
++              size_t len = nla_len(attrs[WGPEER_A_ENDPOINT]);
++
++              if ((len == sizeof(struct sockaddr_in) &&
++                   addr->sa_family == AF_INET) ||
++                  (len == sizeof(struct sockaddr_in6) &&
++                   addr->sa_family == AF_INET6)) {
++                      struct endpoint endpoint = { { { 0 } } };
++
++                      memcpy(&endpoint.addr, addr, len);
++                      wg_socket_set_peer_endpoint(peer, &endpoint);
++              }
++      }
++
++      if (flags & WGPEER_F_REPLACE_ALLOWEDIPS)
++              wg_allowedips_remove_by_peer(&wg->peer_allowedips, peer,
++                                           &wg->device_update_lock);
++
++      if (attrs[WGPEER_A_ALLOWEDIPS]) {
++              struct nlattr *attr, *allowedip[WGALLOWEDIP_A_MAX + 1];
++              int rem;
++
++              nla_for_each_nested(attr, attrs[WGPEER_A_ALLOWEDIPS], rem) {
++                      ret = nla_parse_nested(allowedip, WGALLOWEDIP_A_MAX,
++                                             attr, allowedip_policy, NULL);
++                      if (ret < 0)
++                              goto out;
++                      ret = set_allowedip(peer, allowedip);
++                      if (ret < 0)
++                              goto out;
++              }
++      }
++
++      if (attrs[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]) {
++              const u16 persistent_keepalive_interval = nla_get_u16(
++                              attrs[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]);
++              const bool send_keepalive =
++                      !peer->persistent_keepalive_interval &&
++                      persistent_keepalive_interval &&
++                      netif_running(wg->dev);
++
++              peer->persistent_keepalive_interval = persistent_keepalive_interval;
++              if (send_keepalive)
++                      wg_packet_send_keepalive(peer);
++      }
++
++      if (netif_running(wg->dev))
++              wg_packet_send_staged_packets(peer);
++
++out:
++      wg_peer_put(peer);
++      if (attrs[WGPEER_A_PRESHARED_KEY])
++              memzero_explicit(nla_data(attrs[WGPEER_A_PRESHARED_KEY]),
++                               nla_len(attrs[WGPEER_A_PRESHARED_KEY]));
++      return ret;
++}
++
++static int wg_set_device(struct sk_buff *skb, struct genl_info *info)
++{
++      struct wg_device *wg = lookup_interface(info->attrs, skb);
++      u32 flags = 0;
++      int ret;
++
++      if (IS_ERR(wg)) {
++              ret = PTR_ERR(wg);
++              goto out_nodev;
++      }
++
++      rtnl_lock();
++      mutex_lock(&wg->device_update_lock);
++
++      if (info->attrs[WGDEVICE_A_FLAGS])
++              flags = nla_get_u32(info->attrs[WGDEVICE_A_FLAGS]);
++      ret = -EOPNOTSUPP;
++      if (flags & ~__WGDEVICE_F_ALL)
++              goto out;
++
++      ret = -EPERM;
++      if ((info->attrs[WGDEVICE_A_LISTEN_PORT] ||
++           info->attrs[WGDEVICE_A_FWMARK]) &&
++          !ns_capable(wg->creating_net->user_ns, CAP_NET_ADMIN))
++              goto out;
++
++      ++wg->device_update_gen;
++
++      if (info->attrs[WGDEVICE_A_FWMARK]) {
++              struct wg_peer *peer;
++
++              wg->fwmark = nla_get_u32(info->attrs[WGDEVICE_A_FWMARK]);
++              list_for_each_entry(peer, &wg->peer_list, peer_list)
++                      wg_socket_clear_peer_endpoint_src(peer);
++      }
++
++      if (info->attrs[WGDEVICE_A_LISTEN_PORT]) {
++              ret = set_port(wg,
++                      nla_get_u16(info->attrs[WGDEVICE_A_LISTEN_PORT]));
++              if (ret)
++                      goto out;
++      }
++
++      if (flags & WGDEVICE_F_REPLACE_PEERS)
++              wg_peer_remove_all(wg);
++
++      if (info->attrs[WGDEVICE_A_PRIVATE_KEY] &&
++          nla_len(info->attrs[WGDEVICE_A_PRIVATE_KEY]) ==
++                  NOISE_PUBLIC_KEY_LEN) {
++              u8 *private_key = nla_data(info->attrs[WGDEVICE_A_PRIVATE_KEY]);
++              u8 public_key[NOISE_PUBLIC_KEY_LEN];
++              struct wg_peer *peer, *temp;
++
++              if (!crypto_memneq(wg->static_identity.static_private,
++                                 private_key, NOISE_PUBLIC_KEY_LEN))
++                      goto skip_set_private_key;
++
++              /* We remove before setting, to prevent race, which means doing
++               * two 25519-genpub ops.
++               */
++              if (curve25519_generate_public(public_key, private_key)) {
++                      peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable,
++                                                        public_key);
++                      if (peer) {
++                              wg_peer_put(peer);
++                              wg_peer_remove(peer);
++                      }
++              }
++
++              down_write(&wg->static_identity.lock);
++              wg_noise_set_static_identity_private_key(&wg->static_identity,
++                                                       private_key);
++              list_for_each_entry_safe(peer, temp, &wg->peer_list,
++                                       peer_list) {
++                      if (wg_noise_precompute_static_static(peer))
++                              wg_noise_expire_current_peer_keypairs(peer);
++                      else
++                              wg_peer_remove(peer);
++              }
++              wg_cookie_checker_precompute_device_keys(&wg->cookie_checker);
++              up_write(&wg->static_identity.lock);
++      }
++skip_set_private_key:
++
++      if (info->attrs[WGDEVICE_A_PEERS]) {
++              struct nlattr *attr, *peer[WGPEER_A_MAX + 1];
++              int rem;
++
++              nla_for_each_nested(attr, info->attrs[WGDEVICE_A_PEERS], rem) {
++                      ret = nla_parse_nested(peer, WGPEER_A_MAX, attr,
++                                             peer_policy, NULL);
++                      if (ret < 0)
++                              goto out;
++                      ret = set_peer(wg, peer);
++                      if (ret < 0)
++                              goto out;
++              }
++      }
++      ret = 0;
++
++out:
++      mutex_unlock(&wg->device_update_lock);
++      rtnl_unlock();
++      dev_put(wg->dev);
++out_nodev:
++      if (info->attrs[WGDEVICE_A_PRIVATE_KEY])
++              memzero_explicit(nla_data(info->attrs[WGDEVICE_A_PRIVATE_KEY]),
++                               nla_len(info->attrs[WGDEVICE_A_PRIVATE_KEY]));
++      return ret;
++}
++
++static const struct genl_ops genl_ops[] = {
++      {
++              .cmd = WG_CMD_GET_DEVICE,
++              .start = wg_get_device_start,
++              .dumpit = wg_get_device_dump,
++              .done = wg_get_device_done,
++              .flags = GENL_UNS_ADMIN_PERM
++      }, {
++              .cmd = WG_CMD_SET_DEVICE,
++              .doit = wg_set_device,
++              .flags = GENL_UNS_ADMIN_PERM
++      }
++};
++
++static struct genl_family genl_family __ro_after_init = {
++      .ops = genl_ops,
++      .n_ops = ARRAY_SIZE(genl_ops),
++      .name = WG_GENL_NAME,
++      .version = WG_GENL_VERSION,
++      .maxattr = WGDEVICE_A_MAX,
++      .module = THIS_MODULE,
++      .policy = device_policy,
++      .netnsok = true
++};
++
++int __init wg_genetlink_init(void)
++{
++      return genl_register_family(&genl_family);
++}
++
++void __exit wg_genetlink_uninit(void)
++{
++      genl_unregister_family(&genl_family);
++}
+--- /dev/null
++++ b/drivers/net/wireguard/netlink.h
+@@ -0,0 +1,12 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_NETLINK_H
++#define _WG_NETLINK_H
++
++int wg_genetlink_init(void);
++void wg_genetlink_uninit(void);
++
++#endif /* _WG_NETLINK_H */
+--- /dev/null
++++ b/drivers/net/wireguard/noise.c
+@@ -0,0 +1,828 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "noise.h"
++#include "device.h"
++#include "peer.h"
++#include "messages.h"
++#include "queueing.h"
++#include "peerlookup.h"
++
++#include <linux/rcupdate.h>
++#include <linux/slab.h>
++#include <linux/bitmap.h>
++#include <linux/scatterlist.h>
++#include <linux/highmem.h>
++#include <crypto/algapi.h>
++
++/* This implements Noise_IKpsk2:
++ *
++ * <- s
++ * ******
++ * -> e, es, s, ss, {t}
++ * <- e, ee, se, psk, {}
++ */
++
++static const u8 handshake_name[37] = "Noise_IKpsk2_25519_ChaChaPoly_BLAKE2s";
++static const u8 identifier_name[34] = "WireGuard v1 zx2c4 Jason@zx2c4.com";
++static u8 handshake_init_hash[NOISE_HASH_LEN] __ro_after_init;
++static u8 handshake_init_chaining_key[NOISE_HASH_LEN] __ro_after_init;
++static atomic64_t keypair_counter = ATOMIC64_INIT(0);
++
++void __init wg_noise_init(void)
++{
++      struct blake2s_state blake;
++
++      blake2s(handshake_init_chaining_key, handshake_name, NULL,
++              NOISE_HASH_LEN, sizeof(handshake_name), 0);
++      blake2s_init(&blake, NOISE_HASH_LEN);
++      blake2s_update(&blake, handshake_init_chaining_key, NOISE_HASH_LEN);
++      blake2s_update(&blake, identifier_name, sizeof(identifier_name));
++      blake2s_final(&blake, handshake_init_hash);
++}
++
++/* Must hold peer->handshake.static_identity->lock */
++bool wg_noise_precompute_static_static(struct wg_peer *peer)
++{
++      bool ret = true;
++
++      down_write(&peer->handshake.lock);
++      if (peer->handshake.static_identity->has_identity)
++              ret = curve25519(
++                      peer->handshake.precomputed_static_static,
++                      peer->handshake.static_identity->static_private,
++                      peer->handshake.remote_static);
++      else
++              memset(peer->handshake.precomputed_static_static, 0,
++                     NOISE_PUBLIC_KEY_LEN);
++      up_write(&peer->handshake.lock);
++      return ret;
++}
++
++bool wg_noise_handshake_init(struct noise_handshake *handshake,
++                         struct noise_static_identity *static_identity,
++                         const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN],
++                         const u8 peer_preshared_key[NOISE_SYMMETRIC_KEY_LEN],
++                         struct wg_peer *peer)
++{
++      memset(handshake, 0, sizeof(*handshake));
++      init_rwsem(&handshake->lock);
++      handshake->entry.type = INDEX_HASHTABLE_HANDSHAKE;
++      handshake->entry.peer = peer;
++      memcpy(handshake->remote_static, peer_public_key, NOISE_PUBLIC_KEY_LEN);
++      if (peer_preshared_key)
++              memcpy(handshake->preshared_key, peer_preshared_key,
++                     NOISE_SYMMETRIC_KEY_LEN);
++      handshake->static_identity = static_identity;
++      handshake->state = HANDSHAKE_ZEROED;
++      return wg_noise_precompute_static_static(peer);
++}
++
++static void handshake_zero(struct noise_handshake *handshake)
++{
++      memset(&handshake->ephemeral_private, 0, NOISE_PUBLIC_KEY_LEN);
++      memset(&handshake->remote_ephemeral, 0, NOISE_PUBLIC_KEY_LEN);
++      memset(&handshake->hash, 0, NOISE_HASH_LEN);
++      memset(&handshake->chaining_key, 0, NOISE_HASH_LEN);
++      handshake->remote_index = 0;
++      handshake->state = HANDSHAKE_ZEROED;
++}
++
++void wg_noise_handshake_clear(struct noise_handshake *handshake)
++{
++      wg_index_hashtable_remove(
++                      handshake->entry.peer->device->index_hashtable,
++                      &handshake->entry);
++      down_write(&handshake->lock);
++      handshake_zero(handshake);
++      up_write(&handshake->lock);
++      wg_index_hashtable_remove(
++                      handshake->entry.peer->device->index_hashtable,
++                      &handshake->entry);
++}
++
++static struct noise_keypair *keypair_create(struct wg_peer *peer)
++{
++      struct noise_keypair *keypair = kzalloc(sizeof(*keypair), GFP_KERNEL);
++
++      if (unlikely(!keypair))
++              return NULL;
++      keypair->internal_id = atomic64_inc_return(&keypair_counter);
++      keypair->entry.type = INDEX_HASHTABLE_KEYPAIR;
++      keypair->entry.peer = peer;
++      kref_init(&keypair->refcount);
++      return keypair;
++}
++
++static void keypair_free_rcu(struct rcu_head *rcu)
++{
++      kzfree(container_of(rcu, struct noise_keypair, rcu));
++}
++
++static void keypair_free_kref(struct kref *kref)
++{
++      struct noise_keypair *keypair =
++              container_of(kref, struct noise_keypair, refcount);
++
++      net_dbg_ratelimited("%s: Keypair %llu destroyed for peer %llu\n",
++                          keypair->entry.peer->device->dev->name,
++                          keypair->internal_id,
++                          keypair->entry.peer->internal_id);
++      wg_index_hashtable_remove(keypair->entry.peer->device->index_hashtable,
++                                &keypair->entry);
++      call_rcu(&keypair->rcu, keypair_free_rcu);
++}
++
++void wg_noise_keypair_put(struct noise_keypair *keypair, bool unreference_now)
++{
++      if (unlikely(!keypair))
++              return;
++      if (unlikely(unreference_now))
++              wg_index_hashtable_remove(
++                      keypair->entry.peer->device->index_hashtable,
++                      &keypair->entry);
++      kref_put(&keypair->refcount, keypair_free_kref);
++}
++
++struct noise_keypair *wg_noise_keypair_get(struct noise_keypair *keypair)
++{
++      RCU_LOCKDEP_WARN(!rcu_read_lock_bh_held(),
++              "Taking noise keypair reference without holding the RCU BH read lock");
++      if (unlikely(!keypair || !kref_get_unless_zero(&keypair->refcount)))
++              return NULL;
++      return keypair;
++}
++
++void wg_noise_keypairs_clear(struct noise_keypairs *keypairs)
++{
++      struct noise_keypair *old;
++
++      spin_lock_bh(&keypairs->keypair_update_lock);
++
++      /* We zero the next_keypair before zeroing the others, so that
++       * wg_noise_received_with_keypair returns early before subsequent ones
++       * are zeroed.
++       */
++      old = rcu_dereference_protected(keypairs->next_keypair,
++              lockdep_is_held(&keypairs->keypair_update_lock));
++      RCU_INIT_POINTER(keypairs->next_keypair, NULL);
++      wg_noise_keypair_put(old, true);
++
++      old = rcu_dereference_protected(keypairs->previous_keypair,
++              lockdep_is_held(&keypairs->keypair_update_lock));
++      RCU_INIT_POINTER(keypairs->previous_keypair, NULL);
++      wg_noise_keypair_put(old, true);
++
++      old = rcu_dereference_protected(keypairs->current_keypair,
++              lockdep_is_held(&keypairs->keypair_update_lock));
++      RCU_INIT_POINTER(keypairs->current_keypair, NULL);
++      wg_noise_keypair_put(old, true);
++
++      spin_unlock_bh(&keypairs->keypair_update_lock);
++}
++
++void wg_noise_expire_current_peer_keypairs(struct wg_peer *peer)
++{
++      struct noise_keypair *keypair;
++
++      wg_noise_handshake_clear(&peer->handshake);
++      wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake);
++
++      spin_lock_bh(&peer->keypairs.keypair_update_lock);
++      keypair = rcu_dereference_protected(peer->keypairs.next_keypair,
++                      lockdep_is_held(&peer->keypairs.keypair_update_lock));
++      if (keypair)
++              keypair->sending.is_valid = false;
++      keypair = rcu_dereference_protected(peer->keypairs.current_keypair,
++                      lockdep_is_held(&peer->keypairs.keypair_update_lock));
++      if (keypair)
++              keypair->sending.is_valid = false;
++      spin_unlock_bh(&peer->keypairs.keypair_update_lock);
++}
++
++static void add_new_keypair(struct noise_keypairs *keypairs,
++                          struct noise_keypair *new_keypair)
++{
++      struct noise_keypair *previous_keypair, *next_keypair, *current_keypair;
++
++      spin_lock_bh(&keypairs->keypair_update_lock);
++      previous_keypair = rcu_dereference_protected(keypairs->previous_keypair,
++              lockdep_is_held(&keypairs->keypair_update_lock));
++      next_keypair = rcu_dereference_protected(keypairs->next_keypair,
++              lockdep_is_held(&keypairs->keypair_update_lock));
++      current_keypair = rcu_dereference_protected(keypairs->current_keypair,
++              lockdep_is_held(&keypairs->keypair_update_lock));
++      if (new_keypair->i_am_the_initiator) {
++              /* If we're the initiator, it means we've sent a handshake, and
++               * received a confirmation response, which means this new
++               * keypair can now be used.
++               */
++              if (next_keypair) {
++                      /* If there already was a next keypair pending, we
++                       * demote it to be the previous keypair, and free the
++                       * existing current. Note that this means KCI can result
++                       * in this transition. It would perhaps be more sound to
++                       * always just get rid of the unused next keypair
++                       * instead of putting it in the previous slot, but this
++                       * might be a bit less robust. Something to think about
++                       * for the future.
++                       */
++                      RCU_INIT_POINTER(keypairs->next_keypair, NULL);
++                      rcu_assign_pointer(keypairs->previous_keypair,
++                                         next_keypair);
++                      wg_noise_keypair_put(current_keypair, true);
++              } else /* If there wasn't an existing next keypair, we replace
++                      * the previous with the current one.
++                      */
++                      rcu_assign_pointer(keypairs->previous_keypair,
++                                         current_keypair);
++              /* At this point we can get rid of the old previous keypair, and
++               * set up the new keypair.
++               */
++              wg_noise_keypair_put(previous_keypair, true);
++              rcu_assign_pointer(keypairs->current_keypair, new_keypair);
++      } else {
++              /* If we're the responder, it means we can't use the new keypair
++               * until we receive confirmation via the first data packet, so
++               * we get rid of the existing previous one, the possibly
++               * existing next one, and slide in the new next one.
++               */
++              rcu_assign_pointer(keypairs->next_keypair, new_keypair);
++              wg_noise_keypair_put(next_keypair, true);
++              RCU_INIT_POINTER(keypairs->previous_keypair, NULL);
++              wg_noise_keypair_put(previous_keypair, true);
++      }
++      spin_unlock_bh(&keypairs->keypair_update_lock);
++}
++
++bool wg_noise_received_with_keypair(struct noise_keypairs *keypairs,
++                                  struct noise_keypair *received_keypair)
++{
++      struct noise_keypair *old_keypair;
++      bool key_is_new;
++
++      /* We first check without taking the spinlock. */
++      key_is_new = received_keypair ==
++                   rcu_access_pointer(keypairs->next_keypair);
++      if (likely(!key_is_new))
++              return false;
++
++      spin_lock_bh(&keypairs->keypair_update_lock);
++      /* After locking, we double check that things didn't change from
++       * beneath us.
++       */
++      if (unlikely(received_keypair !=
++                  rcu_dereference_protected(keypairs->next_keypair,
++                          lockdep_is_held(&keypairs->keypair_update_lock)))) {
++              spin_unlock_bh(&keypairs->keypair_update_lock);
++              return false;
++      }
++
++      /* When we've finally received the confirmation, we slide the next
++       * into the current, the current into the previous, and get rid of
++       * the old previous.
++       */
++      old_keypair = rcu_dereference_protected(keypairs->previous_keypair,
++              lockdep_is_held(&keypairs->keypair_update_lock));
++      rcu_assign_pointer(keypairs->previous_keypair,
++              rcu_dereference_protected(keypairs->current_keypair,
++                      lockdep_is_held(&keypairs->keypair_update_lock)));
++      wg_noise_keypair_put(old_keypair, true);
++      rcu_assign_pointer(keypairs->current_keypair, received_keypair);
++      RCU_INIT_POINTER(keypairs->next_keypair, NULL);
++
++      spin_unlock_bh(&keypairs->keypair_update_lock);
++      return true;
++}
++
++/* Must hold static_identity->lock */
++void wg_noise_set_static_identity_private_key(
++      struct noise_static_identity *static_identity,
++      const u8 private_key[NOISE_PUBLIC_KEY_LEN])
++{
++      memcpy(static_identity->static_private, private_key,
++             NOISE_PUBLIC_KEY_LEN);
++      curve25519_clamp_secret(static_identity->static_private);
++      static_identity->has_identity = curve25519_generate_public(
++              static_identity->static_public, private_key);
++}
++
++/* This is Hugo Krawczyk's HKDF:
++ *  - https://eprint.iacr.org/2010/264.pdf
++ *  - https://tools.ietf.org/html/rfc5869
++ */
++static void kdf(u8 *first_dst, u8 *second_dst, u8 *third_dst, const u8 *data,
++              size_t first_len, size_t second_len, size_t third_len,
++              size_t data_len, const u8 chaining_key[NOISE_HASH_LEN])
++{
++      u8 output[BLAKE2S_HASH_SIZE + 1];
++      u8 secret[BLAKE2S_HASH_SIZE];
++
++      WARN_ON(IS_ENABLED(DEBUG) &&
++              (first_len > BLAKE2S_HASH_SIZE ||
++               second_len > BLAKE2S_HASH_SIZE ||
++               third_len > BLAKE2S_HASH_SIZE ||
++               ((second_len || second_dst || third_len || third_dst) &&
++                (!first_len || !first_dst)) ||
++               ((third_len || third_dst) && (!second_len || !second_dst))));
++
++      /* Extract entropy from data into secret */
++      blake2s256_hmac(secret, data, chaining_key, data_len, NOISE_HASH_LEN);
++
++      if (!first_dst || !first_len)
++              goto out;
++
++      /* Expand first key: key = secret, data = 0x1 */
++      output[0] = 1;
++      blake2s256_hmac(output, output, secret, 1, BLAKE2S_HASH_SIZE);
++      memcpy(first_dst, output, first_len);
++
++      if (!second_dst || !second_len)
++              goto out;
++
++      /* Expand second key: key = secret, data = first-key || 0x2 */
++      output[BLAKE2S_HASH_SIZE] = 2;
++      blake2s256_hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1,
++                      BLAKE2S_HASH_SIZE);
++      memcpy(second_dst, output, second_len);
++
++      if (!third_dst || !third_len)
++              goto out;
++
++      /* Expand third key: key = secret, data = second-key || 0x3 */
++      output[BLAKE2S_HASH_SIZE] = 3;
++      blake2s256_hmac(output, output, secret, BLAKE2S_HASH_SIZE + 1,
++                      BLAKE2S_HASH_SIZE);
++      memcpy(third_dst, output, third_len);
++
++out:
++      /* Clear sensitive data from stack */
++      memzero_explicit(secret, BLAKE2S_HASH_SIZE);
++      memzero_explicit(output, BLAKE2S_HASH_SIZE + 1);
++}
++
++static void symmetric_key_init(struct noise_symmetric_key *key)
++{
++      spin_lock_init(&key->counter.receive.lock);
++      atomic64_set(&key->counter.counter, 0);
++      memset(key->counter.receive.backtrack, 0,
++             sizeof(key->counter.receive.backtrack));
++      key->birthdate = ktime_get_coarse_boottime_ns();
++      key->is_valid = true;
++}
++
++static void derive_keys(struct noise_symmetric_key *first_dst,
++                      struct noise_symmetric_key *second_dst,
++                      const u8 chaining_key[NOISE_HASH_LEN])
++{
++      kdf(first_dst->key, second_dst->key, NULL, NULL,
++          NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, 0,
++          chaining_key);
++      symmetric_key_init(first_dst);
++      symmetric_key_init(second_dst);
++}
++
++static bool __must_check mix_dh(u8 chaining_key[NOISE_HASH_LEN],
++                              u8 key[NOISE_SYMMETRIC_KEY_LEN],
++                              const u8 private[NOISE_PUBLIC_KEY_LEN],
++                              const u8 public[NOISE_PUBLIC_KEY_LEN])
++{
++      u8 dh_calculation[NOISE_PUBLIC_KEY_LEN];
++
++      if (unlikely(!curve25519(dh_calculation, private, public)))
++              return false;
++      kdf(chaining_key, key, NULL, dh_calculation, NOISE_HASH_LEN,
++          NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN, chaining_key);
++      memzero_explicit(dh_calculation, NOISE_PUBLIC_KEY_LEN);
++      return true;
++}
++
++static void mix_hash(u8 hash[NOISE_HASH_LEN], const u8 *src, size_t src_len)
++{
++      struct blake2s_state blake;
++
++      blake2s_init(&blake, NOISE_HASH_LEN);
++      blake2s_update(&blake, hash, NOISE_HASH_LEN);
++      blake2s_update(&blake, src, src_len);
++      blake2s_final(&blake, hash);
++}
++
++static void mix_psk(u8 chaining_key[NOISE_HASH_LEN], u8 hash[NOISE_HASH_LEN],
++                  u8 key[NOISE_SYMMETRIC_KEY_LEN],
++                  const u8 psk[NOISE_SYMMETRIC_KEY_LEN])
++{
++      u8 temp_hash[NOISE_HASH_LEN];
++
++      kdf(chaining_key, temp_hash, key, psk, NOISE_HASH_LEN, NOISE_HASH_LEN,
++          NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, chaining_key);
++      mix_hash(hash, temp_hash, NOISE_HASH_LEN);
++      memzero_explicit(temp_hash, NOISE_HASH_LEN);
++}
++
++static void handshake_init(u8 chaining_key[NOISE_HASH_LEN],
++                         u8 hash[NOISE_HASH_LEN],
++                         const u8 remote_static[NOISE_PUBLIC_KEY_LEN])
++{
++      memcpy(hash, handshake_init_hash, NOISE_HASH_LEN);
++      memcpy(chaining_key, handshake_init_chaining_key, NOISE_HASH_LEN);
++      mix_hash(hash, remote_static, NOISE_PUBLIC_KEY_LEN);
++}
++
++static void message_encrypt(u8 *dst_ciphertext, const u8 *src_plaintext,
++                          size_t src_len, u8 key[NOISE_SYMMETRIC_KEY_LEN],
++                          u8 hash[NOISE_HASH_LEN])
++{
++      chacha20poly1305_encrypt(dst_ciphertext, src_plaintext, src_len, hash,
++                               NOISE_HASH_LEN,
++                               0 /* Always zero for Noise_IK */, key);
++      mix_hash(hash, dst_ciphertext, noise_encrypted_len(src_len));
++}
++
++static bool message_decrypt(u8 *dst_plaintext, const u8 *src_ciphertext,
++                          size_t src_len, u8 key[NOISE_SYMMETRIC_KEY_LEN],
++                          u8 hash[NOISE_HASH_LEN])
++{
++      if (!chacha20poly1305_decrypt(dst_plaintext, src_ciphertext, src_len,
++                                    hash, NOISE_HASH_LEN,
++                                    0 /* Always zero for Noise_IK */, key))
++              return false;
++      mix_hash(hash, src_ciphertext, src_len);
++      return true;
++}
++
++static void message_ephemeral(u8 ephemeral_dst[NOISE_PUBLIC_KEY_LEN],
++                            const u8 ephemeral_src[NOISE_PUBLIC_KEY_LEN],
++                            u8 chaining_key[NOISE_HASH_LEN],
++                            u8 hash[NOISE_HASH_LEN])
++{
++      if (ephemeral_dst != ephemeral_src)
++              memcpy(ephemeral_dst, ephemeral_src, NOISE_PUBLIC_KEY_LEN);
++      mix_hash(hash, ephemeral_src, NOISE_PUBLIC_KEY_LEN);
++      kdf(chaining_key, NULL, NULL, ephemeral_src, NOISE_HASH_LEN, 0, 0,
++          NOISE_PUBLIC_KEY_LEN, chaining_key);
++}
++
++static void tai64n_now(u8 output[NOISE_TIMESTAMP_LEN])
++{
++      struct timespec64 now;
++
++      ktime_get_real_ts64(&now);
++
++      /* In order to prevent some sort of infoleak from precise timers, we
++       * round down the nanoseconds part to the closest rounded-down power of
++       * two to the maximum initiations per second allowed anyway by the
++       * implementation.
++       */
++      now.tv_nsec = ALIGN_DOWN(now.tv_nsec,
++              rounddown_pow_of_two(NSEC_PER_SEC / INITIATIONS_PER_SECOND));
++
++      /* https://cr.yp.to/libtai/tai64.html */
++      *(__be64 *)output = cpu_to_be64(0x400000000000000aULL + now.tv_sec);
++      *(__be32 *)(output + sizeof(__be64)) = cpu_to_be32(now.tv_nsec);
++}
++
++bool
++wg_noise_handshake_create_initiation(struct message_handshake_initiation *dst,
++                                   struct noise_handshake *handshake)
++{
++      u8 timestamp[NOISE_TIMESTAMP_LEN];
++      u8 key[NOISE_SYMMETRIC_KEY_LEN];
++      bool ret = false;
++
++      /* We need to wait for crng _before_ taking any locks, since
++       * curve25519_generate_secret uses get_random_bytes_wait.
++       */
++      wait_for_random_bytes();
++
++      down_read(&handshake->static_identity->lock);
++      down_write(&handshake->lock);
++
++      if (unlikely(!handshake->static_identity->has_identity))
++              goto out;
++
++      dst->header.type = cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION);
++
++      handshake_init(handshake->chaining_key, handshake->hash,
++                     handshake->remote_static);
++
++      /* e */
++      curve25519_generate_secret(handshake->ephemeral_private);
++      if (!curve25519_generate_public(dst->unencrypted_ephemeral,
++                                      handshake->ephemeral_private))
++              goto out;
++      message_ephemeral(dst->unencrypted_ephemeral,
++                        dst->unencrypted_ephemeral, handshake->chaining_key,
++                        handshake->hash);
++
++      /* es */
++      if (!mix_dh(handshake->chaining_key, key, handshake->ephemeral_private,
++                  handshake->remote_static))
++              goto out;
++
++      /* s */
++      message_encrypt(dst->encrypted_static,
++                      handshake->static_identity->static_public,
++                      NOISE_PUBLIC_KEY_LEN, key, handshake->hash);
++
++      /* ss */
++      kdf(handshake->chaining_key, key, NULL,
++          handshake->precomputed_static_static, NOISE_HASH_LEN,
++          NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN,
++          handshake->chaining_key);
++
++      /* {t} */
++      tai64n_now(timestamp);
++      message_encrypt(dst->encrypted_timestamp, timestamp,
++                      NOISE_TIMESTAMP_LEN, key, handshake->hash);
++
++      dst->sender_index = wg_index_hashtable_insert(
++              handshake->entry.peer->device->index_hashtable,
++              &handshake->entry);
++
++      handshake->state = HANDSHAKE_CREATED_INITIATION;
++      ret = true;
++
++out:
++      up_write(&handshake->lock);
++      up_read(&handshake->static_identity->lock);
++      memzero_explicit(key, NOISE_SYMMETRIC_KEY_LEN);
++      return ret;
++}
++
++struct wg_peer *
++wg_noise_handshake_consume_initiation(struct message_handshake_initiation *src,
++                                    struct wg_device *wg)
++{
++      struct wg_peer *peer = NULL, *ret_peer = NULL;
++      struct noise_handshake *handshake;
++      bool replay_attack, flood_attack;
++      u8 key[NOISE_SYMMETRIC_KEY_LEN];
++      u8 chaining_key[NOISE_HASH_LEN];
++      u8 hash[NOISE_HASH_LEN];
++      u8 s[NOISE_PUBLIC_KEY_LEN];
++      u8 e[NOISE_PUBLIC_KEY_LEN];
++      u8 t[NOISE_TIMESTAMP_LEN];
++      u64 initiation_consumption;
++
++      down_read(&wg->static_identity.lock);
++      if (unlikely(!wg->static_identity.has_identity))
++              goto out;
++
++      handshake_init(chaining_key, hash, wg->static_identity.static_public);
++
++      /* e */
++      message_ephemeral(e, src->unencrypted_ephemeral, chaining_key, hash);
++
++      /* es */
++      if (!mix_dh(chaining_key, key, wg->static_identity.static_private, e))
++              goto out;
++
++      /* s */
++      if (!message_decrypt(s, src->encrypted_static,
++                           sizeof(src->encrypted_static), key, hash))
++              goto out;
++
++      /* Lookup which peer we're actually talking to */
++      peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable, s);
++      if (!peer)
++              goto out;
++      handshake = &peer->handshake;
++
++      /* ss */
++      kdf(chaining_key, key, NULL, handshake->precomputed_static_static,
++          NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN,
++          chaining_key);
++
++      /* {t} */
++      if (!message_decrypt(t, src->encrypted_timestamp,
++                           sizeof(src->encrypted_timestamp), key, hash))
++              goto out;
++
++      down_read(&handshake->lock);
++      replay_attack = memcmp(t, handshake->latest_timestamp,
++                             NOISE_TIMESTAMP_LEN) <= 0;
++      flood_attack = (s64)handshake->last_initiation_consumption +
++                             NSEC_PER_SEC / INITIATIONS_PER_SECOND >
++                     (s64)ktime_get_coarse_boottime_ns();
++      up_read(&handshake->lock);
++      if (replay_attack || flood_attack)
++              goto out;
++
++      /* Success! Copy everything to peer */
++      down_write(&handshake->lock);
++      memcpy(handshake->remote_ephemeral, e, NOISE_PUBLIC_KEY_LEN);
++      if (memcmp(t, handshake->latest_timestamp, NOISE_TIMESTAMP_LEN) > 0)
++              memcpy(handshake->latest_timestamp, t, NOISE_TIMESTAMP_LEN);
++      memcpy(handshake->hash, hash, NOISE_HASH_LEN);
++      memcpy(handshake->chaining_key, chaining_key, NOISE_HASH_LEN);
++      handshake->remote_index = src->sender_index;
++      if ((s64)(handshake->last_initiation_consumption -
++          (initiation_consumption = ktime_get_coarse_boottime_ns())) < 0)
++              handshake->last_initiation_consumption = initiation_consumption;
++      handshake->state = HANDSHAKE_CONSUMED_INITIATION;
++      up_write(&handshake->lock);
++      ret_peer = peer;
++
++out:
++      memzero_explicit(key, NOISE_SYMMETRIC_KEY_LEN);
++      memzero_explicit(hash, NOISE_HASH_LEN);
++      memzero_explicit(chaining_key, NOISE_HASH_LEN);
++      up_read(&wg->static_identity.lock);
++      if (!ret_peer)
++              wg_peer_put(peer);
++      return ret_peer;
++}
++
++bool wg_noise_handshake_create_response(struct message_handshake_response *dst,
++                                      struct noise_handshake *handshake)
++{
++      u8 key[NOISE_SYMMETRIC_KEY_LEN];
++      bool ret = false;
++
++      /* We need to wait for crng _before_ taking any locks, since
++       * curve25519_generate_secret uses get_random_bytes_wait.
++       */
++      wait_for_random_bytes();
++
++      down_read(&handshake->static_identity->lock);
++      down_write(&handshake->lock);
++
++      if (handshake->state != HANDSHAKE_CONSUMED_INITIATION)
++              goto out;
++
++      dst->header.type = cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE);
++      dst->receiver_index = handshake->remote_index;
++
++      /* e */
++      curve25519_generate_secret(handshake->ephemeral_private);
++      if (!curve25519_generate_public(dst->unencrypted_ephemeral,
++                                      handshake->ephemeral_private))
++              goto out;
++      message_ephemeral(dst->unencrypted_ephemeral,
++                        dst->unencrypted_ephemeral, handshake->chaining_key,
++                        handshake->hash);
++
++      /* ee */
++      if (!mix_dh(handshake->chaining_key, NULL, handshake->ephemeral_private,
++                  handshake->remote_ephemeral))
++              goto out;
++
++      /* se */
++      if (!mix_dh(handshake->chaining_key, NULL, handshake->ephemeral_private,
++                  handshake->remote_static))
++              goto out;
++
++      /* psk */
++      mix_psk(handshake->chaining_key, handshake->hash, key,
++              handshake->preshared_key);
++
++      /* {} */
++      message_encrypt(dst->encrypted_nothing, NULL, 0, key, handshake->hash);
++
++      dst->sender_index = wg_index_hashtable_insert(
++              handshake->entry.peer->device->index_hashtable,
++              &handshake->entry);
++
++      handshake->state = HANDSHAKE_CREATED_RESPONSE;
++      ret = true;
++
++out:
++      up_write(&handshake->lock);
++      up_read(&handshake->static_identity->lock);
++      memzero_explicit(key, NOISE_SYMMETRIC_KEY_LEN);
++      return ret;
++}
++
++struct wg_peer *
++wg_noise_handshake_consume_response(struct message_handshake_response *src,
++                                  struct wg_device *wg)
++{
++      enum noise_handshake_state state = HANDSHAKE_ZEROED;
++      struct wg_peer *peer = NULL, *ret_peer = NULL;
++      struct noise_handshake *handshake;
++      u8 key[NOISE_SYMMETRIC_KEY_LEN];
++      u8 hash[NOISE_HASH_LEN];
++      u8 chaining_key[NOISE_HASH_LEN];
++      u8 e[NOISE_PUBLIC_KEY_LEN];
++      u8 ephemeral_private[NOISE_PUBLIC_KEY_LEN];
++      u8 static_private[NOISE_PUBLIC_KEY_LEN];
++
++      down_read(&wg->static_identity.lock);
++
++      if (unlikely(!wg->static_identity.has_identity))
++              goto out;
++
++      handshake = (struct noise_handshake *)wg_index_hashtable_lookup(
++              wg->index_hashtable, INDEX_HASHTABLE_HANDSHAKE,
++              src->receiver_index, &peer);
++      if (unlikely(!handshake))
++              goto out;
++
++      down_read(&handshake->lock);
++      state = handshake->state;
++      memcpy(hash, handshake->hash, NOISE_HASH_LEN);
++      memcpy(chaining_key, handshake->chaining_key, NOISE_HASH_LEN);
++      memcpy(ephemeral_private, handshake->ephemeral_private,
++             NOISE_PUBLIC_KEY_LEN);
++      up_read(&handshake->lock);
++
++      if (state != HANDSHAKE_CREATED_INITIATION)
++              goto fail;
++
++      /* e */
++      message_ephemeral(e, src->unencrypted_ephemeral, chaining_key, hash);
++
++      /* ee */
++      if (!mix_dh(chaining_key, NULL, ephemeral_private, e))
++              goto fail;
++
++      /* se */
++      if (!mix_dh(chaining_key, NULL, wg->static_identity.static_private, e))
++              goto fail;
++
++      /* psk */
++      mix_psk(chaining_key, hash, key, handshake->preshared_key);
++
++      /* {} */
++      if (!message_decrypt(NULL, src->encrypted_nothing,
++                           sizeof(src->encrypted_nothing), key, hash))
++              goto fail;
++
++      /* Success! Copy everything to peer */
++      down_write(&handshake->lock);
++      /* It's important to check that the state is still the same, while we
++       * have an exclusive lock.
++       */
++      if (handshake->state != state) {
++              up_write(&handshake->lock);
++              goto fail;
++      }
++      memcpy(handshake->remote_ephemeral, e, NOISE_PUBLIC_KEY_LEN);
++      memcpy(handshake->hash, hash, NOISE_HASH_LEN);
++      memcpy(handshake->chaining_key, chaining_key, NOISE_HASH_LEN);
++      handshake->remote_index = src->sender_index;
++      handshake->state = HANDSHAKE_CONSUMED_RESPONSE;
++      up_write(&handshake->lock);
++      ret_peer = peer;
++      goto out;
++
++fail:
++      wg_peer_put(peer);
++out:
++      memzero_explicit(key, NOISE_SYMMETRIC_KEY_LEN);
++      memzero_explicit(hash, NOISE_HASH_LEN);
++      memzero_explicit(chaining_key, NOISE_HASH_LEN);
++      memzero_explicit(ephemeral_private, NOISE_PUBLIC_KEY_LEN);
++      memzero_explicit(static_private, NOISE_PUBLIC_KEY_LEN);
++      up_read(&wg->static_identity.lock);
++      return ret_peer;
++}
++
++bool wg_noise_handshake_begin_session(struct noise_handshake *handshake,
++                                    struct noise_keypairs *keypairs)
++{
++      struct noise_keypair *new_keypair;
++      bool ret = false;
++
++      down_write(&handshake->lock);
++      if (handshake->state != HANDSHAKE_CREATED_RESPONSE &&
++          handshake->state != HANDSHAKE_CONSUMED_RESPONSE)
++              goto out;
++
++      new_keypair = keypair_create(handshake->entry.peer);
++      if (!new_keypair)
++              goto out;
++      new_keypair->i_am_the_initiator = handshake->state ==
++                                        HANDSHAKE_CONSUMED_RESPONSE;
++      new_keypair->remote_index = handshake->remote_index;
++
++      if (new_keypair->i_am_the_initiator)
++              derive_keys(&new_keypair->sending, &new_keypair->receiving,
++                          handshake->chaining_key);
++      else
++              derive_keys(&new_keypair->receiving, &new_keypair->sending,
++                          handshake->chaining_key);
++
++      handshake_zero(handshake);
++      rcu_read_lock_bh();
++      if (likely(!READ_ONCE(container_of(handshake, struct wg_peer,
++                                         handshake)->is_dead))) {
++              add_new_keypair(keypairs, new_keypair);
++              net_dbg_ratelimited("%s: Keypair %llu created for peer %llu\n",
++                                  handshake->entry.peer->device->dev->name,
++                                  new_keypair->internal_id,
++                                  handshake->entry.peer->internal_id);
++              ret = wg_index_hashtable_replace(
++                      handshake->entry.peer->device->index_hashtable,
++                      &handshake->entry, &new_keypair->entry);
++      } else {
++              kzfree(new_keypair);
++      }
++      rcu_read_unlock_bh();
++
++out:
++      up_write(&handshake->lock);
++      return ret;
++}
+--- /dev/null
++++ b/drivers/net/wireguard/noise.h
+@@ -0,0 +1,137 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++#ifndef _WG_NOISE_H
++#define _WG_NOISE_H
++
++#include "messages.h"
++#include "peerlookup.h"
++
++#include <linux/types.h>
++#include <linux/spinlock.h>
++#include <linux/atomic.h>
++#include <linux/rwsem.h>
++#include <linux/mutex.h>
++#include <linux/kref.h>
++
++union noise_counter {
++      struct {
++              u64 counter;
++              unsigned long backtrack[COUNTER_BITS_TOTAL / BITS_PER_LONG];
++              spinlock_t lock;
++      } receive;
++      atomic64_t counter;
++};
++
++struct noise_symmetric_key {
++      u8 key[NOISE_SYMMETRIC_KEY_LEN];
++      union noise_counter counter;
++      u64 birthdate;
++      bool is_valid;
++};
++
++struct noise_keypair {
++      struct index_hashtable_entry entry;
++      struct noise_symmetric_key sending;
++      struct noise_symmetric_key receiving;
++      __le32 remote_index;
++      bool i_am_the_initiator;
++      struct kref refcount;
++      struct rcu_head rcu;
++      u64 internal_id;
++};
++
++struct noise_keypairs {
++      struct noise_keypair __rcu *current_keypair;
++      struct noise_keypair __rcu *previous_keypair;
++      struct noise_keypair __rcu *next_keypair;
++      spinlock_t keypair_update_lock;
++};
++
++struct noise_static_identity {
++      u8 static_public[NOISE_PUBLIC_KEY_LEN];
++      u8 static_private[NOISE_PUBLIC_KEY_LEN];
++      struct rw_semaphore lock;
++      bool has_identity;
++};
++
++enum noise_handshake_state {
++      HANDSHAKE_ZEROED,
++      HANDSHAKE_CREATED_INITIATION,
++      HANDSHAKE_CONSUMED_INITIATION,
++      HANDSHAKE_CREATED_RESPONSE,
++      HANDSHAKE_CONSUMED_RESPONSE
++};
++
++struct noise_handshake {
++      struct index_hashtable_entry entry;
++
++      enum noise_handshake_state state;
++      u64 last_initiation_consumption;
++
++      struct noise_static_identity *static_identity;
++
++      u8 ephemeral_private[NOISE_PUBLIC_KEY_LEN];
++      u8 remote_static[NOISE_PUBLIC_KEY_LEN];
++      u8 remote_ephemeral[NOISE_PUBLIC_KEY_LEN];
++      u8 precomputed_static_static[NOISE_PUBLIC_KEY_LEN];
++
++      u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN];
++
++      u8 hash[NOISE_HASH_LEN];
++      u8 chaining_key[NOISE_HASH_LEN];
++
++      u8 latest_timestamp[NOISE_TIMESTAMP_LEN];
++      __le32 remote_index;
++
++      /* Protects all members except the immutable (after noise_handshake_
++       * init): remote_static, precomputed_static_static, static_identity.
++       */
++      struct rw_semaphore lock;
++};
++
++struct wg_device;
++
++void wg_noise_init(void);
++bool wg_noise_handshake_init(struct noise_handshake *handshake,
++                         struct noise_static_identity *static_identity,
++                         const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN],
++                         const u8 peer_preshared_key[NOISE_SYMMETRIC_KEY_LEN],
++                         struct wg_peer *peer);
++void wg_noise_handshake_clear(struct noise_handshake *handshake);
++static inline void wg_noise_reset_last_sent_handshake(atomic64_t *handshake_ns)
++{
++      atomic64_set(handshake_ns, ktime_get_coarse_boottime_ns() -
++                                     (u64)(REKEY_TIMEOUT + 1) * NSEC_PER_SEC);
++}
++
++void wg_noise_keypair_put(struct noise_keypair *keypair, bool unreference_now);
++struct noise_keypair *wg_noise_keypair_get(struct noise_keypair *keypair);
++void wg_noise_keypairs_clear(struct noise_keypairs *keypairs);
++bool wg_noise_received_with_keypair(struct noise_keypairs *keypairs,
++                                  struct noise_keypair *received_keypair);
++void wg_noise_expire_current_peer_keypairs(struct wg_peer *peer);
++
++void wg_noise_set_static_identity_private_key(
++      struct noise_static_identity *static_identity,
++      const u8 private_key[NOISE_PUBLIC_KEY_LEN]);
++bool wg_noise_precompute_static_static(struct wg_peer *peer);
++
++bool
++wg_noise_handshake_create_initiation(struct message_handshake_initiation *dst,
++                                   struct noise_handshake *handshake);
++struct wg_peer *
++wg_noise_handshake_consume_initiation(struct message_handshake_initiation *src,
++                                    struct wg_device *wg);
++
++bool wg_noise_handshake_create_response(struct message_handshake_response *dst,
++                                      struct noise_handshake *handshake);
++struct wg_peer *
++wg_noise_handshake_consume_response(struct message_handshake_response *src,
++                                  struct wg_device *wg);
++
++bool wg_noise_handshake_begin_session(struct noise_handshake *handshake,
++                                    struct noise_keypairs *keypairs);
++
++#endif /* _WG_NOISE_H */
+--- /dev/null
++++ b/drivers/net/wireguard/peer.c
+@@ -0,0 +1,240 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "peer.h"
++#include "device.h"
++#include "queueing.h"
++#include "timers.h"
++#include "peerlookup.h"
++#include "noise.h"
++
++#include <linux/kref.h>
++#include <linux/lockdep.h>
++#include <linux/rcupdate.h>
++#include <linux/list.h>
++
++static atomic64_t peer_counter = ATOMIC64_INIT(0);
++
++struct wg_peer *wg_peer_create(struct wg_device *wg,
++                             const u8 public_key[NOISE_PUBLIC_KEY_LEN],
++                             const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN])
++{
++      struct wg_peer *peer;
++      int ret = -ENOMEM;
++
++      lockdep_assert_held(&wg->device_update_lock);
++
++      if (wg->num_peers >= MAX_PEERS_PER_DEVICE)
++              return ERR_PTR(ret);
++
++      peer = kzalloc(sizeof(*peer), GFP_KERNEL);
++      if (unlikely(!peer))
++              return ERR_PTR(ret);
++      peer->device = wg;
++
++      if (!wg_noise_handshake_init(&peer->handshake, &wg->static_identity,
++                                   public_key, preshared_key, peer)) {
++              ret = -EKEYREJECTED;
++              goto err_1;
++      }
++      if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
++              goto err_1;
++      if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false,
++                               MAX_QUEUED_PACKETS))
++              goto err_2;
++      if (wg_packet_queue_init(&peer->rx_queue, NULL, false,
++                               MAX_QUEUED_PACKETS))
++              goto err_3;
++
++      peer->internal_id = atomic64_inc_return(&peer_counter);
++      peer->serial_work_cpu = nr_cpumask_bits;
++      wg_cookie_init(&peer->latest_cookie);
++      wg_timers_init(peer);
++      wg_cookie_checker_precompute_peer_keys(peer);
++      spin_lock_init(&peer->keypairs.keypair_update_lock);
++      INIT_WORK(&peer->transmit_handshake_work,
++                wg_packet_handshake_send_worker);
++      rwlock_init(&peer->endpoint_lock);
++      kref_init(&peer->refcount);
++      skb_queue_head_init(&peer->staged_packet_queue);
++      wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake);
++      set_bit(NAPI_STATE_NO_BUSY_POLL, &peer->napi.state);
++      netif_napi_add(wg->dev, &peer->napi, wg_packet_rx_poll,
++                     NAPI_POLL_WEIGHT);
++      napi_enable(&peer->napi);
++      list_add_tail(&peer->peer_list, &wg->peer_list);
++      INIT_LIST_HEAD(&peer->allowedips_list);
++      wg_pubkey_hashtable_add(wg->peer_hashtable, peer);
++      ++wg->num_peers;
++      pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id);
++      return peer;
++
++err_3:
++      wg_packet_queue_free(&peer->tx_queue, false);
++err_2:
++      dst_cache_destroy(&peer->endpoint_cache);
++err_1:
++      kfree(peer);
++      return ERR_PTR(ret);
++}
++
++struct wg_peer *wg_peer_get_maybe_zero(struct wg_peer *peer)
++{
++      RCU_LOCKDEP_WARN(!rcu_read_lock_bh_held(),
++                       "Taking peer reference without holding the RCU read lock");
++      if (unlikely(!peer || !kref_get_unless_zero(&peer->refcount)))
++              return NULL;
++      return peer;
++}
++
++static void peer_make_dead(struct wg_peer *peer)
++{
++      /* Remove from configuration-time lookup structures. */
++      list_del_init(&peer->peer_list);
++      wg_allowedips_remove_by_peer(&peer->device->peer_allowedips, peer,
++                                   &peer->device->device_update_lock);
++      wg_pubkey_hashtable_remove(peer->device->peer_hashtable, peer);
++
++      /* Mark as dead, so that we don't allow jumping contexts after. */
++      WRITE_ONCE(peer->is_dead, true);
++
++      /* The caller must now synchronize_rcu() for this to take effect. */
++}
++
++static void peer_remove_after_dead(struct wg_peer *peer)
++{
++      WARN_ON(!peer->is_dead);
++
++      /* No more keypairs can be created for this peer, since is_dead protects
++       * add_new_keypair, so we can now destroy existing ones.
++       */
++      wg_noise_keypairs_clear(&peer->keypairs);
++
++      /* Destroy all ongoing timers that were in-flight at the beginning of
++       * this function.
++       */
++      wg_timers_stop(peer);
++
++      /* The transition between packet encryption/decryption queues isn't
++       * guarded by is_dead, but each reference's life is strictly bounded by
++       * two generations: once for parallel crypto and once for serial
++       * ingestion, so we can simply flush twice, and be sure that we no
++       * longer have references inside these queues.
++       */
++
++      /* a) For encrypt/decrypt. */
++      flush_workqueue(peer->device->packet_crypt_wq);
++      /* b.1) For send (but not receive, since that's napi). */
++      flush_workqueue(peer->device->packet_crypt_wq);
++      /* b.2.1) For receive (but not send, since that's wq). */
++      napi_disable(&peer->napi);
++      /* b.2.1) It's now safe to remove the napi struct, which must be done
++       * here from process context.
++       */
++      netif_napi_del(&peer->napi);
++
++      /* Ensure any workstructs we own (like transmit_handshake_work or
++       * clear_peer_work) no longer are in use.
++       */
++      flush_workqueue(peer->device->handshake_send_wq);
++
++      /* After the above flushes, a peer might still be active in a few
++       * different contexts: 1) from xmit(), before hitting is_dead and
++       * returning, 2) from wg_packet_consume_data(), before hitting is_dead
++       * and returning, 3) from wg_receive_handshake_packet() after a point
++       * where it has processed an incoming handshake packet, but where
++       * all calls to pass it off to timers fails because of is_dead. We won't
++       * have new references in (1) eventually, because we're removed from
++       * allowedips; we won't have new references in (2) eventually, because
++       * wg_index_hashtable_lookup will always return NULL, since we removed
++       * all existing keypairs and no more can be created; we won't have new
++       * references in (3) eventually, because we're removed from the pubkey
++       * hash table, which allows for a maximum of one handshake response,
++       * via the still-uncleared index hashtable entry, but not more than one,
++       * and in wg_cookie_message_consume, the lookup eventually gets a peer
++       * with a refcount of zero, so no new reference is taken.
++       */
++
++      --peer->device->num_peers;
++      wg_peer_put(peer);
++}
++
++/* We have a separate "remove" function make sure that all active places where
++ * a peer is currently operating will eventually come to an end and not pass
++ * their reference onto another context.
++ */
++void wg_peer_remove(struct wg_peer *peer)
++{
++      if (unlikely(!peer))
++              return;
++      lockdep_assert_held(&peer->device->device_update_lock);
++
++      peer_make_dead(peer);
++      synchronize_rcu();
++      peer_remove_after_dead(peer);
++}
++
++void wg_peer_remove_all(struct wg_device *wg)
++{
++      struct wg_peer *peer, *temp;
++      LIST_HEAD(dead_peers);
++
++      lockdep_assert_held(&wg->device_update_lock);
++
++      /* Avoid having to traverse individually for each one. */
++      wg_allowedips_free(&wg->peer_allowedips, &wg->device_update_lock);
++
++      list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) {
++              peer_make_dead(peer);
++              list_add_tail(&peer->peer_list, &dead_peers);
++      }
++      synchronize_rcu();
++      list_for_each_entry_safe(peer, temp, &dead_peers, peer_list)
++              peer_remove_after_dead(peer);
++}
++
++static void rcu_release(struct rcu_head *rcu)
++{
++      struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu);
++
++      dst_cache_destroy(&peer->endpoint_cache);
++      wg_packet_queue_free(&peer->rx_queue, false);
++      wg_packet_queue_free(&peer->tx_queue, false);
++
++      /* The final zeroing takes care of clearing any remaining handshake key
++       * material and other potentially sensitive information.
++       */
++      kzfree(peer);
++}
++
++static void kref_release(struct kref *refcount)
++{
++      struct wg_peer *peer = container_of(refcount, struct wg_peer, refcount);
++
++      pr_debug("%s: Peer %llu (%pISpfsc) destroyed\n",
++               peer->device->dev->name, peer->internal_id,
++               &peer->endpoint.addr);
++
++      /* Remove ourself from dynamic runtime lookup structures, now that the
++       * last reference is gone.
++       */
++      wg_index_hashtable_remove(peer->device->index_hashtable,
++                                &peer->handshake.entry);
++
++      /* Remove any lingering packets that didn't have a chance to be
++       * transmitted.
++       */
++      wg_packet_purge_staged_packets(peer);
++
++      /* Free the memory used. */
++      call_rcu(&peer->rcu, rcu_release);
++}
++
++void wg_peer_put(struct wg_peer *peer)
++{
++      if (unlikely(!peer))
++              return;
++      kref_put(&peer->refcount, kref_release);
++}
+--- /dev/null
++++ b/drivers/net/wireguard/peer.h
+@@ -0,0 +1,83 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_PEER_H
++#define _WG_PEER_H
++
++#include "device.h"
++#include "noise.h"
++#include "cookie.h"
++
++#include <linux/types.h>
++#include <linux/netfilter.h>
++#include <linux/spinlock.h>
++#include <linux/kref.h>
++#include <net/dst_cache.h>
++
++struct wg_device;
++
++struct endpoint {
++      union {
++              struct sockaddr addr;
++              struct sockaddr_in addr4;
++              struct sockaddr_in6 addr6;
++      };
++      union {
++              struct {
++                      struct in_addr src4;
++                      /* Essentially the same as addr6->scope_id */
++                      int src_if4;
++              };
++              struct in6_addr src6;
++      };
++};
++
++struct wg_peer {
++      struct wg_device *device;
++      struct crypt_queue tx_queue, rx_queue;
++      struct sk_buff_head staged_packet_queue;
++      int serial_work_cpu;
++      struct noise_keypairs keypairs;
++      struct endpoint endpoint;
++      struct dst_cache endpoint_cache;
++      rwlock_t endpoint_lock;
++      struct noise_handshake handshake;
++      atomic64_t last_sent_handshake;
++      struct work_struct transmit_handshake_work, clear_peer_work;
++      struct cookie latest_cookie;
++      struct hlist_node pubkey_hash;
++      u64 rx_bytes, tx_bytes;
++      struct timer_list timer_retransmit_handshake, timer_send_keepalive;
++      struct timer_list timer_new_handshake, timer_zero_key_material;
++      struct timer_list timer_persistent_keepalive;
++      unsigned int timer_handshake_attempts;
++      u16 persistent_keepalive_interval;
++      bool timer_need_another_keepalive;
++      bool sent_lastminute_handshake;
++      struct timespec64 walltime_last_handshake;
++      struct kref refcount;
++      struct rcu_head rcu;
++      struct list_head peer_list;
++      struct list_head allowedips_list;
++      u64 internal_id;
++      struct napi_struct napi;
++      bool is_dead;
++};
++
++struct wg_peer *wg_peer_create(struct wg_device *wg,
++                             const u8 public_key[NOISE_PUBLIC_KEY_LEN],
++                             const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]);
++
++struct wg_peer *__must_check wg_peer_get_maybe_zero(struct wg_peer *peer);
++static inline struct wg_peer *wg_peer_get(struct wg_peer *peer)
++{
++      kref_get(&peer->refcount);
++      return peer;
++}
++void wg_peer_put(struct wg_peer *peer);
++void wg_peer_remove(struct wg_peer *peer);
++void wg_peer_remove_all(struct wg_device *wg);
++
++#endif /* _WG_PEER_H */
+--- /dev/null
++++ b/drivers/net/wireguard/peerlookup.c
+@@ -0,0 +1,221 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "peerlookup.h"
++#include "peer.h"
++#include "noise.h"
++
++static struct hlist_head *pubkey_bucket(struct pubkey_hashtable *table,
++                                      const u8 pubkey[NOISE_PUBLIC_KEY_LEN])
++{
++      /* siphash gives us a secure 64bit number based on a random key. Since
++       * the bits are uniformly distributed, we can then mask off to get the
++       * bits we need.
++       */
++      const u64 hash = siphash(pubkey, NOISE_PUBLIC_KEY_LEN, &table->key);
++
++      return &table->hashtable[hash & (HASH_SIZE(table->hashtable) - 1)];
++}
++
++struct pubkey_hashtable *wg_pubkey_hashtable_alloc(void)
++{
++      struct pubkey_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL);
++
++      if (!table)
++              return NULL;
++
++      get_random_bytes(&table->key, sizeof(table->key));
++      hash_init(table->hashtable);
++      mutex_init(&table->lock);
++      return table;
++}
++
++void wg_pubkey_hashtable_add(struct pubkey_hashtable *table,
++                           struct wg_peer *peer)
++{
++      mutex_lock(&table->lock);
++      hlist_add_head_rcu(&peer->pubkey_hash,
++                         pubkey_bucket(table, peer->handshake.remote_static));
++      mutex_unlock(&table->lock);
++}
++
++void wg_pubkey_hashtable_remove(struct pubkey_hashtable *table,
++                              struct wg_peer *peer)
++{
++      mutex_lock(&table->lock);
++      hlist_del_init_rcu(&peer->pubkey_hash);
++      mutex_unlock(&table->lock);
++}
++
++/* Returns a strong reference to a peer */
++struct wg_peer *
++wg_pubkey_hashtable_lookup(struct pubkey_hashtable *table,
++                         const u8 pubkey[NOISE_PUBLIC_KEY_LEN])
++{
++      struct wg_peer *iter_peer, *peer = NULL;
++
++      rcu_read_lock_bh();
++      hlist_for_each_entry_rcu_bh(iter_peer, pubkey_bucket(table, pubkey),
++                                  pubkey_hash) {
++              if (!memcmp(pubkey, iter_peer->handshake.remote_static,
++                          NOISE_PUBLIC_KEY_LEN)) {
++                      peer = iter_peer;
++                      break;
++              }
++      }
++      peer = wg_peer_get_maybe_zero(peer);
++      rcu_read_unlock_bh();
++      return peer;
++}
++
++static struct hlist_head *index_bucket(struct index_hashtable *table,
++                                     const __le32 index)
++{
++      /* Since the indices are random and thus all bits are uniformly
++       * distributed, we can find its bucket simply by masking.
++       */
++      return &table->hashtable[(__force u32)index &
++                               (HASH_SIZE(table->hashtable) - 1)];
++}
++
++struct index_hashtable *wg_index_hashtable_alloc(void)
++{
++      struct index_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL);
++
++      if (!table)
++              return NULL;
++
++      hash_init(table->hashtable);
++      spin_lock_init(&table->lock);
++      return table;
++}
++
++/* At the moment, we limit ourselves to 2^20 total peers, which generally might
++ * amount to 2^20*3 items in this hashtable. The algorithm below works by
++ * picking a random number and testing it. We can see that these limits mean we
++ * usually succeed pretty quickly:
++ *
++ * >>> def calculation(tries, size):
++ * ...     return (size / 2**32)**(tries - 1) *  (1 - (size / 2**32))
++ * ...
++ * >>> calculation(1, 2**20 * 3)
++ * 0.999267578125
++ * >>> calculation(2, 2**20 * 3)
++ * 0.0007318854331970215
++ * >>> calculation(3, 2**20 * 3)
++ * 5.360489012673497e-07
++ * >>> calculation(4, 2**20 * 3)
++ * 3.9261394135792216e-10
++ *
++ * At the moment, we don't do any masking, so this algorithm isn't exactly
++ * constant time in either the random guessing or in the hash list lookup. We
++ * could require a minimum of 3 tries, which would successfully mask the
++ * guessing. this would not, however, help with the growing hash lengths, which
++ * is another thing to consider moving forward.
++ */
++
++__le32 wg_index_hashtable_insert(struct index_hashtable *table,
++                               struct index_hashtable_entry *entry)
++{
++      struct index_hashtable_entry *existing_entry;
++
++      spin_lock_bh(&table->lock);
++      hlist_del_init_rcu(&entry->index_hash);
++      spin_unlock_bh(&table->lock);
++
++      rcu_read_lock_bh();
++
++search_unused_slot:
++      /* First we try to find an unused slot, randomly, while unlocked. */
++      entry->index = (__force __le32)get_random_u32();
++      hlist_for_each_entry_rcu_bh(existing_entry,
++                                  index_bucket(table, entry->index),
++                                  index_hash) {
++              if (existing_entry->index == entry->index)
++                      /* If it's already in use, we continue searching. */
++                      goto search_unused_slot;
++      }
++
++      /* Once we've found an unused slot, we lock it, and then double-check
++       * that nobody else stole it from us.
++       */
++      spin_lock_bh(&table->lock);
++      hlist_for_each_entry_rcu_bh(existing_entry,
++                                  index_bucket(table, entry->index),
++                                  index_hash) {
++              if (existing_entry->index == entry->index) {
++                      spin_unlock_bh(&table->lock);
++                      /* If it was stolen, we start over. */
++                      goto search_unused_slot;
++              }
++      }
++      /* Otherwise, we know we have it exclusively (since we're locked),
++       * so we insert.
++       */
++      hlist_add_head_rcu(&entry->index_hash,
++                         index_bucket(table, entry->index));
++      spin_unlock_bh(&table->lock);
++
++      rcu_read_unlock_bh();
++
++      return entry->index;
++}
++
++bool wg_index_hashtable_replace(struct index_hashtable *table,
++                              struct index_hashtable_entry *old,
++                              struct index_hashtable_entry *new)
++{
++      if (unlikely(hlist_unhashed(&old->index_hash)))
++              return false;
++      spin_lock_bh(&table->lock);
++      new->index = old->index;
++      hlist_replace_rcu(&old->index_hash, &new->index_hash);
++
++      /* Calling init here NULLs out index_hash, and in fact after this
++       * function returns, it's theoretically possible for this to get
++       * reinserted elsewhere. That means the RCU lookup below might either
++       * terminate early or jump between buckets, in which case the packet
++       * simply gets dropped, which isn't terrible.
++       */
++      INIT_HLIST_NODE(&old->index_hash);
++      spin_unlock_bh(&table->lock);
++      return true;
++}
++
++void wg_index_hashtable_remove(struct index_hashtable *table,
++                             struct index_hashtable_entry *entry)
++{
++      spin_lock_bh(&table->lock);
++      hlist_del_init_rcu(&entry->index_hash);
++      spin_unlock_bh(&table->lock);
++}
++
++/* Returns a strong reference to a entry->peer */
++struct index_hashtable_entry *
++wg_index_hashtable_lookup(struct index_hashtable *table,
++                        const enum index_hashtable_type type_mask,
++                        const __le32 index, struct wg_peer **peer)
++{
++      struct index_hashtable_entry *iter_entry, *entry = NULL;
++
++      rcu_read_lock_bh();
++      hlist_for_each_entry_rcu_bh(iter_entry, index_bucket(table, index),
++                                  index_hash) {
++              if (iter_entry->index == index) {
++                      if (likely(iter_entry->type & type_mask))
++                              entry = iter_entry;
++                      break;
++              }
++      }
++      if (likely(entry)) {
++              entry->peer = wg_peer_get_maybe_zero(entry->peer);
++              if (likely(entry->peer))
++                      *peer = entry->peer;
++              else
++                      entry = NULL;
++      }
++      rcu_read_unlock_bh();
++      return entry;
++}
+--- /dev/null
++++ b/drivers/net/wireguard/peerlookup.h
+@@ -0,0 +1,64 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_PEERLOOKUP_H
++#define _WG_PEERLOOKUP_H
++
++#include "messages.h"
++
++#include <linux/hashtable.h>
++#include <linux/mutex.h>
++#include <linux/siphash.h>
++
++struct wg_peer;
++
++struct pubkey_hashtable {
++      /* TODO: move to rhashtable */
++      DECLARE_HASHTABLE(hashtable, 11);
++      siphash_key_t key;
++      struct mutex lock;
++};
++
++struct pubkey_hashtable *wg_pubkey_hashtable_alloc(void);
++void wg_pubkey_hashtable_add(struct pubkey_hashtable *table,
++                           struct wg_peer *peer);
++void wg_pubkey_hashtable_remove(struct pubkey_hashtable *table,
++                              struct wg_peer *peer);
++struct wg_peer *
++wg_pubkey_hashtable_lookup(struct pubkey_hashtable *table,
++                         const u8 pubkey[NOISE_PUBLIC_KEY_LEN]);
++
++struct index_hashtable {
++      /* TODO: move to rhashtable */
++      DECLARE_HASHTABLE(hashtable, 13);
++      spinlock_t lock;
++};
++
++enum index_hashtable_type {
++      INDEX_HASHTABLE_HANDSHAKE = 1U << 0,
++      INDEX_HASHTABLE_KEYPAIR = 1U << 1
++};
++
++struct index_hashtable_entry {
++      struct wg_peer *peer;
++      struct hlist_node index_hash;
++      enum index_hashtable_type type;
++      __le32 index;
++};
++
++struct index_hashtable *wg_index_hashtable_alloc(void);
++__le32 wg_index_hashtable_insert(struct index_hashtable *table,
++                               struct index_hashtable_entry *entry);
++bool wg_index_hashtable_replace(struct index_hashtable *table,
++                              struct index_hashtable_entry *old,
++                              struct index_hashtable_entry *new);
++void wg_index_hashtable_remove(struct index_hashtable *table,
++                             struct index_hashtable_entry *entry);
++struct index_hashtable_entry *
++wg_index_hashtable_lookup(struct index_hashtable *table,
++                        const enum index_hashtable_type type_mask,
++                        const __le32 index, struct wg_peer **peer);
++
++#endif /* _WG_PEERLOOKUP_H */
+--- /dev/null
++++ b/drivers/net/wireguard/queueing.c
+@@ -0,0 +1,53 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "queueing.h"
++
++struct multicore_worker __percpu *
++wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr)
++{
++      int cpu;
++      struct multicore_worker __percpu *worker =
++              alloc_percpu(struct multicore_worker);
++
++      if (!worker)
++              return NULL;
++
++      for_each_possible_cpu(cpu) {
++              per_cpu_ptr(worker, cpu)->ptr = ptr;
++              INIT_WORK(&per_cpu_ptr(worker, cpu)->work, function);
++      }
++      return worker;
++}
++
++int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
++                       bool multicore, unsigned int len)
++{
++      int ret;
++
++      memset(queue, 0, sizeof(*queue));
++      ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL);
++      if (ret)
++              return ret;
++      if (function) {
++              if (multicore) {
++                      queue->worker = wg_packet_percpu_multicore_worker_alloc(
++                              function, queue);
++                      if (!queue->worker)
++                              return -ENOMEM;
++              } else {
++                      INIT_WORK(&queue->work, function);
++              }
++      }
++      return 0;
++}
++
++void wg_packet_queue_free(struct crypt_queue *queue, bool multicore)
++{
++      if (multicore)
++              free_percpu(queue->worker);
++      WARN_ON(!__ptr_ring_empty(&queue->ring));
++      ptr_ring_cleanup(&queue->ring, NULL);
++}
+--- /dev/null
++++ b/drivers/net/wireguard/queueing.h
+@@ -0,0 +1,197 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_QUEUEING_H
++#define _WG_QUEUEING_H
++
++#include "peer.h"
++#include <linux/types.h>
++#include <linux/skbuff.h>
++#include <linux/ip.h>
++#include <linux/ipv6.h>
++
++struct wg_device;
++struct wg_peer;
++struct multicore_worker;
++struct crypt_queue;
++struct sk_buff;
++
++/* queueing.c APIs: */
++int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
++                       bool multicore, unsigned int len);
++void wg_packet_queue_free(struct crypt_queue *queue, bool multicore);
++struct multicore_worker __percpu *
++wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr);
++
++/* receive.c APIs: */
++void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb);
++void wg_packet_handshake_receive_worker(struct work_struct *work);
++/* NAPI poll function: */
++int wg_packet_rx_poll(struct napi_struct *napi, int budget);
++/* Workqueue worker: */
++void wg_packet_decrypt_worker(struct work_struct *work);
++
++/* send.c APIs: */
++void wg_packet_send_queued_handshake_initiation(struct wg_peer *peer,
++                                              bool is_retry);
++void wg_packet_send_handshake_response(struct wg_peer *peer);
++void wg_packet_send_handshake_cookie(struct wg_device *wg,
++                                   struct sk_buff *initiating_skb,
++                                   __le32 sender_index);
++void wg_packet_send_keepalive(struct wg_peer *peer);
++void wg_packet_purge_staged_packets(struct wg_peer *peer);
++void wg_packet_send_staged_packets(struct wg_peer *peer);
++/* Workqueue workers: */
++void wg_packet_handshake_send_worker(struct work_struct *work);
++void wg_packet_tx_worker(struct work_struct *work);
++void wg_packet_encrypt_worker(struct work_struct *work);
++
++enum packet_state {
++      PACKET_STATE_UNCRYPTED,
++      PACKET_STATE_CRYPTED,
++      PACKET_STATE_DEAD
++};
++
++struct packet_cb {
++      u64 nonce;
++      struct noise_keypair *keypair;
++      atomic_t state;
++      u32 mtu;
++      u8 ds;
++};
++
++#define PACKET_CB(skb) ((struct packet_cb *)((skb)->cb))
++#define PACKET_PEER(skb) (PACKET_CB(skb)->keypair->entry.peer)
++
++/* Returns either the correct skb->protocol value, or 0 if invalid. */
++static inline __be16 wg_skb_examine_untrusted_ip_hdr(struct sk_buff *skb)
++{
++      if (skb_network_header(skb) >= skb->head &&
++          (skb_network_header(skb) + sizeof(struct iphdr)) <=
++                  skb_tail_pointer(skb) &&
++          ip_hdr(skb)->version == 4)
++              return htons(ETH_P_IP);
++      if (skb_network_header(skb) >= skb->head &&
++          (skb_network_header(skb) + sizeof(struct ipv6hdr)) <=
++                  skb_tail_pointer(skb) &&
++          ipv6_hdr(skb)->version == 6)
++              return htons(ETH_P_IPV6);
++      return 0;
++}
++
++static inline void wg_reset_packet(struct sk_buff *skb)
++{
++      const int pfmemalloc = skb->pfmemalloc;
++
++      skb_scrub_packet(skb, true);
++      memset(&skb->headers_start, 0,
++             offsetof(struct sk_buff, headers_end) -
++                     offsetof(struct sk_buff, headers_start));
++      skb->pfmemalloc = pfmemalloc;
++      skb->queue_mapping = 0;
++      skb->nohdr = 0;
++      skb->peeked = 0;
++      skb->mac_len = 0;
++      skb->dev = NULL;
++#ifdef CONFIG_NET_SCHED
++      skb->tc_index = 0;
++#endif
++      skb_reset_redirect(skb);
++      skb->hdr_len = skb_headroom(skb);
++      skb_reset_mac_header(skb);
++      skb_reset_network_header(skb);
++      skb_reset_transport_header(skb);
++      skb_probe_transport_header(skb);
++      skb_reset_inner_headers(skb);
++}
++
++static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id)
++{
++      unsigned int cpu = *stored_cpu, cpu_index, i;
++
++      if (unlikely(cpu == nr_cpumask_bits ||
++                   !cpumask_test_cpu(cpu, cpu_online_mask))) {
++              cpu_index = id % cpumask_weight(cpu_online_mask);
++              cpu = cpumask_first(cpu_online_mask);
++              for (i = 0; i < cpu_index; ++i)
++                      cpu = cpumask_next(cpu, cpu_online_mask);
++              *stored_cpu = cpu;
++      }
++      return cpu;
++}
++
++/* This function is racy, in the sense that next is unlocked, so it could return
++ * the same CPU twice. A race-free version of this would be to instead store an
++ * atomic sequence number, do an increment-and-return, and then iterate through
++ * every possible CPU until we get to that index -- choose_cpu. However that's
++ * a bit slower, and it doesn't seem like this potential race actually
++ * introduces any performance loss, so we live with it.
++ */
++static inline int wg_cpumask_next_online(int *next)
++{
++      int cpu = *next;
++
++      while (unlikely(!cpumask_test_cpu(cpu, cpu_online_mask)))
++              cpu = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits;
++      *next = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits;
++      return cpu;
++}
++
++static inline int wg_queue_enqueue_per_device_and_peer(
++      struct crypt_queue *device_queue, struct crypt_queue *peer_queue,
++      struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu)
++{
++      int cpu;
++
++      atomic_set_release(&PACKET_CB(skb)->state, PACKET_STATE_UNCRYPTED);
++      /* We first queue this up for the peer ingestion, but the consumer
++       * will wait for the state to change to CRYPTED or DEAD before.
++       */
++      if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb)))
++              return -ENOSPC;
++      /* Then we queue it up in the device queue, which consumes the
++       * packet as soon as it can.
++       */
++      cpu = wg_cpumask_next_online(next_cpu);
++      if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb)))
++              return -EPIPE;
++      queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work);
++      return 0;
++}
++
++static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue,
++                                           struct sk_buff *skb,
++                                           enum packet_state state)
++{
++      /* We take a reference, because as soon as we call atomic_set, the
++       * peer can be freed from below us.
++       */
++      struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
++
++      atomic_set_release(&PACKET_CB(skb)->state, state);
++      queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu,
++                                             peer->internal_id),
++                    peer->device->packet_crypt_wq, &queue->work);
++      wg_peer_put(peer);
++}
++
++static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb,
++                                                enum packet_state state)
++{
++      /* We take a reference, because as soon as we call atomic_set, the
++       * peer can be freed from below us.
++       */
++      struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
++
++      atomic_set_release(&PACKET_CB(skb)->state, state);
++      napi_schedule(&peer->napi);
++      wg_peer_put(peer);
++}
++
++#ifdef DEBUG
++bool wg_packet_counter_selftest(void);
++#endif
++
++#endif /* _WG_QUEUEING_H */
+--- /dev/null
++++ b/drivers/net/wireguard/ratelimiter.c
+@@ -0,0 +1,223 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "ratelimiter.h"
++#include <linux/siphash.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <net/ip.h>
++
++static struct kmem_cache *entry_cache;
++static hsiphash_key_t key;
++static spinlock_t table_lock = __SPIN_LOCK_UNLOCKED("ratelimiter_table_lock");
++static DEFINE_MUTEX(init_lock);
++static u64 init_refcnt; /* Protected by init_lock, hence not atomic. */
++static atomic_t total_entries = ATOMIC_INIT(0);
++static unsigned int max_entries, table_size;
++static void wg_ratelimiter_gc_entries(struct work_struct *);
++static DECLARE_DEFERRABLE_WORK(gc_work, wg_ratelimiter_gc_entries);
++static struct hlist_head *table_v4;
++#if IS_ENABLED(CONFIG_IPV6)
++static struct hlist_head *table_v6;
++#endif
++
++struct ratelimiter_entry {
++      u64 last_time_ns, tokens, ip;
++      void *net;
++      spinlock_t lock;
++      struct hlist_node hash;
++      struct rcu_head rcu;
++};
++
++enum {
++      PACKETS_PER_SECOND = 20,
++      PACKETS_BURSTABLE = 5,
++      PACKET_COST = NSEC_PER_SEC / PACKETS_PER_SECOND,
++      TOKEN_MAX = PACKET_COST * PACKETS_BURSTABLE
++};
++
++static void entry_free(struct rcu_head *rcu)
++{
++      kmem_cache_free(entry_cache,
++                      container_of(rcu, struct ratelimiter_entry, rcu));
++      atomic_dec(&total_entries);
++}
++
++static void entry_uninit(struct ratelimiter_entry *entry)
++{
++      hlist_del_rcu(&entry->hash);
++      call_rcu(&entry->rcu, entry_free);
++}
++
++/* Calling this function with a NULL work uninits all entries. */
++static void wg_ratelimiter_gc_entries(struct work_struct *work)
++{
++      const u64 now = ktime_get_coarse_boottime_ns();
++      struct ratelimiter_entry *entry;
++      struct hlist_node *temp;
++      unsigned int i;
++
++      for (i = 0; i < table_size; ++i) {
++              spin_lock(&table_lock);
++              hlist_for_each_entry_safe(entry, temp, &table_v4[i], hash) {
++                      if (unlikely(!work) ||
++                          now - entry->last_time_ns > NSEC_PER_SEC)
++                              entry_uninit(entry);
++              }
++#if IS_ENABLED(CONFIG_IPV6)
++              hlist_for_each_entry_safe(entry, temp, &table_v6[i], hash) {
++                      if (unlikely(!work) ||
++                          now - entry->last_time_ns > NSEC_PER_SEC)
++                              entry_uninit(entry);
++              }
++#endif
++              spin_unlock(&table_lock);
++              if (likely(work))
++                      cond_resched();
++      }
++      if (likely(work))
++              queue_delayed_work(system_power_efficient_wq, &gc_work, HZ);
++}
++
++bool wg_ratelimiter_allow(struct sk_buff *skb, struct net *net)
++{
++      /* We only take the bottom half of the net pointer, so that we can hash
++       * 3 words in the end. This way, siphash's len param fits into the final
++       * u32, and we don't incur an extra round.
++       */
++      const u32 net_word = (unsigned long)net;
++      struct ratelimiter_entry *entry;
++      struct hlist_head *bucket;
++      u64 ip;
++
++      if (skb->protocol == htons(ETH_P_IP)) {
++              ip = (u64 __force)ip_hdr(skb)->saddr;
++              bucket = &table_v4[hsiphash_2u32(net_word, ip, &key) &
++                                 (table_size - 1)];
++      }
++#if IS_ENABLED(CONFIG_IPV6)
++      else if (skb->protocol == htons(ETH_P_IPV6)) {
++              /* Only use 64 bits, so as to ratelimit the whole /64. */
++              memcpy(&ip, &ipv6_hdr(skb)->saddr, sizeof(ip));
++              bucket = &table_v6[hsiphash_3u32(net_word, ip >> 32, ip, &key) &
++                                 (table_size - 1)];
++      }
++#endif
++      else
++              return false;
++      rcu_read_lock();
++      hlist_for_each_entry_rcu(entry, bucket, hash) {
++              if (entry->net == net && entry->ip == ip) {
++                      u64 now, tokens;
++                      bool ret;
++                      /* Quasi-inspired by nft_limit.c, but this is actually a
++                       * slightly different algorithm. Namely, we incorporate
++                       * the burst as part of the maximum tokens, rather than
++                       * as part of the rate.
++                       */
++                      spin_lock(&entry->lock);
++                      now = ktime_get_coarse_boottime_ns();
++                      tokens = min_t(u64, TOKEN_MAX,
++                                     entry->tokens + now -
++                                             entry->last_time_ns);
++                      entry->last_time_ns = now;
++                      ret = tokens >= PACKET_COST;
++                      entry->tokens = ret ? tokens - PACKET_COST : tokens;
++                      spin_unlock(&entry->lock);
++                      rcu_read_unlock();
++                      return ret;
++              }
++      }
++      rcu_read_unlock();
++
++      if (atomic_inc_return(&total_entries) > max_entries)
++              goto err_oom;
++
++      entry = kmem_cache_alloc(entry_cache, GFP_KERNEL);
++      if (unlikely(!entry))
++              goto err_oom;
++
++      entry->net = net;
++      entry->ip = ip;
++      INIT_HLIST_NODE(&entry->hash);
++      spin_lock_init(&entry->lock);
++      entry->last_time_ns = ktime_get_coarse_boottime_ns();
++      entry->tokens = TOKEN_MAX - PACKET_COST;
++      spin_lock(&table_lock);
++      hlist_add_head_rcu(&entry->hash, bucket);
++      spin_unlock(&table_lock);
++      return true;
++
++err_oom:
++      atomic_dec(&total_entries);
++      return false;
++}
++
++int wg_ratelimiter_init(void)
++{
++      mutex_lock(&init_lock);
++      if (++init_refcnt != 1)
++              goto out;
++
++      entry_cache = KMEM_CACHE(ratelimiter_entry, 0);
++      if (!entry_cache)
++              goto err;
++
++      /* xt_hashlimit.c uses a slightly different algorithm for ratelimiting,
++       * but what it shares in common is that it uses a massive hashtable. So,
++       * we borrow their wisdom about good table sizes on different systems
++       * dependent on RAM. This calculation here comes from there.
++       */
++      table_size = (totalram_pages() > (1U << 30) / PAGE_SIZE) ? 8192 :
++              max_t(unsigned long, 16, roundup_pow_of_two(
++                      (totalram_pages() << PAGE_SHIFT) /
++                      (1U << 14) / sizeof(struct hlist_head)));
++      max_entries = table_size * 8;
++
++      table_v4 = kvzalloc(table_size * sizeof(*table_v4), GFP_KERNEL);
++      if (unlikely(!table_v4))
++              goto err_kmemcache;
++
++#if IS_ENABLED(CONFIG_IPV6)
++      table_v6 = kvzalloc(table_size * sizeof(*table_v6), GFP_KERNEL);
++      if (unlikely(!table_v6)) {
++              kvfree(table_v4);
++              goto err_kmemcache;
++      }
++#endif
++
++      queue_delayed_work(system_power_efficient_wq, &gc_work, HZ);
++      get_random_bytes(&key, sizeof(key));
++out:
++      mutex_unlock(&init_lock);
++      return 0;
++
++err_kmemcache:
++      kmem_cache_destroy(entry_cache);
++err:
++      --init_refcnt;
++      mutex_unlock(&init_lock);
++      return -ENOMEM;
++}
++
++void wg_ratelimiter_uninit(void)
++{
++      mutex_lock(&init_lock);
++      if (!init_refcnt || --init_refcnt)
++              goto out;
++
++      cancel_delayed_work_sync(&gc_work);
++      wg_ratelimiter_gc_entries(NULL);
++      rcu_barrier();
++      kvfree(table_v4);
++#if IS_ENABLED(CONFIG_IPV6)
++      kvfree(table_v6);
++#endif
++      kmem_cache_destroy(entry_cache);
++out:
++      mutex_unlock(&init_lock);
++}
++
++#include "selftest/ratelimiter.c"
+--- /dev/null
++++ b/drivers/net/wireguard/ratelimiter.h
+@@ -0,0 +1,19 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_RATELIMITER_H
++#define _WG_RATELIMITER_H
++
++#include <linux/skbuff.h>
++
++int wg_ratelimiter_init(void);
++void wg_ratelimiter_uninit(void);
++bool wg_ratelimiter_allow(struct sk_buff *skb, struct net *net);
++
++#ifdef DEBUG
++bool wg_ratelimiter_selftest(void);
++#endif
++
++#endif /* _WG_RATELIMITER_H */
+--- /dev/null
++++ b/drivers/net/wireguard/receive.c
+@@ -0,0 +1,595 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "queueing.h"
++#include "device.h"
++#include "peer.h"
++#include "timers.h"
++#include "messages.h"
++#include "cookie.h"
++#include "socket.h"
++
++#include <linux/ip.h>
++#include <linux/ipv6.h>
++#include <linux/udp.h>
++#include <net/ip_tunnels.h>
++
++/* Must be called with bh disabled. */
++static void update_rx_stats(struct wg_peer *peer, size_t len)
++{
++      struct pcpu_sw_netstats *tstats =
++              get_cpu_ptr(peer->device->dev->tstats);
++
++      u64_stats_update_begin(&tstats->syncp);
++      ++tstats->rx_packets;
++      tstats->rx_bytes += len;
++      peer->rx_bytes += len;
++      u64_stats_update_end(&tstats->syncp);
++      put_cpu_ptr(tstats);
++}
++
++#define SKB_TYPE_LE32(skb) (((struct message_header *)(skb)->data)->type)
++
++static size_t validate_header_len(struct sk_buff *skb)
++{
++      if (unlikely(skb->len < sizeof(struct message_header)))
++              return 0;
++      if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_DATA) &&
++          skb->len >= MESSAGE_MINIMUM_LENGTH)
++              return sizeof(struct message_data);
++      if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION) &&
++          skb->len == sizeof(struct message_handshake_initiation))
++              return sizeof(struct message_handshake_initiation);
++      if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE) &&
++          skb->len == sizeof(struct message_handshake_response))
++              return sizeof(struct message_handshake_response);
++      if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE) &&
++          skb->len == sizeof(struct message_handshake_cookie))
++              return sizeof(struct message_handshake_cookie);
++      return 0;
++}
++
++static int prepare_skb_header(struct sk_buff *skb, struct wg_device *wg)
++{
++      size_t data_offset, data_len, header_len;
++      struct udphdr *udp;
++
++      if (unlikely(wg_skb_examine_untrusted_ip_hdr(skb) != skb->protocol ||
++                   skb_transport_header(skb) < skb->head ||
++                   (skb_transport_header(skb) + sizeof(struct udphdr)) >
++                           skb_tail_pointer(skb)))
++              return -EINVAL; /* Bogus IP header */
++      udp = udp_hdr(skb);
++      data_offset = (u8 *)udp - skb->data;
++      if (unlikely(data_offset > U16_MAX ||
++                   data_offset + sizeof(struct udphdr) > skb->len))
++              /* Packet has offset at impossible location or isn't big enough
++               * to have UDP fields.
++               */
++              return -EINVAL;
++      data_len = ntohs(udp->len);
++      if (unlikely(data_len < sizeof(struct udphdr) ||
++                   data_len > skb->len - data_offset))
++              /* UDP packet is reporting too small of a size or lying about
++               * its size.
++               */
++              return -EINVAL;
++      data_len -= sizeof(struct udphdr);
++      data_offset = (u8 *)udp + sizeof(struct udphdr) - skb->data;
++      if (unlikely(!pskb_may_pull(skb,
++                              data_offset + sizeof(struct message_header)) ||
++                   pskb_trim(skb, data_len + data_offset) < 0))
++              return -EINVAL;
++      skb_pull(skb, data_offset);
++      if (unlikely(skb->len != data_len))
++              /* Final len does not agree with calculated len */
++              return -EINVAL;
++      header_len = validate_header_len(skb);
++      if (unlikely(!header_len))
++              return -EINVAL;
++      __skb_push(skb, data_offset);
++      if (unlikely(!pskb_may_pull(skb, data_offset + header_len)))
++              return -EINVAL;
++      __skb_pull(skb, data_offset);
++      return 0;
++}
++
++static void wg_receive_handshake_packet(struct wg_device *wg,
++                                      struct sk_buff *skb)
++{
++      enum cookie_mac_state mac_state;
++      struct wg_peer *peer = NULL;
++      /* This is global, so that our load calculation applies to the whole
++       * system. We don't care about races with it at all.
++       */
++      static u64 last_under_load;
++      bool packet_needs_cookie;
++      bool under_load;
++
++      if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE)) {
++              net_dbg_skb_ratelimited("%s: Receiving cookie response from %pISpfsc\n",
++                                      wg->dev->name, skb);
++              wg_cookie_message_consume(
++                      (struct message_handshake_cookie *)skb->data, wg);
++              return;
++      }
++
++      under_load = skb_queue_len(&wg->incoming_handshakes) >=
++                   MAX_QUEUED_INCOMING_HANDSHAKES / 8;
++      if (under_load)
++              last_under_load = ktime_get_coarse_boottime_ns();
++      else if (last_under_load)
++              under_load = !wg_birthdate_has_expired(last_under_load, 1);
++      mac_state = wg_cookie_validate_packet(&wg->cookie_checker, skb,
++                                            under_load);
++      if ((under_load && mac_state == VALID_MAC_WITH_COOKIE) ||
++          (!under_load && mac_state == VALID_MAC_BUT_NO_COOKIE)) {
++              packet_needs_cookie = false;
++      } else if (under_load && mac_state == VALID_MAC_BUT_NO_COOKIE) {
++              packet_needs_cookie = true;
++      } else {
++              net_dbg_skb_ratelimited("%s: Invalid MAC of handshake, dropping packet from %pISpfsc\n",
++                                      wg->dev->name, skb);
++              return;
++      }
++
++      switch (SKB_TYPE_LE32(skb)) {
++      case cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION): {
++              struct message_handshake_initiation *message =
++                      (struct message_handshake_initiation *)skb->data;
++
++              if (packet_needs_cookie) {
++                      wg_packet_send_handshake_cookie(wg, skb,
++                                                      message->sender_index);
++                      return;
++              }
++              peer = wg_noise_handshake_consume_initiation(message, wg);
++              if (unlikely(!peer)) {
++                      net_dbg_skb_ratelimited("%s: Invalid handshake initiation from %pISpfsc\n",
++                                              wg->dev->name, skb);
++                      return;
++              }
++              wg_socket_set_peer_endpoint_from_skb(peer, skb);
++              net_dbg_ratelimited("%s: Receiving handshake initiation from peer %llu (%pISpfsc)\n",
++                                  wg->dev->name, peer->internal_id,
++                                  &peer->endpoint.addr);
++              wg_packet_send_handshake_response(peer);
++              break;
++      }
++      case cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE): {
++              struct message_handshake_response *message =
++                      (struct message_handshake_response *)skb->data;
++
++              if (packet_needs_cookie) {
++                      wg_packet_send_handshake_cookie(wg, skb,
++                                                      message->sender_index);
++                      return;
++              }
++              peer = wg_noise_handshake_consume_response(message, wg);
++              if (unlikely(!peer)) {
++                      net_dbg_skb_ratelimited("%s: Invalid handshake response from %pISpfsc\n",
++                                              wg->dev->name, skb);
++                      return;
++              }
++              wg_socket_set_peer_endpoint_from_skb(peer, skb);
++              net_dbg_ratelimited("%s: Receiving handshake response from peer %llu (%pISpfsc)\n",
++                                  wg->dev->name, peer->internal_id,
++                                  &peer->endpoint.addr);
++              if (wg_noise_handshake_begin_session(&peer->handshake,
++                                                   &peer->keypairs)) {
++                      wg_timers_session_derived(peer);
++                      wg_timers_handshake_complete(peer);
++                      /* Calling this function will either send any existing
++                       * packets in the queue and not send a keepalive, which
++                       * is the best case, Or, if there's nothing in the
++                       * queue, it will send a keepalive, in order to give
++                       * immediate confirmation of the session.
++                       */
++                      wg_packet_send_keepalive(peer);
++              }
++              break;
++      }
++      }
++
++      if (unlikely(!peer)) {
++              WARN(1, "Somehow a wrong type of packet wound up in the handshake queue!\n");
++              return;
++      }
++
++      local_bh_disable();
++      update_rx_stats(peer, skb->len);
++      local_bh_enable();
++
++      wg_timers_any_authenticated_packet_received(peer);
++      wg_timers_any_authenticated_packet_traversal(peer);
++      wg_peer_put(peer);
++}
++
++void wg_packet_handshake_receive_worker(struct work_struct *work)
++{
++      struct wg_device *wg = container_of(work, struct multicore_worker,
++                                          work)->ptr;
++      struct sk_buff *skb;
++
++      while ((skb = skb_dequeue(&wg->incoming_handshakes)) != NULL) {
++              wg_receive_handshake_packet(wg, skb);
++              dev_kfree_skb(skb);
++              cond_resched();
++      }
++}
++
++static void keep_key_fresh(struct wg_peer *peer)
++{
++      struct noise_keypair *keypair;
++      bool send = false;
++
++      if (peer->sent_lastminute_handshake)
++              return;
++
++      rcu_read_lock_bh();
++      keypair = rcu_dereference_bh(peer->keypairs.current_keypair);
++      if (likely(keypair && READ_ONCE(keypair->sending.is_valid)) &&
++          keypair->i_am_the_initiator &&
++          unlikely(wg_birthdate_has_expired(keypair->sending.birthdate,
++                      REJECT_AFTER_TIME - KEEPALIVE_TIMEOUT - REKEY_TIMEOUT)))
++              send = true;
++      rcu_read_unlock_bh();
++
++      if (send) {
++              peer->sent_lastminute_handshake = true;
++              wg_packet_send_queued_handshake_initiation(peer, false);
++      }
++}
++
++static bool decrypt_packet(struct sk_buff *skb, struct noise_symmetric_key *key)
++{
++      struct scatterlist sg[MAX_SKB_FRAGS + 8];
++      struct sk_buff *trailer;
++      unsigned int offset;
++      int num_frags;
++
++      if (unlikely(!key))
++              return false;
++
++      if (unlikely(!READ_ONCE(key->is_valid) ||
++                wg_birthdate_has_expired(key->birthdate, REJECT_AFTER_TIME) ||
++                key->counter.receive.counter >= REJECT_AFTER_MESSAGES)) {
++              WRITE_ONCE(key->is_valid, false);
++              return false;
++      }
++
++      PACKET_CB(skb)->nonce =
++              le64_to_cpu(((struct message_data *)skb->data)->counter);
++
++      /* We ensure that the network header is part of the packet before we
++       * call skb_cow_data, so that there's no chance that data is removed
++       * from the skb, so that later we can extract the original endpoint.
++       */
++      offset = skb->data - skb_network_header(skb);
++      skb_push(skb, offset);
++      num_frags = skb_cow_data(skb, 0, &trailer);
++      offset += sizeof(struct message_data);
++      skb_pull(skb, offset);
++      if (unlikely(num_frags < 0 || num_frags > ARRAY_SIZE(sg)))
++              return false;
++
++      sg_init_table(sg, num_frags);
++      if (skb_to_sgvec(skb, sg, 0, skb->len) <= 0)
++              return false;
++
++      if (!chacha20poly1305_decrypt_sg_inplace(sg, skb->len, NULL, 0,
++                                               PACKET_CB(skb)->nonce,
++                                               key->key))
++              return false;
++
++      /* Another ugly situation of pushing and pulling the header so as to
++       * keep endpoint information intact.
++       */
++      skb_push(skb, offset);
++      if (pskb_trim(skb, skb->len - noise_encrypted_len(0)))
++              return false;
++      skb_pull(skb, offset);
++
++      return true;
++}
++
++/* This is RFC6479, a replay detection bitmap algorithm that avoids bitshifts */
++static bool counter_validate(union noise_counter *counter, u64 their_counter)
++{
++      unsigned long index, index_current, top, i;
++      bool ret = false;
++
++      spin_lock_bh(&counter->receive.lock);
++
++      if (unlikely(counter->receive.counter >= REJECT_AFTER_MESSAGES + 1 ||
++                   their_counter >= REJECT_AFTER_MESSAGES))
++              goto out;
++
++      ++their_counter;
++
++      if (unlikely((COUNTER_WINDOW_SIZE + their_counter) <
++                   counter->receive.counter))
++              goto out;
++
++      index = their_counter >> ilog2(BITS_PER_LONG);
++
++      if (likely(their_counter > counter->receive.counter)) {
++              index_current = counter->receive.counter >> ilog2(BITS_PER_LONG);
++              top = min_t(unsigned long, index - index_current,
++                          COUNTER_BITS_TOTAL / BITS_PER_LONG);
++              for (i = 1; i <= top; ++i)
++                      counter->receive.backtrack[(i + index_current) &
++                              ((COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1)] = 0;
++              counter->receive.counter = their_counter;
++      }
++
++      index &= (COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1;
++      ret = !test_and_set_bit(their_counter & (BITS_PER_LONG - 1),
++                              &counter->receive.backtrack[index]);
++
++out:
++      spin_unlock_bh(&counter->receive.lock);
++      return ret;
++}
++
++#include "selftest/counter.c"
++
++static void wg_packet_consume_data_done(struct wg_peer *peer,
++                                      struct sk_buff *skb,
++                                      struct endpoint *endpoint)
++{
++      struct net_device *dev = peer->device->dev;
++      unsigned int len, len_before_trim;
++      struct wg_peer *routed_peer;
++
++      wg_socket_set_peer_endpoint(peer, endpoint);
++
++      if (unlikely(wg_noise_received_with_keypair(&peer->keypairs,
++                                                  PACKET_CB(skb)->keypair))) {
++              wg_timers_handshake_complete(peer);
++              wg_packet_send_staged_packets(peer);
++      }
++
++      keep_key_fresh(peer);
++
++      wg_timers_any_authenticated_packet_received(peer);
++      wg_timers_any_authenticated_packet_traversal(peer);
++
++      /* A packet with length 0 is a keepalive packet */
++      if (unlikely(!skb->len)) {
++              update_rx_stats(peer, message_data_len(0));
++              net_dbg_ratelimited("%s: Receiving keepalive packet from peer %llu (%pISpfsc)\n",
++                                  dev->name, peer->internal_id,
++                                  &peer->endpoint.addr);
++              goto packet_processed;
++      }
++
++      wg_timers_data_received(peer);
++
++      if (unlikely(skb_network_header(skb) < skb->head))
++              goto dishonest_packet_size;
++      if (unlikely(!(pskb_network_may_pull(skb, sizeof(struct iphdr)) &&
++                     (ip_hdr(skb)->version == 4 ||
++                      (ip_hdr(skb)->version == 6 &&
++                       pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))))))
++              goto dishonest_packet_type;
++
++      skb->dev = dev;
++      /* We've already verified the Poly1305 auth tag, which means this packet
++       * was not modified in transit. We can therefore tell the networking
++       * stack that all checksums of every layer of encapsulation have already
++       * been checked "by the hardware" and therefore is unneccessary to check
++       * again in software.
++       */
++      skb->ip_summed = CHECKSUM_UNNECESSARY;
++      skb->csum_level = ~0; /* All levels */
++      skb->protocol = wg_skb_examine_untrusted_ip_hdr(skb);
++      if (skb->protocol == htons(ETH_P_IP)) {
++              len = ntohs(ip_hdr(skb)->tot_len);
++              if (unlikely(len < sizeof(struct iphdr)))
++                      goto dishonest_packet_size;
++              if (INET_ECN_is_ce(PACKET_CB(skb)->ds))
++                      IP_ECN_set_ce(ip_hdr(skb));
++      } else if (skb->protocol == htons(ETH_P_IPV6)) {
++              len = ntohs(ipv6_hdr(skb)->payload_len) +
++                    sizeof(struct ipv6hdr);
++              if (INET_ECN_is_ce(PACKET_CB(skb)->ds))
++                      IP6_ECN_set_ce(skb, ipv6_hdr(skb));
++      } else {
++              goto dishonest_packet_type;
++      }
++
++      if (unlikely(len > skb->len))
++              goto dishonest_packet_size;
++      len_before_trim = skb->len;
++      if (unlikely(pskb_trim(skb, len)))
++              goto packet_processed;
++
++      routed_peer = wg_allowedips_lookup_src(&peer->device->peer_allowedips,
++                                             skb);
++      wg_peer_put(routed_peer); /* We don't need the extra reference. */
++
++      if (unlikely(routed_peer != peer))
++              goto dishonest_packet_peer;
++
++      if (unlikely(napi_gro_receive(&peer->napi, skb) == GRO_DROP)) {
++              ++dev->stats.rx_dropped;
++              net_dbg_ratelimited("%s: Failed to give packet to userspace from peer %llu (%pISpfsc)\n",
++                                  dev->name, peer->internal_id,
++                                  &peer->endpoint.addr);
++      } else {
++              update_rx_stats(peer, message_data_len(len_before_trim));
++      }
++      return;
++
++dishonest_packet_peer:
++      net_dbg_skb_ratelimited("%s: Packet has unallowed src IP (%pISc) from peer %llu (%pISpfsc)\n",
++                              dev->name, skb, peer->internal_id,
++                              &peer->endpoint.addr);
++      ++dev->stats.rx_errors;
++      ++dev->stats.rx_frame_errors;
++      goto packet_processed;
++dishonest_packet_type:
++      net_dbg_ratelimited("%s: Packet is neither ipv4 nor ipv6 from peer %llu (%pISpfsc)\n",
++                          dev->name, peer->internal_id, &peer->endpoint.addr);
++      ++dev->stats.rx_errors;
++      ++dev->stats.rx_frame_errors;
++      goto packet_processed;
++dishonest_packet_size:
++      net_dbg_ratelimited("%s: Packet has incorrect size from peer %llu (%pISpfsc)\n",
++                          dev->name, peer->internal_id, &peer->endpoint.addr);
++      ++dev->stats.rx_errors;
++      ++dev->stats.rx_length_errors;
++      goto packet_processed;
++packet_processed:
++      dev_kfree_skb(skb);
++}
++
++int wg_packet_rx_poll(struct napi_struct *napi, int budget)
++{
++      struct wg_peer *peer = container_of(napi, struct wg_peer, napi);
++      struct crypt_queue *queue = &peer->rx_queue;
++      struct noise_keypair *keypair;
++      struct endpoint endpoint;
++      enum packet_state state;
++      struct sk_buff *skb;
++      int work_done = 0;
++      bool free;
++
++      if (unlikely(budget <= 0))
++              return 0;
++
++      while ((skb = __ptr_ring_peek(&queue->ring)) != NULL &&
++             (state = atomic_read_acquire(&PACKET_CB(skb)->state)) !=
++                     PACKET_STATE_UNCRYPTED) {
++              __ptr_ring_discard_one(&queue->ring);
++              peer = PACKET_PEER(skb);
++              keypair = PACKET_CB(skb)->keypair;
++              free = true;
++
++              if (unlikely(state != PACKET_STATE_CRYPTED))
++                      goto next;
++
++              if (unlikely(!counter_validate(&keypair->receiving.counter,
++                                             PACKET_CB(skb)->nonce))) {
++                      net_dbg_ratelimited("%s: Packet has invalid nonce %llu (max %llu)\n",
++                                          peer->device->dev->name,
++                                          PACKET_CB(skb)->nonce,
++                                          keypair->receiving.counter.receive.counter);
++                      goto next;
++              }
++
++              if (unlikely(wg_socket_endpoint_from_skb(&endpoint, skb)))
++                      goto next;
++
++              wg_reset_packet(skb);
++              wg_packet_consume_data_done(peer, skb, &endpoint);
++              free = false;
++
++next:
++              wg_noise_keypair_put(keypair, false);
++              wg_peer_put(peer);
++              if (unlikely(free))
++                      dev_kfree_skb(skb);
++
++              if (++work_done >= budget)
++                      break;
++      }
++
++      if (work_done < budget)
++              napi_complete_done(napi, work_done);
++
++      return work_done;
++}
++
++void wg_packet_decrypt_worker(struct work_struct *work)
++{
++      struct crypt_queue *queue = container_of(work, struct multicore_worker,
++                                               work)->ptr;
++      struct sk_buff *skb;
++
++      while ((skb = ptr_ring_consume_bh(&queue->ring)) != NULL) {
++              enum packet_state state = likely(decrypt_packet(skb,
++                              &PACKET_CB(skb)->keypair->receiving)) ?
++                              PACKET_STATE_CRYPTED : PACKET_STATE_DEAD;
++              wg_queue_enqueue_per_peer_napi(skb, state);
++      }
++}
++
++static void wg_packet_consume_data(struct wg_device *wg, struct sk_buff *skb)
++{
++      __le32 idx = ((struct message_data *)skb->data)->key_idx;
++      struct wg_peer *peer = NULL;
++      int ret;
++
++      rcu_read_lock_bh();
++      PACKET_CB(skb)->keypair =
++              (struct noise_keypair *)wg_index_hashtable_lookup(
++                      wg->index_hashtable, INDEX_HASHTABLE_KEYPAIR, idx,
++                      &peer);
++      if (unlikely(!wg_noise_keypair_get(PACKET_CB(skb)->keypair)))
++              goto err_keypair;
++
++      if (unlikely(READ_ONCE(peer->is_dead)))
++              goto err;
++
++      ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue,
++                                                 &peer->rx_queue, skb,
++                                                 wg->packet_crypt_wq,
++                                                 &wg->decrypt_queue.last_cpu);
++      if (unlikely(ret == -EPIPE))
++              wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD);
++      if (likely(!ret || ret == -EPIPE)) {
++              rcu_read_unlock_bh();
++              return;
++      }
++err:
++      wg_noise_keypair_put(PACKET_CB(skb)->keypair, false);
++err_keypair:
++      rcu_read_unlock_bh();
++      wg_peer_put(peer);
++      dev_kfree_skb(skb);
++}
++
++void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb)
++{
++      if (unlikely(prepare_skb_header(skb, wg) < 0))
++              goto err;
++      switch (SKB_TYPE_LE32(skb)) {
++      case cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION):
++      case cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE):
++      case cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE): {
++              int cpu;
++
++              if (skb_queue_len(&wg->incoming_handshakes) >
++                          MAX_QUEUED_INCOMING_HANDSHAKES ||
++                  unlikely(!rng_is_initialized())) {
++                      net_dbg_skb_ratelimited("%s: Dropping handshake packet from %pISpfsc\n",
++                                              wg->dev->name, skb);
++                      goto err;
++              }
++              skb_queue_tail(&wg->incoming_handshakes, skb);
++              /* Queues up a call to packet_process_queued_handshake_
++               * packets(skb):
++               */
++              cpu = wg_cpumask_next_online(&wg->incoming_handshake_cpu);
++              queue_work_on(cpu, wg->handshake_receive_wq,
++                      &per_cpu_ptr(wg->incoming_handshakes_worker, cpu)->work);
++              break;
++      }
++      case cpu_to_le32(MESSAGE_DATA):
++              PACKET_CB(skb)->ds = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
++              wg_packet_consume_data(wg, skb);
++              break;
++      default:
++              net_dbg_skb_ratelimited("%s: Invalid packet from %pISpfsc\n",
++                                      wg->dev->name, skb);
++              goto err;
++      }
++      return;
++
++err:
++      dev_kfree_skb(skb);
++}
+--- /dev/null
++++ b/drivers/net/wireguard/selftest/allowedips.c
+@@ -0,0 +1,683 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * This contains some basic static unit tests for the allowedips data structure.
++ * It also has two additional modes that are disabled and meant to be used by
++ * folks directly playing with this file. If you define the macro
++ * DEBUG_PRINT_TRIE_GRAPHVIZ to be 1, then every time there's a full tree in
++ * memory, it will be printed out as KERN_DEBUG in a format that can be passed
++ * to graphviz (the dot command) to visualize it. If you define the macro
++ * DEBUG_RANDOM_TRIE to be 1, then there will be an extremely costly set of
++ * randomized tests done against a trivial implementation, which may take
++ * upwards of a half-hour to complete. There's no set of users who should be
++ * enabling these, and the only developers that should go anywhere near these
++ * nobs are the ones who are reading this comment.
++ */
++
++#ifdef DEBUG
++
++#include <linux/siphash.h>
++
++static __init void swap_endian_and_apply_cidr(u8 *dst, const u8 *src, u8 bits,
++                                            u8 cidr)
++{
++      swap_endian(dst, src, bits);
++      memset(dst + (cidr + 7) / 8, 0, bits / 8 - (cidr + 7) / 8);
++      if (cidr)
++              dst[(cidr + 7) / 8 - 1] &= ~0U << ((8 - (cidr % 8)) % 8);
++}
++
++static __init void print_node(struct allowedips_node *node, u8 bits)
++{
++      char *fmt_connection = KERN_DEBUG "\t\"%p/%d\" -> \"%p/%d\";\n";
++      char *fmt_declaration = KERN_DEBUG
++              "\t\"%p/%d\"[style=%s, color=\"#%06x\"];\n";
++      char *style = "dotted";
++      u8 ip1[16], ip2[16];
++      u32 color = 0;
++
++      if (bits == 32) {
++              fmt_connection = KERN_DEBUG "\t\"%pI4/%d\" -> \"%pI4/%d\";\n";
++              fmt_declaration = KERN_DEBUG
++                      "\t\"%pI4/%d\"[style=%s, color=\"#%06x\"];\n";
++      } else if (bits == 128) {
++              fmt_connection = KERN_DEBUG "\t\"%pI6/%d\" -> \"%pI6/%d\";\n";
++              fmt_declaration = KERN_DEBUG
++                      "\t\"%pI6/%d\"[style=%s, color=\"#%06x\"];\n";
++      }
++      if (node->peer) {
++              hsiphash_key_t key = { { 0 } };
++
++              memcpy(&key, &node->peer, sizeof(node->peer));
++              color = hsiphash_1u32(0xdeadbeef, &key) % 200 << 16 |
++                      hsiphash_1u32(0xbabecafe, &key) % 200 << 8 |
++                      hsiphash_1u32(0xabad1dea, &key) % 200;
++              style = "bold";
++      }
++      swap_endian_and_apply_cidr(ip1, node->bits, bits, node->cidr);
++      printk(fmt_declaration, ip1, node->cidr, style, color);
++      if (node->bit[0]) {
++              swap_endian_and_apply_cidr(ip2,
++                              rcu_dereference_raw(node->bit[0])->bits, bits,
++                              node->cidr);
++              printk(fmt_connection, ip1, node->cidr, ip2,
++                     rcu_dereference_raw(node->bit[0])->cidr);
++              print_node(rcu_dereference_raw(node->bit[0]), bits);
++      }
++      if (node->bit[1]) {
++              swap_endian_and_apply_cidr(ip2,
++                              rcu_dereference_raw(node->bit[1])->bits,
++                              bits, node->cidr);
++              printk(fmt_connection, ip1, node->cidr, ip2,
++                     rcu_dereference_raw(node->bit[1])->cidr);
++              print_node(rcu_dereference_raw(node->bit[1]), bits);
++      }
++}
++
++static __init void print_tree(struct allowedips_node __rcu *top, u8 bits)
++{
++      printk(KERN_DEBUG "digraph trie {\n");
++      print_node(rcu_dereference_raw(top), bits);
++      printk(KERN_DEBUG "}\n");
++}
++
++enum {
++      NUM_PEERS = 2000,
++      NUM_RAND_ROUTES = 400,
++      NUM_MUTATED_ROUTES = 100,
++      NUM_QUERIES = NUM_RAND_ROUTES * NUM_MUTATED_ROUTES * 30
++};
++
++struct horrible_allowedips {
++      struct hlist_head head;
++};
++
++struct horrible_allowedips_node {
++      struct hlist_node table;
++      union nf_inet_addr ip;
++      union nf_inet_addr mask;
++      u8 ip_version;
++      void *value;
++};
++
++static __init void horrible_allowedips_init(struct horrible_allowedips *table)
++{
++      INIT_HLIST_HEAD(&table->head);
++}
++
++static __init void horrible_allowedips_free(struct horrible_allowedips *table)
++{
++      struct horrible_allowedips_node *node;
++      struct hlist_node *h;
++
++      hlist_for_each_entry_safe(node, h, &table->head, table) {
++              hlist_del(&node->table);
++              kfree(node);
++      }
++}
++
++static __init inline union nf_inet_addr horrible_cidr_to_mask(u8 cidr)
++{
++      union nf_inet_addr mask;
++
++      memset(&mask, 0x00, 128 / 8);
++      memset(&mask, 0xff, cidr / 8);
++      if (cidr % 32)
++              mask.all[cidr / 32] = (__force u32)htonl(
++                      (0xFFFFFFFFUL << (32 - (cidr % 32))) & 0xFFFFFFFFUL);
++      return mask;
++}
++
++static __init inline u8 horrible_mask_to_cidr(union nf_inet_addr subnet)
++{
++      return hweight32(subnet.all[0]) + hweight32(subnet.all[1]) +
++             hweight32(subnet.all[2]) + hweight32(subnet.all[3]);
++}
++
++static __init inline void
++horrible_mask_self(struct horrible_allowedips_node *node)
++{
++      if (node->ip_version == 4) {
++              node->ip.ip &= node->mask.ip;
++      } else if (node->ip_version == 6) {
++              node->ip.ip6[0] &= node->mask.ip6[0];
++              node->ip.ip6[1] &= node->mask.ip6[1];
++              node->ip.ip6[2] &= node->mask.ip6[2];
++              node->ip.ip6[3] &= node->mask.ip6[3];
++      }
++}
++
++static __init inline bool
++horrible_match_v4(const struct horrible_allowedips_node *node,
++                struct in_addr *ip)
++{
++      return (ip->s_addr & node->mask.ip) == node->ip.ip;
++}
++
++static __init inline bool
++horrible_match_v6(const struct horrible_allowedips_node *node,
++                struct in6_addr *ip)
++{
++      return (ip->in6_u.u6_addr32[0] & node->mask.ip6[0]) ==
++                     node->ip.ip6[0] &&
++             (ip->in6_u.u6_addr32[1] & node->mask.ip6[1]) ==
++                     node->ip.ip6[1] &&
++             (ip->in6_u.u6_addr32[2] & node->mask.ip6[2]) ==
++                     node->ip.ip6[2] &&
++             (ip->in6_u.u6_addr32[3] & node->mask.ip6[3]) == node->ip.ip6[3];
++}
++
++static __init void
++horrible_insert_ordered(struct horrible_allowedips *table,
++                      struct horrible_allowedips_node *node)
++{
++      struct horrible_allowedips_node *other = NULL, *where = NULL;
++      u8 my_cidr = horrible_mask_to_cidr(node->mask);
++
++      hlist_for_each_entry(other, &table->head, table) {
++              if (!memcmp(&other->mask, &node->mask,
++                          sizeof(union nf_inet_addr)) &&
++                  !memcmp(&other->ip, &node->ip,
++                          sizeof(union nf_inet_addr)) &&
++                  other->ip_version == node->ip_version) {
++                      other->value = node->value;
++                      kfree(node);
++                      return;
++              }
++              where = other;
++              if (horrible_mask_to_cidr(other->mask) <= my_cidr)
++                      break;
++      }
++      if (!other && !where)
++              hlist_add_head(&node->table, &table->head);
++      else if (!other)
++              hlist_add_behind(&node->table, &where->table);
++      else
++              hlist_add_before(&node->table, &where->table);
++}
++
++static __init int
++horrible_allowedips_insert_v4(struct horrible_allowedips *table,
++                            struct in_addr *ip, u8 cidr, void *value)
++{
++      struct horrible_allowedips_node *node = kzalloc(sizeof(*node),
++                                                      GFP_KERNEL);
++
++      if (unlikely(!node))
++              return -ENOMEM;
++      node->ip.in = *ip;
++      node->mask = horrible_cidr_to_mask(cidr);
++      node->ip_version = 4;
++      node->value = value;
++      horrible_mask_self(node);
++      horrible_insert_ordered(table, node);
++      return 0;
++}
++
++static __init int
++horrible_allowedips_insert_v6(struct horrible_allowedips *table,
++                            struct in6_addr *ip, u8 cidr, void *value)
++{
++      struct horrible_allowedips_node *node = kzalloc(sizeof(*node),
++                                                      GFP_KERNEL);
++
++      if (unlikely(!node))
++              return -ENOMEM;
++      node->ip.in6 = *ip;
++      node->mask = horrible_cidr_to_mask(cidr);
++      node->ip_version = 6;
++      node->value = value;
++      horrible_mask_self(node);
++      horrible_insert_ordered(table, node);
++      return 0;
++}
++
++static __init void *
++horrible_allowedips_lookup_v4(struct horrible_allowedips *table,
++                            struct in_addr *ip)
++{
++      struct horrible_allowedips_node *node;
++      void *ret = NULL;
++
++      hlist_for_each_entry(node, &table->head, table) {
++              if (node->ip_version != 4)
++                      continue;
++              if (horrible_match_v4(node, ip)) {
++                      ret = node->value;
++                      break;
++              }
++      }
++      return ret;
++}
++
++static __init void *
++horrible_allowedips_lookup_v6(struct horrible_allowedips *table,
++                            struct in6_addr *ip)
++{
++      struct horrible_allowedips_node *node;
++      void *ret = NULL;
++
++      hlist_for_each_entry(node, &table->head, table) {
++              if (node->ip_version != 6)
++                      continue;
++              if (horrible_match_v6(node, ip)) {
++                      ret = node->value;
++                      break;
++              }
++      }
++      return ret;
++}
++
++static __init bool randomized_test(void)
++{
++      unsigned int i, j, k, mutate_amount, cidr;
++      u8 ip[16], mutate_mask[16], mutated[16];
++      struct wg_peer **peers, *peer;
++      struct horrible_allowedips h;
++      DEFINE_MUTEX(mutex);
++      struct allowedips t;
++      bool ret = false;
++
++      mutex_init(&mutex);
++
++      wg_allowedips_init(&t);
++      horrible_allowedips_init(&h);
++
++      peers = kcalloc(NUM_PEERS, sizeof(*peers), GFP_KERNEL);
++      if (unlikely(!peers)) {
++              pr_err("allowedips random self-test malloc: FAIL\n");
++              goto free;
++      }
++      for (i = 0; i < NUM_PEERS; ++i) {
++              peers[i] = kzalloc(sizeof(*peers[i]), GFP_KERNEL);
++              if (unlikely(!peers[i])) {
++                      pr_err("allowedips random self-test malloc: FAIL\n");
++                      goto free;
++              }
++              kref_init(&peers[i]->refcount);
++      }
++
++      mutex_lock(&mutex);
++
++      for (i = 0; i < NUM_RAND_ROUTES; ++i) {
++              prandom_bytes(ip, 4);
++              cidr = prandom_u32_max(32) + 1;
++              peer = peers[prandom_u32_max(NUM_PEERS)];
++              if (wg_allowedips_insert_v4(&t, (struct in_addr *)ip, cidr,
++                                          peer, &mutex) < 0) {
++                      pr_err("allowedips random self-test malloc: FAIL\n");
++                      goto free_locked;
++              }
++              if (horrible_allowedips_insert_v4(&h, (struct in_addr *)ip,
++                                                cidr, peer) < 0) {
++                      pr_err("allowedips random self-test malloc: FAIL\n");
++                      goto free_locked;
++              }
++              for (j = 0; j < NUM_MUTATED_ROUTES; ++j) {
++                      memcpy(mutated, ip, 4);
++                      prandom_bytes(mutate_mask, 4);
++                      mutate_amount = prandom_u32_max(32);
++                      for (k = 0; k < mutate_amount / 8; ++k)
++                              mutate_mask[k] = 0xff;
++                      mutate_mask[k] = 0xff
++                                       << ((8 - (mutate_amount % 8)) % 8);
++                      for (; k < 4; ++k)
++                              mutate_mask[k] = 0;
++                      for (k = 0; k < 4; ++k)
++                              mutated[k] = (mutated[k] & mutate_mask[k]) |
++                                           (~mutate_mask[k] &
++                                            prandom_u32_max(256));
++                      cidr = prandom_u32_max(32) + 1;
++                      peer = peers[prandom_u32_max(NUM_PEERS)];
++                      if (wg_allowedips_insert_v4(&t,
++                                                  (struct in_addr *)mutated,
++                                                  cidr, peer, &mutex) < 0) {
++                              pr_err("allowedips random malloc: FAIL\n");
++                              goto free_locked;
++                      }
++                      if (horrible_allowedips_insert_v4(&h,
++                              (struct in_addr *)mutated, cidr, peer)) {
++                              pr_err("allowedips random self-test malloc: FAIL\n");
++                              goto free_locked;
++                      }
++              }
++      }
++
++      for (i = 0; i < NUM_RAND_ROUTES; ++i) {
++              prandom_bytes(ip, 16);
++              cidr = prandom_u32_max(128) + 1;
++              peer = peers[prandom_u32_max(NUM_PEERS)];
++              if (wg_allowedips_insert_v6(&t, (struct in6_addr *)ip, cidr,
++                                          peer, &mutex) < 0) {
++                      pr_err("allowedips random self-test malloc: FAIL\n");
++                      goto free_locked;
++              }
++              if (horrible_allowedips_insert_v6(&h, (struct in6_addr *)ip,
++                                                cidr, peer) < 0) {
++                      pr_err("allowedips random self-test malloc: FAIL\n");
++                      goto free_locked;
++              }
++              for (j = 0; j < NUM_MUTATED_ROUTES; ++j) {
++                      memcpy(mutated, ip, 16);
++                      prandom_bytes(mutate_mask, 16);
++                      mutate_amount = prandom_u32_max(128);
++                      for (k = 0; k < mutate_amount / 8; ++k)
++                              mutate_mask[k] = 0xff;
++                      mutate_mask[k] = 0xff
++                                       << ((8 - (mutate_amount % 8)) % 8);
++                      for (; k < 4; ++k)
++                              mutate_mask[k] = 0;
++                      for (k = 0; k < 4; ++k)
++                              mutated[k] = (mutated[k] & mutate_mask[k]) |
++                                           (~mutate_mask[k] &
++                                            prandom_u32_max(256));
++                      cidr = prandom_u32_max(128) + 1;
++                      peer = peers[prandom_u32_max(NUM_PEERS)];
++                      if (wg_allowedips_insert_v6(&t,
++                                                  (struct in6_addr *)mutated,
++                                                  cidr, peer, &mutex) < 0) {
++                              pr_err("allowedips random self-test malloc: FAIL\n");
++                              goto free_locked;
++                      }
++                      if (horrible_allowedips_insert_v6(
++                                  &h, (struct in6_addr *)mutated, cidr,
++                                  peer)) {
++                              pr_err("allowedips random self-test malloc: FAIL\n");
++                              goto free_locked;
++                      }
++              }
++      }
++
++      mutex_unlock(&mutex);
++
++      if (IS_ENABLED(DEBUG_PRINT_TRIE_GRAPHVIZ)) {
++              print_tree(t.root4, 32);
++              print_tree(t.root6, 128);
++      }
++
++      for (i = 0; i < NUM_QUERIES; ++i) {
++              prandom_bytes(ip, 4);
++              if (lookup(t.root4, 32, ip) !=
++                  horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip)) {
++                      pr_err("allowedips random self-test: FAIL\n");
++                      goto free;
++              }
++      }
++
++      for (i = 0; i < NUM_QUERIES; ++i) {
++              prandom_bytes(ip, 16);
++              if (lookup(t.root6, 128, ip) !=
++                  horrible_allowedips_lookup_v6(&h, (struct in6_addr *)ip)) {
++                      pr_err("allowedips random self-test: FAIL\n");
++                      goto free;
++              }
++      }
++      ret = true;
++
++free:
++      mutex_lock(&mutex);
++free_locked:
++      wg_allowedips_free(&t, &mutex);
++      mutex_unlock(&mutex);
++      horrible_allowedips_free(&h);
++      if (peers) {
++              for (i = 0; i < NUM_PEERS; ++i)
++                      kfree(peers[i]);
++      }
++      kfree(peers);
++      return ret;
++}
++
++static __init inline struct in_addr *ip4(u8 a, u8 b, u8 c, u8 d)
++{
++      static struct in_addr ip;
++      u8 *split = (u8 *)&ip;
++
++      split[0] = a;
++      split[1] = b;
++      split[2] = c;
++      split[3] = d;
++      return &ip;
++}
++
++static __init inline struct in6_addr *ip6(u32 a, u32 b, u32 c, u32 d)
++{
++      static struct in6_addr ip;
++      __be32 *split = (__be32 *)&ip;
++
++      split[0] = cpu_to_be32(a);
++      split[1] = cpu_to_be32(b);
++      split[2] = cpu_to_be32(c);
++      split[3] = cpu_to_be32(d);
++      return &ip;
++}
++
++static __init struct wg_peer *init_peer(void)
++{
++      struct wg_peer *peer = kzalloc(sizeof(*peer), GFP_KERNEL);
++
++      if (!peer)
++              return NULL;
++      kref_init(&peer->refcount);
++      INIT_LIST_HEAD(&peer->allowedips_list);
++      return peer;
++}
++
++#define insert(version, mem, ipa, ipb, ipc, ipd, cidr)                       \
++      wg_allowedips_insert_v##version(&t, ip##version(ipa, ipb, ipc, ipd), \
++                                      cidr, mem, &mutex)
++
++#define maybe_fail() do {                                               \
++              ++i;                                                    \
++              if (!_s) {                                              \
++                      pr_info("allowedips self-test %zu: FAIL\n", i); \
++                      success = false;                                \
++              }                                                       \
++      } while (0)
++
++#define test(version, mem, ipa, ipb, ipc, ipd) do {                          \
++              bool _s = lookup(t.root##version, (version) == 4 ? 32 : 128, \
++                               ip##version(ipa, ipb, ipc, ipd)) == (mem);  \
++              maybe_fail();                                                \
++      } while (0)
++
++#define test_negative(version, mem, ipa, ipb, ipc, ipd) do {                 \
++              bool _s = lookup(t.root##version, (version) == 4 ? 32 : 128, \
++                               ip##version(ipa, ipb, ipc, ipd)) != (mem);  \
++              maybe_fail();                                                \
++      } while (0)
++
++#define test_boolean(cond) do {   \
++              bool _s = (cond); \
++              maybe_fail();     \
++      } while (0)
++
++bool __init wg_allowedips_selftest(void)
++{
++      bool found_a = false, found_b = false, found_c = false, found_d = false,
++           found_e = false, found_other = false;
++      struct wg_peer *a = init_peer(), *b = init_peer(), *c = init_peer(),
++                     *d = init_peer(), *e = init_peer(), *f = init_peer(),
++                     *g = init_peer(), *h = init_peer();
++      struct allowedips_node *iter_node;
++      bool success = false;
++      struct allowedips t;
++      DEFINE_MUTEX(mutex);
++      struct in6_addr ip;
++      size_t i = 0, count = 0;
++      __be64 part;
++
++      mutex_init(&mutex);
++      mutex_lock(&mutex);
++      wg_allowedips_init(&t);
++
++      if (!a || !b || !c || !d || !e || !f || !g || !h) {
++              pr_err("allowedips self-test malloc: FAIL\n");
++              goto free;
++      }
++
++      insert(4, a, 192, 168, 4, 0, 24);
++      insert(4, b, 192, 168, 4, 4, 32);
++      insert(4, c, 192, 168, 0, 0, 16);
++      insert(4, d, 192, 95, 5, 64, 27);
++      /* replaces previous entry, and maskself is required */
++      insert(4, c, 192, 95, 5, 65, 27);
++      insert(6, d, 0x26075300, 0x60006b00, 0, 0xc05f0543, 128);
++      insert(6, c, 0x26075300, 0x60006b00, 0, 0, 64);
++      insert(4, e, 0, 0, 0, 0, 0);
++      insert(6, e, 0, 0, 0, 0, 0);
++      /* replaces previous entry */
++      insert(6, f, 0, 0, 0, 0, 0);
++      insert(6, g, 0x24046800, 0, 0, 0, 32);
++      /* maskself is required */
++      insert(6, h, 0x24046800, 0x40040800, 0xdeadbeef, 0xdeadbeef, 64);
++      insert(6, a, 0x24046800, 0x40040800, 0xdeadbeef, 0xdeadbeef, 128);
++      insert(6, c, 0x24446800, 0x40e40800, 0xdeaebeef, 0xdefbeef, 128);
++      insert(6, b, 0x24446800, 0xf0e40800, 0xeeaebeef, 0, 98);
++      insert(4, g, 64, 15, 112, 0, 20);
++      /* maskself is required */
++      insert(4, h, 64, 15, 123, 211, 25);
++      insert(4, a, 10, 0, 0, 0, 25);
++      insert(4, b, 10, 0, 0, 128, 25);
++      insert(4, a, 10, 1, 0, 0, 30);
++      insert(4, b, 10, 1, 0, 4, 30);
++      insert(4, c, 10, 1, 0, 8, 29);
++      insert(4, d, 10, 1, 0, 16, 29);
++
++      if (IS_ENABLED(DEBUG_PRINT_TRIE_GRAPHVIZ)) {
++              print_tree(t.root4, 32);
++              print_tree(t.root6, 128);
++      }
++
++      success = true;
++
++      test(4, a, 192, 168, 4, 20);
++      test(4, a, 192, 168, 4, 0);
++      test(4, b, 192, 168, 4, 4);
++      test(4, c, 192, 168, 200, 182);
++      test(4, c, 192, 95, 5, 68);
++      test(4, e, 192, 95, 5, 96);
++      test(6, d, 0x26075300, 0x60006b00, 0, 0xc05f0543);
++      test(6, c, 0x26075300, 0x60006b00, 0, 0xc02e01ee);
++      test(6, f, 0x26075300, 0x60006b01, 0, 0);
++      test(6, g, 0x24046800, 0x40040806, 0, 0x1006);
++      test(6, g, 0x24046800, 0x40040806, 0x1234, 0x5678);
++      test(6, f, 0x240467ff, 0x40040806, 0x1234, 0x5678);
++      test(6, f, 0x24046801, 0x40040806, 0x1234, 0x5678);
++      test(6, h, 0x24046800, 0x40040800, 0x1234, 0x5678);
++      test(6, h, 0x24046800, 0x40040800, 0, 0);
++      test(6, h, 0x24046800, 0x40040800, 0x10101010, 0x10101010);
++      test(6, a, 0x24046800, 0x40040800, 0xdeadbeef, 0xdeadbeef);
++      test(4, g, 64, 15, 116, 26);
++      test(4, g, 64, 15, 127, 3);
++      test(4, g, 64, 15, 123, 1);
++      test(4, h, 64, 15, 123, 128);
++      test(4, h, 64, 15, 123, 129);
++      test(4, a, 10, 0, 0, 52);
++      test(4, b, 10, 0, 0, 220);
++      test(4, a, 10, 1, 0, 2);
++      test(4, b, 10, 1, 0, 6);
++      test(4, c, 10, 1, 0, 10);
++      test(4, d, 10, 1, 0, 20);
++
++      insert(4, a, 1, 0, 0, 0, 32);
++      insert(4, a, 64, 0, 0, 0, 32);
++      insert(4, a, 128, 0, 0, 0, 32);
++      insert(4, a, 192, 0, 0, 0, 32);
++      insert(4, a, 255, 0, 0, 0, 32);
++      wg_allowedips_remove_by_peer(&t, a, &mutex);
++      test_negative(4, a, 1, 0, 0, 0);
++      test_negative(4, a, 64, 0, 0, 0);
++      test_negative(4, a, 128, 0, 0, 0);
++      test_negative(4, a, 192, 0, 0, 0);
++      test_negative(4, a, 255, 0, 0, 0);
++
++      wg_allowedips_free(&t, &mutex);
++      wg_allowedips_init(&t);
++      insert(4, a, 192, 168, 0, 0, 16);
++      insert(4, a, 192, 168, 0, 0, 24);
++      wg_allowedips_remove_by_peer(&t, a, &mutex);
++      test_negative(4, a, 192, 168, 0, 1);
++
++      /* These will hit the WARN_ON(len >= 128) in free_node if something
++       * goes wrong.
++       */
++      for (i = 0; i < 128; ++i) {
++              part = cpu_to_be64(~(1LLU << (i % 64)));
++              memset(&ip, 0xff, 16);
++              memcpy((u8 *)&ip + (i < 64) * 8, &part, 8);
++              wg_allowedips_insert_v6(&t, &ip, 128, a, &mutex);
++      }
++
++      wg_allowedips_free(&t, &mutex);
++
++      wg_allowedips_init(&t);
++      insert(4, a, 192, 95, 5, 93, 27);
++      insert(6, a, 0x26075300, 0x60006b00, 0, 0xc05f0543, 128);
++      insert(4, a, 10, 1, 0, 20, 29);
++      insert(6, a, 0x26075300, 0x6d8a6bf8, 0xdab1f1df, 0xc05f1523, 83);
++      insert(6, a, 0x26075300, 0x6d8a6bf8, 0xdab1f1df, 0xc05f1523, 21);
++      list_for_each_entry(iter_node, &a->allowedips_list, peer_list) {
++              u8 cidr, ip[16] __aligned(__alignof(u64));
++              int family = wg_allowedips_read_node(iter_node, ip, &cidr);
++
++              count++;
++
++              if (cidr == 27 && family == AF_INET &&
++                  !memcmp(ip, ip4(192, 95, 5, 64), sizeof(struct in_addr)))
++                      found_a = true;
++              else if (cidr == 128 && family == AF_INET6 &&
++                       !memcmp(ip, ip6(0x26075300, 0x60006b00, 0, 0xc05f0543),
++                               sizeof(struct in6_addr)))
++                      found_b = true;
++              else if (cidr == 29 && family == AF_INET &&
++                       !memcmp(ip, ip4(10, 1, 0, 16), sizeof(struct in_addr)))
++                      found_c = true;
++              else if (cidr == 83 && family == AF_INET6 &&
++                       !memcmp(ip, ip6(0x26075300, 0x6d8a6bf8, 0xdab1e000, 0),
++                               sizeof(struct in6_addr)))
++                      found_d = true;
++              else if (cidr == 21 && family == AF_INET6 &&
++                       !memcmp(ip, ip6(0x26075000, 0, 0, 0),
++                               sizeof(struct in6_addr)))
++                      found_e = true;
++              else
++                      found_other = true;
++      }
++      test_boolean(count == 5);
++      test_boolean(found_a);
++      test_boolean(found_b);
++      test_boolean(found_c);
++      test_boolean(found_d);
++      test_boolean(found_e);
++      test_boolean(!found_other);
++
++      if (IS_ENABLED(DEBUG_RANDOM_TRIE) && success)
++              success = randomized_test();
++
++      if (success)
++              pr_info("allowedips self-tests: pass\n");
++
++free:
++      wg_allowedips_free(&t, &mutex);
++      kfree(a);
++      kfree(b);
++      kfree(c);
++      kfree(d);
++      kfree(e);
++      kfree(f);
++      kfree(g);
++      kfree(h);
++      mutex_unlock(&mutex);
++
++      return success;
++}
++
++#undef test_negative
++#undef test
++#undef remove
++#undef insert
++#undef init_peer
++
++#endif
+--- /dev/null
++++ b/drivers/net/wireguard/selftest/counter.c
+@@ -0,0 +1,104 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifdef DEBUG
++bool __init wg_packet_counter_selftest(void)
++{
++      unsigned int test_num = 0, i;
++      union noise_counter counter;
++      bool success = true;
++
++#define T_INIT do {                                               \
++              memset(&counter, 0, sizeof(union noise_counter)); \
++              spin_lock_init(&counter.receive.lock);            \
++      } while (0)
++#define T_LIM (COUNTER_WINDOW_SIZE + 1)
++#define T(n, v) do {                                                  \
++              ++test_num;                                           \
++              if (counter_validate(&counter, n) != (v)) {           \
++                      pr_err("nonce counter self-test %u: FAIL\n",  \
++                             test_num);                             \
++                      success = false;                              \
++              }                                                     \
++      } while (0)
++
++      T_INIT;
++      /*  1 */ T(0, true);
++      /*  2 */ T(1, true);
++      /*  3 */ T(1, false);
++      /*  4 */ T(9, true);
++      /*  5 */ T(8, true);
++      /*  6 */ T(7, true);
++      /*  7 */ T(7, false);
++      /*  8 */ T(T_LIM, true);
++      /*  9 */ T(T_LIM - 1, true);
++      /* 10 */ T(T_LIM - 1, false);
++      /* 11 */ T(T_LIM - 2, true);
++      /* 12 */ T(2, true);
++      /* 13 */ T(2, false);
++      /* 14 */ T(T_LIM + 16, true);
++      /* 15 */ T(3, false);
++      /* 16 */ T(T_LIM + 16, false);
++      /* 17 */ T(T_LIM * 4, true);
++      /* 18 */ T(T_LIM * 4 - (T_LIM - 1), true);
++      /* 19 */ T(10, false);
++      /* 20 */ T(T_LIM * 4 - T_LIM, false);
++      /* 21 */ T(T_LIM * 4 - (T_LIM + 1), false);
++      /* 22 */ T(T_LIM * 4 - (T_LIM - 2), true);
++      /* 23 */ T(T_LIM * 4 + 1 - T_LIM, false);
++      /* 24 */ T(0, false);
++      /* 25 */ T(REJECT_AFTER_MESSAGES, false);
++      /* 26 */ T(REJECT_AFTER_MESSAGES - 1, true);
++      /* 27 */ T(REJECT_AFTER_MESSAGES, false);
++      /* 28 */ T(REJECT_AFTER_MESSAGES - 1, false);
++      /* 29 */ T(REJECT_AFTER_MESSAGES - 2, true);
++      /* 30 */ T(REJECT_AFTER_MESSAGES + 1, false);
++      /* 31 */ T(REJECT_AFTER_MESSAGES + 2, false);
++      /* 32 */ T(REJECT_AFTER_MESSAGES - 2, false);
++      /* 33 */ T(REJECT_AFTER_MESSAGES - 3, true);
++      /* 34 */ T(0, false);
++
++      T_INIT;
++      for (i = 1; i <= COUNTER_WINDOW_SIZE; ++i)
++              T(i, true);
++      T(0, true);
++      T(0, false);
++
++      T_INIT;
++      for (i = 2; i <= COUNTER_WINDOW_SIZE + 1; ++i)
++              T(i, true);
++      T(1, true);
++      T(0, false);
++
++      T_INIT;
++      for (i = COUNTER_WINDOW_SIZE + 1; i-- > 0;)
++              T(i, true);
++
++      T_INIT;
++      for (i = COUNTER_WINDOW_SIZE + 2; i-- > 1;)
++              T(i, true);
++      T(0, false);
++
++      T_INIT;
++      for (i = COUNTER_WINDOW_SIZE + 1; i-- > 1;)
++              T(i, true);
++      T(COUNTER_WINDOW_SIZE + 1, true);
++      T(0, false);
++
++      T_INIT;
++      for (i = COUNTER_WINDOW_SIZE + 1; i-- > 1;)
++              T(i, true);
++      T(0, true);
++      T(COUNTER_WINDOW_SIZE + 1, true);
++
++#undef T
++#undef T_LIM
++#undef T_INIT
++
++      if (success)
++              pr_info("nonce counter self-tests: pass\n");
++      return success;
++}
++#endif
+--- /dev/null
++++ b/drivers/net/wireguard/selftest/ratelimiter.c
+@@ -0,0 +1,226 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifdef DEBUG
++
++#include <linux/jiffies.h>
++
++static const struct {
++      bool result;
++      unsigned int msec_to_sleep_before;
++} expected_results[] __initconst = {
++      [0 ... PACKETS_BURSTABLE - 1] = { true, 0 },
++      [PACKETS_BURSTABLE] = { false, 0 },
++      [PACKETS_BURSTABLE + 1] = { true, MSEC_PER_SEC / PACKETS_PER_SECOND },
++      [PACKETS_BURSTABLE + 2] = { false, 0 },
++      [PACKETS_BURSTABLE + 3] = { true, (MSEC_PER_SEC / PACKETS_PER_SECOND) * 2 },
++      [PACKETS_BURSTABLE + 4] = { true, 0 },
++      [PACKETS_BURSTABLE + 5] = { false, 0 }
++};
++
++static __init unsigned int maximum_jiffies_at_index(int index)
++{
++      unsigned int total_msecs = 2 * MSEC_PER_SEC / PACKETS_PER_SECOND / 3;
++      int i;
++
++      for (i = 0; i <= index; ++i)
++              total_msecs += expected_results[i].msec_to_sleep_before;
++      return msecs_to_jiffies(total_msecs);
++}
++
++static __init int timings_test(struct sk_buff *skb4, struct iphdr *hdr4,
++                             struct sk_buff *skb6, struct ipv6hdr *hdr6,
++                             int *test)
++{
++      unsigned long loop_start_time;
++      int i;
++
++      wg_ratelimiter_gc_entries(NULL);
++      rcu_barrier();
++      loop_start_time = jiffies;
++
++      for (i = 0; i < ARRAY_SIZE(expected_results); ++i) {
++              if (expected_results[i].msec_to_sleep_before)
++                      msleep(expected_results[i].msec_to_sleep_before);
++
++              if (time_is_before_jiffies(loop_start_time +
++                                         maximum_jiffies_at_index(i)))
++                      return -ETIMEDOUT;
++              if (wg_ratelimiter_allow(skb4, &init_net) !=
++                                      expected_results[i].result)
++                      return -EXFULL;
++              ++(*test);
++
++              hdr4->saddr = htonl(ntohl(hdr4->saddr) + i + 1);
++              if (time_is_before_jiffies(loop_start_time +
++                                         maximum_jiffies_at_index(i)))
++                      return -ETIMEDOUT;
++              if (!wg_ratelimiter_allow(skb4, &init_net))
++                      return -EXFULL;
++              ++(*test);
++
++              hdr4->saddr = htonl(ntohl(hdr4->saddr) - i - 1);
++
++#if IS_ENABLED(CONFIG_IPV6)
++              hdr6->saddr.in6_u.u6_addr32[2] = htonl(i);
++              hdr6->saddr.in6_u.u6_addr32[3] = htonl(i);
++              if (time_is_before_jiffies(loop_start_time +
++                                         maximum_jiffies_at_index(i)))
++                      return -ETIMEDOUT;
++              if (wg_ratelimiter_allow(skb6, &init_net) !=
++                                      expected_results[i].result)
++                      return -EXFULL;
++              ++(*test);
++
++              hdr6->saddr.in6_u.u6_addr32[0] =
++                      htonl(ntohl(hdr6->saddr.in6_u.u6_addr32[0]) + i + 1);
++              if (time_is_before_jiffies(loop_start_time +
++                                         maximum_jiffies_at_index(i)))
++                      return -ETIMEDOUT;
++              if (!wg_ratelimiter_allow(skb6, &init_net))
++                      return -EXFULL;
++              ++(*test);
++
++              hdr6->saddr.in6_u.u6_addr32[0] =
++                      htonl(ntohl(hdr6->saddr.in6_u.u6_addr32[0]) - i - 1);
++
++              if (time_is_before_jiffies(loop_start_time +
++                                         maximum_jiffies_at_index(i)))
++                      return -ETIMEDOUT;
++#endif
++      }
++      return 0;
++}
++
++static __init int capacity_test(struct sk_buff *skb4, struct iphdr *hdr4,
++                              int *test)
++{
++      int i;
++
++      wg_ratelimiter_gc_entries(NULL);
++      rcu_barrier();
++
++      if (atomic_read(&total_entries))
++              return -EXFULL;
++      ++(*test);
++
++      for (i = 0; i <= max_entries; ++i) {
++              hdr4->saddr = htonl(i);
++              if (wg_ratelimiter_allow(skb4, &init_net) != (i != max_entries))
++                      return -EXFULL;
++              ++(*test);
++      }
++      return 0;
++}
++
++bool __init wg_ratelimiter_selftest(void)
++{
++      enum { TRIALS_BEFORE_GIVING_UP = 5000 };
++      bool success = false;
++      int test = 0, trials;
++      struct sk_buff *skb4, *skb6;
++      struct iphdr *hdr4;
++      struct ipv6hdr *hdr6;
++
++      if (IS_ENABLED(CONFIG_KASAN) || IS_ENABLED(CONFIG_UBSAN))
++              return true;
++
++      BUILD_BUG_ON(MSEC_PER_SEC % PACKETS_PER_SECOND != 0);
++
++      if (wg_ratelimiter_init())
++              goto out;
++      ++test;
++      if (wg_ratelimiter_init()) {
++              wg_ratelimiter_uninit();
++              goto out;
++      }
++      ++test;
++      if (wg_ratelimiter_init()) {
++              wg_ratelimiter_uninit();
++              wg_ratelimiter_uninit();
++              goto out;
++      }
++      ++test;
++
++      skb4 = alloc_skb(sizeof(struct iphdr), GFP_KERNEL);
++      if (unlikely(!skb4))
++              goto err_nofree;
++      skb4->protocol = htons(ETH_P_IP);
++      hdr4 = (struct iphdr *)skb_put(skb4, sizeof(*hdr4));
++      hdr4->saddr = htonl(8182);
++      skb_reset_network_header(skb4);
++      ++test;
++
++#if IS_ENABLED(CONFIG_IPV6)
++      skb6 = alloc_skb(sizeof(struct ipv6hdr), GFP_KERNEL);
++      if (unlikely(!skb6)) {
++              kfree_skb(skb4);
++              goto err_nofree;
++      }
++      skb6->protocol = htons(ETH_P_IPV6);
++      hdr6 = (struct ipv6hdr *)skb_put(skb6, sizeof(*hdr6));
++      hdr6->saddr.in6_u.u6_addr32[0] = htonl(1212);
++      hdr6->saddr.in6_u.u6_addr32[1] = htonl(289188);
++      skb_reset_network_header(skb6);
++      ++test;
++#endif
++
++      for (trials = TRIALS_BEFORE_GIVING_UP;;) {
++              int test_count = 0, ret;
++
++              ret = timings_test(skb4, hdr4, skb6, hdr6, &test_count);
++              if (ret == -ETIMEDOUT) {
++                      if (!trials--) {
++                              test += test_count;
++                              goto err;
++                      }
++                      msleep(500);
++                      continue;
++              } else if (ret < 0) {
++                      test += test_count;
++                      goto err;
++              } else {
++                      test += test_count;
++                      break;
++              }
++      }
++
++      for (trials = TRIALS_BEFORE_GIVING_UP;;) {
++              int test_count = 0;
++
++              if (capacity_test(skb4, hdr4, &test_count) < 0) {
++                      if (!trials--) {
++                              test += test_count;
++                              goto err;
++                      }
++                      msleep(50);
++                      continue;
++              }
++              test += test_count;
++              break;
++      }
++
++      success = true;
++
++err:
++      kfree_skb(skb4);
++#if IS_ENABLED(CONFIG_IPV6)
++      kfree_skb(skb6);
++#endif
++err_nofree:
++      wg_ratelimiter_uninit();
++      wg_ratelimiter_uninit();
++      wg_ratelimiter_uninit();
++      /* Uninit one extra time to check underflow detection. */
++      wg_ratelimiter_uninit();
++out:
++      if (success)
++              pr_info("ratelimiter self-tests: pass\n");
++      else
++              pr_err("ratelimiter self-test %d: FAIL\n", test);
++
++      return success;
++}
++#endif
+--- /dev/null
++++ b/drivers/net/wireguard/send.c
+@@ -0,0 +1,413 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "queueing.h"
++#include "timers.h"
++#include "device.h"
++#include "peer.h"
++#include "socket.h"
++#include "messages.h"
++#include "cookie.h"
++
++#include <linux/uio.h>
++#include <linux/inetdevice.h>
++#include <linux/socket.h>
++#include <net/ip_tunnels.h>
++#include <net/udp.h>
++#include <net/sock.h>
++
++static void wg_packet_send_handshake_initiation(struct wg_peer *peer)
++{
++      struct message_handshake_initiation packet;
++
++      if (!wg_birthdate_has_expired(atomic64_read(&peer->last_sent_handshake),
++                                    REKEY_TIMEOUT))
++              return; /* This function is rate limited. */
++
++      atomic64_set(&peer->last_sent_handshake, ktime_get_coarse_boottime_ns());
++      net_dbg_ratelimited("%s: Sending handshake initiation to peer %llu (%pISpfsc)\n",
++                          peer->device->dev->name, peer->internal_id,
++                          &peer->endpoint.addr);
++
++      if (wg_noise_handshake_create_initiation(&packet, &peer->handshake)) {
++              wg_cookie_add_mac_to_packet(&packet, sizeof(packet), peer);
++              wg_timers_any_authenticated_packet_traversal(peer);
++              wg_timers_any_authenticated_packet_sent(peer);
++              atomic64_set(&peer->last_sent_handshake,
++                           ktime_get_coarse_boottime_ns());
++              wg_socket_send_buffer_to_peer(peer, &packet, sizeof(packet),
++                                            HANDSHAKE_DSCP);
++              wg_timers_handshake_initiated(peer);
++      }
++}
++
++void wg_packet_handshake_send_worker(struct work_struct *work)
++{
++      struct wg_peer *peer = container_of(work, struct wg_peer,
++                                          transmit_handshake_work);
++
++      wg_packet_send_handshake_initiation(peer);
++      wg_peer_put(peer);
++}
++
++void wg_packet_send_queued_handshake_initiation(struct wg_peer *peer,
++                                              bool is_retry)
++{
++      if (!is_retry)
++              peer->timer_handshake_attempts = 0;
++
++      rcu_read_lock_bh();
++      /* We check last_sent_handshake here in addition to the actual function
++       * we're queueing up, so that we don't queue things if not strictly
++       * necessary:
++       */
++      if (!wg_birthdate_has_expired(atomic64_read(&peer->last_sent_handshake),
++                                    REKEY_TIMEOUT) ||
++                      unlikely(READ_ONCE(peer->is_dead)))
++              goto out;
++
++      wg_peer_get(peer);
++      /* Queues up calling packet_send_queued_handshakes(peer), where we do a
++       * peer_put(peer) after:
++       */
++      if (!queue_work(peer->device->handshake_send_wq,
++                      &peer->transmit_handshake_work))
++              /* If the work was already queued, we want to drop the
++               * extra reference:
++               */
++              wg_peer_put(peer);
++out:
++      rcu_read_unlock_bh();
++}
++
++void wg_packet_send_handshake_response(struct wg_peer *peer)
++{
++      struct message_handshake_response packet;
++
++      atomic64_set(&peer->last_sent_handshake, ktime_get_coarse_boottime_ns());
++      net_dbg_ratelimited("%s: Sending handshake response to peer %llu (%pISpfsc)\n",
++                          peer->device->dev->name, peer->internal_id,
++                          &peer->endpoint.addr);
++
++      if (wg_noise_handshake_create_response(&packet, &peer->handshake)) {
++              wg_cookie_add_mac_to_packet(&packet, sizeof(packet), peer);
++              if (wg_noise_handshake_begin_session(&peer->handshake,
++                                                   &peer->keypairs)) {
++                      wg_timers_session_derived(peer);
++                      wg_timers_any_authenticated_packet_traversal(peer);
++                      wg_timers_any_authenticated_packet_sent(peer);
++                      atomic64_set(&peer->last_sent_handshake,
++                                   ktime_get_coarse_boottime_ns());
++                      wg_socket_send_buffer_to_peer(peer, &packet,
++                                                    sizeof(packet),
++                                                    HANDSHAKE_DSCP);
++              }
++      }
++}
++
++void wg_packet_send_handshake_cookie(struct wg_device *wg,
++                                   struct sk_buff *initiating_skb,
++                                   __le32 sender_index)
++{
++      struct message_handshake_cookie packet;
++
++      net_dbg_skb_ratelimited("%s: Sending cookie response for denied handshake message for %pISpfsc\n",
++                              wg->dev->name, initiating_skb);
++      wg_cookie_message_create(&packet, initiating_skb, sender_index,
++                               &wg->cookie_checker);
++      wg_socket_send_buffer_as_reply_to_skb(wg, initiating_skb, &packet,
++                                            sizeof(packet));
++}
++
++static void keep_key_fresh(struct wg_peer *peer)
++{
++      struct noise_keypair *keypair;
++      bool send = false;
++
++      rcu_read_lock_bh();
++      keypair = rcu_dereference_bh(peer->keypairs.current_keypair);
++      if (likely(keypair && READ_ONCE(keypair->sending.is_valid)) &&
++          (unlikely(atomic64_read(&keypair->sending.counter.counter) >
++                    REKEY_AFTER_MESSAGES) ||
++           (keypair->i_am_the_initiator &&
++            unlikely(wg_birthdate_has_expired(keypair->sending.birthdate,
++                                              REKEY_AFTER_TIME)))))
++              send = true;
++      rcu_read_unlock_bh();
++
++      if (send)
++              wg_packet_send_queued_handshake_initiation(peer, false);
++}
++
++static unsigned int calculate_skb_padding(struct sk_buff *skb)
++{
++      /* We do this modulo business with the MTU, just in case the networking
++       * layer gives us a packet that's bigger than the MTU. In that case, we
++       * wouldn't want the final subtraction to overflow in the case of the
++       * padded_size being clamped.
++       */
++      unsigned int last_unit = skb->len % PACKET_CB(skb)->mtu;
++      unsigned int padded_size = ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE);
++
++      if (padded_size > PACKET_CB(skb)->mtu)
++              padded_size = PACKET_CB(skb)->mtu;
++      return padded_size - last_unit;
++}
++
++static bool encrypt_packet(struct sk_buff *skb, struct noise_keypair *keypair)
++{
++      unsigned int padding_len, plaintext_len, trailer_len;
++      struct scatterlist sg[MAX_SKB_FRAGS + 8];
++      struct message_data *header;
++      struct sk_buff *trailer;
++      int num_frags;
++
++      /* Calculate lengths. */
++      padding_len = calculate_skb_padding(skb);
++      trailer_len = padding_len + noise_encrypted_len(0);
++      plaintext_len = skb->len + padding_len;
++
++      /* Expand data section to have room for padding and auth tag. */
++      num_frags = skb_cow_data(skb, trailer_len, &trailer);
++      if (unlikely(num_frags < 0 || num_frags > ARRAY_SIZE(sg)))
++              return false;
++
++      /* Set the padding to zeros, and make sure it and the auth tag are part
++       * of the skb.
++       */
++      memset(skb_tail_pointer(trailer), 0, padding_len);
++
++      /* Expand head section to have room for our header and the network
++       * stack's headers.
++       */
++      if (unlikely(skb_cow_head(skb, DATA_PACKET_HEAD_ROOM) < 0))
++              return false;
++
++      /* Finalize checksum calculation for the inner packet, if required. */
++      if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL &&
++                   skb_checksum_help(skb)))
++              return false;
++
++      /* Only after checksumming can we safely add on the padding at the end
++       * and the header.
++       */
++      skb_set_inner_network_header(skb, 0);
++      header = (struct message_data *)skb_push(skb, sizeof(*header));
++      header->header.type = cpu_to_le32(MESSAGE_DATA);
++      header->key_idx = keypair->remote_index;
++      header->counter = cpu_to_le64(PACKET_CB(skb)->nonce);
++      pskb_put(skb, trailer, trailer_len);
++
++      /* Now we can encrypt the scattergather segments */
++      sg_init_table(sg, num_frags);
++      if (skb_to_sgvec(skb, sg, sizeof(struct message_data),
++                       noise_encrypted_len(plaintext_len)) <= 0)
++              return false;
++      return chacha20poly1305_encrypt_sg_inplace(sg, plaintext_len, NULL, 0,
++                                                 PACKET_CB(skb)->nonce,
++                                                 keypair->sending.key);
++}
++
++void wg_packet_send_keepalive(struct wg_peer *peer)
++{
++      struct sk_buff *skb;
++
++      if (skb_queue_empty(&peer->staged_packet_queue)) {
++              skb = alloc_skb(DATA_PACKET_HEAD_ROOM + MESSAGE_MINIMUM_LENGTH,
++                              GFP_ATOMIC);
++              if (unlikely(!skb))
++                      return;
++              skb_reserve(skb, DATA_PACKET_HEAD_ROOM);
++              skb->dev = peer->device->dev;
++              PACKET_CB(skb)->mtu = skb->dev->mtu;
++              skb_queue_tail(&peer->staged_packet_queue, skb);
++              net_dbg_ratelimited("%s: Sending keepalive packet to peer %llu (%pISpfsc)\n",
++                                  peer->device->dev->name, peer->internal_id,
++                                  &peer->endpoint.addr);
++      }
++
++      wg_packet_send_staged_packets(peer);
++}
++
++static void wg_packet_create_data_done(struct sk_buff *first,
++                                     struct wg_peer *peer)
++{
++      struct sk_buff *skb, *next;
++      bool is_keepalive, data_sent = false;
++
++      wg_timers_any_authenticated_packet_traversal(peer);
++      wg_timers_any_authenticated_packet_sent(peer);
++      skb_list_walk_safe(first, skb, next) {
++              is_keepalive = skb->len == message_data_len(0);
++              if (likely(!wg_socket_send_skb_to_peer(peer, skb,
++                              PACKET_CB(skb)->ds) && !is_keepalive))
++                      data_sent = true;
++      }
++
++      if (likely(data_sent))
++              wg_timers_data_sent(peer);
++
++      keep_key_fresh(peer);
++}
++
++void wg_packet_tx_worker(struct work_struct *work)
++{
++      struct crypt_queue *queue = container_of(work, struct crypt_queue,
++                                               work);
++      struct noise_keypair *keypair;
++      enum packet_state state;
++      struct sk_buff *first;
++      struct wg_peer *peer;
++
++      while ((first = __ptr_ring_peek(&queue->ring)) != NULL &&
++             (state = atomic_read_acquire(&PACKET_CB(first)->state)) !=
++                     PACKET_STATE_UNCRYPTED) {
++              __ptr_ring_discard_one(&queue->ring);
++              peer = PACKET_PEER(first);
++              keypair = PACKET_CB(first)->keypair;
++
++              if (likely(state == PACKET_STATE_CRYPTED))
++                      wg_packet_create_data_done(first, peer);
++              else
++                      kfree_skb_list(first);
++
++              wg_noise_keypair_put(keypair, false);
++              wg_peer_put(peer);
++      }
++}
++
++void wg_packet_encrypt_worker(struct work_struct *work)
++{
++      struct crypt_queue *queue = container_of(work, struct multicore_worker,
++                                               work)->ptr;
++      struct sk_buff *first, *skb, *next;
++
++      while ((first = ptr_ring_consume_bh(&queue->ring)) != NULL) {
++              enum packet_state state = PACKET_STATE_CRYPTED;
++
++              skb_list_walk_safe(first, skb, next) {
++                      if (likely(encrypt_packet(skb,
++                                      PACKET_CB(first)->keypair))) {
++                              wg_reset_packet(skb);
++                      } else {
++                              state = PACKET_STATE_DEAD;
++                              break;
++                      }
++              }
++              wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first,
++                                        state);
++
++      }
++}
++
++static void wg_packet_create_data(struct sk_buff *first)
++{
++      struct wg_peer *peer = PACKET_PEER(first);
++      struct wg_device *wg = peer->device;
++      int ret = -EINVAL;
++
++      rcu_read_lock_bh();
++      if (unlikely(READ_ONCE(peer->is_dead)))
++              goto err;
++
++      ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue,
++                                                 &peer->tx_queue, first,
++                                                 wg->packet_crypt_wq,
++                                                 &wg->encrypt_queue.last_cpu);
++      if (unlikely(ret == -EPIPE))
++              wg_queue_enqueue_per_peer(&peer->tx_queue, first,
++                                        PACKET_STATE_DEAD);
++err:
++      rcu_read_unlock_bh();
++      if (likely(!ret || ret == -EPIPE))
++              return;
++      wg_noise_keypair_put(PACKET_CB(first)->keypair, false);
++      wg_peer_put(peer);
++      kfree_skb_list(first);
++}
++
++void wg_packet_purge_staged_packets(struct wg_peer *peer)
++{
++      spin_lock_bh(&peer->staged_packet_queue.lock);
++      peer->device->dev->stats.tx_dropped += peer->staged_packet_queue.qlen;
++      __skb_queue_purge(&peer->staged_packet_queue);
++      spin_unlock_bh(&peer->staged_packet_queue.lock);
++}
++
++void wg_packet_send_staged_packets(struct wg_peer *peer)
++{
++      struct noise_symmetric_key *key;
++      struct noise_keypair *keypair;
++      struct sk_buff_head packets;
++      struct sk_buff *skb;
++
++      /* Steal the current queue into our local one. */
++      __skb_queue_head_init(&packets);
++      spin_lock_bh(&peer->staged_packet_queue.lock);
++      skb_queue_splice_init(&peer->staged_packet_queue, &packets);
++      spin_unlock_bh(&peer->staged_packet_queue.lock);
++      if (unlikely(skb_queue_empty(&packets)))
++              return;
++
++      /* First we make sure we have a valid reference to a valid key. */
++      rcu_read_lock_bh();
++      keypair = wg_noise_keypair_get(
++              rcu_dereference_bh(peer->keypairs.current_keypair));
++      rcu_read_unlock_bh();
++      if (unlikely(!keypair))
++              goto out_nokey;
++      key = &keypair->sending;
++      if (unlikely(!READ_ONCE(key->is_valid)))
++              goto out_nokey;
++      if (unlikely(wg_birthdate_has_expired(key->birthdate,
++                                            REJECT_AFTER_TIME)))
++              goto out_invalid;
++
++      /* After we know we have a somewhat valid key, we now try to assign
++       * nonces to all of the packets in the queue. If we can't assign nonces
++       * for all of them, we just consider it a failure and wait for the next
++       * handshake.
++       */
++      skb_queue_walk(&packets, skb) {
++              /* 0 for no outer TOS: no leak. TODO: at some later point, we
++               * might consider using flowi->tos as outer instead.
++               */
++              PACKET_CB(skb)->ds = ip_tunnel_ecn_encap(0, ip_hdr(skb), skb);
++              PACKET_CB(skb)->nonce =
++                              atomic64_inc_return(&key->counter.counter) - 1;
++              if (unlikely(PACKET_CB(skb)->nonce >= REJECT_AFTER_MESSAGES))
++                      goto out_invalid;
++      }
++
++      packets.prev->next = NULL;
++      wg_peer_get(keypair->entry.peer);
++      PACKET_CB(packets.next)->keypair = keypair;
++      wg_packet_create_data(packets.next);
++      return;
++
++out_invalid:
++      WRITE_ONCE(key->is_valid, false);
++out_nokey:
++      wg_noise_keypair_put(keypair, false);
++
++      /* We orphan the packets if we're waiting on a handshake, so that they
++       * don't block a socket's pool.
++       */
++      skb_queue_walk(&packets, skb)
++              skb_orphan(skb);
++      /* Then we put them back on the top of the queue. We're not too
++       * concerned about accidentally getting things a little out of order if
++       * packets are being added really fast, because this queue is for before
++       * packets can even be sent and it's small anyway.
++       */
++      spin_lock_bh(&peer->staged_packet_queue.lock);
++      skb_queue_splice(&packets, &peer->staged_packet_queue);
++      spin_unlock_bh(&peer->staged_packet_queue.lock);
++
++      /* If we're exiting because there's something wrong with the key, it
++       * means we should initiate a new handshake.
++       */
++      wg_packet_send_queued_handshake_initiation(peer, false);
++}
+--- /dev/null
++++ b/drivers/net/wireguard/socket.c
+@@ -0,0 +1,437 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "device.h"
++#include "peer.h"
++#include "socket.h"
++#include "queueing.h"
++#include "messages.h"
++
++#include <linux/ctype.h>
++#include <linux/net.h>
++#include <linux/if_vlan.h>
++#include <linux/if_ether.h>
++#include <linux/inetdevice.h>
++#include <net/udp_tunnel.h>
++#include <net/ipv6.h>
++
++static int send4(struct wg_device *wg, struct sk_buff *skb,
++               struct endpoint *endpoint, u8 ds, struct dst_cache *cache)
++{
++      struct flowi4 fl = {
++              .saddr = endpoint->src4.s_addr,
++              .daddr = endpoint->addr4.sin_addr.s_addr,
++              .fl4_dport = endpoint->addr4.sin_port,
++              .flowi4_mark = wg->fwmark,
++              .flowi4_proto = IPPROTO_UDP
++      };
++      struct rtable *rt = NULL;
++      struct sock *sock;
++      int ret = 0;
++
++      skb_mark_not_on_list(skb);
++      skb->dev = wg->dev;
++      skb->mark = wg->fwmark;
++
++      rcu_read_lock_bh();
++      sock = rcu_dereference_bh(wg->sock4);
++
++      if (unlikely(!sock)) {
++              ret = -ENONET;
++              goto err;
++      }
++
++      fl.fl4_sport = inet_sk(sock)->inet_sport;
++
++      if (cache)
++              rt = dst_cache_get_ip4(cache, &fl.saddr);
++
++      if (!rt) {
++              security_sk_classify_flow(sock, flowi4_to_flowi(&fl));
++              if (unlikely(!inet_confirm_addr(sock_net(sock), NULL, 0,
++                                              fl.saddr, RT_SCOPE_HOST))) {
++                      endpoint->src4.s_addr = 0;
++                      *(__force __be32 *)&endpoint->src_if4 = 0;
++                      fl.saddr = 0;
++                      if (cache)
++                              dst_cache_reset(cache);
++              }
++              rt = ip_route_output_flow(sock_net(sock), &fl, sock);
++              if (unlikely(endpoint->src_if4 && ((IS_ERR(rt) &&
++                           PTR_ERR(rt) == -EINVAL) || (!IS_ERR(rt) &&
++                           rt->dst.dev->ifindex != endpoint->src_if4)))) {
++                      endpoint->src4.s_addr = 0;
++                      *(__force __be32 *)&endpoint->src_if4 = 0;
++                      fl.saddr = 0;
++                      if (cache)
++                              dst_cache_reset(cache);
++                      if (!IS_ERR(rt))
++                              ip_rt_put(rt);
++                      rt = ip_route_output_flow(sock_net(sock), &fl, sock);
++              }
++              if (unlikely(IS_ERR(rt))) {
++                      ret = PTR_ERR(rt);
++                      net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n",
++                                          wg->dev->name, &endpoint->addr, ret);
++                      goto err;
++              } else if (unlikely(rt->dst.dev == skb->dev)) {
++                      ip_rt_put(rt);
++                      ret = -ELOOP;
++                      net_dbg_ratelimited("%s: Avoiding routing loop to %pISpfsc\n",
++                                          wg->dev->name, &endpoint->addr);
++                      goto err;
++              }
++              if (cache)
++                      dst_cache_set_ip4(cache, &rt->dst, fl.saddr);
++      }
++
++      skb->ignore_df = 1;
++      udp_tunnel_xmit_skb(rt, sock, skb, fl.saddr, fl.daddr, ds,
++                          ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport,
++                          fl.fl4_dport, false, false);
++      goto out;
++
++err:
++      kfree_skb(skb);
++out:
++      rcu_read_unlock_bh();
++      return ret;
++}
++
++static int send6(struct wg_device *wg, struct sk_buff *skb,
++               struct endpoint *endpoint, u8 ds, struct dst_cache *cache)
++{
++#if IS_ENABLED(CONFIG_IPV6)
++      struct flowi6 fl = {
++              .saddr = endpoint->src6,
++              .daddr = endpoint->addr6.sin6_addr,
++              .fl6_dport = endpoint->addr6.sin6_port,
++              .flowi6_mark = wg->fwmark,
++              .flowi6_oif = endpoint->addr6.sin6_scope_id,
++              .flowi6_proto = IPPROTO_UDP
++              /* TODO: addr->sin6_flowinfo */
++      };
++      struct dst_entry *dst = NULL;
++      struct sock *sock;
++      int ret = 0;
++
++      skb_mark_not_on_list(skb);
++      skb->dev = wg->dev;
++      skb->mark = wg->fwmark;
++
++      rcu_read_lock_bh();
++      sock = rcu_dereference_bh(wg->sock6);
++
++      if (unlikely(!sock)) {
++              ret = -ENONET;
++              goto err;
++      }
++
++      fl.fl6_sport = inet_sk(sock)->inet_sport;
++
++      if (cache)
++              dst = dst_cache_get_ip6(cache, &fl.saddr);
++
++      if (!dst) {
++              security_sk_classify_flow(sock, flowi6_to_flowi(&fl));
++              if (unlikely(!ipv6_addr_any(&fl.saddr) &&
++                           !ipv6_chk_addr(sock_net(sock), &fl.saddr, NULL, 0))) {
++                      endpoint->src6 = fl.saddr = in6addr_any;
++                      if (cache)
++                              dst_cache_reset(cache);
++              }
++              dst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(sock), sock, &fl,
++                                                    NULL);
++              if (unlikely(IS_ERR(dst))) {
++                      ret = PTR_ERR(dst);
++                      net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n",
++                                          wg->dev->name, &endpoint->addr, ret);
++                      goto err;
++              } else if (unlikely(dst->dev == skb->dev)) {
++                      dst_release(dst);
++                      ret = -ELOOP;
++                      net_dbg_ratelimited("%s: Avoiding routing loop to %pISpfsc\n",
++                                          wg->dev->name, &endpoint->addr);
++                      goto err;
++              }
++              if (cache)
++                      dst_cache_set_ip6(cache, dst, &fl.saddr);
++      }
++
++      skb->ignore_df = 1;
++      udp_tunnel6_xmit_skb(dst, sock, skb, skb->dev, &fl.saddr, &fl.daddr, ds,
++                           ip6_dst_hoplimit(dst), 0, fl.fl6_sport,
++                           fl.fl6_dport, false);
++      goto out;
++
++err:
++      kfree_skb(skb);
++out:
++      rcu_read_unlock_bh();
++      return ret;
++#else
++      return -EAFNOSUPPORT;
++#endif
++}
++
++int wg_socket_send_skb_to_peer(struct wg_peer *peer, struct sk_buff *skb, u8 ds)
++{
++      size_t skb_len = skb->len;
++      int ret = -EAFNOSUPPORT;
++
++      read_lock_bh(&peer->endpoint_lock);
++      if (peer->endpoint.addr.sa_family == AF_INET)
++              ret = send4(peer->device, skb, &peer->endpoint, ds,
++                          &peer->endpoint_cache);
++      else if (peer->endpoint.addr.sa_family == AF_INET6)
++              ret = send6(peer->device, skb, &peer->endpoint, ds,
++                          &peer->endpoint_cache);
++      else
++              dev_kfree_skb(skb);
++      if (likely(!ret))
++              peer->tx_bytes += skb_len;
++      read_unlock_bh(&peer->endpoint_lock);
++
++      return ret;
++}
++
++int wg_socket_send_buffer_to_peer(struct wg_peer *peer, void *buffer,
++                                size_t len, u8 ds)
++{
++      struct sk_buff *skb = alloc_skb(len + SKB_HEADER_LEN, GFP_ATOMIC);
++
++      if (unlikely(!skb))
++              return -ENOMEM;
++
++      skb_reserve(skb, SKB_HEADER_LEN);
++      skb_set_inner_network_header(skb, 0);
++      skb_put_data(skb, buffer, len);
++      return wg_socket_send_skb_to_peer(peer, skb, ds);
++}
++
++int wg_socket_send_buffer_as_reply_to_skb(struct wg_device *wg,
++                                        struct sk_buff *in_skb, void *buffer,
++                                        size_t len)
++{
++      int ret = 0;
++      struct sk_buff *skb;
++      struct endpoint endpoint;
++
++      if (unlikely(!in_skb))
++              return -EINVAL;
++      ret = wg_socket_endpoint_from_skb(&endpoint, in_skb);
++      if (unlikely(ret < 0))
++              return ret;
++
++      skb = alloc_skb(len + SKB_HEADER_LEN, GFP_ATOMIC);
++      if (unlikely(!skb))
++              return -ENOMEM;
++      skb_reserve(skb, SKB_HEADER_LEN);
++      skb_set_inner_network_header(skb, 0);
++      skb_put_data(skb, buffer, len);
++
++      if (endpoint.addr.sa_family == AF_INET)
++              ret = send4(wg, skb, &endpoint, 0, NULL);
++      else if (endpoint.addr.sa_family == AF_INET6)
++              ret = send6(wg, skb, &endpoint, 0, NULL);
++      /* No other possibilities if the endpoint is valid, which it is,
++       * as we checked above.
++       */
++
++      return ret;
++}
++
++int wg_socket_endpoint_from_skb(struct endpoint *endpoint,
++                              const struct sk_buff *skb)
++{
++      memset(endpoint, 0, sizeof(*endpoint));
++      if (skb->protocol == htons(ETH_P_IP)) {
++              endpoint->addr4.sin_family = AF_INET;
++              endpoint->addr4.sin_port = udp_hdr(skb)->source;
++              endpoint->addr4.sin_addr.s_addr = ip_hdr(skb)->saddr;
++              endpoint->src4.s_addr = ip_hdr(skb)->daddr;
++              endpoint->src_if4 = skb->skb_iif;
++      } else if (skb->protocol == htons(ETH_P_IPV6)) {
++              endpoint->addr6.sin6_family = AF_INET6;
++              endpoint->addr6.sin6_port = udp_hdr(skb)->source;
++              endpoint->addr6.sin6_addr = ipv6_hdr(skb)->saddr;
++              endpoint->addr6.sin6_scope_id = ipv6_iface_scope_id(
++                      &ipv6_hdr(skb)->saddr, skb->skb_iif);
++              endpoint->src6 = ipv6_hdr(skb)->daddr;
++      } else {
++              return -EINVAL;
++      }
++      return 0;
++}
++
++static bool endpoint_eq(const struct endpoint *a, const struct endpoint *b)
++{
++      return (a->addr.sa_family == AF_INET && b->addr.sa_family == AF_INET &&
++              a->addr4.sin_port == b->addr4.sin_port &&
++              a->addr4.sin_addr.s_addr == b->addr4.sin_addr.s_addr &&
++              a->src4.s_addr == b->src4.s_addr && a->src_if4 == b->src_if4) ||
++             (a->addr.sa_family == AF_INET6 &&
++              b->addr.sa_family == AF_INET6 &&
++              a->addr6.sin6_port == b->addr6.sin6_port &&
++              ipv6_addr_equal(&a->addr6.sin6_addr, &b->addr6.sin6_addr) &&
++              a->addr6.sin6_scope_id == b->addr6.sin6_scope_id &&
++              ipv6_addr_equal(&a->src6, &b->src6)) ||
++             unlikely(!a->addr.sa_family && !b->addr.sa_family);
++}
++
++void wg_socket_set_peer_endpoint(struct wg_peer *peer,
++                               const struct endpoint *endpoint)
++{
++      /* First we check unlocked, in order to optimize, since it's pretty rare
++       * that an endpoint will change. If we happen to be mid-write, and two
++       * CPUs wind up writing the same thing or something slightly different,
++       * it doesn't really matter much either.
++       */
++      if (endpoint_eq(endpoint, &peer->endpoint))
++              return;
++      write_lock_bh(&peer->endpoint_lock);
++      if (endpoint->addr.sa_family == AF_INET) {
++              peer->endpoint.addr4 = endpoint->addr4;
++              peer->endpoint.src4 = endpoint->src4;
++              peer->endpoint.src_if4 = endpoint->src_if4;
++      } else if (endpoint->addr.sa_family == AF_INET6) {
++              peer->endpoint.addr6 = endpoint->addr6;
++              peer->endpoint.src6 = endpoint->src6;
++      } else {
++              goto out;
++      }
++      dst_cache_reset(&peer->endpoint_cache);
++out:
++      write_unlock_bh(&peer->endpoint_lock);
++}
++
++void wg_socket_set_peer_endpoint_from_skb(struct wg_peer *peer,
++                                        const struct sk_buff *skb)
++{
++      struct endpoint endpoint;
++
++      if (!wg_socket_endpoint_from_skb(&endpoint, skb))
++              wg_socket_set_peer_endpoint(peer, &endpoint);
++}
++
++void wg_socket_clear_peer_endpoint_src(struct wg_peer *peer)
++{
++      write_lock_bh(&peer->endpoint_lock);
++      memset(&peer->endpoint.src6, 0, sizeof(peer->endpoint.src6));
++      dst_cache_reset(&peer->endpoint_cache);
++      write_unlock_bh(&peer->endpoint_lock);
++}
++
++static int wg_receive(struct sock *sk, struct sk_buff *skb)
++{
++      struct wg_device *wg;
++
++      if (unlikely(!sk))
++              goto err;
++      wg = sk->sk_user_data;
++      if (unlikely(!wg))
++              goto err;
++      wg_packet_receive(wg, skb);
++      return 0;
++
++err:
++      kfree_skb(skb);
++      return 0;
++}
++
++static void sock_free(struct sock *sock)
++{
++      if (unlikely(!sock))
++              return;
++      sk_clear_memalloc(sock);
++      udp_tunnel_sock_release(sock->sk_socket);
++}
++
++static void set_sock_opts(struct socket *sock)
++{
++      sock->sk->sk_allocation = GFP_ATOMIC;
++      sock->sk->sk_sndbuf = INT_MAX;
++      sk_set_memalloc(sock->sk);
++}
++
++int wg_socket_init(struct wg_device *wg, u16 port)
++{
++      int ret;
++      struct udp_tunnel_sock_cfg cfg = {
++              .sk_user_data = wg,
++              .encap_type = 1,
++              .encap_rcv = wg_receive
++      };
++      struct socket *new4 = NULL, *new6 = NULL;
++      struct udp_port_cfg port4 = {
++              .family = AF_INET,
++              .local_ip.s_addr = htonl(INADDR_ANY),
++              .local_udp_port = htons(port),
++              .use_udp_checksums = true
++      };
++#if IS_ENABLED(CONFIG_IPV6)
++      int retries = 0;
++      struct udp_port_cfg port6 = {
++              .family = AF_INET6,
++              .local_ip6 = IN6ADDR_ANY_INIT,
++              .use_udp6_tx_checksums = true,
++              .use_udp6_rx_checksums = true,
++              .ipv6_v6only = true
++      };
++#endif
++
++#if IS_ENABLED(CONFIG_IPV6)
++retry:
++#endif
++
++      ret = udp_sock_create(wg->creating_net, &port4, &new4);
++      if (ret < 0) {
++              pr_err("%s: Could not create IPv4 socket\n", wg->dev->name);
++              return ret;
++      }
++      set_sock_opts(new4);
++      setup_udp_tunnel_sock(wg->creating_net, new4, &cfg);
++
++#if IS_ENABLED(CONFIG_IPV6)
++      if (ipv6_mod_enabled()) {
++              port6.local_udp_port = inet_sk(new4->sk)->inet_sport;
++              ret = udp_sock_create(wg->creating_net, &port6, &new6);
++              if (ret < 0) {
++                      udp_tunnel_sock_release(new4);
++                      if (ret == -EADDRINUSE && !port && retries++ < 100)
++                              goto retry;
++                      pr_err("%s: Could not create IPv6 socket\n",
++                             wg->dev->name);
++                      return ret;
++              }
++              set_sock_opts(new6);
++              setup_udp_tunnel_sock(wg->creating_net, new6, &cfg);
++      }
++#endif
++
++      wg_socket_reinit(wg, new4->sk, new6 ? new6->sk : NULL);
++      return 0;
++}
++
++void wg_socket_reinit(struct wg_device *wg, struct sock *new4,
++                    struct sock *new6)
++{
++      struct sock *old4, *old6;
++
++      mutex_lock(&wg->socket_update_lock);
++      old4 = rcu_dereference_protected(wg->sock4,
++                              lockdep_is_held(&wg->socket_update_lock));
++      old6 = rcu_dereference_protected(wg->sock6,
++                              lockdep_is_held(&wg->socket_update_lock));
++      rcu_assign_pointer(wg->sock4, new4);
++      rcu_assign_pointer(wg->sock6, new6);
++      if (new4)
++              wg->incoming_port = ntohs(inet_sk(new4)->inet_sport);
++      mutex_unlock(&wg->socket_update_lock);
++      synchronize_rcu();
++      synchronize_net();
++      sock_free(old4);
++      sock_free(old6);
++}
+--- /dev/null
++++ b/drivers/net/wireguard/socket.h
+@@ -0,0 +1,44 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_SOCKET_H
++#define _WG_SOCKET_H
++
++#include <linux/netdevice.h>
++#include <linux/udp.h>
++#include <linux/if_vlan.h>
++#include <linux/if_ether.h>
++
++int wg_socket_init(struct wg_device *wg, u16 port);
++void wg_socket_reinit(struct wg_device *wg, struct sock *new4,
++                    struct sock *new6);
++int wg_socket_send_buffer_to_peer(struct wg_peer *peer, void *data,
++                                size_t len, u8 ds);
++int wg_socket_send_skb_to_peer(struct wg_peer *peer, struct sk_buff *skb,
++                             u8 ds);
++int wg_socket_send_buffer_as_reply_to_skb(struct wg_device *wg,
++                                        struct sk_buff *in_skb,
++                                        void *out_buffer, size_t len);
++
++int wg_socket_endpoint_from_skb(struct endpoint *endpoint,
++                              const struct sk_buff *skb);
++void wg_socket_set_peer_endpoint(struct wg_peer *peer,
++                               const struct endpoint *endpoint);
++void wg_socket_set_peer_endpoint_from_skb(struct wg_peer *peer,
++                                        const struct sk_buff *skb);
++void wg_socket_clear_peer_endpoint_src(struct wg_peer *peer);
++
++#if defined(CONFIG_DYNAMIC_DEBUG) || defined(DEBUG)
++#define net_dbg_skb_ratelimited(fmt, dev, skb, ...) do {                       \
++              struct endpoint __endpoint;                                    \
++              wg_socket_endpoint_from_skb(&__endpoint, skb);                 \
++              net_dbg_ratelimited(fmt, dev, &__endpoint.addr,                \
++                                  ##__VA_ARGS__);                            \
++      } while (0)
++#else
++#define net_dbg_skb_ratelimited(fmt, skb, ...)
++#endif
++
++#endif /* _WG_SOCKET_H */
+--- /dev/null
++++ b/drivers/net/wireguard/timers.c
+@@ -0,0 +1,243 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include "timers.h"
++#include "device.h"
++#include "peer.h"
++#include "queueing.h"
++#include "socket.h"
++
++/*
++ * - Timer for retransmitting the handshake if we don't hear back after
++ * `REKEY_TIMEOUT + jitter` ms.
++ *
++ * - Timer for sending empty packet if we have received a packet but after have
++ * not sent one for `KEEPALIVE_TIMEOUT` ms.
++ *
++ * - Timer for initiating new handshake if we have sent a packet but after have
++ * not received one (even empty) for `(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) +
++ * jitter` ms.
++ *
++ * - Timer for zeroing out all ephemeral keys after `(REJECT_AFTER_TIME * 3)` ms
++ * if no new keys have been received.
++ *
++ * - Timer for, if enabled, sending an empty authenticated packet every user-
++ * specified seconds.
++ */
++
++static inline void mod_peer_timer(struct wg_peer *peer,
++                                struct timer_list *timer,
++                                unsigned long expires)
++{
++      rcu_read_lock_bh();
++      if (likely(netif_running(peer->device->dev) &&
++                 !READ_ONCE(peer->is_dead)))
++              mod_timer(timer, expires);
++      rcu_read_unlock_bh();
++}
++
++static void wg_expired_retransmit_handshake(struct timer_list *timer)
++{
++      struct wg_peer *peer = from_timer(peer, timer,
++                                        timer_retransmit_handshake);
++
++      if (peer->timer_handshake_attempts > MAX_TIMER_HANDSHAKES) {
++              pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d attempts, giving up\n",
++                       peer->device->dev->name, peer->internal_id,
++                       &peer->endpoint.addr, MAX_TIMER_HANDSHAKES + 2);
++
++              del_timer(&peer->timer_send_keepalive);
++              /* We drop all packets without a keypair and don't try again,
++               * if we try unsuccessfully for too long to make a handshake.
++               */
++              wg_packet_purge_staged_packets(peer);
++
++              /* We set a timer for destroying any residue that might be left
++               * of a partial exchange.
++               */
++              if (!timer_pending(&peer->timer_zero_key_material))
++                      mod_peer_timer(peer, &peer->timer_zero_key_material,
++                                     jiffies + REJECT_AFTER_TIME * 3 * HZ);
++      } else {
++              ++peer->timer_handshake_attempts;
++              pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d seconds, retrying (try %d)\n",
++                       peer->device->dev->name, peer->internal_id,
++                       &peer->endpoint.addr, REKEY_TIMEOUT,
++                       peer->timer_handshake_attempts + 1);
++
++              /* We clear the endpoint address src address, in case this is
++               * the cause of trouble.
++               */
++              wg_socket_clear_peer_endpoint_src(peer);
++
++              wg_packet_send_queued_handshake_initiation(peer, true);
++      }
++}
++
++static void wg_expired_send_keepalive(struct timer_list *timer)
++{
++      struct wg_peer *peer = from_timer(peer, timer, timer_send_keepalive);
++
++      wg_packet_send_keepalive(peer);
++      if (peer->timer_need_another_keepalive) {
++              peer->timer_need_another_keepalive = false;
++              mod_peer_timer(peer, &peer->timer_send_keepalive,
++                             jiffies + KEEPALIVE_TIMEOUT * HZ);
++      }
++}
++
++static void wg_expired_new_handshake(struct timer_list *timer)
++{
++      struct wg_peer *peer = from_timer(peer, timer, timer_new_handshake);
++
++      pr_debug("%s: Retrying handshake with peer %llu (%pISpfsc) because we stopped hearing back after %d seconds\n",
++               peer->device->dev->name, peer->internal_id,
++               &peer->endpoint.addr, KEEPALIVE_TIMEOUT + REKEY_TIMEOUT);
++      /* We clear the endpoint address src address, in case this is the cause
++       * of trouble.
++       */
++      wg_socket_clear_peer_endpoint_src(peer);
++      wg_packet_send_queued_handshake_initiation(peer, false);
++}
++
++static void wg_expired_zero_key_material(struct timer_list *timer)
++{
++      struct wg_peer *peer = from_timer(peer, timer, timer_zero_key_material);
++
++      rcu_read_lock_bh();
++      if (!READ_ONCE(peer->is_dead)) {
++              wg_peer_get(peer);
++              if (!queue_work(peer->device->handshake_send_wq,
++                              &peer->clear_peer_work))
++                      /* If the work was already on the queue, we want to drop
++                       * the extra reference.
++                       */
++                      wg_peer_put(peer);
++      }
++      rcu_read_unlock_bh();
++}
++
++static void wg_queued_expired_zero_key_material(struct work_struct *work)
++{
++      struct wg_peer *peer = container_of(work, struct wg_peer,
++                                          clear_peer_work);
++
++      pr_debug("%s: Zeroing out all keys for peer %llu (%pISpfsc), since we haven't received a new one in %d seconds\n",
++               peer->device->dev->name, peer->internal_id,
++               &peer->endpoint.addr, REJECT_AFTER_TIME * 3);
++      wg_noise_handshake_clear(&peer->handshake);
++      wg_noise_keypairs_clear(&peer->keypairs);
++      wg_peer_put(peer);
++}
++
++static void wg_expired_send_persistent_keepalive(struct timer_list *timer)
++{
++      struct wg_peer *peer = from_timer(peer, timer,
++                                        timer_persistent_keepalive);
++
++      if (likely(peer->persistent_keepalive_interval))
++              wg_packet_send_keepalive(peer);
++}
++
++/* Should be called after an authenticated data packet is sent. */
++void wg_timers_data_sent(struct wg_peer *peer)
++{
++      if (!timer_pending(&peer->timer_new_handshake))
++              mod_peer_timer(peer, &peer->timer_new_handshake,
++                      jiffies + (KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) * HZ +
++                      prandom_u32_max(REKEY_TIMEOUT_JITTER_MAX_JIFFIES));
++}
++
++/* Should be called after an authenticated data packet is received. */
++void wg_timers_data_received(struct wg_peer *peer)
++{
++      if (likely(netif_running(peer->device->dev))) {
++              if (!timer_pending(&peer->timer_send_keepalive))
++                      mod_peer_timer(peer, &peer->timer_send_keepalive,
++                                     jiffies + KEEPALIVE_TIMEOUT * HZ);
++              else
++                      peer->timer_need_another_keepalive = true;
++      }
++}
++
++/* Should be called after any type of authenticated packet is sent, whether
++ * keepalive, data, or handshake.
++ */
++void wg_timers_any_authenticated_packet_sent(struct wg_peer *peer)
++{
++      del_timer(&peer->timer_send_keepalive);
++}
++
++/* Should be called after any type of authenticated packet is received, whether
++ * keepalive, data, or handshake.
++ */
++void wg_timers_any_authenticated_packet_received(struct wg_peer *peer)
++{
++      del_timer(&peer->timer_new_handshake);
++}
++
++/* Should be called after a handshake initiation message is sent. */
++void wg_timers_handshake_initiated(struct wg_peer *peer)
++{
++      mod_peer_timer(peer, &peer->timer_retransmit_handshake,
++                     jiffies + REKEY_TIMEOUT * HZ +
++                     prandom_u32_max(REKEY_TIMEOUT_JITTER_MAX_JIFFIES));
++}
++
++/* Should be called after a handshake response message is received and processed
++ * or when getting key confirmation via the first data message.
++ */
++void wg_timers_handshake_complete(struct wg_peer *peer)
++{
++      del_timer(&peer->timer_retransmit_handshake);
++      peer->timer_handshake_attempts = 0;
++      peer->sent_lastminute_handshake = false;
++      ktime_get_real_ts64(&peer->walltime_last_handshake);
++}
++
++/* Should be called after an ephemeral key is created, which is before sending a
++ * handshake response or after receiving a handshake response.
++ */
++void wg_timers_session_derived(struct wg_peer *peer)
++{
++      mod_peer_timer(peer, &peer->timer_zero_key_material,
++                     jiffies + REJECT_AFTER_TIME * 3 * HZ);
++}
++
++/* Should be called before a packet with authentication, whether
++ * keepalive, data, or handshakem is sent, or after one is received.
++ */
++void wg_timers_any_authenticated_packet_traversal(struct wg_peer *peer)
++{
++      if (peer->persistent_keepalive_interval)
++              mod_peer_timer(peer, &peer->timer_persistent_keepalive,
++                      jiffies + peer->persistent_keepalive_interval * HZ);
++}
++
++void wg_timers_init(struct wg_peer *peer)
++{
++      timer_setup(&peer->timer_retransmit_handshake,
++                  wg_expired_retransmit_handshake, 0);
++      timer_setup(&peer->timer_send_keepalive, wg_expired_send_keepalive, 0);
++      timer_setup(&peer->timer_new_handshake, wg_expired_new_handshake, 0);
++      timer_setup(&peer->timer_zero_key_material,
++                  wg_expired_zero_key_material, 0);
++      timer_setup(&peer->timer_persistent_keepalive,
++                  wg_expired_send_persistent_keepalive, 0);
++      INIT_WORK(&peer->clear_peer_work, wg_queued_expired_zero_key_material);
++      peer->timer_handshake_attempts = 0;
++      peer->sent_lastminute_handshake = false;
++      peer->timer_need_another_keepalive = false;
++}
++
++void wg_timers_stop(struct wg_peer *peer)
++{
++      del_timer_sync(&peer->timer_retransmit_handshake);
++      del_timer_sync(&peer->timer_send_keepalive);
++      del_timer_sync(&peer->timer_new_handshake);
++      del_timer_sync(&peer->timer_zero_key_material);
++      del_timer_sync(&peer->timer_persistent_keepalive);
++      flush_work(&peer->clear_peer_work);
++}
+--- /dev/null
++++ b/drivers/net/wireguard/timers.h
+@@ -0,0 +1,31 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#ifndef _WG_TIMERS_H
++#define _WG_TIMERS_H
++
++#include <linux/ktime.h>
++
++struct wg_peer;
++
++void wg_timers_init(struct wg_peer *peer);
++void wg_timers_stop(struct wg_peer *peer);
++void wg_timers_data_sent(struct wg_peer *peer);
++void wg_timers_data_received(struct wg_peer *peer);
++void wg_timers_any_authenticated_packet_sent(struct wg_peer *peer);
++void wg_timers_any_authenticated_packet_received(struct wg_peer *peer);
++void wg_timers_handshake_initiated(struct wg_peer *peer);
++void wg_timers_handshake_complete(struct wg_peer *peer);
++void wg_timers_session_derived(struct wg_peer *peer);
++void wg_timers_any_authenticated_packet_traversal(struct wg_peer *peer);
++
++static inline bool wg_birthdate_has_expired(u64 birthday_nanoseconds,
++                                          u64 expiration_seconds)
++{
++      return (s64)(birthday_nanoseconds + expiration_seconds * NSEC_PER_SEC)
++              <= (s64)ktime_get_coarse_boottime_ns();
++}
++
++#endif /* _WG_TIMERS_H */
+--- /dev/null
++++ b/drivers/net/wireguard/version.h
+@@ -0,0 +1 @@
++#define WIREGUARD_VERSION "1.0.0"
+--- /dev/null
++++ b/include/uapi/linux/wireguard.h
+@@ -0,0 +1,196 @@
++/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ *
++ * Documentation
++ * =============
++ *
++ * The below enums and macros are for interfacing with WireGuard, using generic
++ * netlink, with family WG_GENL_NAME and version WG_GENL_VERSION. It defines two
++ * methods: get and set. Note that while they share many common attributes,
++ * these two functions actually accept a slightly different set of inputs and
++ * outputs.
++ *
++ * WG_CMD_GET_DEVICE
++ * -----------------
++ *
++ * May only be called via NLM_F_REQUEST | NLM_F_DUMP. The command should contain
++ * one but not both of:
++ *
++ *    WGDEVICE_A_IFINDEX: NLA_U32
++ *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMESIZ - 1
++ *
++ * The kernel will then return several messages (NLM_F_MULTI) containing the
++ * following tree of nested items:
++ *
++ *    WGDEVICE_A_IFINDEX: NLA_U32
++ *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMESIZ - 1
++ *    WGDEVICE_A_PRIVATE_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
++ *    WGDEVICE_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
++ *    WGDEVICE_A_LISTEN_PORT: NLA_U16
++ *    WGDEVICE_A_FWMARK: NLA_U32
++ *    WGDEVICE_A_PEERS: NLA_NESTED
++ *        0: NLA_NESTED
++ *            WGPEER_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
++ *            WGPEER_A_PRESHARED_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
++ *            WGPEER_A_ENDPOINT: NLA_MIN_LEN(struct sockaddr), struct sockaddr_in or struct sockaddr_in6
++ *            WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL: NLA_U16
++ *            WGPEER_A_LAST_HANDSHAKE_TIME: NLA_EXACT_LEN, struct __kernel_timespec
++ *            WGPEER_A_RX_BYTES: NLA_U64
++ *            WGPEER_A_TX_BYTES: NLA_U64
++ *            WGPEER_A_ALLOWEDIPS: NLA_NESTED
++ *                0: NLA_NESTED
++ *                    WGALLOWEDIP_A_FAMILY: NLA_U16
++ *                    WGALLOWEDIP_A_IPADDR: NLA_MIN_LEN(struct in_addr), struct in_addr or struct in6_addr
++ *                    WGALLOWEDIP_A_CIDR_MASK: NLA_U8
++ *                0: NLA_NESTED
++ *                    ...
++ *                0: NLA_NESTED
++ *                    ...
++ *                ...
++ *            WGPEER_A_PROTOCOL_VERSION: NLA_U32
++ *        0: NLA_NESTED
++ *            ...
++ *        ...
++ *
++ * It is possible that all of the allowed IPs of a single peer will not
++ * fit within a single netlink message. In that case, the same peer will
++ * be written in the following message, except it will only contain
++ * WGPEER_A_PUBLIC_KEY and WGPEER_A_ALLOWEDIPS. This may occur several
++ * times in a row for the same peer. It is then up to the receiver to
++ * coalesce adjacent peers. Likewise, it is possible that all peers will
++ * not fit within a single message. So, subsequent peers will be sent
++ * in following messages, except those will only contain WGDEVICE_A_IFNAME
++ * and WGDEVICE_A_PEERS. It is then up to the receiver to coalesce these
++ * messages to form the complete list of peers.
++ *
++ * Since this is an NLA_F_DUMP command, the final message will always be
++ * NLMSG_DONE, even if an error occurs. However, this NLMSG_DONE message
++ * contains an integer error code. It is either zero or a negative error
++ * code corresponding to the errno.
++ *
++ * WG_CMD_SET_DEVICE
++ * -----------------
++ *
++ * May only be called via NLM_F_REQUEST. The command should contain the
++ * following tree of nested items, containing one but not both of
++ * WGDEVICE_A_IFINDEX and WGDEVICE_A_IFNAME:
++ *
++ *    WGDEVICE_A_IFINDEX: NLA_U32
++ *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMESIZ - 1
++ *    WGDEVICE_A_FLAGS: NLA_U32, 0 or WGDEVICE_F_REPLACE_PEERS if all current
++ *                      peers should be removed prior to adding the list below.
++ *    WGDEVICE_A_PRIVATE_KEY: len WG_KEY_LEN, all zeros to remove
++ *    WGDEVICE_A_LISTEN_PORT: NLA_U16, 0 to choose randomly
++ *    WGDEVICE_A_FWMARK: NLA_U32, 0 to disable
++ *    WGDEVICE_A_PEERS: NLA_NESTED
++ *        0: NLA_NESTED
++ *            WGPEER_A_PUBLIC_KEY: len WG_KEY_LEN
++ *            WGPEER_A_FLAGS: NLA_U32, 0 and/or WGPEER_F_REMOVE_ME if the
++ *                            specified peer should not exist at the end of the
++ *                            operation, rather than added/updated and/or
++ *                            WGPEER_F_REPLACE_ALLOWEDIPS if all current allowed
++ *                            IPs of this peer should be removed prior to adding
++ *                            the list below and/or WGPEER_F_UPDATE_ONLY if the
++ *                            peer should only be set if it already exists.
++ *            WGPEER_A_PRESHARED_KEY: len WG_KEY_LEN, all zeros to remove
++ *            WGPEER_A_ENDPOINT: struct sockaddr_in or struct sockaddr_in6
++ *            WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL: NLA_U16, 0 to disable
++ *            WGPEER_A_ALLOWEDIPS: NLA_NESTED
++ *                0: NLA_NESTED
++ *                    WGALLOWEDIP_A_FAMILY: NLA_U16
++ *                    WGALLOWEDIP_A_IPADDR: struct in_addr or struct in6_addr
++ *                    WGALLOWEDIP_A_CIDR_MASK: NLA_U8
++ *                0: NLA_NESTED
++ *                    ...
++ *                0: NLA_NESTED
++ *                    ...
++ *                ...
++ *            WGPEER_A_PROTOCOL_VERSION: NLA_U32, should not be set or used at
++ *                                       all by most users of this API, as the
++ *                                       most recent protocol will be used when
++ *                                       this is unset. Otherwise, must be set
++ *                                       to 1.
++ *        0: NLA_NESTED
++ *            ...
++ *        ...
++ *
++ * It is possible that the amount of configuration data exceeds that of
++ * the maximum message length accepted by the kernel. In that case, several
++ * messages should be sent one after another, with each successive one
++ * filling in information not contained in the prior. Note that if
++ * WGDEVICE_F_REPLACE_PEERS is specified in the first message, it probably
++ * should not be specified in fragments that come after, so that the list
++ * of peers is only cleared the first time but appened after. Likewise for
++ * peers, if WGPEER_F_REPLACE_ALLOWEDIPS is specified in the first message
++ * of a peer, it likely should not be specified in subsequent fragments.
++ *
++ * If an error occurs, NLMSG_ERROR will reply containing an errno.
++ */
++
++#ifndef _WG_UAPI_WIREGUARD_H
++#define _WG_UAPI_WIREGUARD_H
++
++#define WG_GENL_NAME "wireguard"
++#define WG_GENL_VERSION 1
++
++#define WG_KEY_LEN 32
++
++enum wg_cmd {
++      WG_CMD_GET_DEVICE,
++      WG_CMD_SET_DEVICE,
++      __WG_CMD_MAX
++};
++#define WG_CMD_MAX (__WG_CMD_MAX - 1)
++
++enum wgdevice_flag {
++      WGDEVICE_F_REPLACE_PEERS = 1U << 0,
++      __WGDEVICE_F_ALL = WGDEVICE_F_REPLACE_PEERS
++};
++enum wgdevice_attribute {
++      WGDEVICE_A_UNSPEC,
++      WGDEVICE_A_IFINDEX,
++      WGDEVICE_A_IFNAME,
++      WGDEVICE_A_PRIVATE_KEY,
++      WGDEVICE_A_PUBLIC_KEY,
++      WGDEVICE_A_FLAGS,
++      WGDEVICE_A_LISTEN_PORT,
++      WGDEVICE_A_FWMARK,
++      WGDEVICE_A_PEERS,
++      __WGDEVICE_A_LAST
++};
++#define WGDEVICE_A_MAX (__WGDEVICE_A_LAST - 1)
++
++enum wgpeer_flag {
++      WGPEER_F_REMOVE_ME = 1U << 0,
++      WGPEER_F_REPLACE_ALLOWEDIPS = 1U << 1,
++      WGPEER_F_UPDATE_ONLY = 1U << 2,
++      __WGPEER_F_ALL = WGPEER_F_REMOVE_ME | WGPEER_F_REPLACE_ALLOWEDIPS |
++                       WGPEER_F_UPDATE_ONLY
++};
++enum wgpeer_attribute {
++      WGPEER_A_UNSPEC,
++      WGPEER_A_PUBLIC_KEY,
++      WGPEER_A_PRESHARED_KEY,
++      WGPEER_A_FLAGS,
++      WGPEER_A_ENDPOINT,
++      WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL,
++      WGPEER_A_LAST_HANDSHAKE_TIME,
++      WGPEER_A_RX_BYTES,
++      WGPEER_A_TX_BYTES,
++      WGPEER_A_ALLOWEDIPS,
++      WGPEER_A_PROTOCOL_VERSION,
++      __WGPEER_A_LAST
++};
++#define WGPEER_A_MAX (__WGPEER_A_LAST - 1)
++
++enum wgallowedip_attribute {
++      WGALLOWEDIP_A_UNSPEC,
++      WGALLOWEDIP_A_FAMILY,
++      WGALLOWEDIP_A_IPADDR,
++      WGALLOWEDIP_A_CIDR_MASK,
++      __WGALLOWEDIP_A_LAST
++};
++#define WGALLOWEDIP_A_MAX (__WGALLOWEDIP_A_LAST - 1)
++
++#endif /* _WG_UAPI_WIREGUARD_H */
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -0,0 +1,537 @@
++#!/bin/bash
++# SPDX-License-Identifier: GPL-2.0
++#
++# Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++#
++# This script tests the below topology:
++#
++# ┌─────────────────────┐   ┌──────────────────────────────────┐   ┌─────────────────────┐
++# │   $ns1 namespace    │   │          $ns0 namespace          │   │   $ns2 namespace    │
++# │                     │   │                                  │   │                     │
++# │┌────────┐           │   │            ┌────────┐            │   │           ┌────────┐│
++# ││  wg0   │───────────┼───┼────────────│   lo   │────────────┼───┼───────────│  wg0   ││
++# │├────────┴──────────┐│   │    ┌───────┴────────┴────────┐   │   │┌──────────┴────────┤│
++# ││192.168.241.1/24   ││   │    │(ns1)         (ns2)      │   │   ││192.168.241.2/24   ││
++# ││fd00::1/24         ││   │    │127.0.0.1:1   127.0.0.1:2│   │   ││fd00::2/24         ││
++# │└───────────────────┘│   │    │[::]:1        [::]:2     │   │   │└───────────────────┘│
++# └─────────────────────┘   │    └─────────────────────────┘   │   └─────────────────────┘
++#                           └──────────────────────────────────┘
++#
++# After the topology is prepared we run a series of TCP/UDP iperf3 tests between the
++# wireguard peers in $ns1 and $ns2. Note that $ns0 is the endpoint for the wg0
++# interfaces in $ns1 and $ns2. See https://www.wireguard.com/netns/ for further
++# details on how this is accomplished.
++set -e
++
++exec 3>&1
++export WG_HIDE_KEYS=never
++netns0="wg-test-$$-0"
++netns1="wg-test-$$-1"
++netns2="wg-test-$$-2"
++pretty() { echo -e "\x1b[32m\x1b[1m[+] ${1:+NS$1: }${2}\x1b[0m" >&3; }
++pp() { pretty "" "$*"; "$@"; }
++maybe_exec() { if [[ $BASHPID -eq $$ ]]; then "$@"; else exec "$@"; fi; }
++n0() { pretty 0 "$*"; maybe_exec ip netns exec $netns0 "$@"; }
++n1() { pretty 1 "$*"; maybe_exec ip netns exec $netns1 "$@"; }
++n2() { pretty 2 "$*"; maybe_exec ip netns exec $netns2 "$@"; }
++ip0() { pretty 0 "ip $*"; ip -n $netns0 "$@"; }
++ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; }
++ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; }
++sleep() { read -t "$1" -N 0 || true; }
++waitiperf() { pretty "${1//*-}" "wait for iperf:5201"; while [[ $(ss -N "$1" -tlp 'sport = 5201') != *iperf3* ]]; do sleep 0.1; done; }
++waitncatudp() { pretty "${1//*-}" "wait for udp:1111"; while [[ $(ss -N "$1" -ulp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; }
++waitncattcp() { pretty "${1//*-}" "wait for tcp:1111"; while [[ $(ss -N "$1" -tlp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; }
++waitiface() { pretty "${1//*-}" "wait for $2 to come up"; ip netns exec "$1" bash -c "while [[ \$(< \"/sys/class/net/$2/operstate\") != up ]]; do read -t .1 -N 0 || true; done;"; }
++
++cleanup() {
++      set +e
++      exec 2>/dev/null
++      printf "$orig_message_cost" > /proc/sys/net/core/message_cost
++      ip0 link del dev wg0
++      ip1 link del dev wg0
++      ip2 link del dev wg0
++      local to_kill="$(ip netns pids $netns0) $(ip netns pids $netns1) $(ip netns pids $netns2)"
++      [[ -n $to_kill ]] && kill $to_kill
++      pp ip netns del $netns1
++      pp ip netns del $netns2
++      pp ip netns del $netns0
++      exit
++}
++
++orig_message_cost="$(< /proc/sys/net/core/message_cost)"
++trap cleanup EXIT
++printf 0 > /proc/sys/net/core/message_cost
++
++ip netns del $netns0 2>/dev/null || true
++ip netns del $netns1 2>/dev/null || true
++ip netns del $netns2 2>/dev/null || true
++pp ip netns add $netns0
++pp ip netns add $netns1
++pp ip netns add $netns2
++ip0 link set up dev lo
++
++ip0 link add dev wg0 type wireguard
++ip0 link set wg0 netns $netns1
++ip0 link add dev wg0 type wireguard
++ip0 link set wg0 netns $netns2
++key1="$(pp wg genkey)"
++key2="$(pp wg genkey)"
++key3="$(pp wg genkey)"
++pub1="$(pp wg pubkey <<<"$key1")"
++pub2="$(pp wg pubkey <<<"$key2")"
++pub3="$(pp wg pubkey <<<"$key3")"
++psk="$(pp wg genpsk)"
++[[ -n $key1 && -n $key2 && -n $psk ]]
++
++configure_peers() {
++      ip1 addr add 192.168.241.1/24 dev wg0
++      ip1 addr add fd00::1/24 dev wg0
++
++      ip2 addr add 192.168.241.2/24 dev wg0
++      ip2 addr add fd00::2/24 dev wg0
++
++      n1 wg set wg0 \
++              private-key <(echo "$key1") \
++              listen-port 1 \
++              peer "$pub2" \
++                      preshared-key <(echo "$psk") \
++                      allowed-ips 192.168.241.2/32,fd00::2/128
++      n2 wg set wg0 \
++              private-key <(echo "$key2") \
++              listen-port 2 \
++              peer "$pub1" \
++                      preshared-key <(echo "$psk") \
++                      allowed-ips 192.168.241.1/32,fd00::1/128
++
++      ip1 link set up dev wg0
++      ip2 link set up dev wg0
++}
++configure_peers
++
++tests() {
++      # Ping over IPv4
++      n2 ping -c 10 -f -W 1 192.168.241.1
++      n1 ping -c 10 -f -W 1 192.168.241.2
++
++      # Ping over IPv6
++      n2 ping6 -c 10 -f -W 1 fd00::1
++      n1 ping6 -c 10 -f -W 1 fd00::2
++
++      # TCP over IPv4
++      n2 iperf3 -s -1 -B 192.168.241.2 &
++      waitiperf $netns2
++      n1 iperf3 -Z -t 3 -c 192.168.241.2
++
++      # TCP over IPv6
++      n1 iperf3 -s -1 -B fd00::1 &
++      waitiperf $netns1
++      n2 iperf3 -Z -t 3 -c fd00::1
++
++      # UDP over IPv4
++      n1 iperf3 -s -1 -B 192.168.241.1 &
++      waitiperf $netns1
++      n2 iperf3 -Z -t 3 -b 0 -u -c 192.168.241.1
++
++      # UDP over IPv6
++      n2 iperf3 -s -1 -B fd00::2 &
++      waitiperf $netns2
++      n1 iperf3 -Z -t 3 -b 0 -u -c fd00::2
++}
++
++[[ $(ip1 link show dev wg0) =~ mtu\ ([0-9]+) ]] && orig_mtu="${BASH_REMATCH[1]}"
++big_mtu=$(( 34816 - 1500 + $orig_mtu ))
++
++# Test using IPv4 as outer transport
++n1 wg set wg0 peer "$pub2" endpoint 127.0.0.1:2
++n2 wg set wg0 peer "$pub1" endpoint 127.0.0.1:1
++# Before calling tests, we first make sure that the stats counters and timestamper are working
++n2 ping -c 10 -f -W 1 192.168.241.1
++{ read _; read _; read _; read rx_bytes _; read _; read tx_bytes _; } < <(ip2 -stats link show dev wg0)
++(( rx_bytes == 1372 && (tx_bytes == 1428 || tx_bytes == 1460) ))
++{ read _; read _; read _; read rx_bytes _; read _; read tx_bytes _; } < <(ip1 -stats link show dev wg0)
++(( tx_bytes == 1372 && (rx_bytes == 1428 || rx_bytes == 1460) ))
++read _ rx_bytes tx_bytes < <(n2 wg show wg0 transfer)
++(( rx_bytes == 1372 && (tx_bytes == 1428 || tx_bytes == 1460) ))
++read _ rx_bytes tx_bytes < <(n1 wg show wg0 transfer)
++(( tx_bytes == 1372 && (rx_bytes == 1428 || rx_bytes == 1460) ))
++read _ timestamp < <(n1 wg show wg0 latest-handshakes)
++(( timestamp != 0 ))
++
++tests
++ip1 link set wg0 mtu $big_mtu
++ip2 link set wg0 mtu $big_mtu
++tests
++
++ip1 link set wg0 mtu $orig_mtu
++ip2 link set wg0 mtu $orig_mtu
++
++# Test using IPv6 as outer transport
++n1 wg set wg0 peer "$pub2" endpoint [::1]:2
++n2 wg set wg0 peer "$pub1" endpoint [::1]:1
++tests
++ip1 link set wg0 mtu $big_mtu
++ip2 link set wg0 mtu $big_mtu
++tests
++
++# Test that route MTUs work with the padding
++ip1 link set wg0 mtu 1300
++ip2 link set wg0 mtu 1300
++n1 wg set wg0 peer "$pub2" endpoint 127.0.0.1:2
++n2 wg set wg0 peer "$pub1" endpoint 127.0.0.1:1
++n0 iptables -A INPUT -m length --length 1360 -j DROP
++n1 ip route add 192.168.241.2/32 dev wg0 mtu 1299
++n2 ip route add 192.168.241.1/32 dev wg0 mtu 1299
++n2 ping -c 1 -W 1 -s 1269 192.168.241.1
++n2 ip route delete 192.168.241.1/32 dev wg0 mtu 1299
++n1 ip route delete 192.168.241.2/32 dev wg0 mtu 1299
++n0 iptables -F INPUT
++
++ip1 link set wg0 mtu $orig_mtu
++ip2 link set wg0 mtu $orig_mtu
++
++# Test using IPv4 that roaming works
++ip0 -4 addr del 127.0.0.1/8 dev lo
++ip0 -4 addr add 127.212.121.99/8 dev lo
++n1 wg set wg0 listen-port 9999
++n1 wg set wg0 peer "$pub2" endpoint 127.0.0.1:2
++n1 ping6 -W 1 -c 1 fd00::2
++[[ $(n2 wg show wg0 endpoints) == "$pub1      127.212.121.99:9999" ]]
++
++# Test using IPv6 that roaming works
++n1 wg set wg0 listen-port 9998
++n1 wg set wg0 peer "$pub2" endpoint [::1]:2
++n1 ping -W 1 -c 1 192.168.241.2
++[[ $(n2 wg show wg0 endpoints) == "$pub1      [::1]:9998" ]]
++
++# Test that crypto-RP filter works
++n1 wg set wg0 peer "$pub2" allowed-ips 192.168.241.0/24
++exec 4< <(n1 ncat -l -u -p 1111)
++ncat_pid=$!
++waitncatudp $netns1
++n2 ncat -u 192.168.241.1 1111 <<<"X"
++read -r -N 1 -t 1 out <&4 && [[ $out == "X" ]]
++kill $ncat_pid
++more_specific_key="$(pp wg genkey | pp wg pubkey)"
++n1 wg set wg0 peer "$more_specific_key" allowed-ips 192.168.241.2/32
++n2 wg set wg0 listen-port 9997
++exec 4< <(n1 ncat -l -u -p 1111)
++ncat_pid=$!
++waitncatudp $netns1
++n2 ncat -u 192.168.241.1 1111 <<<"X"
++! read -r -N 1 -t 1 out <&4 || false
++kill $ncat_pid
++n1 wg set wg0 peer "$more_specific_key" remove
++[[ $(n1 wg show wg0 endpoints) == "$pub2      [::1]:9997" ]]
++
++# Test that we can change private keys keys and immediately handshake
++n1 wg set wg0 private-key <(echo "$key1") peer "$pub2" preshared-key <(echo "$psk") allowed-ips 192.168.241.2/32 endpoint 127.0.0.1:2
++n2 wg set wg0 private-key <(echo "$key2") listen-port 2 peer "$pub1" preshared-key <(echo "$psk") allowed-ips 192.168.241.1/32
++n1 ping -W 1 -c 1 192.168.241.2
++n1 wg set wg0 private-key <(echo "$key3")
++n2 wg set wg0 peer "$pub3" preshared-key <(echo "$psk") allowed-ips 192.168.241.1/32 peer "$pub1" remove
++n1 ping -W 1 -c 1 192.168.241.2
++
++ip1 link del wg0
++ip2 link del wg0
++
++# Test using NAT. We now change the topology to this:
++# ┌────────────────────────────────────────┐    ┌────────────────────────────────────────────────┐     ┌────────────────────────────────────────┐
++# │             $ns1 namespace             │    │                 $ns0 namespace                 │     │             $ns2 namespace             │
++# │                                        │    │                                                │     │                                        │
++# │  ┌─────┐             ┌─────┐           │    │    ┌──────┐              ┌──────┐              │     │  ┌─────┐            ┌─────┐            │
++# │  │ wg0 │─────────────│vethc│───────────┼────┼────│vethrc│              │vethrs│──────────────┼─────┼──│veths│────────────│ wg0 │            │
++# │  ├─────┴──────────┐  ├─────┴──────────┐│    │    ├──────┴─────────┐    ├──────┴────────────┐ │     │  ├─────┴──────────┐ ├─────┴──────────┐ │
++# │  │192.168.241.1/24│  │192.168.1.100/24││    │    │192.168.1.1/24  │    │10.0.0.1/24        │ │     │  │10.0.0.100/24   │ │192.168.241.2/24│ │
++# │  │fd00::1/24      │  │                ││    │    │                │    │SNAT:192.168.1.0/24│ │     │  │                │ │fd00::2/24      │ │
++# │  └────────────────┘  └────────────────┘│    │    └────────────────┘    └───────────────────┘ │     │  └────────────────┘ └────────────────┘ │
++# └────────────────────────────────────────┘    └────────────────────────────────────────────────┘     └────────────────────────────────────────┘
++
++ip1 link add dev wg0 type wireguard
++ip2 link add dev wg0 type wireguard
++configure_peers
++
++ip0 link add vethrc type veth peer name vethc
++ip0 link add vethrs type veth peer name veths
++ip0 link set vethc netns $netns1
++ip0 link set veths netns $netns2
++ip0 link set vethrc up
++ip0 link set vethrs up
++ip0 addr add 192.168.1.1/24 dev vethrc
++ip0 addr add 10.0.0.1/24 dev vethrs
++ip1 addr add 192.168.1.100/24 dev vethc
++ip1 link set vethc up
++ip1 route add default via 192.168.1.1
++ip2 addr add 10.0.0.100/24 dev veths
++ip2 link set veths up
++waitiface $netns0 vethrc
++waitiface $netns0 vethrs
++waitiface $netns1 vethc
++waitiface $netns2 veths
++
++n0 bash -c 'printf 1 > /proc/sys/net/ipv4/ip_forward'
++n0 bash -c 'printf 2 > /proc/sys/net/netfilter/nf_conntrack_udp_timeout'
++n0 bash -c 'printf 2 > /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream'
++n0 iptables -t nat -A POSTROUTING -s 192.168.1.0/24 -d 10.0.0.0/24 -j SNAT --to 10.0.0.1
++
++n1 wg set wg0 peer "$pub2" endpoint 10.0.0.100:2 persistent-keepalive 1
++n1 ping -W 1 -c 1 192.168.241.2
++n2 ping -W 1 -c 1 192.168.241.1
++[[ $(n2 wg show wg0 endpoints) == "$pub1      10.0.0.1:1" ]]
++# Demonstrate n2 can still send packets to n1, since persistent-keepalive will prevent connection tracking entry from expiring (to see entries: `n0 conntrack -L`).
++pp sleep 3
++n2 ping -W 1 -c 1 192.168.241.1
++n1 wg set wg0 peer "$pub2" persistent-keepalive 0
++
++# Do a wg-quick(8)-style policy routing for the default route, making sure vethc has a v6 address to tease out bugs.
++ip1 -6 addr add fc00::9/96 dev vethc
++ip1 -6 route add default via fc00::1
++ip2 -4 addr add 192.168.99.7/32 dev wg0
++ip2 -6 addr add abab::1111/128 dev wg0
++n1 wg set wg0 fwmark 51820 peer "$pub2" allowed-ips 192.168.99.7,abab::1111
++ip1 -6 route add default dev wg0 table 51820
++ip1 -6 rule add not fwmark 51820 table 51820
++ip1 -6 rule add table main suppress_prefixlength 0
++ip1 -4 route add default dev wg0 table 51820
++ip1 -4 rule add not fwmark 51820 table 51820
++ip1 -4 rule add table main suppress_prefixlength 0
++# suppress_prefixlength only got added in 3.12, and we want to support 3.10+.
++if [[ $(ip1 -4 rule show all) == *suppress_prefixlength* ]]; then
++      # Flood the pings instead of sending just one, to trigger routing table reference counting bugs.
++      n1 ping -W 1 -c 100 -f 192.168.99.7
++      n1 ping -W 1 -c 100 -f abab::1111
++fi
++
++n0 iptables -t nat -F
++ip0 link del vethrc
++ip0 link del vethrs
++ip1 link del wg0
++ip2 link del wg0
++
++# Test that saddr routing is sticky but not too sticky, changing to this topology:
++# ┌────────────────────────────────────────┐    ┌────────────────────────────────────────┐
++# │             $ns1 namespace             │    │             $ns2 namespace             │
++# │                                        │    │                                        │
++# │  ┌─────┐             ┌─────┐           │    │  ┌─────┐            ┌─────┐            │
++# │  │ wg0 │─────────────│veth1│───────────┼────┼──│veth2│────────────│ wg0 │            │
++# │  ├─────┴──────────┐  ├─────┴──────────┐│    │  ├─────┴──────────┐ ├─────┴──────────┐ │
++# │  │192.168.241.1/24│  │10.0.0.1/24     ││    │  │10.0.0.2/24     │ │192.168.241.2/24│ │
++# │  │fd00::1/24      │  │fd00:aa::1/96   ││    │  │fd00:aa::2/96   │ │fd00::2/24      │ │
++# │  └────────────────┘  └────────────────┘│    │  └────────────────┘ └────────────────┘ │
++# └────────────────────────────────────────┘    └────────────────────────────────────────┘
++
++ip1 link add dev wg0 type wireguard
++ip2 link add dev wg0 type wireguard
++configure_peers
++ip1 link add veth1 type veth peer name veth2
++ip1 link set veth2 netns $netns2
++n1 bash -c 'printf 0 > /proc/sys/net/ipv6/conf/all/accept_dad'
++n2 bash -c 'printf 0 > /proc/sys/net/ipv6/conf/all/accept_dad'
++n1 bash -c 'printf 0 > /proc/sys/net/ipv6/conf/veth1/accept_dad'
++n2 bash -c 'printf 0 > /proc/sys/net/ipv6/conf/veth2/accept_dad'
++n1 bash -c 'printf 1 > /proc/sys/net/ipv4/conf/veth1/promote_secondaries'
++
++# First we check that we aren't overly sticky and can fall over to new IPs when old ones are removed
++ip1 addr add 10.0.0.1/24 dev veth1
++ip1 addr add fd00:aa::1/96 dev veth1
++ip2 addr add 10.0.0.2/24 dev veth2
++ip2 addr add fd00:aa::2/96 dev veth2
++ip1 link set veth1 up
++ip2 link set veth2 up
++waitiface $netns1 veth1
++waitiface $netns2 veth2
++n1 wg set wg0 peer "$pub2" endpoint 10.0.0.2:2
++n1 ping -W 1 -c 1 192.168.241.2
++ip1 addr add 10.0.0.10/24 dev veth1
++ip1 addr del 10.0.0.1/24 dev veth1
++n1 ping -W 1 -c 1 192.168.241.2
++n1 wg set wg0 peer "$pub2" endpoint [fd00:aa::2]:2
++n1 ping -W 1 -c 1 192.168.241.2
++ip1 addr add fd00:aa::10/96 dev veth1
++ip1 addr del fd00:aa::1/96 dev veth1
++n1 ping -W 1 -c 1 192.168.241.2
++
++# Now we show that we can successfully do reply to sender routing
++ip1 link set veth1 down
++ip2 link set veth2 down
++ip1 addr flush dev veth1
++ip2 addr flush dev veth2
++ip1 addr add 10.0.0.1/24 dev veth1
++ip1 addr add 10.0.0.2/24 dev veth1
++ip1 addr add fd00:aa::1/96 dev veth1
++ip1 addr add fd00:aa::2/96 dev veth1
++ip2 addr add 10.0.0.3/24 dev veth2
++ip2 addr add fd00:aa::3/96 dev veth2
++ip1 link set veth1 up
++ip2 link set veth2 up
++waitiface $netns1 veth1
++waitiface $netns2 veth2
++n2 wg set wg0 peer "$pub1" endpoint 10.0.0.1:1
++n2 ping -W 1 -c 1 192.168.241.1
++[[ $(n2 wg show wg0 endpoints) == "$pub1      10.0.0.1:1" ]]
++n2 wg set wg0 peer "$pub1" endpoint [fd00:aa::1]:1
++n2 ping -W 1 -c 1 192.168.241.1
++[[ $(n2 wg show wg0 endpoints) == "$pub1      [fd00:aa::1]:1" ]]
++n2 wg set wg0 peer "$pub1" endpoint 10.0.0.2:1
++n2 ping -W 1 -c 1 192.168.241.1
++[[ $(n2 wg show wg0 endpoints) == "$pub1      10.0.0.2:1" ]]
++n2 wg set wg0 peer "$pub1" endpoint [fd00:aa::2]:1
++n2 ping -W 1 -c 1 192.168.241.1
++[[ $(n2 wg show wg0 endpoints) == "$pub1      [fd00:aa::2]:1" ]]
++
++# What happens if the inbound destination address belongs to a different interface as the default route?
++ip1 link add dummy0 type dummy
++ip1 addr add 10.50.0.1/24 dev dummy0
++ip1 link set dummy0 up
++ip2 route add 10.50.0.0/24 dev veth2
++n2 wg set wg0 peer "$pub1" endpoint 10.50.0.1:1
++n2 ping -W 1 -c 1 192.168.241.1
++[[ $(n2 wg show wg0 endpoints) == "$pub1      10.50.0.1:1" ]]
++
++ip1 link del dummy0
++ip1 addr flush dev veth1
++ip2 addr flush dev veth2
++ip1 route flush dev veth1
++ip2 route flush dev veth2
++
++# Now we see what happens if another interface route takes precedence over an ongoing one
++ip1 link add veth3 type veth peer name veth4
++ip1 link set veth4 netns $netns2
++ip1 addr add 10.0.0.1/24 dev veth1
++ip2 addr add 10.0.0.2/24 dev veth2
++ip1 addr add 10.0.0.3/24 dev veth3
++ip1 link set veth1 up
++ip2 link set veth2 up
++ip1 link set veth3 up
++ip2 link set veth4 up
++waitiface $netns1 veth1
++waitiface $netns2 veth2
++waitiface $netns1 veth3
++waitiface $netns2 veth4
++ip1 route flush dev veth1
++ip1 route flush dev veth3
++ip1 route add 10.0.0.0/24 dev veth1 src 10.0.0.1 metric 2
++n1 wg set wg0 peer "$pub2" endpoint 10.0.0.2:2
++n1 ping -W 1 -c 1 192.168.241.2
++[[ $(n2 wg show wg0 endpoints) == "$pub1      10.0.0.1:1" ]]
++ip1 route add 10.0.0.0/24 dev veth3 src 10.0.0.3 metric 1
++n1 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/veth1/rp_filter'
++n2 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/veth4/rp_filter'
++n1 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/all/rp_filter'
++n2 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/all/rp_filter'
++n1 ping -W 1 -c 1 192.168.241.2
++[[ $(n2 wg show wg0 endpoints) == "$pub1      10.0.0.3:1" ]]
++
++ip1 link del veth1
++ip1 link del veth3
++ip1 link del wg0
++ip2 link del wg0
++
++# We test that Netlink/IPC is working properly by doing things that usually cause split responses
++ip0 link add dev wg0 type wireguard
++config=( "[Interface]" "PrivateKey=$(wg genkey)" "[Peer]" "PublicKey=$(wg genkey)" )
++for a in {1..255}; do
++      for b in {0..255}; do
++              config+=( "AllowedIPs=$a.$b.0.0/16,$a::$b/128" )
++      done
++done
++n0 wg setconf wg0 <(printf '%s\n' "${config[@]}")
++i=0
++for ip in $(n0 wg show wg0 allowed-ips); do
++      ((++i))
++done
++((i == 255*256*2+1))
++ip0 link del wg0
++ip0 link add dev wg0 type wireguard
++config=( "[Interface]" "PrivateKey=$(wg genkey)" )
++for a in {1..40}; do
++      config+=( "[Peer]" "PublicKey=$(wg genkey)" )
++      for b in {1..52}; do
++              config+=( "AllowedIPs=$a.$b.0.0/16" )
++      done
++done
++n0 wg setconf wg0 <(printf '%s\n' "${config[@]}")
++i=0
++while read -r line; do
++      j=0
++      for ip in $line; do
++              ((++j))
++      done
++      ((j == 53))
++      ((++i))
++done < <(n0 wg show wg0 allowed-ips)
++((i == 40))
++ip0 link del wg0
++ip0 link add wg0 type wireguard
++config=( )
++for i in {1..29}; do
++      config+=( "[Peer]" "PublicKey=$(wg genkey)" )
++done
++config+=( "[Peer]" "PublicKey=$(wg genkey)" "AllowedIPs=255.2.3.4/32,abcd::255/128" )
++n0 wg setconf wg0 <(printf '%s\n' "${config[@]}")
++n0 wg showconf wg0 > /dev/null
++ip0 link del wg0
++
++allowedips=( )
++for i in {1..197}; do
++        allowedips+=( abcd::$i )
++done
++saved_ifs="$IFS"
++IFS=,
++allowedips="${allowedips[*]}"
++IFS="$saved_ifs"
++ip0 link add wg0 type wireguard
++n0 wg set wg0 peer "$pub1"
++n0 wg set wg0 peer "$pub2" allowed-ips "$allowedips"
++{
++      read -r pub allowedips
++      [[ $pub == "$pub1" && $allowedips == "(none)" ]]
++      read -r pub allowedips
++      [[ $pub == "$pub2" ]]
++      i=0
++      for _ in $allowedips; do
++              ((++i))
++      done
++      ((i == 197))
++} < <(n0 wg show wg0 allowed-ips)
++ip0 link del wg0
++
++! n0 wg show doesnotexist || false
++
++ip0 link add wg0 type wireguard
++n0 wg set wg0 private-key <(echo "$key1") peer "$pub2" preshared-key <(echo "$psk")
++[[ $(n0 wg show wg0 private-key) == "$key1" ]]
++[[ $(n0 wg show wg0 preshared-keys) == "$pub2 $psk" ]]
++n0 wg set wg0 private-key /dev/null peer "$pub2" preshared-key /dev/null
++[[ $(n0 wg show wg0 private-key) == "(none)" ]]
++[[ $(n0 wg show wg0 preshared-keys) == "$pub2 (none)" ]]
++n0 wg set wg0 peer "$pub2"
++n0 wg set wg0 private-key <(echo "$key2")
++[[ $(n0 wg show wg0 public-key) == "$pub2" ]]
++[[ -z $(n0 wg show wg0 peers) ]]
++n0 wg set wg0 peer "$pub2"
++[[ -z $(n0 wg show wg0 peers) ]]
++n0 wg set wg0 private-key <(echo "$key1")
++n0 wg set wg0 peer "$pub2"
++[[ $(n0 wg show wg0 peers) == "$pub2" ]]
++n0 wg set wg0 private-key <(echo "/${key1:1}")
++[[ $(n0 wg show wg0 private-key) == "+${key1:1}" ]]
++n0 wg set wg0 peer "$pub2" allowed-ips 0.0.0.0/0,10.0.0.0/8,100.0.0.0/10,172.16.0.0/12,192.168.0.0/16
++n0 wg set wg0 peer "$pub2" allowed-ips 0.0.0.0/0
++n0 wg set wg0 peer "$pub2" allowed-ips ::/0,1700::/111,5000::/4,e000::/37,9000::/75
++n0 wg set wg0 peer "$pub2" allowed-ips ::/0
++ip0 link del wg0
++
++declare -A objects
++while read -t 0.1 -r line 2>/dev/null || [[ $? -ne 142 ]]; do
++      [[ $line =~ .*(wg[0-9]+:\ [A-Z][a-z]+\ [0-9]+)\ .*(created|destroyed).* ]] || continue
++      objects["${BASH_REMATCH[1]}"]+="${BASH_REMATCH[2]}"
++done < /dev/kmsg
++alldeleted=1
++for object in "${!objects[@]}"; do
++      if [[ ${objects["$object"]} != *createddestroyed ]]; then
++              echo "Error: $object: merely ${objects["$object"]}" >&3
++              alldeleted=0
++      fi
++done
++[[ $alldeleted -eq 1 ]]
++pretty "" "Objects that were created were also destroyed."
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0074-wireguard-selftests-import-harness-makefile-for-test.patch b/target/linux/generic/backport-5.4/080-wireguard-0074-wireguard-selftests-import-harness-makefile-for-test.patch

new file mode 100644 (file)

index 0000000..60ecebf
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0074-wireguard-selftests-import-harness-makefile-for-test.patch
@@ -0,0 +1,1079 @@
+From e333013ee167444adefd8a292e401b70e97dd4b2 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 15 Dec 2019 22:08:00 +0100
+Subject: [PATCH 074/124] wireguard: selftests: import harness makefile for
+ test suite
+
+commit 65d88d04114bca7d85faebd5fed61069cb2b632c upstream.
+
+WireGuard has been using this on build.wireguard.com for the last
+several years with considerable success. It allows for very quick and
+iterative development cycles, and supports several platforms.
+
+To run the test suite on your current platform in QEMU:
+
+  $ make -C tools/testing/selftests/wireguard/qemu -j$(nproc)
+
+To run it with KASAN and such turned on:
+
+  $ DEBUG_KERNEL=yes make -C tools/testing/selftests/wireguard/qemu -j$(nproc)
+
+To run it emulated for another platform in QEMU:
+
+  $ ARCH=arm make -C tools/testing/selftests/wireguard/qemu -j$(nproc)
+
+At the moment, we support aarch64_be, aarch64, arm, armeb, i686, m68k,
+mips64, mips64el, mips, mipsel, powerpc64le, powerpc, and x86_64.
+
+The system supports incremental rebuilding, so it should be very fast to
+change a single file and then test it out and have immediate feedback.
+
+This requires for the right toolchain and qemu to be installed prior.
+I've had success with those from musl.cc.
+
+This is tailored for WireGuard at the moment, though later projects
+might generalize it for other network testing.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ .../selftests/wireguard/qemu/.gitignore       |   2 +
+ .../testing/selftests/wireguard/qemu/Makefile | 385 ++++++++++++++++++
+ .../wireguard/qemu/arch/aarch64.config        |   5 +
+ .../wireguard/qemu/arch/aarch64_be.config     |   6 +
+ .../selftests/wireguard/qemu/arch/arm.config  |   9 +
+ .../wireguard/qemu/arch/armeb.config          |  10 +
+ .../selftests/wireguard/qemu/arch/i686.config |   5 +
+ .../selftests/wireguard/qemu/arch/m68k.config |   9 +
+ .../selftests/wireguard/qemu/arch/mips.config |  11 +
+ .../wireguard/qemu/arch/mips64.config         |  14 +
+ .../wireguard/qemu/arch/mips64el.config       |  15 +
+ .../wireguard/qemu/arch/mipsel.config         |  12 +
+ .../wireguard/qemu/arch/powerpc.config        |  10 +
+ .../wireguard/qemu/arch/powerpc64le.config    |  12 +
+ .../wireguard/qemu/arch/x86_64.config         |   5 +
+ .../selftests/wireguard/qemu/debug.config     |  67 +++
+ tools/testing/selftests/wireguard/qemu/init.c | 284 +++++++++++++
+ .../selftests/wireguard/qemu/kernel.config    |  86 ++++
+ 18 files changed, 947 insertions(+)
+ create mode 100644 tools/testing/selftests/wireguard/qemu/.gitignore
+ create mode 100644 tools/testing/selftests/wireguard/qemu/Makefile
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/aarch64.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/aarch64_be.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/arm.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/armeb.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/i686.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/m68k.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/mips.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/mips64.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/mips64el.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/mipsel.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/powerpc.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/arch/x86_64.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/debug.config
+ create mode 100644 tools/testing/selftests/wireguard/qemu/init.c
+ create mode 100644 tools/testing/selftests/wireguard/qemu/kernel.config
+
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/.gitignore
+@@ -0,0 +1,2 @@
++build/
++distfiles/
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/Makefile
+@@ -0,0 +1,385 @@
++# SPDX-License-Identifier: GPL-2.0
++#
++# Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++
++PWD := $(shell pwd)
++
++CHOST := $(shell gcc -dumpmachine)
++ifneq (,$(ARCH))
++CBUILD := $(subst -gcc,,$(lastword $(subst /, ,$(firstword $(wildcard $(foreach bindir,$(subst :, ,$(PATH)),$(bindir)/$(ARCH)-*-gcc))))))
++ifeq (,$(CBUILD))
++$(error The toolchain for $(ARCH) is not installed)
++endif
++else
++CBUILD := $(CHOST)
++ARCH := $(firstword $(subst -, ,$(CBUILD)))
++endif
++
++# Set these from the environment to override
++KERNEL_PATH ?= $(PWD)/../../../../..
++BUILD_PATH ?= $(PWD)/build/$(ARCH)
++DISTFILES_PATH ?= $(PWD)/distfiles
++NR_CPUS ?= 4
++
++MIRROR := https://download.wireguard.com/qemu-test/distfiles/
++
++default: qemu
++
++# variable name, tarball project name, version, tarball extension, default URI base
++define tar_download =
++$(1)_VERSION := $(3)
++$(1)_NAME := $(2)-$$($(1)_VERSION)
++$(1)_TAR := $(DISTFILES_PATH)/$$($(1)_NAME)$(4)
++$(1)_PATH := $(BUILD_PATH)/$$($(1)_NAME)
++$(call file_download,$$($(1)_NAME)$(4),$(5),$(6))
++endef
++
++define file_download =
++$(DISTFILES_PATH)/$(1):
++      mkdir -p $(DISTFILES_PATH)
++      flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -t inf --retry-on-http-error=404 -O $$@.tmp $(2)$(1) || rm -f $$@.tmp'
++      if echo "$(3)  $$@.tmp" | sha256sum -c -; then mv $$@.tmp $$@; else rm -f $$@.tmp; exit 71; fi
++endef
++
++$(eval $(call tar_download,MUSL,musl,1.1.20,.tar.gz,https://www.musl-libc.org/releases/,44be8771d0e6c6b5f82dd15662eb2957c9a3173a19a8b49966ac0542bbd40d61))
++$(eval $(call tar_download,LIBMNL,libmnl,1.0.4,.tar.bz2,https://www.netfilter.org/projects/libmnl/files/,171f89699f286a5854b72b91d06e8f8e3683064c5901fb09d954a9ab6f551f81))
++$(eval $(call tar_download,IPERF,iperf,3.1.7,.tar.gz,http://downloads.es.net/pub/iperf/,a4ef73406fe92250602b8da2ae89ec53211f805df97a1d1d629db5a14043734f))
++$(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d))
++$(eval $(call tar_download,IPROUTE2,iproute2,5.1.0,.tar.gz,https://www.kernel.org/pub/linux/utils/net/iproute2/,9b43707d6075ecdca14803ca8ce0c8553848c49fa1586d12fd508d66577243f2))
++$(eval $(call tar_download,IPTABLES,iptables,1.6.1,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,0fc2d7bd5d7be11311726466789d4c65fb4c8e096c9182b56ce97440864f0cf5))
++$(eval $(call tar_download,NMAP,nmap,7.60,.tar.bz2,https://nmap.org/dist/,a8796ecc4fa6c38aad6139d9515dc8113023a82e9d787e5a5fb5fa1b05516f21))
++$(eval $(call tar_download,IPUTILS,iputils,s20161105,.tar.gz,https://github.com/iputils/iputils/archive/s20161105.tar.gz/#,f813092f03d17294fd23544b129b95cdb87fe19f7970a51908a6b88509acad8a))
++$(eval $(call tar_download,WIREGUARD_TOOLS,WireGuard,0.0.20191212,.tar.xz,https://git.zx2c4.com/WireGuard/snapshot/,b0d718380f7a8822b2f12d75e462fa4eafa3a77871002981f367cd4fe2a1b071))
++
++KERNEL_BUILD_PATH := $(BUILD_PATH)/kernel$(if $(findstring yes,$(DEBUG_KERNEL)),-debug)
++rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
++WIREGUARD_SOURCES := $(call rwildcard,$(KERNEL_PATH)/drivers/net/wireguard/,*)
++
++export CFLAGS ?= -O3 -pipe
++export LDFLAGS ?=
++export CPPFLAGS := -I$(BUILD_PATH)/include
++
++ifeq ($(CHOST),$(CBUILD))
++CROSS_COMPILE_FLAG := --host=$(CHOST)
++NOPIE_GCC := gcc -fno-PIE
++CFLAGS += -march=native
++STRIP := strip
++else
++$(info Cross compilation: building for $(CBUILD) using $(CHOST))
++CROSS_COMPILE_FLAG := --build=$(CBUILD) --host=$(CHOST)
++export CROSS_COMPILE=$(CBUILD)-
++NOPIE_GCC := $(CBUILD)-gcc -fno-PIE
++STRIP := $(CBUILD)-strip
++endif
++ifeq ($(ARCH),aarch64)
++QEMU_ARCH := aarch64
++KERNEL_ARCH := arm64
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm64/boot/Image
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
++else
++QEMU_MACHINE := -cpu cortex-a53 -machine virt
++CFLAGS += -march=armv8-a -mtune=cortex-a53
++endif
++else ifeq ($(ARCH),aarch64_be)
++QEMU_ARCH := aarch64
++KERNEL_ARCH := arm64
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm64/boot/Image
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
++else
++QEMU_MACHINE := -cpu cortex-a53 -machine virt
++CFLAGS += -march=armv8-a -mtune=cortex-a53
++endif
++else ifeq ($(ARCH),arm)
++QEMU_ARCH := arm
++KERNEL_ARCH := arm
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm/boot/zImage
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
++else
++QEMU_MACHINE := -cpu cortex-a15 -machine virt
++CFLAGS += -march=armv7-a -mtune=cortex-a15 -mabi=aapcs-linux
++endif
++else ifeq ($(ARCH),armeb)
++QEMU_ARCH := arm
++KERNEL_ARCH := arm
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm/boot/zImage
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
++else
++QEMU_MACHINE := -cpu cortex-a15 -machine virt
++CFLAGS += -march=armv7-a -mabi=aapcs-linux # We don't pass -mtune=cortex-a15 due to a compiler bug on big endian.
++LDFLAGS += -Wl,--be8
++endif
++else ifeq ($(ARCH),x86_64)
++QEMU_ARCH := x86_64
++KERNEL_ARCH := x86_64
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host -machine q35,accel=kvm
++else
++QEMU_MACHINE := -cpu Skylake-Server -machine q35
++CFLAGS += -march=skylake-avx512
++endif
++else ifeq ($(ARCH),i686)
++QEMU_ARCH := i386
++KERNEL_ARCH := x86
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage
++ifeq ($(subst i686,x86_64,$(CBUILD)),$(CHOST))
++QEMU_MACHINE := -cpu host -machine q35,accel=kvm
++else
++QEMU_MACHINE := -cpu coreduo -machine q35
++CFLAGS += -march=prescott
++endif
++else ifeq ($(ARCH),mips64)
++QEMU_ARCH := mips64
++KERNEL_ARCH := mips
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host -machine malta,accel=kvm
++CFLAGS += -EB
++else
++QEMU_MACHINE := -cpu MIPS64R2-generic -machine malta -smp 1
++CFLAGS += -march=mips64r2 -EB
++endif
++else ifeq ($(ARCH),mips64el)
++QEMU_ARCH := mips64el
++KERNEL_ARCH := mips
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host -machine malta,accel=kvm
++CFLAGS += -EL
++else
++QEMU_MACHINE := -cpu MIPS64R2-generic -machine malta -smp 1
++CFLAGS += -march=mips64r2 -EL
++endif
++else ifeq ($(ARCH),mips)
++QEMU_ARCH := mips
++KERNEL_ARCH := mips
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host -machine malta,accel=kvm
++CFLAGS += -EB
++else
++QEMU_MACHINE := -cpu 24Kf -machine malta -smp 1
++CFLAGS += -march=mips32r2 -EB
++endif
++else ifeq ($(ARCH),mipsel)
++QEMU_ARCH := mipsel
++KERNEL_ARCH := mips
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host -machine malta,accel=kvm
++CFLAGS += -EL
++else
++QEMU_MACHINE := -cpu 24Kf -machine malta -smp 1
++CFLAGS += -march=mips32r2 -EL
++endif
++else ifeq ($(ARCH),powerpc64le)
++QEMU_ARCH := ppc64
++KERNEL_ARCH := powerpc
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host,accel=kvm -machine pseries
++else
++QEMU_MACHINE := -machine pseries
++endif
++CFLAGS += -mcpu=powerpc64le -mlong-double-64
++else ifeq ($(ARCH),powerpc)
++QEMU_ARCH := ppc
++KERNEL_ARCH := powerpc
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/powerpc/boot/uImage
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host,accel=kvm -machine ppce500
++else
++QEMU_MACHINE := -machine ppce500
++endif
++CFLAGS += -mcpu=powerpc -mlong-double-64 -msecure-plt
++else ifeq ($(ARCH),m68k)
++QEMU_ARCH := m68k
++KERNEL_ARCH := m68k
++KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
++ifeq ($(CHOST),$(CBUILD))
++QEMU_MACHINE := -cpu host,accel=kvm -machine q800
++else
++QEMU_MACHINE := -machine q800
++endif
++else
++$(error I only build: x86_64, i686, arm, armeb, aarch64, aarch64_be, mips, mipsel, mips64, mips64el, powerpc64le, powerpc, m68k)
++endif
++
++REAL_CC := $(CBUILD)-gcc
++MUSL_CC := $(BUILD_PATH)/musl-gcc
++export CC := $(MUSL_CC)
++USERSPACE_DEPS := $(MUSL_CC) $(BUILD_PATH)/include/.installed $(BUILD_PATH)/include/linux/.installed
++
++build: $(KERNEL_BZIMAGE)
++qemu: $(KERNEL_BZIMAGE)
++      rm -f $(BUILD_PATH)/result
++      timeout --foreground 20m qemu-system-$(QEMU_ARCH) \
++              -nodefaults \
++              -nographic \
++              -smp $(NR_CPUS) \
++              $(QEMU_MACHINE) \
++              -m $$(grep -q CONFIG_DEBUG_KMEMLEAK=y $(KERNEL_BUILD_PATH)/.config && echo 1G || echo 256M) \
++              -serial stdio \
++              -serial file:$(BUILD_PATH)/result \
++              -no-reboot \
++              -monitor none \
++              -kernel $<
++      grep -Fq success $(BUILD_PATH)/result
++
++$(BUILD_PATH)/init-cpio-spec.txt:
++      mkdir -p $(BUILD_PATH)
++      echo "file /init $(BUILD_PATH)/init 755 0 0" > $@
++      echo "file /init.sh $(PWD)/../netns.sh 755 0 0" >> $@
++      echo "dir /dev 755 0 0" >> $@
++      echo "nod /dev/console 644 0 0 c 5 1" >> $@
++      echo "dir /bin 755 0 0" >> $@
++      echo "file /bin/iperf3 $(IPERF_PATH)/src/iperf3 755 0 0" >> $@
++      echo "file /bin/wg $(WIREGUARD_TOOLS_PATH)/src/tools/wg 755 0 0" >> $@
++      echo "file /bin/bash $(BASH_PATH)/bash 755 0 0" >> $@
++      echo "file /bin/ip $(IPROUTE2_PATH)/ip/ip 755 0 0" >> $@
++      echo "file /bin/ss $(IPROUTE2_PATH)/misc/ss 755 0 0" >> $@
++      echo "file /bin/ping $(IPUTILS_PATH)/ping 755 0 0" >> $@
++      echo "file /bin/ncat $(NMAP_PATH)/ncat/ncat 755 0 0" >> $@
++      echo "file /bin/xtables-multi $(IPTABLES_PATH)/iptables/xtables-multi 755 0 0" >> $@
++      echo "slink /bin/iptables xtables-multi 777 0 0" >> $@
++      echo "slink /bin/ping6 ping 777 0 0" >> $@
++      echo "dir /lib 755 0 0" >> $@
++      echo "file /lib/libc.so $(MUSL_PATH)/lib/libc.so 755 0 0" >> $@
++      echo "slink /lib/ld-linux.so.1 libc.so 777 0 0" >> $@
++
++$(KERNEL_BUILD_PATH)/.config: kernel.config arch/$(ARCH).config
++      mkdir -p $(KERNEL_BUILD_PATH)
++      cp kernel.config $(KERNEL_BUILD_PATH)/minimal.config
++      printf 'CONFIG_NR_CPUS=$(NR_CPUS)\nCONFIG_INITRAMFS_SOURCE="$(BUILD_PATH)/init-cpio-spec.txt"\n' >> $(KERNEL_BUILD_PATH)/minimal.config
++      cat arch/$(ARCH).config >> $(KERNEL_BUILD_PATH)/minimal.config
++      $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) allnoconfig
++      cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config $(KERNEL_BUILD_PATH)/minimal.config
++      $(if $(findstring yes,$(DEBUG_KERNEL)),cp debug.config $(KERNEL_BUILD_PATH) && cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config debug.config,)
++
++$(KERNEL_BZIMAGE): $(KERNEL_BUILD_PATH)/.config $(BUILD_PATH)/init-cpio-spec.txt $(MUSL_PATH)/lib/libc.so $(IPERF_PATH)/src/iperf3 $(IPUTILS_PATH)/ping $(BASH_PATH)/bash $(IPROUTE2_PATH)/misc/ss $(IPROUTE2_PATH)/ip/ip $(IPTABLES_PATH)/iptables/xtables-multi $(NMAP_PATH)/ncat/ncat $(WIREGUARD_TOOLS_PATH)/src/tools/wg $(BUILD_PATH)/init ../netns.sh $(WIREGUARD_SOURCES)
++      $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC="$(NOPIE_GCC)"
++
++$(BUILD_PATH)/include/linux/.installed: | $(KERNEL_BUILD_PATH)/.config
++      $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) INSTALL_HDR_PATH=$(BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) headers_install
++      touch $@
++
++$(MUSL_PATH)/lib/libc.so: $(MUSL_TAR)
++      mkdir -p $(BUILD_PATH)
++      flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
++      cd $(MUSL_PATH) && CC=$(REAL_CC) ./configure --prefix=/ --disable-static --build=$(CBUILD)
++      $(MAKE) -C $(MUSL_PATH)
++      $(STRIP) -s $@
++
++$(BUILD_PATH)/include/.installed: $(MUSL_PATH)/lib/libc.so
++      $(MAKE) -C $(MUSL_PATH) DESTDIR=$(BUILD_PATH) install-headers
++      touch $@
++
++$(MUSL_CC): $(MUSL_PATH)/lib/libc.so
++      sh $(MUSL_PATH)/tools/musl-gcc.specs.sh $(BUILD_PATH)/include $(MUSL_PATH)/lib /lib/ld-linux.so.1 > $(BUILD_PATH)/musl-gcc.specs
++      printf '#!/bin/sh\nexec "$(REAL_CC)" --specs="$(BUILD_PATH)/musl-gcc.specs" -fno-stack-protector -no-pie "$$@"\n' > $(BUILD_PATH)/musl-gcc
++      chmod +x $(BUILD_PATH)/musl-gcc
++
++$(IPERF_PATH)/.installed: $(IPERF_TAR)
++      mkdir -p $(BUILD_PATH)
++      flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
++      sed -i '1s/^/#include <stdint.h>/' $(IPERF_PATH)/src/cjson.h $(IPERF_PATH)/src/timer.h
++      sed -i -r 's/-p?g//g' $(IPERF_PATH)/src/Makefile*
++      touch $@
++
++$(IPERF_PATH)/src/iperf3: | $(IPERF_PATH)/.installed $(USERSPACE_DEPS)
++      cd $(IPERF_PATH) && CFLAGS="$(CFLAGS) -D_GNU_SOURCE" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared
++      $(MAKE) -C $(IPERF_PATH)
++      $(STRIP) -s $@
++
++$(LIBMNL_PATH)/.installed: $(LIBMNL_TAR)
++      flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
++      touch $@
++
++$(LIBMNL_PATH)/src/.libs/libmnl.a: | $(LIBMNL_PATH)/.installed $(USERSPACE_DEPS)
++      cd $(LIBMNL_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared
++      $(MAKE) -C $(LIBMNL_PATH)
++      sed -i 's:prefix=.*:prefix=$(LIBMNL_PATH):' $(LIBMNL_PATH)/libmnl.pc
++
++$(WIREGUARD_TOOLS_PATH)/.installed: $(WIREGUARD_TOOLS_TAR)
++      flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
++      touch $@
++
++$(WIREGUARD_TOOLS_PATH)/src/tools/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
++      LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src/tools LIBMNL_CFLAGS="-I$(LIBMNL_PATH)/include" LIBMNL_LDLIBS="-lmnl" wg
++      $(STRIP) -s $@
++
++$(BUILD_PATH)/init: init.c | $(USERSPACE_DEPS)
++      mkdir -p $(BUILD_PATH)
++      $(MUSL_CC) -o $@ $(CFLAGS) $(LDFLAGS) -std=gnu11 $<
++      $(STRIP) -s $@
++
++$(IPUTILS_PATH)/.installed: $(IPUTILS_TAR)
++      mkdir -p $(BUILD_PATH)
++      flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
++      touch $@
++
++$(IPUTILS_PATH)/ping: | $(IPUTILS_PATH)/.installed $(USERSPACE_DEPS)
++      $(MAKE) -C $(IPUTILS_PATH) USE_CAP=no USE_IDN=no USE_NETTLE=no USE_CRYPTO=no ping
++      $(STRIP) -s $@
++
++$(BASH_PATH)/.installed: $(BASH_TAR)
++      mkdir -p $(BUILD_PATH)
++      flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
++      touch $@
++
++$(BASH_PATH)/bash: | $(BASH_PATH)/.installed $(USERSPACE_DEPS)
++      cd $(BASH_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --without-bash-malloc --disable-debugger --disable-help-builtin --disable-history --disable-multibyte --disable-progcomp --disable-readline --disable-mem-scramble
++      $(MAKE) -C $(BASH_PATH)
++      $(STRIP) -s $@
++
++$(IPROUTE2_PATH)/.installed: $(IPROUTE2_TAR)
++      mkdir -p $(BUILD_PATH)
++      flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
++      printf 'CC:=$(CC)\nPKG_CONFIG:=pkg-config\nTC_CONFIG_XT:=n\nTC_CONFIG_ATM:=n\nTC_CONFIG_IPSET:=n\nIP_CONFIG_SETNS:=y\nHAVE_ELF:=n\nHAVE_MNL:=y\nHAVE_BERKELEY_DB:=n\nHAVE_LATEX:=n\nHAVE_PDFLATEX:=n\nCFLAGS+=-DHAVE_SETNS -DHAVE_LIBMNL -I$(LIBMNL_PATH)/include\nLDLIBS+=-lmnl' > $(IPROUTE2_PATH)/config.mk
++      printf 'lib: snapshot\n\t$$(MAKE) -C lib\nip/ip: lib\n\t$$(MAKE) -C ip ip\nmisc/ss: lib\n\t$$(MAKE) -C misc ss\n' >> $(IPROUTE2_PATH)/Makefile
++      touch $@
++
++$(IPROUTE2_PATH)/ip/ip: | $(IPROUTE2_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
++      LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ ip/ip
++      $(STRIP) -s $(IPROUTE2_PATH)/ip/ip
++
++$(IPROUTE2_PATH)/misc/ss: | $(IPROUTE2_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
++      LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ misc/ss
++      $(STRIP) -s $(IPROUTE2_PATH)/misc/ss
++
++$(IPTABLES_PATH)/.installed: $(IPTABLES_TAR)
++      mkdir -p $(BUILD_PATH)
++      flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
++      sed -i -e "/nfnetlink=[01]/s:=[01]:=0:" -e "/nfconntrack=[01]/s:=[01]:=0:" $(IPTABLES_PATH)/configure
++      touch $@
++
++$(IPTABLES_PATH)/iptables/xtables-multi: | $(IPTABLES_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
++      cd $(IPTABLES_PATH) && PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --disable-nftables --disable-bpf-compiler --disable-nfsynproxy --disable-libipq --with-kernel=$(BUILD_PATH)/include
++      $(MAKE) -C $(IPTABLES_PATH)
++      $(STRIP) -s $@
++
++$(NMAP_PATH)/.installed: $(NMAP_TAR)
++      mkdir -p $(BUILD_PATH)
++      flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
++      touch $@
++
++$(NMAP_PATH)/ncat/ncat: | $(NMAP_PATH)/.installed $(USERSPACE_DEPS)
++      cd $(NMAP_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --without-ndiff --without-zenmap --without-nping --with-libpcap=included --with-libpcre=included --with-libdnet=included --without-liblua --with-liblinear=included --without-nmap-update --without-openssl --with-pcap=linux
++      $(MAKE) -C $(NMAP_PATH) build-ncat
++      $(STRIP) -s $@
++
++clean:
++      rm -rf $(BUILD_PATH)
++
++distclean: clean
++      rm -rf $(DISTFILES_PATH)
++
++menuconfig: $(KERNEL_BUILD_PATH)/.config
++      $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC="$(NOPIE_GCC)" menuconfig
++
++.PHONY: qemu build clean distclean menuconfig
++.DELETE_ON_ERROR:
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/aarch64.config
+@@ -0,0 +1,5 @@
++CONFIG_SERIAL_AMBA_PL011=y
++CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyAMA0 wg.success=ttyAMA1"
++CONFIG_FRAME_WARN=1280
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/aarch64_be.config
+@@ -0,0 +1,6 @@
++CONFIG_CPU_BIG_ENDIAN=y
++CONFIG_SERIAL_AMBA_PL011=y
++CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyAMA0 wg.success=ttyAMA1"
++CONFIG_FRAME_WARN=1280
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/arm.config
+@@ -0,0 +1,9 @@
++CONFIG_MMU=y
++CONFIG_ARCH_MULTI_V7=y
++CONFIG_ARCH_VIRT=y
++CONFIG_THUMB2_KERNEL=n
++CONFIG_SERIAL_AMBA_PL011=y
++CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyAMA0 wg.success=ttyAMA1"
++CONFIG_FRAME_WARN=1024
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/armeb.config
+@@ -0,0 +1,10 @@
++CONFIG_MMU=y
++CONFIG_ARCH_MULTI_V7=y
++CONFIG_ARCH_VIRT=y
++CONFIG_THUMB2_KERNEL=n
++CONFIG_SERIAL_AMBA_PL011=y
++CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyAMA0 wg.success=ttyAMA1"
++CONFIG_CPU_BIG_ENDIAN=y
++CONFIG_FRAME_WARN=1024
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/i686.config
+@@ -0,0 +1,5 @@
++CONFIG_SERIAL_8250=y
++CONFIG_SERIAL_8250_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
++CONFIG_FRAME_WARN=1024
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/m68k.config
+@@ -0,0 +1,9 @@
++CONFIG_MMU=y
++CONFIG_M68040=y
++CONFIG_MAC=y
++CONFIG_SERIAL_PMACZILOG=y
++CONFIG_SERIAL_PMACZILOG_TTYS=y
++CONFIG_SERIAL_PMACZILOG_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
++CONFIG_FRAME_WARN=1024
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/mips.config
+@@ -0,0 +1,11 @@
++CONFIG_CPU_MIPS32_R2=y
++CONFIG_MIPS_MALTA=y
++CONFIG_MIPS_CPS=y
++CONFIG_MIPS_FP_SUPPORT=y
++CONFIG_POWER_RESET=y
++CONFIG_POWER_RESET_SYSCON=y
++CONFIG_SERIAL_8250=y
++CONFIG_SERIAL_8250_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
++CONFIG_FRAME_WARN=1024
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/mips64.config
+@@ -0,0 +1,14 @@
++CONFIG_64BIT=y
++CONFIG_CPU_MIPS64_R2=y
++CONFIG_MIPS32_N32=y
++CONFIG_CPU_HAS_MSA=y
++CONFIG_MIPS_MALTA=y
++CONFIG_MIPS_CPS=y
++CONFIG_MIPS_FP_SUPPORT=y
++CONFIG_POWER_RESET=y
++CONFIG_POWER_RESET_SYSCON=y
++CONFIG_SERIAL_8250=y
++CONFIG_SERIAL_8250_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
++CONFIG_FRAME_WARN=1280
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/mips64el.config
+@@ -0,0 +1,15 @@
++CONFIG_64BIT=y
++CONFIG_CPU_MIPS64_R2=y
++CONFIG_MIPS32_N32=y
++CONFIG_CPU_HAS_MSA=y
++CONFIG_MIPS_MALTA=y
++CONFIG_CPU_LITTLE_ENDIAN=y
++CONFIG_MIPS_CPS=y
++CONFIG_MIPS_FP_SUPPORT=y
++CONFIG_POWER_RESET=y
++CONFIG_POWER_RESET_SYSCON=y
++CONFIG_SERIAL_8250=y
++CONFIG_SERIAL_8250_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
++CONFIG_FRAME_WARN=1280
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/mipsel.config
+@@ -0,0 +1,12 @@
++CONFIG_CPU_MIPS32_R2=y
++CONFIG_MIPS_MALTA=y
++CONFIG_CPU_LITTLE_ENDIAN=y
++CONFIG_MIPS_CPS=y
++CONFIG_MIPS_FP_SUPPORT=y
++CONFIG_POWER_RESET=y
++CONFIG_POWER_RESET_SYSCON=y
++CONFIG_SERIAL_8250=y
++CONFIG_SERIAL_8250_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
++CONFIG_FRAME_WARN=1024
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/powerpc.config
+@@ -0,0 +1,10 @@
++CONFIG_PPC_QEMU_E500=y
++CONFIG_FSL_SOC_BOOKE=y
++CONFIG_PPC_85xx=y
++CONFIG_PHYS_64BIT=y
++CONFIG_SERIAL_8250=y
++CONFIG_SERIAL_8250_CONSOLE=y
++CONFIG_MATH_EMULATION=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
++CONFIG_FRAME_WARN=1024
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config
+@@ -0,0 +1,12 @@
++CONFIG_PPC64=y
++CONFIG_PPC_PSERIES=y
++CONFIG_ALTIVEC=y
++CONFIG_VSX=y
++CONFIG_PPC_OF_BOOT_TRAMPOLINE=y
++CONFIG_PPC_RADIX_MMU=y
++CONFIG_HVC_CONSOLE=y
++CONFIG_CPU_LITTLE_ENDIAN=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=hvc0 wg.success=hvc1"
++CONFIG_SECTION_MISMATCH_WARN_ONLY=y
++CONFIG_FRAME_WARN=1280
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/arch/x86_64.config
+@@ -0,0 +1,5 @@
++CONFIG_SERIAL_8250=y
++CONFIG_SERIAL_8250_CONSOLE=y
++CONFIG_CMDLINE_BOOL=y
++CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
++CONFIG_FRAME_WARN=1280
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/debug.config
+@@ -0,0 +1,67 @@
++CONFIG_LOCALVERSION="-debug"
++CONFIG_ENABLE_WARN_DEPRECATED=y
++CONFIG_ENABLE_MUST_CHECK=y
++CONFIG_FRAME_POINTER=y
++CONFIG_STACK_VALIDATION=y
++CONFIG_DEBUG_KERNEL=y
++CONFIG_DEBUG_INFO=y
++CONFIG_DEBUG_INFO_DWARF4=y
++CONFIG_PAGE_EXTENSION=y
++CONFIG_PAGE_POISONING=y
++CONFIG_DEBUG_OBJECTS=y
++CONFIG_DEBUG_OBJECTS_FREE=y
++CONFIG_DEBUG_OBJECTS_TIMERS=y
++CONFIG_DEBUG_OBJECTS_WORK=y
++CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
++CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y
++CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
++CONFIG_SLUB_DEBUG_ON=y
++CONFIG_DEBUG_VM=y
++CONFIG_DEBUG_MEMORY_INIT=y
++CONFIG_HAVE_DEBUG_STACKOVERFLOW=y
++CONFIG_DEBUG_STACKOVERFLOW=y
++CONFIG_HAVE_ARCH_KMEMCHECK=y
++CONFIG_HAVE_ARCH_KASAN=y
++CONFIG_KASAN=y
++CONFIG_KASAN_INLINE=y
++CONFIG_UBSAN=y
++CONFIG_UBSAN_SANITIZE_ALL=y
++CONFIG_UBSAN_NO_ALIGNMENT=y
++CONFIG_UBSAN_NULL=y
++CONFIG_DEBUG_KMEMLEAK=y
++CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE=8192
++CONFIG_DEBUG_STACK_USAGE=y
++CONFIG_DEBUG_SHIRQ=y
++CONFIG_WQ_WATCHDOG=y
++CONFIG_SCHED_DEBUG=y
++CONFIG_SCHED_INFO=y
++CONFIG_SCHEDSTATS=y
++CONFIG_SCHED_STACK_END_CHECK=y
++CONFIG_DEBUG_TIMEKEEPING=y
++CONFIG_TIMER_STATS=y
++CONFIG_DEBUG_PREEMPT=y
++CONFIG_DEBUG_RT_MUTEXES=y
++CONFIG_DEBUG_SPINLOCK=y
++CONFIG_DEBUG_MUTEXES=y
++CONFIG_DEBUG_LOCK_ALLOC=y
++CONFIG_PROVE_LOCKING=y
++CONFIG_LOCKDEP=y
++CONFIG_DEBUG_ATOMIC_SLEEP=y
++CONFIG_TRACE_IRQFLAGS=y
++CONFIG_DEBUG_BUGVERBOSE=y
++CONFIG_DEBUG_LIST=y
++CONFIG_DEBUG_PI_LIST=y
++CONFIG_PROVE_RCU=y
++CONFIG_SPARSE_RCU_POINTER=y
++CONFIG_RCU_CPU_STALL_TIMEOUT=21
++CONFIG_RCU_TRACE=y
++CONFIG_RCU_EQS_DEBUG=y
++CONFIG_USER_STACKTRACE_SUPPORT=y
++CONFIG_DEBUG_SG=y
++CONFIG_DEBUG_NOTIFIERS=y
++CONFIG_DOUBLEFAULT=y
++CONFIG_X86_DEBUG_FPU=y
++CONFIG_DEBUG_SECTION_MISMATCH=y
++CONFIG_DEBUG_PAGEALLOC=y
++CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT=y
++CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/init.c
+@@ -0,0 +1,284 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#define _GNU_SOURCE
++#include <unistd.h>
++#include <errno.h>
++#include <string.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <stdbool.h>
++#include <fcntl.h>
++#include <sys/wait.h>
++#include <sys/mount.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <sys/types.h>
++#include <sys/io.h>
++#include <sys/ioctl.h>
++#include <sys/reboot.h>
++#include <sys/utsname.h>
++#include <sys/sendfile.h>
++#include <linux/random.h>
++#include <linux/version.h>
++
++__attribute__((noreturn)) static void poweroff(void)
++{
++      fflush(stdout);
++      fflush(stderr);
++      reboot(RB_AUTOBOOT);
++      sleep(30);
++      fprintf(stderr, "\x1b[37m\x1b[41m\x1b[1mFailed to power off!!!\x1b[0m\n");
++      exit(1);
++}
++
++static void panic(const char *what)
++{
++      fprintf(stderr, "\n\n\x1b[37m\x1b[41m\x1b[1mSOMETHING WENT HORRIBLY WRONG\x1b[0m\n\n    \x1b[31m\x1b[1m%s: %s\x1b[0m\n\n\x1b[37m\x1b[44m\x1b[1mPower off...\x1b[0m\n\n", what, strerror(errno));
++      poweroff();
++}
++
++#define pretty_message(msg) puts("\x1b[32m\x1b[1m" msg "\x1b[0m")
++
++static void print_banner(void)
++{
++      struct utsname utsname;
++      int len;
++
++      if (uname(&utsname) < 0)
++              panic("uname");
++
++      len = strlen("    WireGuard Test Suite on       ") + strlen(utsname.sysname) + strlen(utsname.release) + strlen(utsname.machine);
++      printf("\x1b[45m\x1b[33m\x1b[1m%*.s\x1b[0m\n\x1b[45m\x1b[33m\x1b[1m    WireGuard Test Suite on %s %s %s    \x1b[0m\n\x1b[45m\x1b[33m\x1b[1m%*.s\x1b[0m\n\n", len, "", utsname.sysname, utsname.release, utsname.machine, len, "");
++}
++
++static void seed_rng(void)
++{
++      int fd;
++      struct {
++              int entropy_count;
++              int buffer_size;
++              unsigned char buffer[256];
++      } entropy = {
++              .entropy_count = sizeof(entropy.buffer) * 8,
++              .buffer_size = sizeof(entropy.buffer),
++              .buffer = "Adding real entropy is not actually important for these tests. Don't try this at home, kids!"
++      };
++
++      if (mknod("/dev/urandom", S_IFCHR | 0644, makedev(1, 9)))
++              panic("mknod(/dev/urandom)");
++      fd = open("/dev/urandom", O_WRONLY);
++      if (fd < 0)
++              panic("open(urandom)");
++      for (int i = 0; i < 256; ++i) {
++              if (ioctl(fd, RNDADDENTROPY, &entropy) < 0)
++                      panic("ioctl(urandom)");
++      }
++      close(fd);
++}
++
++static void mount_filesystems(void)
++{
++      pretty_message("[+] Mounting filesystems...");
++      mkdir("/dev", 0755);
++      mkdir("/proc", 0755);
++      mkdir("/sys", 0755);
++      mkdir("/tmp", 0755);
++      mkdir("/run", 0755);
++      mkdir("/var", 0755);
++      if (mount("none", "/dev", "devtmpfs", 0, NULL))
++              panic("devtmpfs mount");
++      if (mount("none", "/proc", "proc", 0, NULL))
++              panic("procfs mount");
++      if (mount("none", "/sys", "sysfs", 0, NULL))
++              panic("sysfs mount");
++      if (mount("none", "/tmp", "tmpfs", 0, NULL))
++              panic("tmpfs mount");
++      if (mount("none", "/run", "tmpfs", 0, NULL))
++              panic("tmpfs mount");
++      if (mount("none", "/sys/kernel/debug", "debugfs", 0, NULL))
++              ; /* Not a problem if it fails.*/
++      if (symlink("/run", "/var/run"))
++              panic("run symlink");
++      if (symlink("/proc/self/fd", "/dev/fd"))
++              panic("fd symlink");
++}
++
++static void enable_logging(void)
++{
++      int fd;
++      pretty_message("[+] Enabling logging...");
++      fd = open("/proc/sys/kernel/printk", O_WRONLY);
++      if (fd >= 0) {
++              if (write(fd, "9\n", 2) != 2)
++                      panic("write(printk)");
++              close(fd);
++      }
++      fd = open("/proc/sys/debug/exception-trace", O_WRONLY);
++      if (fd >= 0) {
++              if (write(fd, "1\n", 2) != 2)
++                      panic("write(exception-trace)");
++              close(fd);
++      }
++      fd = open("/proc/sys/kernel/panic_on_warn", O_WRONLY);
++      if (fd >= 0) {
++              if (write(fd, "1\n", 2) != 2)
++                      panic("write(panic_on_warn)");
++              close(fd);
++      }
++}
++
++static void kmod_selftests(void)
++{
++      FILE *file;
++      char line[2048], *start, *pass;
++      bool success = true;
++      pretty_message("[+] Module self-tests:");
++      file = fopen("/proc/kmsg", "r");
++      if (!file)
++              panic("fopen(kmsg)");
++      if (fcntl(fileno(file), F_SETFL, O_NONBLOCK) < 0)
++              panic("fcntl(kmsg, nonblock)");
++      while (fgets(line, sizeof(line), file)) {
++              start = strstr(line, "wireguard: ");
++              if (!start)
++                      continue;
++              start += 11;
++              *strchrnul(start, '\n') = '\0';
++              if (strstr(start, "www.wireguard.com"))
++                      break;
++              pass = strstr(start, ": pass");
++              if (!pass || pass[6] != '\0') {
++                      success = false;
++                      printf(" \x1b[31m*  %s\x1b[0m\n", start);
++              } else
++                      printf(" \x1b[32m*  %s\x1b[0m\n", start);
++      }
++      fclose(file);
++      if (!success) {
++              puts("\x1b[31m\x1b[1m[-] Tests failed! \u2639\x1b[0m");
++              poweroff();
++      }
++}
++
++static void launch_tests(void)
++{
++      char cmdline[4096], *success_dev;
++      int status, fd;
++      pid_t pid;
++
++      pretty_message("[+] Launching tests...");
++      pid = fork();
++      if (pid == -1)
++              panic("fork");
++      else if (pid == 0) {
++              execl("/init.sh", "init", NULL);
++              panic("exec");
++      }
++      if (waitpid(pid, &status, 0) < 0)
++              panic("waitpid");
++      if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
++              pretty_message("[+] Tests successful! :-)");
++              fd = open("/proc/cmdline", O_RDONLY);
++              if (fd < 0)
++                      panic("open(/proc/cmdline)");
++              if (read(fd, cmdline, sizeof(cmdline) - 1) <= 0)
++                      panic("read(/proc/cmdline)");
++              cmdline[sizeof(cmdline) - 1] = '\0';
++              for (success_dev = strtok(cmdline, " \n"); success_dev; success_dev = strtok(NULL, " \n")) {
++                      if (strncmp(success_dev, "wg.success=", 11))
++                              continue;
++                      memcpy(success_dev + 11 - 5, "/dev/", 5);
++                      success_dev += 11 - 5;
++                      break;
++              }
++              if (!success_dev || !strlen(success_dev))
++                      panic("Unable to find success device");
++
++              fd = open(success_dev, O_WRONLY);
++              if (fd < 0)
++                      panic("open(success_dev)");
++              if (write(fd, "success\n", 8) != 8)
++                      panic("write(success_dev)");
++              close(fd);
++      } else {
++              const char *why = "unknown cause";
++              int what = -1;
++
++              if (WIFEXITED(status)) {
++                      why = "exit code";
++                      what = WEXITSTATUS(status);
++              } else if (WIFSIGNALED(status)) {
++                      why = "signal";
++                      what = WTERMSIG(status);
++              }
++              printf("\x1b[31m\x1b[1m[-] Tests failed with %s %d! \u2639\x1b[0m\n", why, what);
++      }
++}
++
++static void ensure_console(void)
++{
++      for (unsigned int i = 0; i < 1000; ++i) {
++              int fd = open("/dev/console", O_RDWR);
++              if (fd < 0) {
++                      usleep(50000);
++                      continue;
++              }
++              dup2(fd, 0);
++              dup2(fd, 1);
++              dup2(fd, 2);
++              close(fd);
++              if (write(1, "\0\0\0\0\n", 5) == 5)
++                      return;
++      }
++      panic("Unable to open console device");
++}
++
++static void clear_leaks(void)
++{
++      int fd;
++
++      fd = open("/sys/kernel/debug/kmemleak", O_WRONLY);
++      if (fd < 0)
++              return;
++      pretty_message("[+] Starting memory leak detection...");
++      write(fd, "clear\n", 5);
++      close(fd);
++}
++
++static void check_leaks(void)
++{
++      int fd;
++
++      fd = open("/sys/kernel/debug/kmemleak", O_WRONLY);
++      if (fd < 0)
++              return;
++      pretty_message("[+] Scanning for memory leaks...");
++      sleep(2); /* Wait for any grace periods. */
++      write(fd, "scan\n", 5);
++      close(fd);
++
++      fd = open("/sys/kernel/debug/kmemleak", O_RDONLY);
++      if (fd < 0)
++              return;
++      if (sendfile(1, fd, NULL, 0x7ffff000) > 0)
++              panic("Memory leaks encountered");
++      close(fd);
++}
++
++int main(int argc, char *argv[])
++{
++      seed_rng();
++      ensure_console();
++      print_banner();
++      mount_filesystems();
++      kmod_selftests();
++      enable_logging();
++      clear_leaks();
++      launch_tests();
++      check_leaks();
++      poweroff();
++      return 1;
++}
+--- /dev/null
++++ b/tools/testing/selftests/wireguard/qemu/kernel.config
+@@ -0,0 +1,86 @@
++CONFIG_LOCALVERSION=""
++CONFIG_NET=y
++CONFIG_NETDEVICES=y
++CONFIG_NET_CORE=y
++CONFIG_NET_IPIP=y
++CONFIG_DUMMY=y
++CONFIG_VETH=y
++CONFIG_MULTIUSER=y
++CONFIG_NAMESPACES=y
++CONFIG_NET_NS=y
++CONFIG_UNIX=y
++CONFIG_INET=y
++CONFIG_IPV6=y
++CONFIG_NETFILTER=y
++CONFIG_NETFILTER_ADVANCED=y
++CONFIG_NF_CONNTRACK=y
++CONFIG_NF_NAT=y
++CONFIG_NETFILTER_XTABLES=y
++CONFIG_NETFILTER_XT_NAT=y
++CONFIG_NETFILTER_XT_MATCH_LENGTH=y
++CONFIG_NF_CONNTRACK_IPV4=y
++CONFIG_NF_NAT_IPV4=y
++CONFIG_IP_NF_IPTABLES=y
++CONFIG_IP_NF_FILTER=y
++CONFIG_IP_NF_NAT=y
++CONFIG_IP_ADVANCED_ROUTER=y
++CONFIG_IP_MULTIPLE_TABLES=y
++CONFIG_IPV6_MULTIPLE_TABLES=y
++CONFIG_TTY=y
++CONFIG_BINFMT_ELF=y
++CONFIG_BINFMT_SCRIPT=y
++CONFIG_VDSO=y
++CONFIG_VIRTUALIZATION=y
++CONFIG_HYPERVISOR_GUEST=y
++CONFIG_PARAVIRT=y
++CONFIG_KVM_GUEST=y
++CONFIG_PARAVIRT_SPINLOCKS=y
++CONFIG_PRINTK=y
++CONFIG_KALLSYMS=y
++CONFIG_BUG=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y
++CONFIG_EMBEDDED=n
++CONFIG_BASE_FULL=y
++CONFIG_FUTEX=y
++CONFIG_SHMEM=y
++CONFIG_SLUB=y
++CONFIG_SPARSEMEM_VMEMMAP=y
++CONFIG_SMP=y
++CONFIG_SCHED_SMT=y
++CONFIG_SCHED_MC=y
++CONFIG_NUMA=y
++CONFIG_PREEMPT=y
++CONFIG_NO_HZ=y
++CONFIG_NO_HZ_IDLE=y
++CONFIG_NO_HZ_FULL=n
++CONFIG_HZ_PERIODIC=n
++CONFIG_HIGH_RES_TIMERS=y
++CONFIG_ARCH_RANDOM=y
++CONFIG_FILE_LOCKING=y
++CONFIG_POSIX_TIMERS=y
++CONFIG_DEVTMPFS=y
++CONFIG_PROC_FS=y
++CONFIG_PROC_SYSCTL=y
++CONFIG_SYSFS=y
++CONFIG_TMPFS=y
++CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15
++CONFIG_PRINTK_TIME=y
++CONFIG_BLK_DEV_INITRD=y
++CONFIG_LEGACY_VSYSCALL_NONE=y
++CONFIG_KERNEL_GZIP=y
++CONFIG_PANIC_ON_OOPS=y
++CONFIG_BUG_ON_DATA_CORRUPTION=y
++CONFIG_LOCKUP_DETECTOR=y
++CONFIG_SOFTLOCKUP_DETECTOR=y
++CONFIG_HARDLOCKUP_DETECTOR=y
++CONFIG_WQ_WATCHDOG=y
++CONFIG_DETECT_HUNG_TASK=y
++CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
++CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
++CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
++CONFIG_PANIC_TIMEOUT=-1
++CONFIG_STACKTRACE=y
++CONFIG_EARLY_PRINTK=y
++CONFIG_GDB_SCRIPTS=y
++CONFIG_WIREGUARD=y
++CONFIG_WIREGUARD_DEBUG=y
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0075-wireguard-Kconfig-select-parent-dependency-for-crypt.patch b/target/linux/generic/backport-5.4/080-wireguard-0075-wireguard-Kconfig-select-parent-dependency-for-crypt.patch

new file mode 100644 (file)

index 0000000..75c483a
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0075-wireguard-Kconfig-select-parent-dependency-for-crypt.patch
@@ -0,0 +1,31 @@
+From 87e4891e91a381de049a6c70690a295f44ae1f13 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 15 Dec 2019 22:08:01 +0100
+Subject: [PATCH 075/124] wireguard: Kconfig: select parent dependency for
+ crypto
+
+commit d7c68a38bb4f9b7c1a2e4a772872c752ee5c44a6 upstream.
+
+This fixes the crypto selection submenu depenencies. Otherwise, we'd
+wind up issuing warnings in which certain dependencies we also select
+couldn't be satisfied. This condition was triggered by the addition of
+the test suite autobuilder in the previous commit.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/Kconfig | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/net/Kconfig
++++ b/drivers/net/Kconfig
+@@ -85,6 +85,8 @@ config WIREGUARD
+       select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
+       select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
+       select CRYPTO_CURVE25519_X86 if X86 && 64BIT
++      select ARM_CRYPTO if ARM
++      select ARM64_CRYPTO if ARM64
+       select CRYPTO_CHACHA20_NEON if (ARM || ARM64) && KERNEL_MODE_NEON
+       select CRYPTO_POLY1305_NEON if ARM64 && KERNEL_MODE_NEON
+       select CRYPTO_POLY1305_ARM if ARM
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0076-wireguard-global-fix-spelling-mistakes-in-comments.patch b/target/linux/generic/backport-5.4/080-wireguard-0076-wireguard-global-fix-spelling-mistakes-in-comments.patch

new file mode 100644 (file)

index 0000000..da9fd72
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0076-wireguard-global-fix-spelling-mistakes-in-comments.patch
@@ -0,0 +1,66 @@
+From 163383e1867a8eb7026d436627bbcd39ecdbd509 Mon Sep 17 00:00:00 2001
+From: Josh Soref <jsoref@gmail.com>
+Date: Sun, 15 Dec 2019 22:08:02 +0100
+Subject: [PATCH 076/124] wireguard: global: fix spelling mistakes in comments
+
+commit a2ec8b5706944d228181c8b91d815f41d6dd8e7b upstream.
+
+This fixes two spelling errors in source code comments.
+
+Signed-off-by: Josh Soref <jsoref@gmail.com>
+[Jason: rewrote commit message]
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/receive.c | 2 +-
+ include/uapi/linux/wireguard.h  | 8 ++++----
+ 2 files changed, 5 insertions(+), 5 deletions(-)
+
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -380,7 +380,7 @@ static void wg_packet_consume_data_done(
+       /* We've already verified the Poly1305 auth tag, which means this packet
+        * was not modified in transit. We can therefore tell the networking
+        * stack that all checksums of every layer of encapsulation have already
+-       * been checked "by the hardware" and therefore is unneccessary to check
++       * been checked "by the hardware" and therefore is unnecessary to check
+        * again in software.
+        */
+       skb->ip_summed = CHECKSUM_UNNECESSARY;
+--- a/include/uapi/linux/wireguard.h
++++ b/include/uapi/linux/wireguard.h
+@@ -18,13 +18,13 @@
+  * one but not both of:
+  *
+  *    WGDEVICE_A_IFINDEX: NLA_U32
+- *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMESIZ - 1
++ *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1
+  *
+  * The kernel will then return several messages (NLM_F_MULTI) containing the
+  * following tree of nested items:
+  *
+  *    WGDEVICE_A_IFINDEX: NLA_U32
+- *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMESIZ - 1
++ *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1
+  *    WGDEVICE_A_PRIVATE_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
+  *    WGDEVICE_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
+  *    WGDEVICE_A_LISTEN_PORT: NLA_U16
+@@ -77,7 +77,7 @@
+  * WGDEVICE_A_IFINDEX and WGDEVICE_A_IFNAME:
+  *
+  *    WGDEVICE_A_IFINDEX: NLA_U32
+- *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMESIZ - 1
++ *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1
+  *    WGDEVICE_A_FLAGS: NLA_U32, 0 or WGDEVICE_F_REPLACE_PEERS if all current
+  *                      peers should be removed prior to adding the list below.
+  *    WGDEVICE_A_PRIVATE_KEY: len WG_KEY_LEN, all zeros to remove
+@@ -121,7 +121,7 @@
+  * filling in information not contained in the prior. Note that if
+  * WGDEVICE_F_REPLACE_PEERS is specified in the first message, it probably
+  * should not be specified in fragments that come after, so that the list
+- * of peers is only cleared the first time but appened after. Likewise for
++ * of peers is only cleared the first time but appended after. Likewise for
+  * peers, if WGPEER_F_REPLACE_ALLOWEDIPS is specified in the first message
+  * of a peer, it likely should not be specified in subsequent fragments.
+  *
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0077-wireguard-main-remove-unused-include-linux-version.h.patch b/target/linux/generic/backport-5.4/080-wireguard-0077-wireguard-main-remove-unused-include-linux-version.h.patch

new file mode 100644 (file)

index 0000000..9f1070e
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0077-wireguard-main-remove-unused-include-linux-version.h.patch
@@ -0,0 +1,29 @@
+From 27686282c4b34ad6db44cb3dbf58763e5bb8e96b Mon Sep 17 00:00:00 2001
+From: YueHaibing <yuehaibing@huawei.com>
+Date: Sun, 15 Dec 2019 22:08:03 +0100
+Subject: [PATCH 077/124] wireguard: main: remove unused include
+ <linux/version.h>
+
+commit 43967b6ff91e53bcce5ae08c16a0588a475b53a1 upstream.
+
+Remove <linux/version.h> from the includes for main.c, which is unused.
+
+Signed-off-by: YueHaibing <yuehaibing@huawei.com>
+[Jason: reworded commit message]
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/main.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/drivers/net/wireguard/main.c
++++ b/drivers/net/wireguard/main.c
+@@ -12,7 +12,6 @@
+ 
+ #include <uapi/linux/wireguard.h>
+ 
+-#include <linux/version.h>
+ #include <linux/init.h>
+ #include <linux/module.h>
+ #include <linux/genetlink.h>
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0078-wireguard-allowedips-use-kfree_rcu-instead-of-call_r.patch b/target/linux/generic/backport-5.4/080-wireguard-0078-wireguard-allowedips-use-kfree_rcu-instead-of-call_r.patch

new file mode 100644 (file)

index 0000000..82581e9
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0078-wireguard-allowedips-use-kfree_rcu-instead-of-call_r.patch
@@ -0,0 +1,42 @@
+From 17c33753f9b68288a2e8551b6161ca54f1809d37 Mon Sep 17 00:00:00 2001
+From: Wei Yongjun <weiyongjun1@huawei.com>
+Date: Sun, 15 Dec 2019 22:08:04 +0100
+Subject: [PATCH 078/124] wireguard: allowedips: use kfree_rcu() instead of
+ call_rcu()
+
+commit d89ee7d5c73af15c1c6f12b016cdf469742b5726 upstream.
+
+The callback function of call_rcu() just calls a kfree(), so we
+can use kfree_rcu() instead of call_rcu() + callback function.
+
+Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/allowedips.c | 7 +------
+ 1 file changed, 1 insertion(+), 6 deletions(-)
+
+--- a/drivers/net/wireguard/allowedips.c
++++ b/drivers/net/wireguard/allowedips.c
+@@ -31,11 +31,6 @@ static void copy_and_assign_cidr(struct
+ #define CHOOSE_NODE(parent, key) \
+       parent->bit[(key[parent->bit_at_a] >> parent->bit_at_b) & 1]
+ 
+-static void node_free_rcu(struct rcu_head *rcu)
+-{
+-      kfree(container_of(rcu, struct allowedips_node, rcu));
+-}
+-
+ static void push_rcu(struct allowedips_node **stack,
+                    struct allowedips_node __rcu *p, unsigned int *len)
+ {
+@@ -112,7 +107,7 @@ static void walk_remove_by_peer(struct a
+                               if (!node->bit[0] || !node->bit[1]) {
+                                       rcu_assign_pointer(*nptr, DEREF(
+                                              &node->bit[!REF(node->bit[0])]));
+-                                      call_rcu(&node->rcu, node_free_rcu);
++                                      kfree_rcu(node, rcu);
+                                       node = DEREF(nptr);
+                               }
+                       }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0079-wireguard-selftests-remove-ancient-kernel-compatibil.patch b/target/linux/generic/backport-5.4/080-wireguard-0079-wireguard-selftests-remove-ancient-kernel-compatibil.patch

new file mode 100644 (file)

index 0000000..efc5500
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0079-wireguard-selftests-remove-ancient-kernel-compatibil.patch
@@ -0,0 +1,373 @@
+From df3289cf81503ef299450a67f5bf11e526fdb2d0 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 2 Jan 2020 17:47:49 +0100
+Subject: [PATCH 079/124] wireguard: selftests: remove ancient kernel
+ compatibility code
+
+commit 9a69a4c8802adf642bc4a13d471b5a86b44ed434 upstream.
+
+Quite a bit of the test suite was designed to work with ancient kernels.
+Thankfully we no longer have to deal with this. This commit updates
+things that we can finally update and removes things that we can finally
+remove, to avoid the build-up of the last several years as a result of
+having to support ancient kernels. We can finally rely on suppress_
+prefixlength being available. On the build side of things, the no-PIE
+hack is no longer required, and we can bump some of the tools, repair
+our m68k and i686-kvm support, and get better coverage of the static
+branches used in the crypto lib and in udp_tunnel.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ tools/testing/selftests/wireguard/netns.sh    | 11 +--
+ .../testing/selftests/wireguard/qemu/Makefile | 82 ++++++++++---------
+ .../selftests/wireguard/qemu/arch/m68k.config |  2 +-
+ tools/testing/selftests/wireguard/qemu/init.c |  1 +
+ .../selftests/wireguard/qemu/kernel.config    |  2 +
+ 5 files changed, 50 insertions(+), 48 deletions(-)
+
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -37,7 +37,7 @@ n2() { pretty 2 "$*"; maybe_exec ip netn
+ ip0() { pretty 0 "ip $*"; ip -n $netns0 "$@"; }
+ ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; }
+ ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; }
+-sleep() { read -t "$1" -N 0 || true; }
++sleep() { read -t "$1" -N 1 || true; }
+ waitiperf() { pretty "${1//*-}" "wait for iperf:5201"; while [[ $(ss -N "$1" -tlp 'sport = 5201') != *iperf3* ]]; do sleep 0.1; done; }
+ waitncatudp() { pretty "${1//*-}" "wait for udp:1111"; while [[ $(ss -N "$1" -ulp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; }
+ waitncattcp() { pretty "${1//*-}" "wait for tcp:1111"; while [[ $(ss -N "$1" -tlp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; }
+@@ -294,12 +294,9 @@ ip1 -6 rule add table main suppress_pref
+ ip1 -4 route add default dev wg0 table 51820
+ ip1 -4 rule add not fwmark 51820 table 51820
+ ip1 -4 rule add table main suppress_prefixlength 0
+-# suppress_prefixlength only got added in 3.12, and we want to support 3.10+.
+-if [[ $(ip1 -4 rule show all) == *suppress_prefixlength* ]]; then
+-      # Flood the pings instead of sending just one, to trigger routing table reference counting bugs.
+-      n1 ping -W 1 -c 100 -f 192.168.99.7
+-      n1 ping -W 1 -c 100 -f abab::1111
+-fi
++# Flood the pings instead of sending just one, to trigger routing table reference counting bugs.
++n1 ping -W 1 -c 100 -f 192.168.99.7
++n1 ping -W 1 -c 100 -f abab::1111
+ 
+ n0 iptables -t nat -F
+ ip0 link del vethrc
+--- a/tools/testing/selftests/wireguard/qemu/Makefile
++++ b/tools/testing/selftests/wireguard/qemu/Makefile
+@@ -5,6 +5,7 @@
+ PWD := $(shell pwd)
+ 
+ CHOST := $(shell gcc -dumpmachine)
++HOST_ARCH := $(firstword $(subst -, ,$(CHOST)))
+ ifneq (,$(ARCH))
+ CBUILD := $(subst -gcc,,$(lastword $(subst /, ,$(firstword $(wildcard $(foreach bindir,$(subst :, ,$(PATH)),$(bindir)/$(ARCH)-*-gcc))))))
+ ifeq (,$(CBUILD))
+@@ -37,19 +38,19 @@ endef
+ define file_download =
+ $(DISTFILES_PATH)/$(1):
+       mkdir -p $(DISTFILES_PATH)
+-      flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -t inf --retry-on-http-error=404 -O $$@.tmp $(2)$(1) || rm -f $$@.tmp'
++      flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -O $$@.tmp $(2)$(1) || rm -f $$@.tmp'
+       if echo "$(3)  $$@.tmp" | sha256sum -c -; then mv $$@.tmp $$@; else rm -f $$@.tmp; exit 71; fi
+ endef
+ 
+-$(eval $(call tar_download,MUSL,musl,1.1.20,.tar.gz,https://www.musl-libc.org/releases/,44be8771d0e6c6b5f82dd15662eb2957c9a3173a19a8b49966ac0542bbd40d61))
++$(eval $(call tar_download,MUSL,musl,1.1.24,.tar.gz,https://www.musl-libc.org/releases/,1370c9a812b2cf2a7d92802510cca0058cc37e66a7bedd70051f0a34015022a3))
+ $(eval $(call tar_download,LIBMNL,libmnl,1.0.4,.tar.bz2,https://www.netfilter.org/projects/libmnl/files/,171f89699f286a5854b72b91d06e8f8e3683064c5901fb09d954a9ab6f551f81))
+-$(eval $(call tar_download,IPERF,iperf,3.1.7,.tar.gz,http://downloads.es.net/pub/iperf/,a4ef73406fe92250602b8da2ae89ec53211f805df97a1d1d629db5a14043734f))
++$(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c))
+ $(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d))
+-$(eval $(call tar_download,IPROUTE2,iproute2,5.1.0,.tar.gz,https://www.kernel.org/pub/linux/utils/net/iproute2/,9b43707d6075ecdca14803ca8ce0c8553848c49fa1586d12fd508d66577243f2))
+-$(eval $(call tar_download,IPTABLES,iptables,1.6.1,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,0fc2d7bd5d7be11311726466789d4c65fb4c8e096c9182b56ce97440864f0cf5))
+-$(eval $(call tar_download,NMAP,nmap,7.60,.tar.bz2,https://nmap.org/dist/,a8796ecc4fa6c38aad6139d9515dc8113023a82e9d787e5a5fb5fa1b05516f21))
+-$(eval $(call tar_download,IPUTILS,iputils,s20161105,.tar.gz,https://github.com/iputils/iputils/archive/s20161105.tar.gz/#,f813092f03d17294fd23544b129b95cdb87fe19f7970a51908a6b88509acad8a))
+-$(eval $(call tar_download,WIREGUARD_TOOLS,WireGuard,0.0.20191212,.tar.xz,https://git.zx2c4.com/WireGuard/snapshot/,b0d718380f7a8822b2f12d75e462fa4eafa3a77871002981f367cd4fe2a1b071))
++$(eval $(call tar_download,IPROUTE2,iproute2,5.4.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,fe97aa60a0d4c5ac830be18937e18dc3400ca713a33a89ad896ff1e3d46086ae))
++$(eval $(call tar_download,IPTABLES,iptables,1.8.4,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,993a3a5490a544c2cbf2ef15cf7e7ed21af1845baf228318d5c36ef8827e157c))
++$(eval $(call tar_download,NMAP,nmap,7.80,.tar.bz2,https://nmap.org/dist/,fcfa5a0e42099e12e4bf7a68ebe6fde05553383a682e816a7ec9256ab4773faa))
++$(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a))
++$(eval $(call tar_download,WIREGUARD_TOOLS,wireguard-tools,1.0.20191226,.tar.xz,https://git.zx2c4.com/wireguard-tools/snapshot/,aa8af0fdc9872d369d8c890a84dbc2a2466b55795dccd5b47721b2d97644b04f))
+ 
+ KERNEL_BUILD_PATH := $(BUILD_PATH)/kernel$(if $(findstring yes,$(DEBUG_KERNEL)),-debug)
+ rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+@@ -59,23 +60,21 @@ export CFLAGS ?= -O3 -pipe
+ export LDFLAGS ?=
+ export CPPFLAGS := -I$(BUILD_PATH)/include
+ 
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ CROSS_COMPILE_FLAG := --host=$(CHOST)
+-NOPIE_GCC := gcc -fno-PIE
+ CFLAGS += -march=native
+ STRIP := strip
+ else
+ $(info Cross compilation: building for $(CBUILD) using $(CHOST))
+ CROSS_COMPILE_FLAG := --build=$(CBUILD) --host=$(CHOST)
+ export CROSS_COMPILE=$(CBUILD)-
+-NOPIE_GCC := $(CBUILD)-gcc -fno-PIE
+ STRIP := $(CBUILD)-strip
+ endif
+ ifeq ($(ARCH),aarch64)
+ QEMU_ARCH := aarch64
+ KERNEL_ARCH := arm64
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm64/boot/Image
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+ else
+ QEMU_MACHINE := -cpu cortex-a53 -machine virt
+@@ -85,7 +84,7 @@ else ifeq ($(ARCH),aarch64_be)
+ QEMU_ARCH := aarch64
+ KERNEL_ARCH := arm64
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm64/boot/Image
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+ else
+ QEMU_MACHINE := -cpu cortex-a53 -machine virt
+@@ -95,7 +94,7 @@ else ifeq ($(ARCH),arm)
+ QEMU_ARCH := arm
+ KERNEL_ARCH := arm
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm/boot/zImage
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+ else
+ QEMU_MACHINE := -cpu cortex-a15 -machine virt
+@@ -105,7 +104,7 @@ else ifeq ($(ARCH),armeb)
+ QEMU_ARCH := arm
+ KERNEL_ARCH := arm
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm/boot/zImage
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+ else
+ QEMU_MACHINE := -cpu cortex-a15 -machine virt
+@@ -116,7 +115,7 @@ else ifeq ($(ARCH),x86_64)
+ QEMU_ARCH := x86_64
+ KERNEL_ARCH := x86_64
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine q35,accel=kvm
+ else
+ QEMU_MACHINE := -cpu Skylake-Server -machine q35
+@@ -126,7 +125,7 @@ else ifeq ($(ARCH),i686)
+ QEMU_ARCH := i386
+ KERNEL_ARCH := x86
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage
+-ifeq ($(subst i686,x86_64,$(CBUILD)),$(CHOST))
++ifeq ($(subst x86_64,i686,$(HOST_ARCH)),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine q35,accel=kvm
+ else
+ QEMU_MACHINE := -cpu coreduo -machine q35
+@@ -136,7 +135,7 @@ else ifeq ($(ARCH),mips64)
+ QEMU_ARCH := mips64
+ KERNEL_ARCH := mips
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+ CFLAGS += -EB
+ else
+@@ -147,7 +146,7 @@ else ifeq ($(ARCH),mips64el)
+ QEMU_ARCH := mips64el
+ KERNEL_ARCH := mips
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+ CFLAGS += -EL
+ else
+@@ -158,7 +157,7 @@ else ifeq ($(ARCH),mips)
+ QEMU_ARCH := mips
+ KERNEL_ARCH := mips
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+ CFLAGS += -EB
+ else
+@@ -169,7 +168,7 @@ else ifeq ($(ARCH),mipsel)
+ QEMU_ARCH := mipsel
+ KERNEL_ARCH := mips
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+ CFLAGS += -EL
+ else
+@@ -180,7 +179,7 @@ else ifeq ($(ARCH),powerpc64le)
+ QEMU_ARCH := ppc64
+ KERNEL_ARCH := powerpc
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host,accel=kvm -machine pseries
+ else
+ QEMU_MACHINE := -machine pseries
+@@ -190,7 +189,7 @@ else ifeq ($(ARCH),powerpc)
+ QEMU_ARCH := ppc
+ KERNEL_ARCH := powerpc
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/powerpc/boot/uImage
+-ifeq ($(CHOST),$(CBUILD))
++ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host,accel=kvm -machine ppce500
+ else
+ QEMU_MACHINE := -machine ppce500
+@@ -200,10 +199,11 @@ else ifeq ($(ARCH),m68k)
+ QEMU_ARCH := m68k
+ KERNEL_ARCH := m68k
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
+-QEMU_MACHINE := -cpu host,accel=kvm -machine q800
++KERNEL_CMDLINE := $(shell sed -n 's/CONFIG_CMDLINE=\(.*\)/\1/p' arch/m68k.config)
++ifeq ($(HOST_ARCH),$(ARCH))
++QEMU_MACHINE := -cpu host,accel=kvm -machine q800 -smp 1 -append $(KERNEL_CMDLINE)
+ else
+-QEMU_MACHINE := -machine q800
++QEMU_MACHINE := -machine q800 -smp 1 -append $(KERNEL_CMDLINE)
+ endif
+ else
+ $(error I only build: x86_64, i686, arm, armeb, aarch64, aarch64_be, mips, mipsel, mips64, mips64el, powerpc64le, powerpc, m68k)
+@@ -238,14 +238,14 @@ $(BUILD_PATH)/init-cpio-spec.txt:
+       echo "nod /dev/console 644 0 0 c 5 1" >> $@
+       echo "dir /bin 755 0 0" >> $@
+       echo "file /bin/iperf3 $(IPERF_PATH)/src/iperf3 755 0 0" >> $@
+-      echo "file /bin/wg $(WIREGUARD_TOOLS_PATH)/src/tools/wg 755 0 0" >> $@
++      echo "file /bin/wg $(WIREGUARD_TOOLS_PATH)/src/wg 755 0 0" >> $@
+       echo "file /bin/bash $(BASH_PATH)/bash 755 0 0" >> $@
+       echo "file /bin/ip $(IPROUTE2_PATH)/ip/ip 755 0 0" >> $@
+       echo "file /bin/ss $(IPROUTE2_PATH)/misc/ss 755 0 0" >> $@
+       echo "file /bin/ping $(IPUTILS_PATH)/ping 755 0 0" >> $@
+       echo "file /bin/ncat $(NMAP_PATH)/ncat/ncat 755 0 0" >> $@
+-      echo "file /bin/xtables-multi $(IPTABLES_PATH)/iptables/xtables-multi 755 0 0" >> $@
+-      echo "slink /bin/iptables xtables-multi 777 0 0" >> $@
++      echo "file /bin/xtables-legacy-multi $(IPTABLES_PATH)/iptables/xtables-legacy-multi 755 0 0" >> $@
++      echo "slink /bin/iptables xtables-legacy-multi 777 0 0" >> $@
+       echo "slink /bin/ping6 ping 777 0 0" >> $@
+       echo "dir /lib 755 0 0" >> $@
+       echo "file /lib/libc.so $(MUSL_PATH)/lib/libc.so 755 0 0" >> $@
+@@ -260,8 +260,8 @@ $(KERNEL_BUILD_PATH)/.config: kernel.con
+       cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config $(KERNEL_BUILD_PATH)/minimal.config
+       $(if $(findstring yes,$(DEBUG_KERNEL)),cp debug.config $(KERNEL_BUILD_PATH) && cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config debug.config,)
+ 
+-$(KERNEL_BZIMAGE): $(KERNEL_BUILD_PATH)/.config $(BUILD_PATH)/init-cpio-spec.txt $(MUSL_PATH)/lib/libc.so $(IPERF_PATH)/src/iperf3 $(IPUTILS_PATH)/ping $(BASH_PATH)/bash $(IPROUTE2_PATH)/misc/ss $(IPROUTE2_PATH)/ip/ip $(IPTABLES_PATH)/iptables/xtables-multi $(NMAP_PATH)/ncat/ncat $(WIREGUARD_TOOLS_PATH)/src/tools/wg $(BUILD_PATH)/init ../netns.sh $(WIREGUARD_SOURCES)
+-      $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC="$(NOPIE_GCC)"
++$(KERNEL_BZIMAGE): $(KERNEL_BUILD_PATH)/.config $(BUILD_PATH)/init-cpio-spec.txt $(MUSL_PATH)/lib/libc.so $(IPERF_PATH)/src/iperf3 $(IPUTILS_PATH)/ping $(BASH_PATH)/bash $(IPROUTE2_PATH)/misc/ss $(IPROUTE2_PATH)/ip/ip $(IPTABLES_PATH)/iptables/xtables-legacy-multi $(NMAP_PATH)/ncat/ncat $(WIREGUARD_TOOLS_PATH)/src/wg $(BUILD_PATH)/init ../netns.sh $(WIREGUARD_SOURCES)
++      $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE)
+ 
+ $(BUILD_PATH)/include/linux/.installed: | $(KERNEL_BUILD_PATH)/.config
+       $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) INSTALL_HDR_PATH=$(BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) headers_install
+@@ -280,7 +280,7 @@ $(BUILD_PATH)/include/.installed: $(MUSL
+ 
+ $(MUSL_CC): $(MUSL_PATH)/lib/libc.so
+       sh $(MUSL_PATH)/tools/musl-gcc.specs.sh $(BUILD_PATH)/include $(MUSL_PATH)/lib /lib/ld-linux.so.1 > $(BUILD_PATH)/musl-gcc.specs
+-      printf '#!/bin/sh\nexec "$(REAL_CC)" --specs="$(BUILD_PATH)/musl-gcc.specs" -fno-stack-protector -no-pie "$$@"\n' > $(BUILD_PATH)/musl-gcc
++      printf '#!/bin/sh\nexec "$(REAL_CC)" --specs="$(BUILD_PATH)/musl-gcc.specs" "$$@"\n' > $(BUILD_PATH)/musl-gcc
+       chmod +x $(BUILD_PATH)/musl-gcc
+ 
+ $(IPERF_PATH)/.installed: $(IPERF_TAR)
+@@ -291,7 +291,7 @@ $(IPERF_PATH)/.installed: $(IPERF_TAR)
+       touch $@
+ 
+ $(IPERF_PATH)/src/iperf3: | $(IPERF_PATH)/.installed $(USERSPACE_DEPS)
+-      cd $(IPERF_PATH) && CFLAGS="$(CFLAGS) -D_GNU_SOURCE" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared
++      cd $(IPERF_PATH) && CFLAGS="$(CFLAGS) -D_GNU_SOURCE" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --with-openssl=no
+       $(MAKE) -C $(IPERF_PATH)
+       $(STRIP) -s $@
+ 
+@@ -308,8 +308,8 @@ $(WIREGUARD_TOOLS_PATH)/.installed: $(WI
+       flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+       touch $@
+ 
+-$(WIREGUARD_TOOLS_PATH)/src/tools/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
+-      LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src/tools LIBMNL_CFLAGS="-I$(LIBMNL_PATH)/include" LIBMNL_LDLIBS="-lmnl" wg
++$(WIREGUARD_TOOLS_PATH)/src/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
++      LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src LIBMNL_CFLAGS="-I$(LIBMNL_PATH)/include" LIBMNL_LDLIBS="-lmnl" wg
+       $(STRIP) -s $@
+ 
+ $(BUILD_PATH)/init: init.c | $(USERSPACE_DEPS)
+@@ -323,7 +323,8 @@ $(IPUTILS_PATH)/.installed: $(IPUTILS_TA
+       touch $@
+ 
+ $(IPUTILS_PATH)/ping: | $(IPUTILS_PATH)/.installed $(USERSPACE_DEPS)
+-      $(MAKE) -C $(IPUTILS_PATH) USE_CAP=no USE_IDN=no USE_NETTLE=no USE_CRYPTO=no ping
++      sed -i /atexit/d $(IPUTILS_PATH)/ping.c
++      cd $(IPUTILS_PATH) && $(CC) $(CFLAGS) -std=c99 -o $@ ping.c ping_common.c ping6_common.c iputils_common.c -D_GNU_SOURCE -D'IPUTILS_VERSION(f)=f' -lresolv $(LDFLAGS)
+       $(STRIP) -s $@
+ 
+ $(BASH_PATH)/.installed: $(BASH_TAR)
+@@ -357,7 +358,7 @@ $(IPTABLES_PATH)/.installed: $(IPTABLES_
+       sed -i -e "/nfnetlink=[01]/s:=[01]:=0:" -e "/nfconntrack=[01]/s:=[01]:=0:" $(IPTABLES_PATH)/configure
+       touch $@
+ 
+-$(IPTABLES_PATH)/iptables/xtables-multi: | $(IPTABLES_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
++$(IPTABLES_PATH)/iptables/xtables-legacy-multi: | $(IPTABLES_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
+       cd $(IPTABLES_PATH) && PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --disable-nftables --disable-bpf-compiler --disable-nfsynproxy --disable-libipq --with-kernel=$(BUILD_PATH)/include
+       $(MAKE) -C $(IPTABLES_PATH)
+       $(STRIP) -s $@
+@@ -368,8 +369,9 @@ $(NMAP_PATH)/.installed: $(NMAP_TAR)
+       touch $@
+ 
+ $(NMAP_PATH)/ncat/ncat: | $(NMAP_PATH)/.installed $(USERSPACE_DEPS)
+-      cd $(NMAP_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --without-ndiff --without-zenmap --without-nping --with-libpcap=included --with-libpcre=included --with-libdnet=included --without-liblua --with-liblinear=included --without-nmap-update --without-openssl --with-pcap=linux
+-      $(MAKE) -C $(NMAP_PATH) build-ncat
++      cd $(NMAP_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --without-ndiff --without-zenmap --without-nping --with-libpcap=included --with-libpcre=included --with-libdnet=included --without-liblua --with-liblinear=included --without-nmap-update --without-openssl --with-pcap=linux --without-libssh
++      $(MAKE) -C $(NMAP_PATH)/libpcap
++      $(MAKE) -C $(NMAP_PATH)/ncat
+       $(STRIP) -s $@
+ 
+ clean:
+@@ -379,7 +381,7 @@ distclean: clean
+       rm -rf $(DISTFILES_PATH)
+ 
+ menuconfig: $(KERNEL_BUILD_PATH)/.config
+-      $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC="$(NOPIE_GCC)" menuconfig
++      $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) menuconfig
+ 
+ .PHONY: qemu build clean distclean menuconfig
+ .DELETE_ON_ERROR:
+--- a/tools/testing/selftests/wireguard/qemu/arch/m68k.config
++++ b/tools/testing/selftests/wireguard/qemu/arch/m68k.config
+@@ -1,9 +1,9 @@
+ CONFIG_MMU=y
++CONFIG_M68KCLASSIC=y
+ CONFIG_M68040=y
+ CONFIG_MAC=y
+ CONFIG_SERIAL_PMACZILOG=y
+ CONFIG_SERIAL_PMACZILOG_TTYS=y
+ CONFIG_SERIAL_PMACZILOG_CONSOLE=y
+-CONFIG_CMDLINE_BOOL=y
+ CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
+ CONFIG_FRAME_WARN=1024
+--- a/tools/testing/selftests/wireguard/qemu/init.c
++++ b/tools/testing/selftests/wireguard/qemu/init.c
+@@ -21,6 +21,7 @@
+ #include <sys/reboot.h>
+ #include <sys/utsname.h>
+ #include <sys/sendfile.h>
++#include <sys/sysmacros.h>
+ #include <linux/random.h>
+ #include <linux/version.h>
+ 
+--- a/tools/testing/selftests/wireguard/qemu/kernel.config
++++ b/tools/testing/selftests/wireguard/qemu/kernel.config
+@@ -39,6 +39,7 @@ CONFIG_PRINTK=y
+ CONFIG_KALLSYMS=y
+ CONFIG_BUG=y
+ CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y
++CONFIG_JUMP_LABEL=y
+ CONFIG_EMBEDDED=n
+ CONFIG_BASE_FULL=y
+ CONFIG_FUTEX=y
+@@ -55,6 +56,7 @@ CONFIG_NO_HZ_IDLE=y
+ CONFIG_NO_HZ_FULL=n
+ CONFIG_HZ_PERIODIC=n
+ CONFIG_HIGH_RES_TIMERS=y
++CONFIG_COMPAT_32BIT_TIME=y
+ CONFIG_ARCH_RANDOM=y
+ CONFIG_FILE_LOCKING=y
+ CONFIG_POSIX_TIMERS=y
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0080-wireguard-queueing-do-not-account-for-pfmemalloc-whe.patch b/target/linux/generic/backport-5.4/080-wireguard-0080-wireguard-queueing-do-not-account-for-pfmemalloc-whe.patch

new file mode 100644 (file)

index 0000000..edbca28
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0080-wireguard-queueing-do-not-account-for-pfmemalloc-whe.patch
@@ -0,0 +1,39 @@
+From 2b7c5a4a57e1f5cc37877f838293173994e028c6 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 2 Jan 2020 17:47:50 +0100
+Subject: [PATCH 080/124] wireguard: queueing: do not account for pfmemalloc
+ when clearing skb header
+
+commit 04d2ea92a18417619182cbb79063f154892b0150 upstream.
+
+Before 8b7008620b84 ("net: Don't copy pfmemalloc flag in __copy_skb_
+header()"), the pfmemalloc flag used to be between headers_start and
+headers_end, which is a region we clear when preparing the packet for
+encryption/decryption. This is a parameter we certainly want to
+preserve, which is why 8b7008620b84 moved it out of there. The code here
+was written in a world before 8b7008620b84, though, where we had to
+manually account for it. This commit brings things up to speed.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/queueing.h | 3 ---
+ 1 file changed, 3 deletions(-)
+
+--- a/drivers/net/wireguard/queueing.h
++++ b/drivers/net/wireguard/queueing.h
+@@ -83,13 +83,10 @@ static inline __be16 wg_skb_examine_untr
+ 
+ static inline void wg_reset_packet(struct sk_buff *skb)
+ {
+-      const int pfmemalloc = skb->pfmemalloc;
+-
+       skb_scrub_packet(skb, true);
+       memset(&skb->headers_start, 0,
+              offsetof(struct sk_buff, headers_end) -
+                      offsetof(struct sk_buff, headers_start));
+-      skb->pfmemalloc = pfmemalloc;
+       skb->queue_mapping = 0;
+       skb->nohdr = 0;
+       skb->peeked = 0;
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0081-wireguard-socket-mark-skbs-as-not-on-list-when-recei.patch b/target/linux/generic/backport-5.4/080-wireguard-0081-wireguard-socket-mark-skbs-as-not-on-list-when-recei.patch

new file mode 100644 (file)

index 0000000..a347246
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0081-wireguard-socket-mark-skbs-as-not-on-list-when-recei.patch
@@ -0,0 +1,34 @@
+From 6d8e17ad1b4b019c61403a88377e731491de409c Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 2 Jan 2020 17:47:51 +0100
+Subject: [PATCH 081/124] wireguard: socket: mark skbs as not on list when
+ receiving via gro
+
+commit 736775d06bac60d7a353e405398b48b2bd8b1e54 upstream.
+
+Certain drivers will pass gro skbs to udp, at which point the udp driver
+simply iterates through them and passes them off to encap_rcv, which is
+where we pick up. At the moment, we're not attempting to coalesce these
+into bundles, but we also don't want to wind up having cascaded lists of
+skbs treated separately. The right behavior here, then, is to just mark
+each incoming one as not on a list. This can be seen in practice, for
+example, with Qualcomm's rmnet_perf driver.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Tested-by: Yaroslav Furman <yaro330@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/socket.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/wireguard/socket.c
++++ b/drivers/net/wireguard/socket.c
+@@ -333,6 +333,7 @@ static int wg_receive(struct sock *sk, s
+       wg = sk->sk_user_data;
+       if (unlikely(!wg))
+               goto err;
++      skb_mark_not_on_list(skb);
+       wg_packet_receive(wg, skb);
+       return 0;
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0082-wireguard-allowedips-fix-use-after-free-in-root_remo.patch b/target/linux/generic/backport-5.4/080-wireguard-0082-wireguard-allowedips-fix-use-after-free-in-root_remo.patch

new file mode 100644 (file)

index 0000000..f027539
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0082-wireguard-allowedips-fix-use-after-free-in-root_remo.patch
@@ -0,0 +1,164 @@
+From 13696b0d3219c3ca9ff4ce6a580c53fab6284312 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Tue, 4 Feb 2020 22:17:25 +0100
+Subject: [PATCH 082/124] wireguard: allowedips: fix use-after-free in
+ root_remove_peer_lists
+
+commit 9981159fc3b677b357f84e069a11de5a5ec8a2a8 upstream.
+
+In the unlikely case a new node could not be allocated, we need to
+remove @newnode from @peer->allowedips_list before freeing it.
+
+syzbot reported:
+
+BUG: KASAN: use-after-free in __list_del_entry_valid+0xdc/0xf5 lib/list_debug.c:54
+Read of size 8 at addr ffff88809881a538 by task syz-executor.4/30133
+
+CPU: 0 PID: 30133 Comm: syz-executor.4 Not tainted 5.5.0-syzkaller #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+Call Trace:
+ __dump_stack lib/dump_stack.c:77 [inline]
+ dump_stack+0x197/0x210 lib/dump_stack.c:118
+ print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374
+ __kasan_report.cold+0x1b/0x32 mm/kasan/report.c:506
+ kasan_report+0x12/0x20 mm/kasan/common.c:639
+ __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:135
+ __list_del_entry_valid+0xdc/0xf5 lib/list_debug.c:54
+ __list_del_entry include/linux/list.h:132 [inline]
+ list_del include/linux/list.h:146 [inline]
+ root_remove_peer_lists+0x24f/0x4b0 drivers/net/wireguard/allowedips.c:65
+ wg_allowedips_free+0x232/0x390 drivers/net/wireguard/allowedips.c:300
+ wg_peer_remove_all+0xd5/0x620 drivers/net/wireguard/peer.c:187
+ wg_set_device+0xd01/0x1350 drivers/net/wireguard/netlink.c:542
+ genl_family_rcv_msg_doit net/netlink/genetlink.c:672 [inline]
+ genl_family_rcv_msg net/netlink/genetlink.c:717 [inline]
+ genl_rcv_msg+0x67d/0xea0 net/netlink/genetlink.c:734
+ netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477
+ genl_rcv+0x29/0x40 net/netlink/genetlink.c:745
+ netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline]
+ netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328
+ netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917
+ sock_sendmsg_nosec net/socket.c:652 [inline]
+ sock_sendmsg+0xd7/0x130 net/socket.c:672
+ ____sys_sendmsg+0x753/0x880 net/socket.c:2343
+ ___sys_sendmsg+0x100/0x170 net/socket.c:2397
+ __sys_sendmsg+0x105/0x1d0 net/socket.c:2430
+ __do_sys_sendmsg net/socket.c:2439 [inline]
+ __se_sys_sendmsg net/socket.c:2437 [inline]
+ __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2437
+ do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
+ entry_SYSCALL_64_after_hwframe+0x49/0xbe
+RIP: 0033:0x45b399
+Code: ad b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00
+RSP: 002b:00007f99a9bcdc78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
+RAX: ffffffffffffffda RBX: 00007f99a9bce6d4 RCX: 000000000045b399
+RDX: 0000000000000000 RSI: 0000000020001340 RDI: 0000000000000003
+RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004
+R13: 00000000000009ba R14: 00000000004cb2b8 R15: 0000000000000009
+
+Allocated by task 30103:
+ save_stack+0x23/0x90 mm/kasan/common.c:72
+ set_track mm/kasan/common.c:80 [inline]
+ __kasan_kmalloc mm/kasan/common.c:513 [inline]
+ __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:486
+ kasan_kmalloc+0x9/0x10 mm/kasan/common.c:527
+ kmem_cache_alloc_trace+0x158/0x790 mm/slab.c:3551
+ kmalloc include/linux/slab.h:556 [inline]
+ kzalloc include/linux/slab.h:670 [inline]
+ add+0x70a/0x1970 drivers/net/wireguard/allowedips.c:236
+ wg_allowedips_insert_v4+0xf6/0x160 drivers/net/wireguard/allowedips.c:320
+ set_allowedip drivers/net/wireguard/netlink.c:343 [inline]
+ set_peer+0xfb9/0x1150 drivers/net/wireguard/netlink.c:468
+ wg_set_device+0xbd4/0x1350 drivers/net/wireguard/netlink.c:591
+ genl_family_rcv_msg_doit net/netlink/genetlink.c:672 [inline]
+ genl_family_rcv_msg net/netlink/genetlink.c:717 [inline]
+ genl_rcv_msg+0x67d/0xea0 net/netlink/genetlink.c:734
+ netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477
+ genl_rcv+0x29/0x40 net/netlink/genetlink.c:745
+ netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline]
+ netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328
+ netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917
+ sock_sendmsg_nosec net/socket.c:652 [inline]
+ sock_sendmsg+0xd7/0x130 net/socket.c:672
+ ____sys_sendmsg+0x753/0x880 net/socket.c:2343
+ ___sys_sendmsg+0x100/0x170 net/socket.c:2397
+ __sys_sendmsg+0x105/0x1d0 net/socket.c:2430
+ __do_sys_sendmsg net/socket.c:2439 [inline]
+ __se_sys_sendmsg net/socket.c:2437 [inline]
+ __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2437
+ do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
+ entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+Freed by task 30103:
+ save_stack+0x23/0x90 mm/kasan/common.c:72
+ set_track mm/kasan/common.c:80 [inline]
+ kasan_set_free_info mm/kasan/common.c:335 [inline]
+ __kasan_slab_free+0x102/0x150 mm/kasan/common.c:474
+ kasan_slab_free+0xe/0x10 mm/kasan/common.c:483
+ __cache_free mm/slab.c:3426 [inline]
+ kfree+0x10a/0x2c0 mm/slab.c:3757
+ add+0x12d2/0x1970 drivers/net/wireguard/allowedips.c:266
+ wg_allowedips_insert_v4+0xf6/0x160 drivers/net/wireguard/allowedips.c:320
+ set_allowedip drivers/net/wireguard/netlink.c:343 [inline]
+ set_peer+0xfb9/0x1150 drivers/net/wireguard/netlink.c:468
+ wg_set_device+0xbd4/0x1350 drivers/net/wireguard/netlink.c:591
+ genl_family_rcv_msg_doit net/netlink/genetlink.c:672 [inline]
+ genl_family_rcv_msg net/netlink/genetlink.c:717 [inline]
+ genl_rcv_msg+0x67d/0xea0 net/netlink/genetlink.c:734
+ netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477
+ genl_rcv+0x29/0x40 net/netlink/genetlink.c:745
+ netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline]
+ netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328
+ netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917
+ sock_sendmsg_nosec net/socket.c:652 [inline]
+ sock_sendmsg+0xd7/0x130 net/socket.c:672
+ ____sys_sendmsg+0x753/0x880 net/socket.c:2343
+ ___sys_sendmsg+0x100/0x170 net/socket.c:2397
+ __sys_sendmsg+0x105/0x1d0 net/socket.c:2430
+ __do_sys_sendmsg net/socket.c:2439 [inline]
+ __se_sys_sendmsg net/socket.c:2437 [inline]
+ __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2437
+ do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
+ entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+The buggy address belongs to the object at ffff88809881a500
+ which belongs to the cache kmalloc-64 of size 64
+The buggy address is located 56 bytes inside of
+ 64-byte region [ffff88809881a500, ffff88809881a540)
+The buggy address belongs to the page:
+page:ffffea0002620680 refcount:1 mapcount:0 mapping:ffff8880aa400380 index:0x0
+raw: 00fffe0000000200 ffffea000250b748 ffffea000254bac8 ffff8880aa400380
+raw: 0000000000000000 ffff88809881a000 0000000100000020 0000000000000000
+page dumped because: kasan: bad access detected
+
+Memory state around the buggy address:
+ ffff88809881a400: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
+ ffff88809881a480: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc
+>ffff88809881a500: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
+                                        ^
+ ffff88809881a580: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
+ ffff88809881a600: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Cc: Jason A. Donenfeld <Jason@zx2c4.com>
+Cc: wireguard@lists.zx2c4.com
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/allowedips.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/wireguard/allowedips.c
++++ b/drivers/net/wireguard/allowedips.c
+@@ -263,6 +263,7 @@ static int add(struct allowedips_node __
+       } else {
+               node = kzalloc(sizeof(*node), GFP_KERNEL);
+               if (unlikely(!node)) {
++                      list_del(&newnode->peer_list);
+                       kfree(newnode);
+                       return -ENOMEM;
+               }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0083-wireguard-noise-reject-peers-with-low-order-public-k.patch b/target/linux/generic/backport-5.4/080-wireguard-0083-wireguard-noise-reject-peers-with-low-order-public-k.patch

new file mode 100644 (file)

index 0000000..113678d
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0083-wireguard-noise-reject-peers-with-low-order-public-k.patch
@@ -0,0 +1,234 @@
+From 1da05ad0bbc51cd226a2297e66b3cc8499803306 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 4 Feb 2020 22:17:26 +0100
+Subject: [PATCH 083/124] wireguard: noise: reject peers with low order public
+ keys
+
+commit ec31c2676a10e064878927b243fada8c2fb0c03c upstream.
+
+Our static-static calculation returns a failure if the public key is of
+low order. We check for this when peers are added, and don't allow them
+to be added if they're low order, except in the case where we haven't
+yet been given a private key. In that case, we would defer the removal
+of the peer until we're given a private key, since at that point we're
+doing new static-static calculations which incur failures we can act on.
+This meant, however, that we wound up removing peers rather late in the
+configuration flow.
+
+Syzkaller points out that peer_remove calls flush_workqueue, which in
+turn might then wait for sending a handshake initiation to complete.
+Since handshake initiation needs the static identity lock, holding the
+static identity lock while calling peer_remove can result in a rare
+deadlock. We have precisely this case in this situation of late-stage
+peer removal based on an invalid public key. We can't drop the lock when
+removing, because then incoming handshakes might interact with a bogus
+static-static calculation.
+
+While the band-aid patch for this would involve breaking up the peer
+removal into two steps like wg_peer_remove_all does, in order to solve
+the locking issue, there's actually a much more elegant way of fixing
+this:
+
+If the static-static calculation succeeds with one private key, it
+*must* succeed with all others, because all 32-byte strings map to valid
+private keys, thanks to clamping. That means we can get rid of this
+silly dance and locking headaches of removing peers late in the
+configuration flow, and instead just reject them early on, regardless of
+whether the device has yet been assigned a private key. For the case
+where the device doesn't yet have a private key, we safely use zeros
+just for the purposes of checking for low order points by way of
+checking the output of the calculation.
+
+The following PoC will trigger the deadlock:
+
+ip link add wg0 type wireguard
+ip addr add 10.0.0.1/24 dev wg0
+ip link set wg0 up
+ping -f 10.0.0.2 &
+while true; do
+        wg set wg0 private-key /dev/null peer AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= allowed-ips 10.0.0.0/24 endpoint 10.0.0.3:1234
+        wg set wg0 private-key <(echo AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=)
+done
+
+[    0.949105] ======================================================
+[    0.949550] WARNING: possible circular locking dependency detected
+[    0.950143] 5.5.0-debug+ #18 Not tainted
+[    0.950431] ------------------------------------------------------
+[    0.950959] wg/89 is trying to acquire lock:
+[    0.951252] ffff8880333e2128 ((wq_completion)wg-kex-wg0){+.+.}, at: flush_workqueue+0xe3/0x12f0
+[    0.951865]
+[    0.951865] but task is already holding lock:
+[    0.952280] ffff888032819bc0 (&wg->static_identity.lock){++++}, at: wg_set_device+0x95d/0xcc0
+[    0.953011]
+[    0.953011] which lock already depends on the new lock.
+[    0.953011]
+[    0.953651]
+[    0.953651] the existing dependency chain (in reverse order) is:
+[    0.954292]
+[    0.954292] -> #2 (&wg->static_identity.lock){++++}:
+[    0.954804]        lock_acquire+0x127/0x350
+[    0.955133]        down_read+0x83/0x410
+[    0.955428]        wg_noise_handshake_create_initiation+0x97/0x700
+[    0.955885]        wg_packet_send_handshake_initiation+0x13a/0x280
+[    0.956401]        wg_packet_handshake_send_worker+0x10/0x20
+[    0.956841]        process_one_work+0x806/0x1500
+[    0.957167]        worker_thread+0x8c/0xcb0
+[    0.957549]        kthread+0x2ee/0x3b0
+[    0.957792]        ret_from_fork+0x24/0x30
+[    0.958234]
+[    0.958234] -> #1 ((work_completion)(&peer->transmit_handshake_work)){+.+.}:
+[    0.958808]        lock_acquire+0x127/0x350
+[    0.959075]        process_one_work+0x7ab/0x1500
+[    0.959369]        worker_thread+0x8c/0xcb0
+[    0.959639]        kthread+0x2ee/0x3b0
+[    0.959896]        ret_from_fork+0x24/0x30
+[    0.960346]
+[    0.960346] -> #0 ((wq_completion)wg-kex-wg0){+.+.}:
+[    0.960945]        check_prev_add+0x167/0x1e20
+[    0.961351]        __lock_acquire+0x2012/0x3170
+[    0.961725]        lock_acquire+0x127/0x350
+[    0.961990]        flush_workqueue+0x106/0x12f0
+[    0.962280]        peer_remove_after_dead+0x160/0x220
+[    0.962600]        wg_set_device+0xa24/0xcc0
+[    0.962994]        genl_rcv_msg+0x52f/0xe90
+[    0.963298]        netlink_rcv_skb+0x111/0x320
+[    0.963618]        genl_rcv+0x1f/0x30
+[    0.963853]        netlink_unicast+0x3f6/0x610
+[    0.964245]        netlink_sendmsg+0x700/0xb80
+[    0.964586]        __sys_sendto+0x1dd/0x2c0
+[    0.964854]        __x64_sys_sendto+0xd8/0x1b0
+[    0.965141]        do_syscall_64+0x90/0xd9a
+[    0.965408]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
+[    0.965769]
+[    0.965769] other info that might help us debug this:
+[    0.965769]
+[    0.966337] Chain exists of:
+[    0.966337]   (wq_completion)wg-kex-wg0 --> (work_completion)(&peer->transmit_handshake_work) --> &wg->static_identity.lock
+[    0.966337]
+[    0.967417]  Possible unsafe locking scenario:
+[    0.967417]
+[    0.967836]        CPU0                    CPU1
+[    0.968155]        ----                    ----
+[    0.968497]   lock(&wg->static_identity.lock);
+[    0.968779]                                lock((work_completion)(&peer->transmit_handshake_work));
+[    0.969345]                                lock(&wg->static_identity.lock);
+[    0.969809]   lock((wq_completion)wg-kex-wg0);
+[    0.970146]
+[    0.970146]  *** DEADLOCK ***
+[    0.970146]
+[    0.970531] 5 locks held by wg/89:
+[    0.970908]  #0: ffffffff827433c8 (cb_lock){++++}, at: genl_rcv+0x10/0x30
+[    0.971400]  #1: ffffffff82743480 (genl_mutex){+.+.}, at: genl_rcv_msg+0x642/0xe90
+[    0.971924]  #2: ffffffff827160c0 (rtnl_mutex){+.+.}, at: wg_set_device+0x9f/0xcc0
+[    0.972488]  #3: ffff888032819de0 (&wg->device_update_lock){+.+.}, at: wg_set_device+0xb0/0xcc0
+[    0.973095]  #4: ffff888032819bc0 (&wg->static_identity.lock){++++}, at: wg_set_device+0x95d/0xcc0
+[    0.973653]
+[    0.973653] stack backtrace:
+[    0.973932] CPU: 1 PID: 89 Comm: wg Not tainted 5.5.0-debug+ #18
+[    0.974476] Call Trace:
+[    0.974638]  dump_stack+0x97/0xe0
+[    0.974869]  check_noncircular+0x312/0x3e0
+[    0.975132]  ? print_circular_bug+0x1f0/0x1f0
+[    0.975410]  ? __kernel_text_address+0x9/0x30
+[    0.975727]  ? unwind_get_return_address+0x51/0x90
+[    0.976024]  check_prev_add+0x167/0x1e20
+[    0.976367]  ? graph_lock+0x70/0x160
+[    0.976682]  __lock_acquire+0x2012/0x3170
+[    0.976998]  ? register_lock_class+0x1140/0x1140
+[    0.977323]  lock_acquire+0x127/0x350
+[    0.977627]  ? flush_workqueue+0xe3/0x12f0
+[    0.977890]  flush_workqueue+0x106/0x12f0
+[    0.978147]  ? flush_workqueue+0xe3/0x12f0
+[    0.978410]  ? find_held_lock+0x2c/0x110
+[    0.978662]  ? lock_downgrade+0x6e0/0x6e0
+[    0.978919]  ? queue_rcu_work+0x60/0x60
+[    0.979166]  ? netif_napi_del+0x151/0x3b0
+[    0.979501]  ? peer_remove_after_dead+0x160/0x220
+[    0.979871]  peer_remove_after_dead+0x160/0x220
+[    0.980232]  wg_set_device+0xa24/0xcc0
+[    0.980516]  ? deref_stack_reg+0x8e/0xc0
+[    0.980801]  ? set_peer+0xe10/0xe10
+[    0.981040]  ? __ww_mutex_check_waiters+0x150/0x150
+[    0.981430]  ? __nla_validate_parse+0x163/0x270
+[    0.981719]  ? genl_family_rcv_msg_attrs_parse+0x13f/0x310
+[    0.982078]  genl_rcv_msg+0x52f/0xe90
+[    0.982348]  ? genl_family_rcv_msg_attrs_parse+0x310/0x310
+[    0.982690]  ? register_lock_class+0x1140/0x1140
+[    0.983049]  netlink_rcv_skb+0x111/0x320
+[    0.983298]  ? genl_family_rcv_msg_attrs_parse+0x310/0x310
+[    0.983645]  ? netlink_ack+0x880/0x880
+[    0.983888]  genl_rcv+0x1f/0x30
+[    0.984168]  netlink_unicast+0x3f6/0x610
+[    0.984443]  ? netlink_detachskb+0x60/0x60
+[    0.984729]  ? find_held_lock+0x2c/0x110
+[    0.984976]  netlink_sendmsg+0x700/0xb80
+[    0.985220]  ? netlink_broadcast_filtered+0xa60/0xa60
+[    0.985533]  __sys_sendto+0x1dd/0x2c0
+[    0.985763]  ? __x64_sys_getpeername+0xb0/0xb0
+[    0.986039]  ? sockfd_lookup_light+0x17/0x160
+[    0.986397]  ? __sys_recvmsg+0x8c/0xf0
+[    0.986711]  ? __sys_recvmsg_sock+0xd0/0xd0
+[    0.987018]  __x64_sys_sendto+0xd8/0x1b0
+[    0.987283]  ? lockdep_hardirqs_on+0x39b/0x5a0
+[    0.987666]  do_syscall_64+0x90/0xd9a
+[    0.987903]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
+[    0.988223] RIP: 0033:0x7fe77c12003e
+[    0.988508] Code: c3 8b 07 85 c0 75 24 49 89 fb 48 89 f0 48 89 d7 48 89 ce 4c 89 c2 4d 89 ca 4c 8b 44 24 08 4c 8b 4c 24 10 4c 4
+[    0.989666] RSP: 002b:00007fffada2ed58 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
+[    0.990137] RAX: ffffffffffffffda RBX: 00007fe77c159d48 RCX: 00007fe77c12003e
+[    0.990583] RDX: 0000000000000040 RSI: 000055fd1d38e020 RDI: 0000000000000004
+[    0.991091] RBP: 000055fd1d38e020 R08: 000055fd1cb63358 R09: 000000000000000c
+[    0.991568] R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000002c
+[    0.992014] R13: 0000000000000004 R14: 000055fd1d38e020 R15: 0000000000000001
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/netlink.c |  6 ++----
+ drivers/net/wireguard/noise.c   | 10 +++++++---
+ 2 files changed, 9 insertions(+), 7 deletions(-)
+
+--- a/drivers/net/wireguard/netlink.c
++++ b/drivers/net/wireguard/netlink.c
+@@ -575,10 +575,8 @@ static int wg_set_device(struct sk_buff
+                                                        private_key);
+               list_for_each_entry_safe(peer, temp, &wg->peer_list,
+                                        peer_list) {
+-                      if (wg_noise_precompute_static_static(peer))
+-                              wg_noise_expire_current_peer_keypairs(peer);
+-                      else
+-                              wg_peer_remove(peer);
++                      BUG_ON(!wg_noise_precompute_static_static(peer));
++                      wg_noise_expire_current_peer_keypairs(peer);
+               }
+               wg_cookie_checker_precompute_device_keys(&wg->cookie_checker);
+               up_write(&wg->static_identity.lock);
+--- a/drivers/net/wireguard/noise.c
++++ b/drivers/net/wireguard/noise.c
+@@ -46,17 +46,21 @@ void __init wg_noise_init(void)
+ /* Must hold peer->handshake.static_identity->lock */
+ bool wg_noise_precompute_static_static(struct wg_peer *peer)
+ {
+-      bool ret = true;
++      bool ret;
+ 
+       down_write(&peer->handshake.lock);
+-      if (peer->handshake.static_identity->has_identity)
++      if (peer->handshake.static_identity->has_identity) {
+               ret = curve25519(
+                       peer->handshake.precomputed_static_static,
+                       peer->handshake.static_identity->static_private,
+                       peer->handshake.remote_static);
+-      else
++      } else {
++              u8 empty[NOISE_PUBLIC_KEY_LEN] = { 0 };
++
++              ret = curve25519(empty, empty, peer->handshake.remote_static);
+               memset(peer->handshake.precomputed_static_static, 0,
+                      NOISE_PUBLIC_KEY_LEN);
++      }
+       up_write(&peer->handshake.lock);
+       return ret;
+ }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0084-wireguard-selftests-ensure-non-addition-of-peers-wit.patch b/target/linux/generic/backport-5.4/080-wireguard-0084-wireguard-selftests-ensure-non-addition-of-peers-wit.patch

new file mode 100644 (file)

index 0000000..d6ad3be
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0084-wireguard-selftests-ensure-non-addition-of-peers-wit.patch
@@ -0,0 +1,34 @@
+From eb6a11e6d69912d8bb0b951b08f6871785cfe0e9 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 4 Feb 2020 22:17:27 +0100
+Subject: [PATCH 084/124] wireguard: selftests: ensure non-addition of peers
+ with failed precomputation
+
+commit f9398acba6a4ae9cb98bfe4d56414d376eff8d57 upstream.
+
+Ensure that peers with low order points are ignored, both in the case
+where we already have a device private key and in the case where we do
+not. This adds points that naturally give a zero output.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ tools/testing/selftests/wireguard/netns.sh | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -516,6 +516,12 @@ n0 wg set wg0 peer "$pub2" allowed-ips 0
+ n0 wg set wg0 peer "$pub2" allowed-ips 0.0.0.0/0
+ n0 wg set wg0 peer "$pub2" allowed-ips ::/0,1700::/111,5000::/4,e000::/37,9000::/75
+ n0 wg set wg0 peer "$pub2" allowed-ips ::/0
++n0 wg set wg0 peer "$pub2" remove
++low_order_points=( AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= AQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= 4Ot6fDtBuK4WVuP68Z/EatoJjeucMrH9hmIFFl9JuAA= X5yVvKNQjCSx0LFVnIPvWwREXMRYHI6G2CJO3dCfEVc= 7P///////////////////////////////////////38= 7f///////////////////////////////////////38= 7v///////////////////////////////////////38= )
++n0 wg set wg0 private-key /dev/null ${low_order_points[@]/#/peer }
++[[ -z $(n0 wg show wg0 peers) ]]
++n0 wg set wg0 private-key <(echo "$key1") ${low_order_points[@]/#/peer }
++[[ -z $(n0 wg show wg0 peers) ]]
+ ip0 link del wg0
+ 
+ declare -A objects
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0085-wireguard-selftests-tie-socket-waiting-to-target-pid.patch b/target/linux/generic/backport-5.4/080-wireguard-0085-wireguard-selftests-tie-socket-waiting-to-target-pid.patch

new file mode 100644 (file)

index 0000000..c891f8f
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0085-wireguard-selftests-tie-socket-waiting-to-target-pid.patch
@@ -0,0 +1,78 @@
+From d95179eade4bc805455dd5e6617db5e387004d13 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 4 Feb 2020 22:17:29 +0100
+Subject: [PATCH 085/124] wireguard: selftests: tie socket waiting to target
+ pid
+
+commit 88f404a9b1d75388225b1c67b6dd327cb2182777 upstream.
+
+Without this, we wind up proceeding too early sometimes when the
+previous process has just used the same listening port. So, we tie the
+listening socket query to the specific pid we're interested in.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ tools/testing/selftests/wireguard/netns.sh | 17 ++++++++---------
+ 1 file changed, 8 insertions(+), 9 deletions(-)
+
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -38,9 +38,8 @@ ip0() { pretty 0 "ip $*"; ip -n $netns0
+ ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; }
+ ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; }
+ sleep() { read -t "$1" -N 1 || true; }
+-waitiperf() { pretty "${1//*-}" "wait for iperf:5201"; while [[ $(ss -N "$1" -tlp 'sport = 5201') != *iperf3* ]]; do sleep 0.1; done; }
+-waitncatudp() { pretty "${1//*-}" "wait for udp:1111"; while [[ $(ss -N "$1" -ulp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; }
+-waitncattcp() { pretty "${1//*-}" "wait for tcp:1111"; while [[ $(ss -N "$1" -tlp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; }
++waitiperf() { pretty "${1//*-}" "wait for iperf:5201 pid $2"; while [[ $(ss -N "$1" -tlpH 'sport = 5201') != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; }
++waitncatudp() { pretty "${1//*-}" "wait for udp:1111 pid $2"; while [[ $(ss -N "$1" -ulpH 'sport = 1111') != *\"ncat\",pid=$2,fd=* ]]; do sleep 0.1; done; }
+ waitiface() { pretty "${1//*-}" "wait for $2 to come up"; ip netns exec "$1" bash -c "while [[ \$(< \"/sys/class/net/$2/operstate\") != up ]]; do read -t .1 -N 0 || true; done;"; }
+ 
+ cleanup() {
+@@ -119,22 +118,22 @@ tests() {
+ 
+       # TCP over IPv4
+       n2 iperf3 -s -1 -B 192.168.241.2 &
+-      waitiperf $netns2
++      waitiperf $netns2 $!
+       n1 iperf3 -Z -t 3 -c 192.168.241.2
+ 
+       # TCP over IPv6
+       n1 iperf3 -s -1 -B fd00::1 &
+-      waitiperf $netns1
++      waitiperf $netns1 $!
+       n2 iperf3 -Z -t 3 -c fd00::1
+ 
+       # UDP over IPv4
+       n1 iperf3 -s -1 -B 192.168.241.1 &
+-      waitiperf $netns1
++      waitiperf $netns1 $!
+       n2 iperf3 -Z -t 3 -b 0 -u -c 192.168.241.1
+ 
+       # UDP over IPv6
+       n2 iperf3 -s -1 -B fd00::2 &
+-      waitiperf $netns2
++      waitiperf $netns2 $!
+       n1 iperf3 -Z -t 3 -b 0 -u -c fd00::2
+ }
+ 
+@@ -207,7 +206,7 @@ n1 ping -W 1 -c 1 192.168.241.2
+ n1 wg set wg0 peer "$pub2" allowed-ips 192.168.241.0/24
+ exec 4< <(n1 ncat -l -u -p 1111)
+ ncat_pid=$!
+-waitncatudp $netns1
++waitncatudp $netns1 $ncat_pid
+ n2 ncat -u 192.168.241.1 1111 <<<"X"
+ read -r -N 1 -t 1 out <&4 && [[ $out == "X" ]]
+ kill $ncat_pid
+@@ -216,7 +215,7 @@ n1 wg set wg0 peer "$more_specific_key"
+ n2 wg set wg0 listen-port 9997
+ exec 4< <(n1 ncat -l -u -p 1111)
+ ncat_pid=$!
+-waitncatudp $netns1
++waitncatudp $netns1 $ncat_pid
+ n2 ncat -u 192.168.241.1 1111 <<<"X"
+ ! read -r -N 1 -t 1 out <&4 || false
+ kill $ncat_pid
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0086-wireguard-device-use-icmp_ndo_send-helper.patch b/target/linux/generic/backport-5.4/080-wireguard-0086-wireguard-device-use-icmp_ndo_send-helper.patch

new file mode 100644 (file)

index 0000000..32bb799
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0086-wireguard-device-use-icmp_ndo_send-helper.patch
@@ -0,0 +1,64 @@
+From a16efc93a9c12bbfbff6d50811332e687cc527a9 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 11 Feb 2020 20:47:08 +0100
+Subject: [PATCH 086/124] wireguard: device: use icmp_ndo_send helper
+
+commit a12d7f3cbdc72c7625881c8dc2660fc2c979fdf2 upstream.
+
+Because wireguard is calling icmp from network device context, it should
+use the ndo helper so that the rate limiting applies correctly.  This
+commit adds a small test to the wireguard test suite to ensure that the
+new functions continue doing the right thing in the context of
+wireguard. It does this by setting up a condition that will definately
+evoke an icmp error message from the driver, but along a nat'd path.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/device.c             |  4 ++--
+ tools/testing/selftests/wireguard/netns.sh | 11 +++++++++++
+ 2 files changed, 13 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/wireguard/device.c
++++ b/drivers/net/wireguard/device.c
+@@ -203,9 +203,9 @@ err_peer:
+ err:
+       ++dev->stats.tx_errors;
+       if (skb->protocol == htons(ETH_P_IP))
+-              icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
++              icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+       else if (skb->protocol == htons(ETH_P_IPV6))
+-              icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
++              icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
+       kfree_skb(skb);
+       return ret;
+ }
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -24,6 +24,7 @@
+ set -e
+ 
+ exec 3>&1
++export LANG=C
+ export WG_HIDE_KEYS=never
+ netns0="wg-test-$$-0"
+ netns1="wg-test-$$-1"
+@@ -297,7 +298,17 @@ ip1 -4 rule add table main suppress_pref
+ n1 ping -W 1 -c 100 -f 192.168.99.7
+ n1 ping -W 1 -c 100 -f abab::1111
+ 
++# Have ns2 NAT into wg0 packets from ns0, but return an icmp error along the right route.
++n2 iptables -t nat -A POSTROUTING -s 10.0.0.0/24 -d 192.168.241.0/24 -j SNAT --to 192.168.241.2
++n0 iptables -t filter -A INPUT \! -s 10.0.0.0/24 -i vethrs -j DROP # Manual rpfilter just to be explicit.
++n2 bash -c 'printf 1 > /proc/sys/net/ipv4/ip_forward'
++ip0 -4 route add 192.168.241.1 via 10.0.0.100
++n2 wg set wg0 peer "$pub1" remove
++[[ $(! n0 ping -W 1 -c 1 192.168.241.1 || false) == *"From 10.0.0.100 icmp_seq=1 Destination Host Unreachable"* ]]
++
+ n0 iptables -t nat -F
++n0 iptables -t filter -F
++n2 iptables -t nat -F
+ ip0 link del vethrc
+ ip0 link del vethrs
+ ip1 link del wg0
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0087-wireguard-selftests-reduce-complexity-and-fix-make-r.patch b/target/linux/generic/backport-5.4/080-wireguard-0087-wireguard-selftests-reduce-complexity-and-fix-make-r.patch

new file mode 100644 (file)

index 0000000..6ef752c
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0087-wireguard-selftests-reduce-complexity-and-fix-make-r.patch
@@ -0,0 +1,105 @@
+From 871a6ff0cd8f9edad483b8f467c0abe6cff32390 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 14 Feb 2020 23:57:20 +0100
+Subject: [PATCH 087/124] wireguard: selftests: reduce complexity and fix make
+ races
+
+commit 04ddf1208f03e1dbc39a4619c40eba640051b950 upstream.
+
+This gives us fewer dependencies and shortens build time, fixes up some
+hash checking race conditions, and also fixes missing directory creation
+that caused issues on massively parallel builds.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ .../testing/selftests/wireguard/qemu/Makefile | 38 +++++++------------
+ 1 file changed, 14 insertions(+), 24 deletions(-)
+
+--- a/tools/testing/selftests/wireguard/qemu/Makefile
++++ b/tools/testing/selftests/wireguard/qemu/Makefile
+@@ -38,19 +38,17 @@ endef
+ define file_download =
+ $(DISTFILES_PATH)/$(1):
+       mkdir -p $(DISTFILES_PATH)
+-      flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -O $$@.tmp $(2)$(1) || rm -f $$@.tmp'
+-      if echo "$(3)  $$@.tmp" | sha256sum -c -; then mv $$@.tmp $$@; else rm -f $$@.tmp; exit 71; fi
++      flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -O $$@.tmp $(2)$(1) || rm -f $$@.tmp; [ -f $$@.tmp ] || exit 1; if echo "$(3)  $$@.tmp" | sha256sum -c -; then mv $$@.tmp $$@; else rm -f $$@.tmp; exit 71; fi'
+ endef
+ 
+ $(eval $(call tar_download,MUSL,musl,1.1.24,.tar.gz,https://www.musl-libc.org/releases/,1370c9a812b2cf2a7d92802510cca0058cc37e66a7bedd70051f0a34015022a3))
+-$(eval $(call tar_download,LIBMNL,libmnl,1.0.4,.tar.bz2,https://www.netfilter.org/projects/libmnl/files/,171f89699f286a5854b72b91d06e8f8e3683064c5901fb09d954a9ab6f551f81))
+ $(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c))
+ $(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d))
+ $(eval $(call tar_download,IPROUTE2,iproute2,5.4.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,fe97aa60a0d4c5ac830be18937e18dc3400ca713a33a89ad896ff1e3d46086ae))
+ $(eval $(call tar_download,IPTABLES,iptables,1.8.4,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,993a3a5490a544c2cbf2ef15cf7e7ed21af1845baf228318d5c36ef8827e157c))
+ $(eval $(call tar_download,NMAP,nmap,7.80,.tar.bz2,https://nmap.org/dist/,fcfa5a0e42099e12e4bf7a68ebe6fde05553383a682e816a7ec9256ab4773faa))
+ $(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a))
+-$(eval $(call tar_download,WIREGUARD_TOOLS,wireguard-tools,1.0.20191226,.tar.xz,https://git.zx2c4.com/wireguard-tools/snapshot/,aa8af0fdc9872d369d8c890a84dbc2a2466b55795dccd5b47721b2d97644b04f))
++$(eval $(call tar_download,WIREGUARD_TOOLS,wireguard-tools,1.0.20200206,.tar.xz,https://git.zx2c4.com/wireguard-tools/snapshot/,f5207248c6a3c3e3bfc9ab30b91c1897b00802ed861e1f9faaed873366078c64))
+ 
+ KERNEL_BUILD_PATH := $(BUILD_PATH)/kernel$(if $(findstring yes,$(DEBUG_KERNEL)),-debug)
+ rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+@@ -295,21 +293,13 @@ $(IPERF_PATH)/src/iperf3: | $(IPERF_PATH
+       $(MAKE) -C $(IPERF_PATH)
+       $(STRIP) -s $@
+ 
+-$(LIBMNL_PATH)/.installed: $(LIBMNL_TAR)
+-      flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+-      touch $@
+-
+-$(LIBMNL_PATH)/src/.libs/libmnl.a: | $(LIBMNL_PATH)/.installed $(USERSPACE_DEPS)
+-      cd $(LIBMNL_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared
+-      $(MAKE) -C $(LIBMNL_PATH)
+-      sed -i 's:prefix=.*:prefix=$(LIBMNL_PATH):' $(LIBMNL_PATH)/libmnl.pc
+-
+ $(WIREGUARD_TOOLS_PATH)/.installed: $(WIREGUARD_TOOLS_TAR)
++      mkdir -p $(BUILD_PATH)
+       flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+       touch $@
+ 
+-$(WIREGUARD_TOOLS_PATH)/src/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
+-      LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src LIBMNL_CFLAGS="-I$(LIBMNL_PATH)/include" LIBMNL_LDLIBS="-lmnl" wg
++$(WIREGUARD_TOOLS_PATH)/src/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(USERSPACE_DEPS)
++      $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src wg
+       $(STRIP) -s $@
+ 
+ $(BUILD_PATH)/init: init.c | $(USERSPACE_DEPS)
+@@ -340,17 +330,17 @@ $(BASH_PATH)/bash: | $(BASH_PATH)/.insta
+ $(IPROUTE2_PATH)/.installed: $(IPROUTE2_TAR)
+       mkdir -p $(BUILD_PATH)
+       flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+-      printf 'CC:=$(CC)\nPKG_CONFIG:=pkg-config\nTC_CONFIG_XT:=n\nTC_CONFIG_ATM:=n\nTC_CONFIG_IPSET:=n\nIP_CONFIG_SETNS:=y\nHAVE_ELF:=n\nHAVE_MNL:=y\nHAVE_BERKELEY_DB:=n\nHAVE_LATEX:=n\nHAVE_PDFLATEX:=n\nCFLAGS+=-DHAVE_SETNS -DHAVE_LIBMNL -I$(LIBMNL_PATH)/include\nLDLIBS+=-lmnl' > $(IPROUTE2_PATH)/config.mk
++      printf 'CC:=$(CC)\nPKG_CONFIG:=pkg-config\nTC_CONFIG_XT:=n\nTC_CONFIG_ATM:=n\nTC_CONFIG_IPSET:=n\nIP_CONFIG_SETNS:=y\nHAVE_ELF:=n\nHAVE_MNL:=n\nHAVE_BERKELEY_DB:=n\nHAVE_LATEX:=n\nHAVE_PDFLATEX:=n\nCFLAGS+=-DHAVE_SETNS\n' > $(IPROUTE2_PATH)/config.mk
+       printf 'lib: snapshot\n\t$$(MAKE) -C lib\nip/ip: lib\n\t$$(MAKE) -C ip ip\nmisc/ss: lib\n\t$$(MAKE) -C misc ss\n' >> $(IPROUTE2_PATH)/Makefile
+       touch $@
+ 
+-$(IPROUTE2_PATH)/ip/ip: | $(IPROUTE2_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
+-      LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ ip/ip
+-      $(STRIP) -s $(IPROUTE2_PATH)/ip/ip
+-
+-$(IPROUTE2_PATH)/misc/ss: | $(IPROUTE2_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
+-      LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ misc/ss
+-      $(STRIP) -s $(IPROUTE2_PATH)/misc/ss
++$(IPROUTE2_PATH)/ip/ip: | $(IPROUTE2_PATH)/.installed $(USERSPACE_DEPS)
++      $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ ip/ip
++      $(STRIP) -s $@
++
++$(IPROUTE2_PATH)/misc/ss: | $(IPROUTE2_PATH)/.installed $(USERSPACE_DEPS)
++      $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ misc/ss
++      $(STRIP) -s $@
+ 
+ $(IPTABLES_PATH)/.installed: $(IPTABLES_TAR)
+       mkdir -p $(BUILD_PATH)
+@@ -358,8 +348,8 @@ $(IPTABLES_PATH)/.installed: $(IPTABLES_
+       sed -i -e "/nfnetlink=[01]/s:=[01]:=0:" -e "/nfconntrack=[01]/s:=[01]:=0:" $(IPTABLES_PATH)/configure
+       touch $@
+ 
+-$(IPTABLES_PATH)/iptables/xtables-legacy-multi: | $(IPTABLES_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
+-      cd $(IPTABLES_PATH) && PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --disable-nftables --disable-bpf-compiler --disable-nfsynproxy --disable-libipq --with-kernel=$(BUILD_PATH)/include
++$(IPTABLES_PATH)/iptables/xtables-legacy-multi: | $(IPTABLES_PATH)/.installed $(USERSPACE_DEPS)
++      cd $(IPTABLES_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --disable-nftables --disable-bpf-compiler --disable-nfsynproxy --disable-libipq --disable-connlabel --with-kernel=$(BUILD_PATH)/include
+       $(MAKE) -C $(IPTABLES_PATH)
+       $(STRIP) -s $@
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0088-wireguard-receive-reset-last_under_load-to-zero.patch b/target/linux/generic/backport-5.4/080-wireguard-0088-wireguard-receive-reset-last_under_load-to-zero.patch

new file mode 100644 (file)

index 0000000..2f3e6a3
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0088-wireguard-receive-reset-last_under_load-to-zero.patch
@@ -0,0 +1,38 @@
+From b3969f204f6f3e1b712d4892050abf35ad178ccc Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 14 Feb 2020 23:57:21 +0100
+Subject: [PATCH 088/124] wireguard: receive: reset last_under_load to zero
+
+commit 2a8a4df36462aa85b0db87b7c5ea145ba67e34a8 upstream.
+
+This is a small optimization that prevents more expensive comparisons
+from happening when they are no longer necessary, by clearing the
+last_under_load variable whenever we wind up in a state where we were
+under load but we no longer are.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Suggested-by: Matt Dunwoodie <ncon@noconroy.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/receive.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -118,10 +118,13 @@ static void wg_receive_handshake_packet(
+ 
+       under_load = skb_queue_len(&wg->incoming_handshakes) >=
+                    MAX_QUEUED_INCOMING_HANDSHAKES / 8;
+-      if (under_load)
++      if (under_load) {
+               last_under_load = ktime_get_coarse_boottime_ns();
+-      else if (last_under_load)
++      } else if (last_under_load) {
+               under_load = !wg_birthdate_has_expired(last_under_load, 1);
++              if (!under_load)
++                      last_under_load = 0;
++      }
+       mac_state = wg_cookie_validate_packet(&wg->cookie_checker, skb,
+                                             under_load);
+       if ((under_load && mac_state == VALID_MAC_WITH_COOKIE) ||
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0089-wireguard-send-account-for-mtu-0-devices.patch b/target/linux/generic/backport-5.4/080-wireguard-0089-wireguard-send-account-for-mtu-0-devices.patch

new file mode 100644 (file)

index 0000000..012a6a1
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0089-wireguard-send-account-for-mtu-0-devices.patch
@@ -0,0 +1,95 @@
+From 6e82ecb98d019209c77c73d0460535f1fcb3d8cc Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 14 Feb 2020 23:57:22 +0100
+Subject: [PATCH 089/124] wireguard: send: account for mtu=0 devices
+
+commit 175f1ca9a9ed8689d2028da1a7c624bb4fb4ff7e upstream.
+
+It turns out there's an easy way to get packets queued up while still
+having an MTU of zero, and that's via persistent keep alive. This commit
+makes sure that in whatever condition, we don't wind up dividing by
+zero. Note that an MTU of zero for a wireguard interface is something
+quasi-valid, so I don't think the correct fix is to limit it via
+min_mtu. This can be reproduced easily with:
+
+ip link add wg0 type wireguard
+ip link add wg1 type wireguard
+ip link set wg0 up mtu 0
+ip link set wg1 up
+wg set wg0 private-key <(wg genkey)
+wg set wg1 listen-port 1 private-key <(wg genkey) peer $(wg show wg0 public-key)
+wg set wg0 peer $(wg show wg1 public-key) persistent-keepalive 1 endpoint 127.0.0.1:1
+
+However, while min_mtu=0 seems fine, it makes sense to restrict the
+max_mtu. This commit also restricts the maximum MTU to the greatest
+number for which rounding up to the padding multiple won't overflow a
+signed integer. Packets this large were always rejected anyway
+eventually, due to checks deeper in, but it seems more sound not to even
+let the administrator configure something that won't work anyway.
+
+We use this opportunity to clean up this function a bit so that it's
+clear which paths we're expecting.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Cc: Eric Dumazet <eric.dumazet@gmail.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/device.c |  7 ++++---
+ drivers/net/wireguard/send.c   | 16 +++++++++++-----
+ 2 files changed, 15 insertions(+), 8 deletions(-)
+
+--- a/drivers/net/wireguard/device.c
++++ b/drivers/net/wireguard/device.c
+@@ -258,6 +258,8 @@ static void wg_setup(struct net_device *
+       enum { WG_NETDEV_FEATURES = NETIF_F_HW_CSUM | NETIF_F_RXCSUM |
+                                   NETIF_F_SG | NETIF_F_GSO |
+                                   NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA };
++      const int overhead = MESSAGE_MINIMUM_LENGTH + sizeof(struct udphdr) +
++                           max(sizeof(struct ipv6hdr), sizeof(struct iphdr));
+ 
+       dev->netdev_ops = &netdev_ops;
+       dev->hard_header_len = 0;
+@@ -271,9 +273,8 @@ static void wg_setup(struct net_device *
+       dev->features |= WG_NETDEV_FEATURES;
+       dev->hw_features |= WG_NETDEV_FEATURES;
+       dev->hw_enc_features |= WG_NETDEV_FEATURES;
+-      dev->mtu = ETH_DATA_LEN - MESSAGE_MINIMUM_LENGTH -
+-                 sizeof(struct udphdr) -
+-                 max(sizeof(struct ipv6hdr), sizeof(struct iphdr));
++      dev->mtu = ETH_DATA_LEN - overhead;
++      dev->max_mtu = round_down(INT_MAX, MESSAGE_PADDING_MULTIPLE) - overhead;
+ 
+       SET_NETDEV_DEVTYPE(dev, &device_type);
+ 
+--- a/drivers/net/wireguard/send.c
++++ b/drivers/net/wireguard/send.c
+@@ -143,16 +143,22 @@ static void keep_key_fresh(struct wg_pee
+ 
+ static unsigned int calculate_skb_padding(struct sk_buff *skb)
+ {
++      unsigned int padded_size, last_unit = skb->len;
++
++      if (unlikely(!PACKET_CB(skb)->mtu))
++              return ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE) - last_unit;
++
+       /* We do this modulo business with the MTU, just in case the networking
+        * layer gives us a packet that's bigger than the MTU. In that case, we
+        * wouldn't want the final subtraction to overflow in the case of the
+-       * padded_size being clamped.
++       * padded_size being clamped. Fortunately, that's very rarely the case,
++       * so we optimize for that not happening.
+        */
+-      unsigned int last_unit = skb->len % PACKET_CB(skb)->mtu;
+-      unsigned int padded_size = ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE);
++      if (unlikely(last_unit > PACKET_CB(skb)->mtu))
++              last_unit %= PACKET_CB(skb)->mtu;
+ 
+-      if (padded_size > PACKET_CB(skb)->mtu)
+-              padded_size = PACKET_CB(skb)->mtu;
++      padded_size = min(PACKET_CB(skb)->mtu,
++                        ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE));
+       return padded_size - last_unit;
+ }
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0090-wireguard-socket-remove-extra-call-to-synchronize_ne.patch b/target/linux/generic/backport-5.4/080-wireguard-0090-wireguard-socket-remove-extra-call-to-synchronize_ne.patch

new file mode 100644 (file)

index 0000000..542a9ca
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0090-wireguard-socket-remove-extra-call-to-synchronize_ne.patch
@@ -0,0 +1,33 @@
+From ce6c6fa0d2dd4ca9c500e6240e4f22c48018a0ae Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 14 Feb 2020 23:57:23 +0100
+Subject: [PATCH 090/124] wireguard: socket: remove extra call to
+ synchronize_net
+
+commit 1fbc33b0a7feb6ca72bf7dc8a05d81485ee8ee2e upstream.
+
+synchronize_net() is a wrapper around synchronize_rcu(), so there's no
+point in having synchronize_net and synchronize_rcu back to back,
+despite the documentation comment suggesting maybe it's somewhat useful,
+"Wait for packets currently being received to be done." This commit
+removes the extra call.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/socket.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/drivers/net/wireguard/socket.c
++++ b/drivers/net/wireguard/socket.c
+@@ -432,7 +432,6 @@ void wg_socket_reinit(struct wg_device *
+               wg->incoming_port = ntohs(inet_sk(new4)->inet_sport);
+       mutex_unlock(&wg->socket_update_lock);
+       synchronize_rcu();
+-      synchronize_net();
+       sock_free(old4);
+       sock_free(old6);
+ }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0091-wireguard-selftests-remove-duplicated-include-sys-ty.patch b/target/linux/generic/backport-5.4/080-wireguard-0091-wireguard-selftests-remove-duplicated-include-sys-ty.patch

new file mode 100644 (file)

index 0000000..a7b2d70
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0091-wireguard-selftests-remove-duplicated-include-sys-ty.patch
@@ -0,0 +1,28 @@
+From 4c680d3ca400288018c9b9fff0c5df4dbed96e84 Mon Sep 17 00:00:00 2001
+From: YueHaibing <yuehaibing@huawei.com>
+Date: Wed, 18 Mar 2020 18:30:43 -0600
+Subject: [PATCH 091/124] wireguard: selftests: remove duplicated include
+ <sys/types.h>
+
+commit 166391159c5deb84795d2ff46e95f276177fa5fb upstream.
+
+This commit removes a duplicated include.
+
+Signed-off-by: YueHaibing <yuehaibing@huawei.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ tools/testing/selftests/wireguard/qemu/init.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/tools/testing/selftests/wireguard/qemu/init.c
++++ b/tools/testing/selftests/wireguard/qemu/init.c
+@@ -13,7 +13,6 @@
+ #include <fcntl.h>
+ #include <sys/wait.h>
+ #include <sys/mount.h>
+-#include <sys/types.h>
+ #include <sys/stat.h>
+ #include <sys/types.h>
+ #include <sys/io.h>
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0092-wireguard-queueing-account-for-skb-protocol-0.patch b/target/linux/generic/backport-5.4/080-wireguard-0092-wireguard-queueing-account-for-skb-protocol-0.patch

new file mode 100644 (file)

index 0000000..7826e34
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0092-wireguard-queueing-account-for-skb-protocol-0.patch
@@ -0,0 +1,100 @@
+From db7e2e9ced3df1fb9286946914183f6a074a2b92 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 18 Mar 2020 18:30:45 -0600
+Subject: [PATCH 092/124] wireguard: queueing: account for skb->protocol==0
+
+commit a5588604af448664e796daf3c1d5a4523c60667b upstream.
+
+We carry out checks to the effect of:
+
+  if (skb->protocol != wg_examine_packet_protocol(skb))
+    goto err;
+
+By having wg_skb_examine_untrusted_ip_hdr return 0 on failure, this
+means that the check above still passes in the case where skb->protocol
+is zero, which is possible to hit with AF_PACKET:
+
+  struct sockaddr_pkt saddr = { .spkt_device = "wg0" };
+  unsigned char buffer[5] = { 0 };
+  sendto(socket(AF_PACKET, SOCK_PACKET, /* skb->protocol = */ 0),
+         buffer, sizeof(buffer), 0, (const struct sockaddr *)&saddr, sizeof(saddr));
+
+Additional checks mean that this isn't actually a problem in the code
+base, but I could imagine it becoming a problem later if the function is
+used more liberally.
+
+I would prefer to fix this by having wg_examine_packet_protocol return a
+32-bit ~0 value on failure, which will never match any value of
+skb->protocol, which would simply change the generated code from a mov
+to a movzx. However, sparse complains, and adding __force casts doesn't
+seem like a good idea, so instead we just add a simple helper function
+to check for the zero return value. Since wg_examine_packet_protocol
+itself gets inlined, this winds up not adding an additional branch to
+the generated code, since the 0 return value already happens in a
+mergable branch.
+
+Reported-by: Fabian Freyer <fabianfreyer@radicallyopensecurity.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/device.c   | 2 +-
+ drivers/net/wireguard/queueing.h | 8 +++++++-
+ drivers/net/wireguard/receive.c  | 4 ++--
+ 3 files changed, 10 insertions(+), 4 deletions(-)
+
+--- a/drivers/net/wireguard/device.c
++++ b/drivers/net/wireguard/device.c
+@@ -122,7 +122,7 @@ static netdev_tx_t wg_xmit(struct sk_buf
+       u32 mtu;
+       int ret;
+ 
+-      if (unlikely(wg_skb_examine_untrusted_ip_hdr(skb) != skb->protocol)) {
++      if (unlikely(!wg_check_packet_protocol(skb))) {
+               ret = -EPROTONOSUPPORT;
+               net_dbg_ratelimited("%s: Invalid IP packet\n", dev->name);
+               goto err;
+--- a/drivers/net/wireguard/queueing.h
++++ b/drivers/net/wireguard/queueing.h
+@@ -66,7 +66,7 @@ struct packet_cb {
+ #define PACKET_PEER(skb) (PACKET_CB(skb)->keypair->entry.peer)
+ 
+ /* Returns either the correct skb->protocol value, or 0 if invalid. */
+-static inline __be16 wg_skb_examine_untrusted_ip_hdr(struct sk_buff *skb)
++static inline __be16 wg_examine_packet_protocol(struct sk_buff *skb)
+ {
+       if (skb_network_header(skb) >= skb->head &&
+           (skb_network_header(skb) + sizeof(struct iphdr)) <=
+@@ -81,6 +81,12 @@ static inline __be16 wg_skb_examine_untr
+       return 0;
+ }
+ 
++static inline bool wg_check_packet_protocol(struct sk_buff *skb)
++{
++      __be16 real_protocol = wg_examine_packet_protocol(skb);
++      return real_protocol && skb->protocol == real_protocol;
++}
++
+ static inline void wg_reset_packet(struct sk_buff *skb)
+ {
+       skb_scrub_packet(skb, true);
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -56,7 +56,7 @@ static int prepare_skb_header(struct sk_
+       size_t data_offset, data_len, header_len;
+       struct udphdr *udp;
+ 
+-      if (unlikely(wg_skb_examine_untrusted_ip_hdr(skb) != skb->protocol ||
++      if (unlikely(!wg_check_packet_protocol(skb) ||
+                    skb_transport_header(skb) < skb->head ||
+                    (skb_transport_header(skb) + sizeof(struct udphdr)) >
+                            skb_tail_pointer(skb)))
+@@ -388,7 +388,7 @@ static void wg_packet_consume_data_done(
+        */
+       skb->ip_summed = CHECKSUM_UNNECESSARY;
+       skb->csum_level = ~0; /* All levels */
+-      skb->protocol = wg_skb_examine_untrusted_ip_hdr(skb);
++      skb->protocol = wg_examine_packet_protocol(skb);
+       if (skb->protocol == htons(ETH_P_IP)) {
+               len = ntohs(ip_hdr(skb)->tot_len);
+               if (unlikely(len < sizeof(struct iphdr)))
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0093-wireguard-receive-remove-dead-code-from-default-pack.patch b/target/linux/generic/backport-5.4/080-wireguard-0093-wireguard-receive-remove-dead-code-from-default-pack.patch

new file mode 100644 (file)

index 0000000..ed4c4a0
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0093-wireguard-receive-remove-dead-code-from-default-pack.patch
@@ -0,0 +1,35 @@
+From 827489b9186ac53ed1e162c7d9b0f7b19d1a5995 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 18 Mar 2020 18:30:46 -0600
+Subject: [PATCH 093/124] wireguard: receive: remove dead code from default
+ packet type case
+
+commit 2b8765c52db24c0fbcc81bac9b5e8390f2c7d3c8 upstream.
+
+The situation in which we wind up hitting the default case here
+indicates a major bug in earlier parsing code. It is not a usual thing
+that should ever happen, which means a "friendly" message for it doesn't
+make sense. Rather, replace this with a WARN_ON, just like we do earlier
+in the file for a similar situation, so that somebody sends us a bug
+report and we can fix it.
+
+Reported-by: Fabian Freyer <fabianfreyer@radicallyopensecurity.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/receive.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -587,8 +587,7 @@ void wg_packet_receive(struct wg_device
+               wg_packet_consume_data(wg, skb);
+               break;
+       default:
+-              net_dbg_skb_ratelimited("%s: Invalid packet from %pISpfsc\n",
+-                                      wg->dev->name, skb);
++              WARN(1, "Non-exhaustive parsing of packet header lead to unknown packet type!\n");
+               goto err;
+       }
+       return;
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0094-wireguard-noise-error-out-precomputed-DH-during-hand.patch b/target/linux/generic/backport-5.4/080-wireguard-0094-wireguard-noise-error-out-precomputed-DH-during-hand.patch

new file mode 100644 (file)

index 0000000..c014fc3
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0094-wireguard-noise-error-out-precomputed-DH-during-hand.patch
@@ -0,0 +1,224 @@
+From 3c5c9d96cea67a8dc381e6ca0f5a894f1ce099ea Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 18 Mar 2020 18:30:47 -0600
+Subject: [PATCH 094/124] wireguard: noise: error out precomputed DH during
+ handshake rather than config
+
+commit 11a7686aa99c7fe4b3f80f6dcccd54129817984d upstream.
+
+We precompute the static-static ECDH during configuration time, in order
+to save an expensive computation later when receiving network packets.
+However, not all ECDH computations yield a contributory result. Prior,
+we were just not letting those peers be added to the interface. However,
+this creates a strange inconsistency, since it was still possible to add
+other weird points, like a valid public key plus a low-order point, and,
+like points that result in zeros, a handshake would not complete. In
+order to make the behavior more uniform and less surprising, simply
+allow all peers to be added. Then, we'll error out later when doing the
+crypto if there's an issue. This also adds more separation between the
+crypto layer and the configuration layer.
+
+Discussed-with: Mathias Hall-Andersen <mathias@hall-andersen.dk>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/netlink.c            |  8 +---
+ drivers/net/wireguard/noise.c              | 55 ++++++++++++----------
+ drivers/net/wireguard/noise.h              | 12 ++---
+ drivers/net/wireguard/peer.c               |  7 +--
+ tools/testing/selftests/wireguard/netns.sh | 15 ++++--
+ 5 files changed, 49 insertions(+), 48 deletions(-)
+
+--- a/drivers/net/wireguard/netlink.c
++++ b/drivers/net/wireguard/netlink.c
+@@ -417,11 +417,7 @@ static int set_peer(struct wg_device *wg
+ 
+               peer = wg_peer_create(wg, public_key, preshared_key);
+               if (IS_ERR(peer)) {
+-                      /* Similar to the above, if the key is invalid, we skip
+-                       * it without fanfare, so that services don't need to
+-                       * worry about doing key validation themselves.
+-                       */
+-                      ret = PTR_ERR(peer) == -EKEYREJECTED ? 0 : PTR_ERR(peer);
++                      ret = PTR_ERR(peer);
+                       peer = NULL;
+                       goto out;
+               }
+@@ -575,7 +571,7 @@ static int wg_set_device(struct sk_buff
+                                                        private_key);
+               list_for_each_entry_safe(peer, temp, &wg->peer_list,
+                                        peer_list) {
+-                      BUG_ON(!wg_noise_precompute_static_static(peer));
++                      wg_noise_precompute_static_static(peer);
+                       wg_noise_expire_current_peer_keypairs(peer);
+               }
+               wg_cookie_checker_precompute_device_keys(&wg->cookie_checker);
+--- a/drivers/net/wireguard/noise.c
++++ b/drivers/net/wireguard/noise.c
+@@ -44,32 +44,23 @@ void __init wg_noise_init(void)
+ }
+ 
+ /* Must hold peer->handshake.static_identity->lock */
+-bool wg_noise_precompute_static_static(struct wg_peer *peer)
++void wg_noise_precompute_static_static(struct wg_peer *peer)
+ {
+-      bool ret;
+-
+       down_write(&peer->handshake.lock);
+-      if (peer->handshake.static_identity->has_identity) {
+-              ret = curve25519(
+-                      peer->handshake.precomputed_static_static,
++      if (!peer->handshake.static_identity->has_identity ||
++          !curve25519(peer->handshake.precomputed_static_static,
+                       peer->handshake.static_identity->static_private,
+-                      peer->handshake.remote_static);
+-      } else {
+-              u8 empty[NOISE_PUBLIC_KEY_LEN] = { 0 };
+-
+-              ret = curve25519(empty, empty, peer->handshake.remote_static);
++                      peer->handshake.remote_static))
+               memset(peer->handshake.precomputed_static_static, 0,
+                      NOISE_PUBLIC_KEY_LEN);
+-      }
+       up_write(&peer->handshake.lock);
+-      return ret;
+ }
+ 
+-bool wg_noise_handshake_init(struct noise_handshake *handshake,
+-                         struct noise_static_identity *static_identity,
+-                         const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN],
+-                         const u8 peer_preshared_key[NOISE_SYMMETRIC_KEY_LEN],
+-                         struct wg_peer *peer)
++void wg_noise_handshake_init(struct noise_handshake *handshake,
++                           struct noise_static_identity *static_identity,
++                           const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN],
++                           const u8 peer_preshared_key[NOISE_SYMMETRIC_KEY_LEN],
++                           struct wg_peer *peer)
+ {
+       memset(handshake, 0, sizeof(*handshake));
+       init_rwsem(&handshake->lock);
+@@ -81,7 +72,7 @@ bool wg_noise_handshake_init(struct nois
+                      NOISE_SYMMETRIC_KEY_LEN);
+       handshake->static_identity = static_identity;
+       handshake->state = HANDSHAKE_ZEROED;
+-      return wg_noise_precompute_static_static(peer);
++      wg_noise_precompute_static_static(peer);
+ }
+ 
+ static void handshake_zero(struct noise_handshake *handshake)
+@@ -403,6 +394,19 @@ static bool __must_check mix_dh(u8 chain
+       return true;
+ }
+ 
++static bool __must_check mix_precomputed_dh(u8 chaining_key[NOISE_HASH_LEN],
++                                          u8 key[NOISE_SYMMETRIC_KEY_LEN],
++                                          const u8 precomputed[NOISE_PUBLIC_KEY_LEN])
++{
++      static u8 zero_point[NOISE_PUBLIC_KEY_LEN];
++      if (unlikely(!crypto_memneq(precomputed, zero_point, NOISE_PUBLIC_KEY_LEN)))
++              return false;
++      kdf(chaining_key, key, NULL, precomputed, NOISE_HASH_LEN,
++          NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN,
++          chaining_key);
++      return true;
++}
++
+ static void mix_hash(u8 hash[NOISE_HASH_LEN], const u8 *src, size_t src_len)
+ {
+       struct blake2s_state blake;
+@@ -531,10 +535,9 @@ wg_noise_handshake_create_initiation(str
+                       NOISE_PUBLIC_KEY_LEN, key, handshake->hash);
+ 
+       /* ss */
+-      kdf(handshake->chaining_key, key, NULL,
+-          handshake->precomputed_static_static, NOISE_HASH_LEN,
+-          NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN,
+-          handshake->chaining_key);
++      if (!mix_precomputed_dh(handshake->chaining_key, key,
++                              handshake->precomputed_static_static))
++              goto out;
+ 
+       /* {t} */
+       tai64n_now(timestamp);
+@@ -595,9 +598,9 @@ wg_noise_handshake_consume_initiation(st
+       handshake = &peer->handshake;
+ 
+       /* ss */
+-      kdf(chaining_key, key, NULL, handshake->precomputed_static_static,
+-          NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN,
+-          chaining_key);
++      if (!mix_precomputed_dh(chaining_key, key,
++                              handshake->precomputed_static_static))
++          goto out;
+ 
+       /* {t} */
+       if (!message_decrypt(t, src->encrypted_timestamp,
+--- a/drivers/net/wireguard/noise.h
++++ b/drivers/net/wireguard/noise.h
+@@ -94,11 +94,11 @@ struct noise_handshake {
+ struct wg_device;
+ 
+ void wg_noise_init(void);
+-bool wg_noise_handshake_init(struct noise_handshake *handshake,
+-                         struct noise_static_identity *static_identity,
+-                         const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN],
+-                         const u8 peer_preshared_key[NOISE_SYMMETRIC_KEY_LEN],
+-                         struct wg_peer *peer);
++void wg_noise_handshake_init(struct noise_handshake *handshake,
++                           struct noise_static_identity *static_identity,
++                           const u8 peer_public_key[NOISE_PUBLIC_KEY_LEN],
++                           const u8 peer_preshared_key[NOISE_SYMMETRIC_KEY_LEN],
++                           struct wg_peer *peer);
+ void wg_noise_handshake_clear(struct noise_handshake *handshake);
+ static inline void wg_noise_reset_last_sent_handshake(atomic64_t *handshake_ns)
+ {
+@@ -116,7 +116,7 @@ void wg_noise_expire_current_peer_keypai
+ void wg_noise_set_static_identity_private_key(
+       struct noise_static_identity *static_identity,
+       const u8 private_key[NOISE_PUBLIC_KEY_LEN]);
+-bool wg_noise_precompute_static_static(struct wg_peer *peer);
++void wg_noise_precompute_static_static(struct wg_peer *peer);
+ 
+ bool
+ wg_noise_handshake_create_initiation(struct message_handshake_initiation *dst,
+--- a/drivers/net/wireguard/peer.c
++++ b/drivers/net/wireguard/peer.c
+@@ -34,11 +34,8 @@ struct wg_peer *wg_peer_create(struct wg
+               return ERR_PTR(ret);
+       peer->device = wg;
+ 
+-      if (!wg_noise_handshake_init(&peer->handshake, &wg->static_identity,
+-                                   public_key, preshared_key, peer)) {
+-              ret = -EKEYREJECTED;
+-              goto err_1;
+-      }
++      wg_noise_handshake_init(&peer->handshake, &wg->static_identity,
++                              public_key, preshared_key, peer);
+       if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
+               goto err_1;
+       if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false,
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -527,11 +527,16 @@ n0 wg set wg0 peer "$pub2" allowed-ips 0
+ n0 wg set wg0 peer "$pub2" allowed-ips ::/0,1700::/111,5000::/4,e000::/37,9000::/75
+ n0 wg set wg0 peer "$pub2" allowed-ips ::/0
+ n0 wg set wg0 peer "$pub2" remove
+-low_order_points=( AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= AQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= 4Ot6fDtBuK4WVuP68Z/EatoJjeucMrH9hmIFFl9JuAA= X5yVvKNQjCSx0LFVnIPvWwREXMRYHI6G2CJO3dCfEVc= 7P///////////////////////////////////////38= 7f///////////////////////////////////////38= 7v///////////////////////////////////////38= )
+-n0 wg set wg0 private-key /dev/null ${low_order_points[@]/#/peer }
+-[[ -z $(n0 wg show wg0 peers) ]]
+-n0 wg set wg0 private-key <(echo "$key1") ${low_order_points[@]/#/peer }
+-[[ -z $(n0 wg show wg0 peers) ]]
++for low_order_point in AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= AQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= 4Ot6fDtBuK4WVuP68Z/EatoJjeucMrH9hmIFFl9JuAA= X5yVvKNQjCSx0LFVnIPvWwREXMRYHI6G2CJO3dCfEVc= 7P///////////////////////////////////////38= 7f///////////////////////////////////////38= 7v///////////////////////////////////////38=; do
++      n0 wg set wg0 peer "$low_order_point" persistent-keepalive 1 endpoint 127.0.0.1:1111
++done
++[[ -n $(n0 wg show wg0 peers) ]]
++exec 4< <(n0 ncat -l -u -p 1111)
++ncat_pid=$!
++waitncatudp $netns0 $ncat_pid
++ip0 link set wg0 up
++! read -r -n 1 -t 2 <&4 || false
++kill $ncat_pid
+ ip0 link del wg0
+ 
+ declare -A objects
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0095-wireguard-send-remove-errant-newline-from-packet_enc.patch b/target/linux/generic/backport-5.4/080-wireguard-0095-wireguard-send-remove-errant-newline-from-packet_enc.patch

new file mode 100644 (file)

index 0000000..d546cd6
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0095-wireguard-send-remove-errant-newline-from-packet_enc.patch
@@ -0,0 +1,29 @@
+From 271fd6b0c4c81b844b81bbf4dd49d9e1de2827c2 Mon Sep 17 00:00:00 2001
+From: Sultan Alsawaf <sultan@kerneltoast.com>
+Date: Wed, 29 Apr 2020 14:59:20 -0600
+Subject: [PATCH 095/124] wireguard: send: remove errant newline from
+ packet_encrypt_worker
+
+commit d6833e42786e050e7522d6a91a9361e54085897d upstream.
+
+This commit removes a useless newline at the end of a scope, which
+doesn't add anything in the way of organization or readability.
+
+Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/send.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/drivers/net/wireguard/send.c
++++ b/drivers/net/wireguard/send.c
+@@ -304,7 +304,6 @@ void wg_packet_encrypt_worker(struct wor
+               }
+               wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first,
+                                         state);
+-
+       }
+ }
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0096-wireguard-queueing-cleanup-ptr_ring-in-error-path-of.patch b/target/linux/generic/backport-5.4/080-wireguard-0096-wireguard-queueing-cleanup-ptr_ring-in-error-path-of.patch

new file mode 100644 (file)

index 0000000..98ff785
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0096-wireguard-queueing-cleanup-ptr_ring-in-error-path-of.patch
@@ -0,0 +1,35 @@
+From 08d5c8fc96361389fdd982477aaf6d7c9311f5e0 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 29 Apr 2020 14:59:21 -0600
+Subject: [PATCH 096/124] wireguard: queueing: cleanup ptr_ring in error path
+ of packet_queue_init
+
+commit 130c58606171326c81841a49cc913cd354113dd9 upstream.
+
+Prior, if the alloc_percpu of packet_percpu_multicore_worker_alloc
+failed, the previously allocated ptr_ring wouldn't be freed. This commit
+adds the missing call to ptr_ring_cleanup in the error case.
+
+Reported-by: Sultan Alsawaf <sultan@kerneltoast.com>
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/queueing.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/wireguard/queueing.c
++++ b/drivers/net/wireguard/queueing.c
+@@ -35,8 +35,10 @@ int wg_packet_queue_init(struct crypt_qu
+               if (multicore) {
+                       queue->worker = wg_packet_percpu_multicore_worker_alloc(
+                               function, queue);
+-                      if (!queue->worker)
++                      if (!queue->worker) {
++                              ptr_ring_cleanup(&queue->ring, NULL);
+                               return -ENOMEM;
++                      }
+               } else {
+                       INIT_WORK(&queue->work, function);
+               }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0097-wireguard-receive-use-tunnel-helpers-for-decapsulati.patch b/target/linux/generic/backport-5.4/080-wireguard-0097-wireguard-receive-use-tunnel-helpers-for-decapsulati.patch

new file mode 100644 (file)

index 0000000..300cc01
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0097-wireguard-receive-use-tunnel-helpers-for-decapsulati.patch
@@ -0,0 +1,50 @@
+From 274c356580ec1b077ad10212c59a05b6e0b90d97 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
+Date: Wed, 29 Apr 2020 14:59:22 -0600
+Subject: [PATCH 097/124] wireguard: receive: use tunnel helpers for
+ decapsulating ECN markings
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit eebabcb26ea1e3295704477c6cd4e772c96a9559 upstream.
+
+WireGuard currently only propagates ECN markings on tunnel decap according
+to the old RFC3168 specification. However, the spec has since been updated
+in RFC6040 to recommend slightly different decapsulation semantics. This
+was implemented in the kernel as a set of common helpers for ECN
+decapsulation, so let's just switch over WireGuard to using those, so it
+can benefit from this enhancement and any future tweaks. We do not drop
+packets with invalid ECN marking combinations, because WireGuard is
+frequently used to work around broken ISPs, which could be doing that.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Reported-by: Olivier Tilmans <olivier.tilmans@nokia-bell-labs.com>
+Cc: Dave Taht <dave.taht@gmail.com>
+Cc: Rodney W. Grimes <ietf@gndrsh.dnsmgr.net>
+Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/receive.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -393,13 +393,11 @@ static void wg_packet_consume_data_done(
+               len = ntohs(ip_hdr(skb)->tot_len);
+               if (unlikely(len < sizeof(struct iphdr)))
+                       goto dishonest_packet_size;
+-              if (INET_ECN_is_ce(PACKET_CB(skb)->ds))
+-                      IP_ECN_set_ce(ip_hdr(skb));
++              INET_ECN_decapsulate(skb, PACKET_CB(skb)->ds, ip_hdr(skb)->tos);
+       } else if (skb->protocol == htons(ETH_P_IPV6)) {
+               len = ntohs(ipv6_hdr(skb)->payload_len) +
+                     sizeof(struct ipv6hdr);
+-              if (INET_ECN_is_ce(PACKET_CB(skb)->ds))
+-                      IP6_ECN_set_ce(skb, ipv6_hdr(skb));
++              INET_ECN_decapsulate(skb, PACKET_CB(skb)->ds, ipv6_get_dsfield(ipv6_hdr(skb)));
+       } else {
+               goto dishonest_packet_type;
+       }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0098-wireguard-selftests-use-normal-kernel-stack-size-on-.patch b/target/linux/generic/backport-5.4/080-wireguard-0098-wireguard-selftests-use-normal-kernel-stack-size-on-.patch

new file mode 100644 (file)

index 0000000..6aa6c2e
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0098-wireguard-selftests-use-normal-kernel-stack-size-on-.patch
@@ -0,0 +1,29 @@
+From abf11efb5187c0aaa57c37f36db035c840c9c90d Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 6 May 2020 15:33:02 -0600
+Subject: [PATCH 098/124] wireguard: selftests: use normal kernel stack size on
+ ppc64
+
+commit a0fd7cc87a018df1a17f9d3f0bd994c1f22c6b34 upstream.
+
+While at some point it might have made sense to be running these tests
+on ppc64 with 4k stacks, the kernel hasn't actually used 4k stacks on
+64-bit powerpc in a long time, and more interesting things that we test
+don't really work when we deviate from the default (16k). So, we stop
+pushing our luck in this commit, and return to the default instead of
+the minimum.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config
++++ b/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config
+@@ -10,3 +10,4 @@ CONFIG_CMDLINE_BOOL=y
+ CONFIG_CMDLINE="console=hvc0 wg.success=hvc1"
+ CONFIG_SECTION_MISMATCH_WARN_ONLY=y
+ CONFIG_FRAME_WARN=1280
++CONFIG_THREAD_SHIFT=14
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0099-wireguard-socket-remove-errant-restriction-on-loopin.patch b/target/linux/generic/backport-5.4/080-wireguard-0099-wireguard-socket-remove-errant-restriction-on-loopin.patch

new file mode 100644 (file)

index 0000000..887cb52
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0099-wireguard-socket-remove-errant-restriction-on-loopin.patch
@@ -0,0 +1,162 @@
+From 81676eb0adad9931279470559107f75741ba957c Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 6 May 2020 15:33:03 -0600
+Subject: [PATCH 099/124] wireguard: socket: remove errant restriction on
+ looping to self
+
+commit b673e24aad36981f327a6570412ffa7754de8911 upstream.
+
+It's already possible to create two different interfaces and loop
+packets between them. This has always been possible with tunnels in the
+kernel, and isn't specific to wireguard. Therefore, the networking stack
+already needs to deal with that. At the very least, the packet winds up
+exceeding the MTU and is discarded at that point. So, since this is
+already something that happens, there's no need to forbid the not very
+exceptional case of routing a packet back to the same interface; this
+loop is no different than others, and we shouldn't special case it, but
+rather rely on generic handling of loops in general. This also makes it
+easier to do interesting things with wireguard such as onion routing.
+
+At the same time, we add a selftest for this, ensuring that both onion
+routing works and infinite routing loops do not crash the kernel. We
+also add a test case for wireguard interfaces nesting packets and
+sending traffic between each other, as well as the loop in this case
+too. We make sure to send some throughput-heavy traffic for this use
+case, to stress out any possible recursion issues with the locks around
+workqueues.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/socket.c             | 12 -----
+ tools/testing/selftests/wireguard/netns.sh | 54 ++++++++++++++++++++--
+ 2 files changed, 51 insertions(+), 15 deletions(-)
+
+--- a/drivers/net/wireguard/socket.c
++++ b/drivers/net/wireguard/socket.c
+@@ -76,12 +76,6 @@ static int send4(struct wg_device *wg, s
+                       net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n",
+                                           wg->dev->name, &endpoint->addr, ret);
+                       goto err;
+-              } else if (unlikely(rt->dst.dev == skb->dev)) {
+-                      ip_rt_put(rt);
+-                      ret = -ELOOP;
+-                      net_dbg_ratelimited("%s: Avoiding routing loop to %pISpfsc\n",
+-                                          wg->dev->name, &endpoint->addr);
+-                      goto err;
+               }
+               if (cache)
+                       dst_cache_set_ip4(cache, &rt->dst, fl.saddr);
+@@ -149,12 +143,6 @@ static int send6(struct wg_device *wg, s
+                       net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n",
+                                           wg->dev->name, &endpoint->addr, ret);
+                       goto err;
+-              } else if (unlikely(dst->dev == skb->dev)) {
+-                      dst_release(dst);
+-                      ret = -ELOOP;
+-                      net_dbg_ratelimited("%s: Avoiding routing loop to %pISpfsc\n",
+-                                          wg->dev->name, &endpoint->addr);
+-                      goto err;
+               }
+               if (cache)
+                       dst_cache_set_ip6(cache, dst, &fl.saddr);
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -48,8 +48,11 @@ cleanup() {
+       exec 2>/dev/null
+       printf "$orig_message_cost" > /proc/sys/net/core/message_cost
+       ip0 link del dev wg0
++      ip0 link del dev wg1
+       ip1 link del dev wg0
++      ip1 link del dev wg1
+       ip2 link del dev wg0
++      ip2 link del dev wg1
+       local to_kill="$(ip netns pids $netns0) $(ip netns pids $netns1) $(ip netns pids $netns2)"
+       [[ -n $to_kill ]] && kill $to_kill
+       pp ip netns del $netns1
+@@ -77,18 +80,20 @@ ip0 link set wg0 netns $netns2
+ key1="$(pp wg genkey)"
+ key2="$(pp wg genkey)"
+ key3="$(pp wg genkey)"
++key4="$(pp wg genkey)"
+ pub1="$(pp wg pubkey <<<"$key1")"
+ pub2="$(pp wg pubkey <<<"$key2")"
+ pub3="$(pp wg pubkey <<<"$key3")"
++pub4="$(pp wg pubkey <<<"$key4")"
+ psk="$(pp wg genpsk)"
+ [[ -n $key1 && -n $key2 && -n $psk ]]
+ 
+ configure_peers() {
+       ip1 addr add 192.168.241.1/24 dev wg0
+-      ip1 addr add fd00::1/24 dev wg0
++      ip1 addr add fd00::1/112 dev wg0
+ 
+       ip2 addr add 192.168.241.2/24 dev wg0
+-      ip2 addr add fd00::2/24 dev wg0
++      ip2 addr add fd00::2/112 dev wg0
+ 
+       n1 wg set wg0 \
+               private-key <(echo "$key1") \
+@@ -230,9 +235,38 @@ n1 ping -W 1 -c 1 192.168.241.2
+ n1 wg set wg0 private-key <(echo "$key3")
+ n2 wg set wg0 peer "$pub3" preshared-key <(echo "$psk") allowed-ips 192.168.241.1/32 peer "$pub1" remove
+ n1 ping -W 1 -c 1 192.168.241.2
++n2 wg set wg0 peer "$pub3" remove
+ 
+-ip1 link del wg0
++# Test that we can route wg through wg
++ip1 addr flush dev wg0
++ip2 addr flush dev wg0
++ip1 addr add fd00::5:1/112 dev wg0
++ip2 addr add fd00::5:2/112 dev wg0
++n1 wg set wg0 private-key <(echo "$key1") peer "$pub2" preshared-key <(echo "$psk") allowed-ips fd00::5:2/128 endpoint 127.0.0.1:2
++n2 wg set wg0 private-key <(echo "$key2") listen-port 2 peer "$pub1" preshared-key <(echo "$psk") allowed-ips fd00::5:1/128 endpoint 127.212.121.99:9998
++ip1 link add wg1 type wireguard
++ip2 link add wg1 type wireguard
++ip1 addr add 192.168.241.1/24 dev wg1
++ip1 addr add fd00::1/112 dev wg1
++ip2 addr add 192.168.241.2/24 dev wg1
++ip2 addr add fd00::2/112 dev wg1
++ip1 link set mtu 1340 up dev wg1
++ip2 link set mtu 1340 up dev wg1
++n1 wg set wg1 listen-port 5 private-key <(echo "$key3") peer "$pub4" allowed-ips 192.168.241.2/32,fd00::2/128 endpoint [fd00::5:2]:5
++n2 wg set wg1 listen-port 5 private-key <(echo "$key4") peer "$pub3" allowed-ips 192.168.241.1/32,fd00::1/128 endpoint [fd00::5:1]:5
++tests
++# Try to set up a routing loop between the two namespaces
++ip1 link set netns $netns0 dev wg1
++ip0 addr add 192.168.241.1/24 dev wg1
++ip0 link set up dev wg1
++n0 ping -W 1 -c 1 192.168.241.2
++n1 wg set wg0 peer "$pub2" endpoint 192.168.241.2:7
+ ip2 link del wg0
++ip2 link del wg1
++! n0 ping -W 1 -c 10 -f 192.168.241.2 || false # Should not crash kernel
++
++ip0 link del wg1
++ip1 link del wg0
+ 
+ # Test using NAT. We now change the topology to this:
+ # ┌────────────────────────────────────────┐    ┌────────────────────────────────────────────────┐     ┌────────────────────────────────────────┐
+@@ -282,6 +316,20 @@ pp sleep 3
+ n2 ping -W 1 -c 1 192.168.241.1
+ n1 wg set wg0 peer "$pub2" persistent-keepalive 0
+ 
++# Test that onion routing works, even when it loops
++n1 wg set wg0 peer "$pub3" allowed-ips 192.168.242.2/32 endpoint 192.168.241.2:5
++ip1 addr add 192.168.242.1/24 dev wg0
++ip2 link add wg1 type wireguard
++ip2 addr add 192.168.242.2/24 dev wg1
++n2 wg set wg1 private-key <(echo "$key3") listen-port 5 peer "$pub1" allowed-ips 192.168.242.1/32
++ip2 link set wg1 up
++n1 ping -W 1 -c 1 192.168.242.2
++ip2 link del wg1
++n1 wg set wg0 peer "$pub3" endpoint 192.168.242.2:5
++! n1 ping -W 1 -c 1 192.168.242.2 || false # Should not crash kernel
++n1 wg set wg0 peer "$pub3" remove
++ip1 addr del 192.168.242.1/24 dev wg0
++
+ # Do a wg-quick(8)-style policy routing for the default route, making sure vethc has a v6 address to tease out bugs.
+ ip1 -6 addr add fc00::9/96 dev vethc
+ ip1 -6 route add default via fc00::1
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0100-wireguard-send-receive-cond_resched-when-processing-.patch b/target/linux/generic/backport-5.4/080-wireguard-0100-wireguard-send-receive-cond_resched-when-processing-.patch

new file mode 100644 (file)

index 0000000..a87a383
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0100-wireguard-send-receive-cond_resched-when-processing-.patch
@@ -0,0 +1,58 @@
+From 3943211e0997b04f1e2ca1a6624391cc72a176bc Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 6 May 2020 15:33:04 -0600
+Subject: [PATCH 100/124] wireguard: send/receive: cond_resched() when
+ processing worker ringbuffers
+
+commit 4005f5c3c9d006157ba716594e0d70c88a235c5e upstream.
+
+Users with pathological hardware reported CPU stalls on CONFIG_
+PREEMPT_VOLUNTARY=y, because the ringbuffers would stay full, meaning
+these workers would never terminate. That turned out not to be okay on
+systems without forced preemption, which Sultan observed. This commit
+adds a cond_resched() to the bottom of each loop iteration, so that
+these workers don't hog the core. Note that we don't need this on the
+napi poll worker, since that terminates after its budget is expended.
+
+Suggested-by: Sultan Alsawaf <sultan@kerneltoast.com>
+Reported-by: Wang Jian <larkwang@gmail.com>
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/receive.c | 2 ++
+ drivers/net/wireguard/send.c    | 4 ++++
+ 2 files changed, 6 insertions(+)
+
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -516,6 +516,8 @@ void wg_packet_decrypt_worker(struct wor
+                               &PACKET_CB(skb)->keypair->receiving)) ?
+                               PACKET_STATE_CRYPTED : PACKET_STATE_DEAD;
+               wg_queue_enqueue_per_peer_napi(skb, state);
++              if (need_resched())
++                      cond_resched();
+       }
+ }
+ 
+--- a/drivers/net/wireguard/send.c
++++ b/drivers/net/wireguard/send.c
+@@ -281,6 +281,8 @@ void wg_packet_tx_worker(struct work_str
+ 
+               wg_noise_keypair_put(keypair, false);
+               wg_peer_put(peer);
++              if (need_resched())
++                      cond_resched();
+       }
+ }
+ 
+@@ -304,6 +306,8 @@ void wg_packet_encrypt_worker(struct wor
+               }
+               wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first,
+                                         state);
++              if (need_resched())
++                      cond_resched();
+       }
+ }
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0101-wireguard-selftests-initalize-ipv6-members-to-NULL-t.patch b/target/linux/generic/backport-5.4/080-wireguard-0101-wireguard-selftests-initalize-ipv6-members-to-NULL-t.patch

new file mode 100644 (file)

index 0000000..3d57857
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0101-wireguard-selftests-initalize-ipv6-members-to-NULL-t.patch
@@ -0,0 +1,51 @@
+From 7b7da251149dd5fd070255dbf45f8e4f5c2110b8 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 6 May 2020 15:33:05 -0600
+Subject: [PATCH 101/124] wireguard: selftests: initalize ipv6 members to NULL
+ to squelch clang warning
+
+commit 4fed818ef54b08d4b29200e416cce65546ad5312 upstream.
+
+Without setting these to NULL, clang complains in certain
+configurations that have CONFIG_IPV6=n:
+
+In file included from drivers/net/wireguard/ratelimiter.c:223:
+drivers/net/wireguard/selftest/ratelimiter.c:173:34: error: variable 'skb6' is uninitialized when used here [-Werror,-Wuninitialized]
+                ret = timings_test(skb4, hdr4, skb6, hdr6, &test_count);
+                                               ^~~~
+drivers/net/wireguard/selftest/ratelimiter.c:123:29: note: initialize the variable 'skb6' to silence this warning
+        struct sk_buff *skb4, *skb6;
+                                   ^
+                                    = NULL
+drivers/net/wireguard/selftest/ratelimiter.c:173:40: error: variable 'hdr6' is uninitialized when used here [-Werror,-Wuninitialized]
+                ret = timings_test(skb4, hdr4, skb6, hdr6, &test_count);
+                                                     ^~~~
+drivers/net/wireguard/selftest/ratelimiter.c:125:22: note: initialize the variable 'hdr6' to silence this warning
+        struct ipv6hdr *hdr6;
+                            ^
+
+We silence this warning by setting the variables to NULL as the warning
+suggests.
+
+Reported-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/selftest/ratelimiter.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/wireguard/selftest/ratelimiter.c
++++ b/drivers/net/wireguard/selftest/ratelimiter.c
+@@ -120,9 +120,9 @@ bool __init wg_ratelimiter_selftest(void
+       enum { TRIALS_BEFORE_GIVING_UP = 5000 };
+       bool success = false;
+       int test = 0, trials;
+-      struct sk_buff *skb4, *skb6;
++      struct sk_buff *skb4, *skb6 = NULL;
+       struct iphdr *hdr4;
+-      struct ipv6hdr *hdr6;
++      struct ipv6hdr *hdr6 = NULL;
+ 
+       if (IS_ENABLED(CONFIG_KASAN) || IS_ENABLED(CONFIG_UBSAN))
+               return true;
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0102-wireguard-send-receive-use-explicit-unlikely-branch-.patch b/target/linux/generic/backport-5.4/080-wireguard-0102-wireguard-send-receive-use-explicit-unlikely-branch-.patch

new file mode 100644 (file)

index 0000000..d8ea890
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0102-wireguard-send-receive-use-explicit-unlikely-branch-.patch
@@ -0,0 +1,88 @@
+From 8df862b663b026d61b4c463caece77f1f127771f Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 6 May 2020 15:33:06 -0600
+Subject: [PATCH 102/124] wireguard: send/receive: use explicit unlikely branch
+ instead of implicit coalescing
+
+commit 243f2148937adc72bcaaa590d482d599c936efde upstream.
+
+It's very unlikely that send will become true. It's nearly always false
+between 0 and 120 seconds of a session, and in most cases becomes true
+only between 120 and 121 seconds before becoming false again. So,
+unlikely(send) is clearly the right option here.
+
+What happened before was that we had this complex boolean expression
+with multiple likely and unlikely clauses nested. Since this is
+evaluated left-to-right anyway, the whole thing got converted to
+unlikely. So, we can clean this up to better represent what's going on.
+
+The generated code is the same.
+
+Suggested-by: Sultan Alsawaf <sultan@kerneltoast.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/receive.c | 13 ++++++-------
+ drivers/net/wireguard/send.c    | 15 ++++++---------
+ 2 files changed, 12 insertions(+), 16 deletions(-)
+
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -226,21 +226,20 @@ void wg_packet_handshake_receive_worker(
+ static void keep_key_fresh(struct wg_peer *peer)
+ {
+       struct noise_keypair *keypair;
+-      bool send = false;
++      bool send;
+ 
+       if (peer->sent_lastminute_handshake)
+               return;
+ 
+       rcu_read_lock_bh();
+       keypair = rcu_dereference_bh(peer->keypairs.current_keypair);
+-      if (likely(keypair && READ_ONCE(keypair->sending.is_valid)) &&
+-          keypair->i_am_the_initiator &&
+-          unlikely(wg_birthdate_has_expired(keypair->sending.birthdate,
+-                      REJECT_AFTER_TIME - KEEPALIVE_TIMEOUT - REKEY_TIMEOUT)))
+-              send = true;
++      send = keypair && READ_ONCE(keypair->sending.is_valid) &&
++             keypair->i_am_the_initiator &&
++             wg_birthdate_has_expired(keypair->sending.birthdate,
++                      REJECT_AFTER_TIME - KEEPALIVE_TIMEOUT - REKEY_TIMEOUT);
+       rcu_read_unlock_bh();
+ 
+-      if (send) {
++      if (unlikely(send)) {
+               peer->sent_lastminute_handshake = true;
+               wg_packet_send_queued_handshake_initiation(peer, false);
+       }
+--- a/drivers/net/wireguard/send.c
++++ b/drivers/net/wireguard/send.c
+@@ -124,20 +124,17 @@ void wg_packet_send_handshake_cookie(str
+ static void keep_key_fresh(struct wg_peer *peer)
+ {
+       struct noise_keypair *keypair;
+-      bool send = false;
++      bool send;
+ 
+       rcu_read_lock_bh();
+       keypair = rcu_dereference_bh(peer->keypairs.current_keypair);
+-      if (likely(keypair && READ_ONCE(keypair->sending.is_valid)) &&
+-          (unlikely(atomic64_read(&keypair->sending.counter.counter) >
+-                    REKEY_AFTER_MESSAGES) ||
+-           (keypair->i_am_the_initiator &&
+-            unlikely(wg_birthdate_has_expired(keypair->sending.birthdate,
+-                                              REKEY_AFTER_TIME)))))
+-              send = true;
++      send = keypair && READ_ONCE(keypair->sending.is_valid) &&
++             (atomic64_read(&keypair->sending.counter.counter) > REKEY_AFTER_MESSAGES ||
++              (keypair->i_am_the_initiator &&
++               wg_birthdate_has_expired(keypair->sending.birthdate, REKEY_AFTER_TIME)));
+       rcu_read_unlock_bh();
+ 
+-      if (send)
++      if (unlikely(send))
+               wg_packet_send_queued_handshake_initiation(peer, false);
+ }
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0103-wireguard-selftests-use-newer-iproute2-for-gcc-10.patch b/target/linux/generic/backport-5.4/080-wireguard-0103-wireguard-selftests-use-newer-iproute2-for-gcc-10.patch

new file mode 100644 (file)

index 0000000..4a4d8e8
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0103-wireguard-selftests-use-newer-iproute2-for-gcc-10.patch
@@ -0,0 +1,31 @@
+From 8c0f0162352081c875a7aa86d897e2bb50f6e46d Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 19 May 2020 22:49:27 -0600
+Subject: [PATCH 103/124] wireguard: selftests: use newer iproute2 for gcc-10
+
+commit ee3c1aa3f34b7842c1557cfe5d8c3f7b8c692de8 upstream.
+
+gcc-10 switched to defaulting to -fno-common, which broke iproute2-5.4.
+This was fixed in iproute-5.6, so switch to that. Because we're after a
+stable testing surface, we generally don't like to bump these
+unnecessarily, but in this case, being able to actually build is a basic
+necessity.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ tools/testing/selftests/wireguard/qemu/Makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/testing/selftests/wireguard/qemu/Makefile
++++ b/tools/testing/selftests/wireguard/qemu/Makefile
+@@ -44,7 +44,7 @@ endef
+ $(eval $(call tar_download,MUSL,musl,1.1.24,.tar.gz,https://www.musl-libc.org/releases/,1370c9a812b2cf2a7d92802510cca0058cc37e66a7bedd70051f0a34015022a3))
+ $(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c))
+ $(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d))
+-$(eval $(call tar_download,IPROUTE2,iproute2,5.4.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,fe97aa60a0d4c5ac830be18937e18dc3400ca713a33a89ad896ff1e3d46086ae))
++$(eval $(call tar_download,IPROUTE2,iproute2,5.6.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,1b5b0e25ce6e23da7526ea1da044e814ad85ba761b10dd29c2b027c056b04692))
+ $(eval $(call tar_download,IPTABLES,iptables,1.8.4,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,993a3a5490a544c2cbf2ef15cf7e7ed21af1845baf228318d5c36ef8827e157c))
+ $(eval $(call tar_download,NMAP,nmap,7.80,.tar.bz2,https://nmap.org/dist/,fcfa5a0e42099e12e4bf7a68ebe6fde05553383a682e816a7ec9256ab4773faa))
+ $(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a))
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0104-wireguard-noise-read-preshared-key-while-taking-lock.patch b/target/linux/generic/backport-5.4/080-wireguard-0104-wireguard-noise-read-preshared-key-while-taking-lock.patch

new file mode 100644 (file)

index 0000000..33ad677
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0104-wireguard-noise-read-preshared-key-while-taking-lock.patch
@@ -0,0 +1,62 @@
+From 5e29ad069218c486737729f88d15e4fe0ca7eb45 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 19 May 2020 22:49:28 -0600
+Subject: [PATCH 104/124] wireguard: noise: read preshared key while taking
+ lock
+
+commit bc67d371256f5c47d824e2eec51e46c8d62d022e upstream.
+
+Prior we read the preshared key after dropping the handshake lock, which
+isn't an actual crypto issue if it races, but it's still not quite
+correct. So copy that part of the state into a temporary like we do with
+the rest of the handshake state variables. Then we can release the lock,
+operate on the temporary, and zero it out at the end of the function. In
+performance tests, the impact of this was entirely unnoticable, probably
+because those bytes are coming from the same cacheline as other things
+that are being copied out in the same manner.
+
+Reported-by: Matt Dunwoodie <ncon@noconroy.net>
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/noise.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/wireguard/noise.c
++++ b/drivers/net/wireguard/noise.c
+@@ -715,6 +715,7 @@ wg_noise_handshake_consume_response(stru
+       u8 e[NOISE_PUBLIC_KEY_LEN];
+       u8 ephemeral_private[NOISE_PUBLIC_KEY_LEN];
+       u8 static_private[NOISE_PUBLIC_KEY_LEN];
++      u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN];
+ 
+       down_read(&wg->static_identity.lock);
+ 
+@@ -733,6 +734,8 @@ wg_noise_handshake_consume_response(stru
+       memcpy(chaining_key, handshake->chaining_key, NOISE_HASH_LEN);
+       memcpy(ephemeral_private, handshake->ephemeral_private,
+              NOISE_PUBLIC_KEY_LEN);
++      memcpy(preshared_key, handshake->preshared_key,
++             NOISE_SYMMETRIC_KEY_LEN);
+       up_read(&handshake->lock);
+ 
+       if (state != HANDSHAKE_CREATED_INITIATION)
+@@ -750,7 +753,7 @@ wg_noise_handshake_consume_response(stru
+               goto fail;
+ 
+       /* psk */
+-      mix_psk(chaining_key, hash, key, handshake->preshared_key);
++      mix_psk(chaining_key, hash, key, preshared_key);
+ 
+       /* {} */
+       if (!message_decrypt(NULL, src->encrypted_nothing,
+@@ -783,6 +786,7 @@ out:
+       memzero_explicit(chaining_key, NOISE_HASH_LEN);
+       memzero_explicit(ephemeral_private, NOISE_PUBLIC_KEY_LEN);
+       memzero_explicit(static_private, NOISE_PUBLIC_KEY_LEN);
++      memzero_explicit(preshared_key, NOISE_SYMMETRIC_KEY_LEN);
+       up_read(&wg->static_identity.lock);
+       return ret_peer;
+ }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0105-wireguard-queueing-preserve-flow-hash-across-packet-.patch b/target/linux/generic/backport-5.4/080-wireguard-0105-wireguard-queueing-preserve-flow-hash-across-packet-.patch

new file mode 100644 (file)

index 0000000..5834425
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0105-wireguard-queueing-preserve-flow-hash-across-packet-.patch
@@ -0,0 +1,116 @@
+From a6fedb7ce9e487edae4c35b70e2d3a5bb2342fec Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 19 May 2020 22:49:29 -0600
+Subject: [PATCH 105/124] wireguard: queueing: preserve flow hash across packet
+ scrubbing
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit c78a0b4a78839d572d8a80f6a62221c0d7843135 upstream.
+
+It's important that we clear most header fields during encapsulation and
+decapsulation, because the packet is substantially changed, and we don't
+want any info leak or logic bug due to an accidental correlation. But,
+for encapsulation, it's wrong to clear skb->hash, since it's used by
+fq_codel and flow dissection in general. Without it, classification does
+not proceed as usual. This change might make it easier to estimate the
+number of innerflows by examining clustering of out of order packets,
+but this shouldn't open up anything that can't already be inferred
+otherwise (e.g. syn packet size inference), and fq_codel can be disabled
+anyway.
+
+Furthermore, it might be the case that the hash isn't used or queried at
+all until after wireguard transmits the encrypted UDP packet, which
+means skb->hash might still be zero at this point, and thus no hash
+taken over the inner packet data. In order to address this situation, we
+force a calculation of skb->hash before encrypting packet data.
+
+Of course this means that fq_codel might transmit packets slightly more
+out of order than usual. Toke did some testing on beefy machines with
+high quantities of parallel flows and found that increasing the
+reply-attack counter to 8192 takes care of the most pathological cases
+pretty well.
+
+Reported-by: Dave Taht <dave.taht@gmail.com>
+Reviewed-and-tested-by: Toke Høiland-Jørgensen <toke@toke.dk>
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/messages.h |  2 +-
+ drivers/net/wireguard/queueing.h | 10 +++++++++-
+ drivers/net/wireguard/receive.c  |  2 +-
+ drivers/net/wireguard/send.c     |  7 ++++++-
+ 4 files changed, 17 insertions(+), 4 deletions(-)
+
+--- a/drivers/net/wireguard/messages.h
++++ b/drivers/net/wireguard/messages.h
+@@ -32,7 +32,7 @@ enum cookie_values {
+ };
+ 
+ enum counter_values {
+-      COUNTER_BITS_TOTAL = 2048,
++      COUNTER_BITS_TOTAL = 8192,
+       COUNTER_REDUNDANT_BITS = BITS_PER_LONG,
+       COUNTER_WINDOW_SIZE = COUNTER_BITS_TOTAL - COUNTER_REDUNDANT_BITS
+ };
+--- a/drivers/net/wireguard/queueing.h
++++ b/drivers/net/wireguard/queueing.h
+@@ -87,12 +87,20 @@ static inline bool wg_check_packet_proto
+       return real_protocol && skb->protocol == real_protocol;
+ }
+ 
+-static inline void wg_reset_packet(struct sk_buff *skb)
++static inline void wg_reset_packet(struct sk_buff *skb, bool encapsulating)
+ {
++      u8 l4_hash = skb->l4_hash;
++      u8 sw_hash = skb->sw_hash;
++      u32 hash = skb->hash;
+       skb_scrub_packet(skb, true);
+       memset(&skb->headers_start, 0,
+              offsetof(struct sk_buff, headers_end) -
+                      offsetof(struct sk_buff, headers_start));
++      if (encapsulating) {
++              skb->l4_hash = l4_hash;
++              skb->sw_hash = sw_hash;
++              skb->hash = hash;
++      }
+       skb->queue_mapping = 0;
+       skb->nohdr = 0;
+       skb->peeked = 0;
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -484,7 +484,7 @@ int wg_packet_rx_poll(struct napi_struct
+               if (unlikely(wg_socket_endpoint_from_skb(&endpoint, skb)))
+                       goto next;
+ 
+-              wg_reset_packet(skb);
++              wg_reset_packet(skb, false);
+               wg_packet_consume_data_done(peer, skb, &endpoint);
+               free = false;
+ 
+--- a/drivers/net/wireguard/send.c
++++ b/drivers/net/wireguard/send.c
+@@ -167,6 +167,11 @@ static bool encrypt_packet(struct sk_buf
+       struct sk_buff *trailer;
+       int num_frags;
+ 
++      /* Force hash calculation before encryption so that flow analysis is
++       * consistent over the inner packet.
++       */
++      skb_get_hash(skb);
++
+       /* Calculate lengths. */
+       padding_len = calculate_skb_padding(skb);
+       trailer_len = padding_len + noise_encrypted_len(0);
+@@ -295,7 +300,7 @@ void wg_packet_encrypt_worker(struct wor
+               skb_list_walk_safe(first, skb, next) {
+                       if (likely(encrypt_packet(skb,
+                                       PACKET_CB(first)->keypair))) {
+-                              wg_reset_packet(skb);
++                              wg_reset_packet(skb, true);
+                       } else {
+                               state = PACKET_STATE_DEAD;
+                               break;
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0106-wireguard-noise-separate-receive-counter-from-send-c.patch b/target/linux/generic/backport-5.4/080-wireguard-0106-wireguard-noise-separate-receive-counter-from-send-c.patch

new file mode 100644 (file)

index 0000000..d72e9f8
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0106-wireguard-noise-separate-receive-counter-from-send-c.patch
@@ -0,0 +1,331 @@
+From 044b98abbb08fabca5c2cff426023f1f52448efc Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 19 May 2020 22:49:30 -0600
+Subject: [PATCH 106/124] wireguard: noise: separate receive counter from send
+ counter
+
+commit a9e90d9931f3a474f04bab782ccd9d77904941e9 upstream.
+
+In "wireguard: queueing: preserve flow hash across packet scrubbing", we
+were required to slightly increase the size of the receive replay
+counter to something still fairly small, but an increase nonetheless.
+It turns out that we can recoup some of the additional memory overhead
+by splitting up the prior union type into two distinct types. Before, we
+used the same "noise_counter" union for both sending and receiving, with
+sending just using a simple atomic64_t, while receiving used the full
+replay counter checker. This meant that most of the memory being
+allocated for the sending counter was being wasted. Since the old
+"noise_counter" type increased in size in the prior commit, now is a
+good time to split up that union type into a distinct "noise_replay_
+counter" for receiving and a boring atomic64_t for sending, each using
+neither more nor less memory than required.
+
+Also, since sometimes the replay counter is accessed without
+necessitating additional accesses to the bitmap, we can reduce cache
+misses by hoisting the always-necessary lock above the bitmap in the
+struct layout. We also change a "noise_replay_counter" stack allocation
+to kmalloc in a -DDEBUG selftest so that KASAN doesn't trigger a stack
+frame warning.
+
+All and all, removing a bit of abstraction in this commit makes the code
+simpler and smaller, in addition to the motivating memory usage
+recuperation. For example, passing around raw "noise_symmetric_key"
+structs is something that really only makes sense within noise.c, in the
+one place where the sending and receiving keys can safely be thought of
+as the same type of object; subsequent to that, it's important that we
+uniformly access these through keypair->{sending,receiving}, where their
+distinct roles are always made explicit. So this patch allows us to draw
+that distinction clearly as well.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/noise.c            | 16 +++------
+ drivers/net/wireguard/noise.h            | 14 ++++----
+ drivers/net/wireguard/receive.c          | 42 ++++++++++++------------
+ drivers/net/wireguard/selftest/counter.c | 17 +++++++---
+ drivers/net/wireguard/send.c             | 12 +++----
+ 5 files changed, 48 insertions(+), 53 deletions(-)
+
+--- a/drivers/net/wireguard/noise.c
++++ b/drivers/net/wireguard/noise.c
+@@ -104,6 +104,7 @@ static struct noise_keypair *keypair_cre
+ 
+       if (unlikely(!keypair))
+               return NULL;
++      spin_lock_init(&keypair->receiving_counter.lock);
+       keypair->internal_id = atomic64_inc_return(&keypair_counter);
+       keypair->entry.type = INDEX_HASHTABLE_KEYPAIR;
+       keypair->entry.peer = peer;
+@@ -358,25 +359,16 @@ out:
+       memzero_explicit(output, BLAKE2S_HASH_SIZE + 1);
+ }
+ 
+-static void symmetric_key_init(struct noise_symmetric_key *key)
+-{
+-      spin_lock_init(&key->counter.receive.lock);
+-      atomic64_set(&key->counter.counter, 0);
+-      memset(key->counter.receive.backtrack, 0,
+-             sizeof(key->counter.receive.backtrack));
+-      key->birthdate = ktime_get_coarse_boottime_ns();
+-      key->is_valid = true;
+-}
+-
+ static void derive_keys(struct noise_symmetric_key *first_dst,
+                       struct noise_symmetric_key *second_dst,
+                       const u8 chaining_key[NOISE_HASH_LEN])
+ {
++      u64 birthdate = ktime_get_coarse_boottime_ns();
+       kdf(first_dst->key, second_dst->key, NULL, NULL,
+           NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, 0,
+           chaining_key);
+-      symmetric_key_init(first_dst);
+-      symmetric_key_init(second_dst);
++      first_dst->birthdate = second_dst->birthdate = birthdate;
++      first_dst->is_valid = second_dst->is_valid = true;
+ }
+ 
+ static bool __must_check mix_dh(u8 chaining_key[NOISE_HASH_LEN],
+--- a/drivers/net/wireguard/noise.h
++++ b/drivers/net/wireguard/noise.h
+@@ -15,18 +15,14 @@
+ #include <linux/mutex.h>
+ #include <linux/kref.h>
+ 
+-union noise_counter {
+-      struct {
+-              u64 counter;
+-              unsigned long backtrack[COUNTER_BITS_TOTAL / BITS_PER_LONG];
+-              spinlock_t lock;
+-      } receive;
+-      atomic64_t counter;
++struct noise_replay_counter {
++      u64 counter;
++      spinlock_t lock;
++      unsigned long backtrack[COUNTER_BITS_TOTAL / BITS_PER_LONG];
+ };
+ 
+ struct noise_symmetric_key {
+       u8 key[NOISE_SYMMETRIC_KEY_LEN];
+-      union noise_counter counter;
+       u64 birthdate;
+       bool is_valid;
+ };
+@@ -34,7 +30,9 @@ struct noise_symmetric_key {
+ struct noise_keypair {
+       struct index_hashtable_entry entry;
+       struct noise_symmetric_key sending;
++      atomic64_t sending_counter;
+       struct noise_symmetric_key receiving;
++      struct noise_replay_counter receiving_counter;
+       __le32 remote_index;
+       bool i_am_the_initiator;
+       struct kref refcount;
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -245,20 +245,20 @@ static void keep_key_fresh(struct wg_pee
+       }
+ }
+ 
+-static bool decrypt_packet(struct sk_buff *skb, struct noise_symmetric_key *key)
++static bool decrypt_packet(struct sk_buff *skb, struct noise_keypair *keypair)
+ {
+       struct scatterlist sg[MAX_SKB_FRAGS + 8];
+       struct sk_buff *trailer;
+       unsigned int offset;
+       int num_frags;
+ 
+-      if (unlikely(!key))
++      if (unlikely(!keypair))
+               return false;
+ 
+-      if (unlikely(!READ_ONCE(key->is_valid) ||
+-                wg_birthdate_has_expired(key->birthdate, REJECT_AFTER_TIME) ||
+-                key->counter.receive.counter >= REJECT_AFTER_MESSAGES)) {
+-              WRITE_ONCE(key->is_valid, false);
++      if (unlikely(!READ_ONCE(keypair->receiving.is_valid) ||
++                wg_birthdate_has_expired(keypair->receiving.birthdate, REJECT_AFTER_TIME) ||
++                keypair->receiving_counter.counter >= REJECT_AFTER_MESSAGES)) {
++              WRITE_ONCE(keypair->receiving.is_valid, false);
+               return false;
+       }
+ 
+@@ -283,7 +283,7 @@ static bool decrypt_packet(struct sk_buf
+ 
+       if (!chacha20poly1305_decrypt_sg_inplace(sg, skb->len, NULL, 0,
+                                                PACKET_CB(skb)->nonce,
+-                                               key->key))
++                                               keypair->receiving.key))
+               return false;
+ 
+       /* Another ugly situation of pushing and pulling the header so as to
+@@ -298,41 +298,41 @@ static bool decrypt_packet(struct sk_buf
+ }
+ 
+ /* This is RFC6479, a replay detection bitmap algorithm that avoids bitshifts */
+-static bool counter_validate(union noise_counter *counter, u64 their_counter)
++static bool counter_validate(struct noise_replay_counter *counter, u64 their_counter)
+ {
+       unsigned long index, index_current, top, i;
+       bool ret = false;
+ 
+-      spin_lock_bh(&counter->receive.lock);
++      spin_lock_bh(&counter->lock);
+ 
+-      if (unlikely(counter->receive.counter >= REJECT_AFTER_MESSAGES + 1 ||
++      if (unlikely(counter->counter >= REJECT_AFTER_MESSAGES + 1 ||
+                    their_counter >= REJECT_AFTER_MESSAGES))
+               goto out;
+ 
+       ++their_counter;
+ 
+       if (unlikely((COUNTER_WINDOW_SIZE + their_counter) <
+-                   counter->receive.counter))
++                   counter->counter))
+               goto out;
+ 
+       index = their_counter >> ilog2(BITS_PER_LONG);
+ 
+-      if (likely(their_counter > counter->receive.counter)) {
+-              index_current = counter->receive.counter >> ilog2(BITS_PER_LONG);
++      if (likely(their_counter > counter->counter)) {
++              index_current = counter->counter >> ilog2(BITS_PER_LONG);
+               top = min_t(unsigned long, index - index_current,
+                           COUNTER_BITS_TOTAL / BITS_PER_LONG);
+               for (i = 1; i <= top; ++i)
+-                      counter->receive.backtrack[(i + index_current) &
++                      counter->backtrack[(i + index_current) &
+                               ((COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1)] = 0;
+-              counter->receive.counter = their_counter;
++              counter->counter = their_counter;
+       }
+ 
+       index &= (COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1;
+       ret = !test_and_set_bit(their_counter & (BITS_PER_LONG - 1),
+-                              &counter->receive.backtrack[index]);
++                              &counter->backtrack[index]);
+ 
+ out:
+-      spin_unlock_bh(&counter->receive.lock);
++      spin_unlock_bh(&counter->lock);
+       return ret;
+ }
+ 
+@@ -472,12 +472,12 @@ int wg_packet_rx_poll(struct napi_struct
+               if (unlikely(state != PACKET_STATE_CRYPTED))
+                       goto next;
+ 
+-              if (unlikely(!counter_validate(&keypair->receiving.counter,
++              if (unlikely(!counter_validate(&keypair->receiving_counter,
+                                              PACKET_CB(skb)->nonce))) {
+                       net_dbg_ratelimited("%s: Packet has invalid nonce %llu (max %llu)\n",
+                                           peer->device->dev->name,
+                                           PACKET_CB(skb)->nonce,
+-                                          keypair->receiving.counter.receive.counter);
++                                          keypair->receiving_counter.counter);
+                       goto next;
+               }
+ 
+@@ -511,8 +511,8 @@ void wg_packet_decrypt_worker(struct wor
+       struct sk_buff *skb;
+ 
+       while ((skb = ptr_ring_consume_bh(&queue->ring)) != NULL) {
+-              enum packet_state state = likely(decrypt_packet(skb,
+-                              &PACKET_CB(skb)->keypair->receiving)) ?
++              enum packet_state state =
++                      likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ?
+                               PACKET_STATE_CRYPTED : PACKET_STATE_DEAD;
+               wg_queue_enqueue_per_peer_napi(skb, state);
+               if (need_resched())
+--- a/drivers/net/wireguard/selftest/counter.c
++++ b/drivers/net/wireguard/selftest/counter.c
+@@ -6,18 +6,24 @@
+ #ifdef DEBUG
+ bool __init wg_packet_counter_selftest(void)
+ {
++      struct noise_replay_counter *counter;
+       unsigned int test_num = 0, i;
+-      union noise_counter counter;
+       bool success = true;
+ 
+-#define T_INIT do {                                               \
+-              memset(&counter, 0, sizeof(union noise_counter)); \
+-              spin_lock_init(&counter.receive.lock);            \
++      counter = kmalloc(sizeof(*counter), GFP_KERNEL);
++      if (unlikely(!counter)) {
++              pr_err("nonce counter self-test malloc: FAIL\n");
++              return false;
++      }
++
++#define T_INIT do {                                    \
++              memset(counter, 0, sizeof(*counter));  \
++              spin_lock_init(&counter->lock);        \
+       } while (0)
+ #define T_LIM (COUNTER_WINDOW_SIZE + 1)
+ #define T(n, v) do {                                                  \
+               ++test_num;                                           \
+-              if (counter_validate(&counter, n) != (v)) {           \
++              if (counter_validate(counter, n) != (v)) {            \
+                       pr_err("nonce counter self-test %u: FAIL\n",  \
+                              test_num);                             \
+                       success = false;                              \
+@@ -99,6 +105,7 @@ bool __init wg_packet_counter_selftest(v
+ 
+       if (success)
+               pr_info("nonce counter self-tests: pass\n");
++      kfree(counter);
+       return success;
+ }
+ #endif
+--- a/drivers/net/wireguard/send.c
++++ b/drivers/net/wireguard/send.c
+@@ -129,7 +129,7 @@ static void keep_key_fresh(struct wg_pee
+       rcu_read_lock_bh();
+       keypair = rcu_dereference_bh(peer->keypairs.current_keypair);
+       send = keypair && READ_ONCE(keypair->sending.is_valid) &&
+-             (atomic64_read(&keypair->sending.counter.counter) > REKEY_AFTER_MESSAGES ||
++             (atomic64_read(&keypair->sending_counter) > REKEY_AFTER_MESSAGES ||
+               (keypair->i_am_the_initiator &&
+                wg_birthdate_has_expired(keypair->sending.birthdate, REKEY_AFTER_TIME)));
+       rcu_read_unlock_bh();
+@@ -349,7 +349,6 @@ void wg_packet_purge_staged_packets(stru
+ 
+ void wg_packet_send_staged_packets(struct wg_peer *peer)
+ {
+-      struct noise_symmetric_key *key;
+       struct noise_keypair *keypair;
+       struct sk_buff_head packets;
+       struct sk_buff *skb;
+@@ -369,10 +368,9 @@ void wg_packet_send_staged_packets(struc
+       rcu_read_unlock_bh();
+       if (unlikely(!keypair))
+               goto out_nokey;
+-      key = &keypair->sending;
+-      if (unlikely(!READ_ONCE(key->is_valid)))
++      if (unlikely(!READ_ONCE(keypair->sending.is_valid)))
+               goto out_nokey;
+-      if (unlikely(wg_birthdate_has_expired(key->birthdate,
++      if (unlikely(wg_birthdate_has_expired(keypair->sending.birthdate,
+                                             REJECT_AFTER_TIME)))
+               goto out_invalid;
+ 
+@@ -387,7 +385,7 @@ void wg_packet_send_staged_packets(struc
+                */
+               PACKET_CB(skb)->ds = ip_tunnel_ecn_encap(0, ip_hdr(skb), skb);
+               PACKET_CB(skb)->nonce =
+-                              atomic64_inc_return(&key->counter.counter) - 1;
++                              atomic64_inc_return(&keypair->sending_counter) - 1;
+               if (unlikely(PACKET_CB(skb)->nonce >= REJECT_AFTER_MESSAGES))
+                       goto out_invalid;
+       }
+@@ -399,7 +397,7 @@ void wg_packet_send_staged_packets(struc
+       return;
+ 
+ out_invalid:
+-      WRITE_ONCE(key->is_valid, false);
++      WRITE_ONCE(keypair->sending.is_valid, false);
+ out_nokey:
+       wg_noise_keypair_put(keypair, false);
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0107-wireguard-noise-do-not-assign-initiation-time-in-if-.patch b/target/linux/generic/backport-5.4/080-wireguard-0107-wireguard-noise-do-not-assign-initiation-time-in-if-.patch

new file mode 100644 (file)

index 0000000..b549b32
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0107-wireguard-noise-do-not-assign-initiation-time-in-if-.patch
@@ -0,0 +1,33 @@
+From adb4079f56d1f6c7d4dc827d7eba14e3436512f9 Mon Sep 17 00:00:00 2001
+From: Frank Werner-Krippendorf <mail@hb9fxq.ch>
+Date: Tue, 23 Jun 2020 03:59:44 -0600
+Subject: [PATCH 107/124] wireguard: noise: do not assign initiation time in if
+ condition
+
+commit 558b353c9c2a717509f291c066c6bd8f5f5e21be upstream.
+
+Fixes an error condition reported by checkpatch.pl which caused by
+assigning a variable in an if condition in wg_noise_handshake_consume_
+initiation().
+
+Signed-off-by: Frank Werner-Krippendorf <mail@hb9fxq.ch>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/noise.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/wireguard/noise.c
++++ b/drivers/net/wireguard/noise.c
+@@ -617,8 +617,8 @@ wg_noise_handshake_consume_initiation(st
+       memcpy(handshake->hash, hash, NOISE_HASH_LEN);
+       memcpy(handshake->chaining_key, chaining_key, NOISE_HASH_LEN);
+       handshake->remote_index = src->sender_index;
+-      if ((s64)(handshake->last_initiation_consumption -
+-          (initiation_consumption = ktime_get_coarse_boottime_ns())) < 0)
++      initiation_consumption = ktime_get_coarse_boottime_ns();
++      if ((s64)(handshake->last_initiation_consumption - initiation_consumption) < 0)
+               handshake->last_initiation_consumption = initiation_consumption;
+       handshake->state = HANDSHAKE_CONSUMED_INITIATION;
+       up_write(&handshake->lock);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0108-wireguard-device-avoid-circular-netns-references.patch b/target/linux/generic/backport-5.4/080-wireguard-0108-wireguard-device-avoid-circular-netns-references.patch

new file mode 100644 (file)

index 0000000..8021b9b
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0108-wireguard-device-avoid-circular-netns-references.patch
@@ -0,0 +1,296 @@
+From 40d881393cfc6953778691444ab27a29d51d24aa Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Tue, 23 Jun 2020 03:59:45 -0600
+Subject: [PATCH 108/124] wireguard: device: avoid circular netns references
+
+commit 900575aa33a3eaaef802b31de187a85c4a4b4bd0 upstream.
+
+Before, we took a reference to the creating netns if the new netns was
+different. This caused issues with circular references, with two
+wireguard interfaces swapping namespaces. The solution is to rather not
+take any extra references at all, but instead simply invalidate the
+creating netns pointer when that netns is deleted.
+
+In order to prevent this from happening again, this commit improves the
+rough object leak tracking by allowing it to account for created and
+destroyed interfaces, aside from just peers and keys. That then makes it
+possible to check for the object leak when having two interfaces take a
+reference to each others' namespaces.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/device.c             | 58 ++++++++++------------
+ drivers/net/wireguard/device.h             |  3 +-
+ drivers/net/wireguard/netlink.c            | 14 ++++--
+ drivers/net/wireguard/socket.c             | 25 +++++++---
+ tools/testing/selftests/wireguard/netns.sh | 13 ++++-
+ 5 files changed, 67 insertions(+), 46 deletions(-)
+
+--- a/drivers/net/wireguard/device.c
++++ b/drivers/net/wireguard/device.c
+@@ -45,17 +45,18 @@ static int wg_open(struct net_device *de
+       if (dev_v6)
+               dev_v6->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_NONE;
+ 
++      mutex_lock(&wg->device_update_lock);
+       ret = wg_socket_init(wg, wg->incoming_port);
+       if (ret < 0)
+-              return ret;
+-      mutex_lock(&wg->device_update_lock);
++              goto out;
+       list_for_each_entry(peer, &wg->peer_list, peer_list) {
+               wg_packet_send_staged_packets(peer);
+               if (peer->persistent_keepalive_interval)
+                       wg_packet_send_keepalive(peer);
+       }
++out:
+       mutex_unlock(&wg->device_update_lock);
+-      return 0;
++      return ret;
+ }
+ 
+ #ifdef CONFIG_PM_SLEEP
+@@ -225,6 +226,7 @@ static void wg_destruct(struct net_devic
+       list_del(&wg->device_list);
+       rtnl_unlock();
+       mutex_lock(&wg->device_update_lock);
++      rcu_assign_pointer(wg->creating_net, NULL);
+       wg->incoming_port = 0;
+       wg_socket_reinit(wg, NULL, NULL);
+       /* The final references are cleared in the below calls to destroy_workqueue. */
+@@ -240,13 +242,11 @@ static void wg_destruct(struct net_devic
+       skb_queue_purge(&wg->incoming_handshakes);
+       free_percpu(dev->tstats);
+       free_percpu(wg->incoming_handshakes_worker);
+-      if (wg->have_creating_net_ref)
+-              put_net(wg->creating_net);
+       kvfree(wg->index_hashtable);
+       kvfree(wg->peer_hashtable);
+       mutex_unlock(&wg->device_update_lock);
+ 
+-      pr_debug("%s: Interface deleted\n", dev->name);
++      pr_debug("%s: Interface destroyed\n", dev->name);
+       free_netdev(dev);
+ }
+ 
+@@ -292,7 +292,7 @@ static int wg_newlink(struct net *src_ne
+       struct wg_device *wg = netdev_priv(dev);
+       int ret = -ENOMEM;
+ 
+-      wg->creating_net = src_net;
++      rcu_assign_pointer(wg->creating_net, src_net);
+       init_rwsem(&wg->static_identity.lock);
+       mutex_init(&wg->socket_update_lock);
+       mutex_init(&wg->device_update_lock);
+@@ -393,30 +393,26 @@ static struct rtnl_link_ops link_ops __r
+       .newlink                = wg_newlink,
+ };
+ 
+-static int wg_netdevice_notification(struct notifier_block *nb,
+-                                   unsigned long action, void *data)
++static void wg_netns_pre_exit(struct net *net)
+ {
+-      struct net_device *dev = ((struct netdev_notifier_info *)data)->dev;
+-      struct wg_device *wg = netdev_priv(dev);
+-
+-      ASSERT_RTNL();
+-
+-      if (action != NETDEV_REGISTER || dev->netdev_ops != &netdev_ops)
+-              return 0;
++      struct wg_device *wg;
+ 
+-      if (dev_net(dev) == wg->creating_net && wg->have_creating_net_ref) {
+-              put_net(wg->creating_net);
+-              wg->have_creating_net_ref = false;
+-      } else if (dev_net(dev) != wg->creating_net &&
+-                 !wg->have_creating_net_ref) {
+-              wg->have_creating_net_ref = true;
+-              get_net(wg->creating_net);
++      rtnl_lock();
++      list_for_each_entry(wg, &device_list, device_list) {
++              if (rcu_access_pointer(wg->creating_net) == net) {
++                      pr_debug("%s: Creating namespace exiting\n", wg->dev->name);
++                      netif_carrier_off(wg->dev);
++                      mutex_lock(&wg->device_update_lock);
++                      rcu_assign_pointer(wg->creating_net, NULL);
++                      wg_socket_reinit(wg, NULL, NULL);
++                      mutex_unlock(&wg->device_update_lock);
++              }
+       }
+-      return 0;
++      rtnl_unlock();
+ }
+ 
+-static struct notifier_block netdevice_notifier = {
+-      .notifier_call = wg_netdevice_notification
++static struct pernet_operations pernet_ops = {
++      .pre_exit = wg_netns_pre_exit
+ };
+ 
+ int __init wg_device_init(void)
+@@ -429,18 +425,18 @@ int __init wg_device_init(void)
+               return ret;
+ #endif
+ 
+-      ret = register_netdevice_notifier(&netdevice_notifier);
++      ret = register_pernet_device(&pernet_ops);
+       if (ret)
+               goto error_pm;
+ 
+       ret = rtnl_link_register(&link_ops);
+       if (ret)
+-              goto error_netdevice;
++              goto error_pernet;
+ 
+       return 0;
+ 
+-error_netdevice:
+-      unregister_netdevice_notifier(&netdevice_notifier);
++error_pernet:
++      unregister_pernet_device(&pernet_ops);
+ error_pm:
+ #ifdef CONFIG_PM_SLEEP
+       unregister_pm_notifier(&pm_notifier);
+@@ -451,7 +447,7 @@ error_pm:
+ void wg_device_uninit(void)
+ {
+       rtnl_link_unregister(&link_ops);
+-      unregister_netdevice_notifier(&netdevice_notifier);
++      unregister_pernet_device(&pernet_ops);
+ #ifdef CONFIG_PM_SLEEP
+       unregister_pm_notifier(&pm_notifier);
+ #endif
+--- a/drivers/net/wireguard/device.h
++++ b/drivers/net/wireguard/device.h
+@@ -40,7 +40,7 @@ struct wg_device {
+       struct net_device *dev;
+       struct crypt_queue encrypt_queue, decrypt_queue;
+       struct sock __rcu *sock4, *sock6;
+-      struct net *creating_net;
++      struct net __rcu *creating_net;
+       struct noise_static_identity static_identity;
+       struct workqueue_struct *handshake_receive_wq, *handshake_send_wq;
+       struct workqueue_struct *packet_crypt_wq;
+@@ -56,7 +56,6 @@ struct wg_device {
+       unsigned int num_peers, device_update_gen;
+       u32 fwmark;
+       u16 incoming_port;
+-      bool have_creating_net_ref;
+ };
+ 
+ int wg_device_init(void);
+--- a/drivers/net/wireguard/netlink.c
++++ b/drivers/net/wireguard/netlink.c
+@@ -517,11 +517,15 @@ static int wg_set_device(struct sk_buff
+       if (flags & ~__WGDEVICE_F_ALL)
+               goto out;
+ 
+-      ret = -EPERM;
+-      if ((info->attrs[WGDEVICE_A_LISTEN_PORT] ||
+-           info->attrs[WGDEVICE_A_FWMARK]) &&
+-          !ns_capable(wg->creating_net->user_ns, CAP_NET_ADMIN))
+-              goto out;
++      if (info->attrs[WGDEVICE_A_LISTEN_PORT] || info->attrs[WGDEVICE_A_FWMARK]) {
++              struct net *net;
++              rcu_read_lock();
++              net = rcu_dereference(wg->creating_net);
++              ret = !net || !ns_capable(net->user_ns, CAP_NET_ADMIN) ? -EPERM : 0;
++              rcu_read_unlock();
++              if (ret)
++                      goto out;
++      }
+ 
+       ++wg->device_update_gen;
+ 
+--- a/drivers/net/wireguard/socket.c
++++ b/drivers/net/wireguard/socket.c
+@@ -347,6 +347,7 @@ static void set_sock_opts(struct socket
+ 
+ int wg_socket_init(struct wg_device *wg, u16 port)
+ {
++      struct net *net;
+       int ret;
+       struct udp_tunnel_sock_cfg cfg = {
+               .sk_user_data = wg,
+@@ -371,37 +372,47 @@ int wg_socket_init(struct wg_device *wg,
+       };
+ #endif
+ 
++      rcu_read_lock();
++      net = rcu_dereference(wg->creating_net);
++      net = net ? maybe_get_net(net) : NULL;
++      rcu_read_unlock();
++      if (unlikely(!net))
++              return -ENONET;
++
+ #if IS_ENABLED(CONFIG_IPV6)
+ retry:
+ #endif
+ 
+-      ret = udp_sock_create(wg->creating_net, &port4, &new4);
++      ret = udp_sock_create(net, &port4, &new4);
+       if (ret < 0) {
+               pr_err("%s: Could not create IPv4 socket\n", wg->dev->name);
+-              return ret;
++              goto out;
+       }
+       set_sock_opts(new4);
+-      setup_udp_tunnel_sock(wg->creating_net, new4, &cfg);
++      setup_udp_tunnel_sock(net, new4, &cfg);
+ 
+ #if IS_ENABLED(CONFIG_IPV6)
+       if (ipv6_mod_enabled()) {
+               port6.local_udp_port = inet_sk(new4->sk)->inet_sport;
+-              ret = udp_sock_create(wg->creating_net, &port6, &new6);
++              ret = udp_sock_create(net, &port6, &new6);
+               if (ret < 0) {
+                       udp_tunnel_sock_release(new4);
+                       if (ret == -EADDRINUSE && !port && retries++ < 100)
+                               goto retry;
+                       pr_err("%s: Could not create IPv6 socket\n",
+                              wg->dev->name);
+-                      return ret;
++                      goto out;
+               }
+               set_sock_opts(new6);
+-              setup_udp_tunnel_sock(wg->creating_net, new6, &cfg);
++              setup_udp_tunnel_sock(net, new6, &cfg);
+       }
+ #endif
+ 
+       wg_socket_reinit(wg, new4->sk, new6 ? new6->sk : NULL);
+-      return 0;
++      ret = 0;
++out:
++      put_net(net);
++      return ret;
+ }
+ 
+ void wg_socket_reinit(struct wg_device *wg, struct sock *new4,
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -587,9 +587,20 @@ ip0 link set wg0 up
+ kill $ncat_pid
+ ip0 link del wg0
+ 
++# Ensure there aren't circular reference loops
++ip1 link add wg1 type wireguard
++ip2 link add wg2 type wireguard
++ip1 link set wg1 netns $netns2
++ip2 link set wg2 netns $netns1
++pp ip netns delete $netns1
++pp ip netns delete $netns2
++pp ip netns add $netns1
++pp ip netns add $netns2
++
++sleep 2 # Wait for cleanup and grace periods
+ declare -A objects
+ while read -t 0.1 -r line 2>/dev/null || [[ $? -ne 142 ]]; do
+-      [[ $line =~ .*(wg[0-9]+:\ [A-Z][a-z]+\ [0-9]+)\ .*(created|destroyed).* ]] || continue
++      [[ $line =~ .*(wg[0-9]+:\ [A-Z][a-z]+\ ?[0-9]*)\ .*(created|destroyed).* ]] || continue
+       objects["${BASH_REMATCH[1]}"]+="${BASH_REMATCH[2]}"
+ done < /dev/kmsg
+ alldeleted=1
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0109-wireguard-receive-account-for-napi_gro_receive-never.patch b/target/linux/generic/backport-5.4/080-wireguard-0109-wireguard-receive-account-for-napi_gro_receive-never.patch

new file mode 100644 (file)

index 0000000..edcbc8a
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0109-wireguard-receive-account-for-napi_gro_receive-never.patch
@@ -0,0 +1,42 @@
+From b7077a2f4d374d3f2108af9d0a1b94fd2c346ba7 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 24 Jun 2020 16:06:03 -0600
+Subject: [PATCH 109/124] wireguard: receive: account for napi_gro_receive
+ never returning GRO_DROP
+
+commit df08126e3833e9dca19e2407db5f5860a7c194fb upstream.
+
+The napi_gro_receive function no longer returns GRO_DROP ever, making
+handling GRO_DROP dead code. This commit removes that dead code.
+Further, it's not even clear that device drivers have any business in
+taking action after passing off received packets; that's arguably out of
+their hands.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Fixes: 6570bc79c0df ("net: core: use listified Rx for GRO_NORMAL in napi_gro_receive()")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/receive.c | 10 ++--------
+ 1 file changed, 2 insertions(+), 8 deletions(-)
+
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -414,14 +414,8 @@ static void wg_packet_consume_data_done(
+       if (unlikely(routed_peer != peer))
+               goto dishonest_packet_peer;
+ 
+-      if (unlikely(napi_gro_receive(&peer->napi, skb) == GRO_DROP)) {
+-              ++dev->stats.rx_dropped;
+-              net_dbg_ratelimited("%s: Failed to give packet to userspace from peer %llu (%pISpfsc)\n",
+-                                  dev->name, peer->internal_id,
+-                                  &peer->endpoint.addr);
+-      } else {
+-              update_rx_stats(peer, message_data_len(len_before_trim));
+-      }
++      napi_gro_receive(&peer->napi, skb);
++      update_rx_stats(peer, message_data_len(len_before_trim));
+       return;
+ 
+ dishonest_packet_peer:
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0110-net-ip_tunnel-add-header_ops-for-layer-3-devices.patch b/target/linux/generic/backport-5.4/080-wireguard-0110-net-ip_tunnel-add-header_ops-for-layer-3-devices.patch

new file mode 100644 (file)

index 0000000..4e925d7
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0110-net-ip_tunnel-add-header_ops-for-layer-3-devices.patch
@@ -0,0 +1,58 @@
+From 5effaa566cea8d862bf00ff81d2e3fa40521d296 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 29 Jun 2020 19:06:18 -0600
+Subject: [PATCH 110/124] net: ip_tunnel: add header_ops for layer 3 devices
+
+commit 2606aff916854b61234bf85001be9777bab2d5f8 upstream.
+
+Some devices that take straight up layer 3 packets benefit from having a
+shared header_ops so that AF_PACKET sockets can inject packets that are
+recognized. This shared infrastructure will be used by other drivers
+that currently can't inject packets using AF_PACKET. It also exposes the
+parser function, as it is useful in standalone form too.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Acked-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/net/ip_tunnels.h  |  3 +++
+ net/ipv4/ip_tunnel_core.c | 18 ++++++++++++++++++
+ 2 files changed, 21 insertions(+)
+
+--- a/include/net/ip_tunnels.h
++++ b/include/net/ip_tunnels.h
+@@ -289,6 +289,9 @@ int ip_tunnel_newlink(struct net_device
+                     struct ip_tunnel_parm *p, __u32 fwmark);
+ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
+ 
++extern const struct header_ops ip_tunnel_header_ops;
++__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb);
++
+ struct ip_tunnel_encap_ops {
+       size_t (*encap_hlen)(struct ip_tunnel_encap *e);
+       int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
+--- a/net/ipv4/ip_tunnel_core.c
++++ b/net/ipv4/ip_tunnel_core.c
+@@ -446,3 +446,21 @@ void ip_tunnel_unneed_metadata(void)
+       static_branch_dec(&ip_tunnel_metadata_cnt);
+ }
+ EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
++
++/* Returns either the correct skb->protocol value, or 0 if invalid. */
++__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb)
++{
++      if (skb_network_header(skb) >= skb->head &&
++          (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) &&
++          ip_hdr(skb)->version == 4)
++              return htons(ETH_P_IP);
++      if (skb_network_header(skb) >= skb->head &&
++          (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) &&
++          ipv6_hdr(skb)->version == 6)
++              return htons(ETH_P_IPV6);
++      return 0;
++}
++EXPORT_SYMBOL(ip_tunnel_parse_protocol);
++
++const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol };
++EXPORT_SYMBOL(ip_tunnel_header_ops);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0111-wireguard-implement-header_ops-parse_protocol-for-AF.patch b/target/linux/generic/backport-5.4/080-wireguard-0111-wireguard-implement-header_ops-parse_protocol-for-AF.patch

new file mode 100644 (file)

index 0000000..4cc67a7
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0111-wireguard-implement-header_ops-parse_protocol-for-AF.patch
@@ -0,0 +1,37 @@
+From cf413ab742788eeb47e789934d492bb546aa4aa8 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 29 Jun 2020 19:06:20 -0600
+Subject: [PATCH 111/124] wireguard: implement header_ops->parse_protocol for
+ AF_PACKET
+
+commit 01a4967c71c004f8ecad4ab57021348636502fa9 upstream.
+
+WireGuard uses skb->protocol to determine packet type, and bails out if
+it's not set or set to something it's not expecting. For AF_PACKET
+injection, we need to support its call chain of:
+
+    packet_sendmsg -> packet_snd -> packet_parse_headers ->
+      dev_parse_header_protocol -> parse_protocol
+
+Without a valid parse_protocol, this returns zero, and wireguard then
+rejects the skb. So, this wires up the ip_tunnel handler for layer 3
+packets for that case.
+
+Reported-by: Hans Wippel <ndev@hwipl.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/device.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/wireguard/device.c
++++ b/drivers/net/wireguard/device.c
+@@ -262,6 +262,7 @@ static void wg_setup(struct net_device *
+                            max(sizeof(struct ipv6hdr), sizeof(struct iphdr));
+ 
+       dev->netdev_ops = &netdev_ops;
++      dev->header_ops = &ip_tunnel_header_ops;
+       dev->hard_header_len = 0;
+       dev->addr_len = 0;
+       dev->needed_headroom = DATA_PACKET_HEAD_ROOM;
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0112-wireguard-queueing-make-use-of-ip_tunnel_parse_proto.patch b/target/linux/generic/backport-5.4/080-wireguard-0112-wireguard-queueing-make-use-of-ip_tunnel_parse_proto.patch

new file mode 100644 (file)

index 0000000..1f8766c
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0112-wireguard-queueing-make-use-of-ip_tunnel_parse_proto.patch
@@ -0,0 +1,69 @@
+From 83313326c87e7c1aacebce4f8411505e2b68bf25 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 29 Jun 2020 19:06:21 -0600
+Subject: [PATCH 112/124] wireguard: queueing: make use of
+ ip_tunnel_parse_protocol
+
+commit 1a574074ae7d1d745c16f7710655f38a53174c27 upstream.
+
+Now that wg_examine_packet_protocol has been added for general
+consumption as ip_tunnel_parse_protocol, it's possible to remove
+wg_examine_packet_protocol and simply use the new
+ip_tunnel_parse_protocol function directly.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/queueing.h | 19 ++-----------------
+ drivers/net/wireguard/receive.c  |  2 +-
+ 2 files changed, 3 insertions(+), 18 deletions(-)
+
+--- a/drivers/net/wireguard/queueing.h
++++ b/drivers/net/wireguard/queueing.h
+@@ -11,6 +11,7 @@
+ #include <linux/skbuff.h>
+ #include <linux/ip.h>
+ #include <linux/ipv6.h>
++#include <net/ip_tunnels.h>
+ 
+ struct wg_device;
+ struct wg_peer;
+@@ -65,25 +66,9 @@ struct packet_cb {
+ #define PACKET_CB(skb) ((struct packet_cb *)((skb)->cb))
+ #define PACKET_PEER(skb) (PACKET_CB(skb)->keypair->entry.peer)
+ 
+-/* Returns either the correct skb->protocol value, or 0 if invalid. */
+-static inline __be16 wg_examine_packet_protocol(struct sk_buff *skb)
+-{
+-      if (skb_network_header(skb) >= skb->head &&
+-          (skb_network_header(skb) + sizeof(struct iphdr)) <=
+-                  skb_tail_pointer(skb) &&
+-          ip_hdr(skb)->version == 4)
+-              return htons(ETH_P_IP);
+-      if (skb_network_header(skb) >= skb->head &&
+-          (skb_network_header(skb) + sizeof(struct ipv6hdr)) <=
+-                  skb_tail_pointer(skb) &&
+-          ipv6_hdr(skb)->version == 6)
+-              return htons(ETH_P_IPV6);
+-      return 0;
+-}
+-
+ static inline bool wg_check_packet_protocol(struct sk_buff *skb)
+ {
+-      __be16 real_protocol = wg_examine_packet_protocol(skb);
++      __be16 real_protocol = ip_tunnel_parse_protocol(skb);
+       return real_protocol && skb->protocol == real_protocol;
+ }
+ 
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -387,7 +387,7 @@ static void wg_packet_consume_data_done(
+        */
+       skb->ip_summed = CHECKSUM_UNNECESSARY;
+       skb->csum_level = ~0; /* All levels */
+-      skb->protocol = wg_examine_packet_protocol(skb);
++      skb->protocol = ip_tunnel_parse_protocol(skb);
+       if (skb->protocol == htons(ETH_P_IP)) {
+               len = ntohs(ip_hdr(skb)->tot_len);
+               if (unlikely(len < sizeof(struct iphdr)))
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0113-netlink-consistently-use-NLA_POLICY_EXACT_LEN.patch b/target/linux/generic/backport-5.4/080-wireguard-0113-netlink-consistently-use-NLA_POLICY_EXACT_LEN.patch

new file mode 100644 (file)

index 0000000..f343ed8
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0113-netlink-consistently-use-NLA_POLICY_EXACT_LEN.patch
@@ -0,0 +1,49 @@
+From 605843f571026155514f83127885ae81b83479ae Mon Sep 17 00:00:00 2001
+From: Johannes Berg <johannes.berg@intel.com>
+Date: Tue, 18 Aug 2020 10:17:31 +0200
+Subject: [PATCH 113/124] netlink: consistently use NLA_POLICY_EXACT_LEN()
+
+commit 8140860c817f3e9f78bcd1e420b9777ddcbaa629 upstream.
+
+Change places that open-code NLA_POLICY_EXACT_LEN() to
+use the macro instead, giving us flexibility in how we
+handle the details of the macro.
+
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Acked-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[Jason: only picked the drivers/net/wireguard/* part]
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/netlink.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/drivers/net/wireguard/netlink.c
++++ b/drivers/net/wireguard/netlink.c
+@@ -22,8 +22,8 @@ static struct genl_family genl_family;
+ static const struct nla_policy device_policy[WGDEVICE_A_MAX + 1] = {
+       [WGDEVICE_A_IFINDEX]            = { .type = NLA_U32 },
+       [WGDEVICE_A_IFNAME]             = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+-      [WGDEVICE_A_PRIVATE_KEY]        = { .type = NLA_EXACT_LEN, .len = NOISE_PUBLIC_KEY_LEN },
+-      [WGDEVICE_A_PUBLIC_KEY]         = { .type = NLA_EXACT_LEN, .len = NOISE_PUBLIC_KEY_LEN },
++      [WGDEVICE_A_PRIVATE_KEY]        = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN),
++      [WGDEVICE_A_PUBLIC_KEY]         = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN),
+       [WGDEVICE_A_FLAGS]              = { .type = NLA_U32 },
+       [WGDEVICE_A_LISTEN_PORT]        = { .type = NLA_U16 },
+       [WGDEVICE_A_FWMARK]             = { .type = NLA_U32 },
+@@ -31,12 +31,12 @@ static const struct nla_policy device_po
+ };
+ 
+ static const struct nla_policy peer_policy[WGPEER_A_MAX + 1] = {
+-      [WGPEER_A_PUBLIC_KEY]                           = { .type = NLA_EXACT_LEN, .len = NOISE_PUBLIC_KEY_LEN },
+-      [WGPEER_A_PRESHARED_KEY]                        = { .type = NLA_EXACT_LEN, .len = NOISE_SYMMETRIC_KEY_LEN },
++      [WGPEER_A_PUBLIC_KEY]                           = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN),
++      [WGPEER_A_PRESHARED_KEY]                        = NLA_POLICY_EXACT_LEN(NOISE_SYMMETRIC_KEY_LEN),
+       [WGPEER_A_FLAGS]                                = { .type = NLA_U32 },
+       [WGPEER_A_ENDPOINT]                             = { .type = NLA_MIN_LEN, .len = sizeof(struct sockaddr) },
+       [WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]        = { .type = NLA_U16 },
+-      [WGPEER_A_LAST_HANDSHAKE_TIME]                  = { .type = NLA_EXACT_LEN, .len = sizeof(struct __kernel_timespec) },
++      [WGPEER_A_LAST_HANDSHAKE_TIME]                  = NLA_POLICY_EXACT_LEN(sizeof(struct __kernel_timespec)),
+       [WGPEER_A_RX_BYTES]                             = { .type = NLA_U64 },
+       [WGPEER_A_TX_BYTES]                             = { .type = NLA_U64 },
+       [WGPEER_A_ALLOWEDIPS]                           = { .type = NLA_NESTED },
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0114-netlink-consistently-use-NLA_POLICY_MIN_LEN.patch b/target/linux/generic/backport-5.4/080-wireguard-0114-netlink-consistently-use-NLA_POLICY_MIN_LEN.patch

new file mode 100644 (file)

index 0000000..a859e7c
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0114-netlink-consistently-use-NLA_POLICY_MIN_LEN.patch
@@ -0,0 +1,39 @@
+From 2c778b2cd59a12f0dcba236e6441a318d1c6486c Mon Sep 17 00:00:00 2001
+From: Johannes Berg <johannes.berg@intel.com>
+Date: Tue, 18 Aug 2020 10:17:32 +0200
+Subject: [PATCH 114/124] netlink: consistently use NLA_POLICY_MIN_LEN()
+
+commit bc0435855041d7fff0b83dd992fc4be34aa11afb upstream.
+
+Change places that open-code NLA_POLICY_MIN_LEN() to
+use the macro instead, giving us flexibility in how we
+handle the details of the macro.
+
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[Jason: only picked the drivers/net/wireguard/* part]
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/netlink.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/wireguard/netlink.c
++++ b/drivers/net/wireguard/netlink.c
+@@ -34,7 +34,7 @@ static const struct nla_policy peer_poli
+       [WGPEER_A_PUBLIC_KEY]                           = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN),
+       [WGPEER_A_PRESHARED_KEY]                        = NLA_POLICY_EXACT_LEN(NOISE_SYMMETRIC_KEY_LEN),
+       [WGPEER_A_FLAGS]                                = { .type = NLA_U32 },
+-      [WGPEER_A_ENDPOINT]                             = { .type = NLA_MIN_LEN, .len = sizeof(struct sockaddr) },
++      [WGPEER_A_ENDPOINT]                             = NLA_POLICY_MIN_LEN(sizeof(struct sockaddr)),
+       [WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]        = { .type = NLA_U16 },
+       [WGPEER_A_LAST_HANDSHAKE_TIME]                  = NLA_POLICY_EXACT_LEN(sizeof(struct __kernel_timespec)),
+       [WGPEER_A_RX_BYTES]                             = { .type = NLA_U64 },
+@@ -45,7 +45,7 @@ static const struct nla_policy peer_poli
+ 
+ static const struct nla_policy allowedip_policy[WGALLOWEDIP_A_MAX + 1] = {
+       [WGALLOWEDIP_A_FAMILY]          = { .type = NLA_U16 },
+-      [WGALLOWEDIP_A_IPADDR]          = { .type = NLA_MIN_LEN, .len = sizeof(struct in_addr) },
++      [WGALLOWEDIP_A_IPADDR]          = NLA_POLICY_MIN_LEN(sizeof(struct in_addr)),
+       [WGALLOWEDIP_A_CIDR_MASK]       = { .type = NLA_U8 }
+ };
+ 
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0115-wireguard-noise-take-lock-when-removing-handshake-en.patch b/target/linux/generic/backport-5.4/080-wireguard-0115-wireguard-noise-take-lock-when-removing-handshake-en.patch

new file mode 100644 (file)

index 0000000..74448ed
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0115-wireguard-noise-take-lock-when-removing-handshake-en.patch
@@ -0,0 +1,127 @@
+From 9d4c0f8cd4cca2c65c7927f839469d6c1bef088f Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 9 Sep 2020 13:58:14 +0200
+Subject: [PATCH 115/124] wireguard: noise: take lock when removing handshake
+ entry from table
+
+commit 9179ba31367bcf481c3c79b5f028c94faad9f30a upstream.
+
+Eric reported that syzkaller found a race of this variety:
+
+CPU 1                                       CPU 2
+-------------------------------------------|---------------------------------------
+wg_index_hashtable_replace(old, ...)       |
+  if (hlist_unhashed(&old->index_hash))    |
+                                           | wg_index_hashtable_remove(old)
+                                           |   hlist_del_init_rcu(&old->index_hash)
+                                          |     old->index_hash.pprev = NULL
+  hlist_replace_rcu(&old->index_hash, ...) |
+    *old->index_hash.pprev                 |
+
+Syzbot wasn't actually able to reproduce this more than once or create a
+reproducer, because the race window between checking "hlist_unhashed" and
+calling "hlist_replace_rcu" is just so small. Adding an mdelay(5) or
+similar there helps make this demonstrable using this simple script:
+
+    #!/bin/bash
+    set -ex
+    trap 'kill $pid1; kill $pid2; ip link del wg0; ip link del wg1' EXIT
+    ip link add wg0 type wireguard
+    ip link add wg1 type wireguard
+    wg set wg0 private-key <(wg genkey) listen-port 9999
+    wg set wg1 private-key <(wg genkey) peer $(wg show wg0 public-key) endpoint 127.0.0.1:9999 persistent-keepalive 1
+    wg set wg0 peer $(wg show wg1 public-key)
+    ip link set wg0 up
+    yes link set wg1 up | ip -force -batch - &
+    pid1=$!
+    yes link set wg1 down | ip -force -batch - &
+    pid2=$!
+    wait
+
+The fundumental underlying problem is that we permit calls to wg_index_
+hashtable_remove(handshake.entry) without requiring the caller to take
+the handshake mutex that is intended to protect members of handshake
+during mutations. This is consistently the case with calls to wg_index_
+hashtable_insert(handshake.entry) and wg_index_hashtable_replace(
+handshake.entry), but it's missing from a pertinent callsite of wg_
+index_hashtable_remove(handshake.entry). So, this patch makes sure that
+mutex is taken.
+
+The original code was a little bit funky though, in the form of:
+
+    remove(handshake.entry)
+    lock(), memzero(handshake.some_members), unlock()
+    remove(handshake.entry)
+
+The original intention of that double removal pattern outside the lock
+appears to be some attempt to prevent insertions that might happen while
+locks are dropped during expensive crypto operations, but actually, all
+callers of wg_index_hashtable_insert(handshake.entry) take the write
+lock and then explicitly check handshake.state, as they should, which
+the aforementioned memzero clears, which means an insertion should
+already be impossible. And regardless, the original intention was
+necessarily racy, since it wasn't guaranteed that something else would
+run after the unlock() instead of after the remove(). So, from a
+soundness perspective, it seems positive to remove what looks like a
+hack at best.
+
+The crash from both syzbot and from the script above is as follows:
+
+  general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN
+  KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
+  CPU: 0 PID: 7395 Comm: kworker/0:3 Not tainted 5.9.0-rc4-syzkaller #0
+  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+  Workqueue: wg-kex-wg1 wg_packet_handshake_receive_worker
+  RIP: 0010:hlist_replace_rcu include/linux/rculist.h:505 [inline]
+  RIP: 0010:wg_index_hashtable_replace+0x176/0x330 drivers/net/wireguard/peerlookup.c:174
+  Code: 00 fc ff df 48 89 f9 48 c1 e9 03 80 3c 01 00 0f 85 44 01 00 00 48 b9 00 00 00 00 00 fc ff df 48 8b 45 10 48 89 c6 48 c1 ee 03 <80> 3c 0e 00 0f 85 06 01 00 00 48 85 d2 4c 89 28 74 47 e8 a3 4f b5
+  RSP: 0018:ffffc90006a97bf8 EFLAGS: 00010246
+  RAX: 0000000000000000 RBX: ffff888050ffc4f8 RCX: dffffc0000000000
+  RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88808e04e010
+  RBP: ffff88808e04e000 R08: 0000000000000001 R09: ffff8880543d0000
+  R10: ffffed100a87a000 R11: 000000000000016e R12: ffff8880543d0000
+  R13: ffff88808e04e008 R14: ffff888050ffc508 R15: ffff888050ffc500
+  FS:  0000000000000000(0000) GS:ffff8880ae600000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 00000000f5505db0 CR3: 0000000097cf7000 CR4: 00000000001526f0
+  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+  Call Trace:
+  wg_noise_handshake_begin_session+0x752/0xc9a drivers/net/wireguard/noise.c:820
+  wg_receive_handshake_packet drivers/net/wireguard/receive.c:183 [inline]
+  wg_packet_handshake_receive_worker+0x33b/0x730 drivers/net/wireguard/receive.c:220
+  process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
+  worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
+  kthread+0x3b5/0x4a0 kernel/kthread.c:292
+  ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
+
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Reported-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/wireguard/20200908145911.4090480-1-edumazet@google.com/
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/noise.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/drivers/net/wireguard/noise.c
++++ b/drivers/net/wireguard/noise.c
+@@ -87,15 +87,12 @@ static void handshake_zero(struct noise_
+ 
+ void wg_noise_handshake_clear(struct noise_handshake *handshake)
+ {
++      down_write(&handshake->lock);
+       wg_index_hashtable_remove(
+                       handshake->entry.peer->device->index_hashtable,
+                       &handshake->entry);
+-      down_write(&handshake->lock);
+       handshake_zero(handshake);
+       up_write(&handshake->lock);
+-      wg_index_hashtable_remove(
+-                      handshake->entry.peer->device->index_hashtable,
+-                      &handshake->entry);
+ }
+ 
+ static struct noise_keypair *keypair_create(struct wg_peer *peer)
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0116-wireguard-peerlookup-take-lock-before-checking-hash-.patch b/target/linux/generic/backport-5.4/080-wireguard-0116-wireguard-peerlookup-take-lock-before-checking-hash-.patch

new file mode 100644 (file)

index 0000000..b329d41
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0116-wireguard-peerlookup-take-lock-before-checking-hash-.patch
@@ -0,0 +1,62 @@
+From 1f5495019fce5680d54f94204500ee59d43fa15a Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 9 Sep 2020 13:58:15 +0200
+Subject: [PATCH 116/124] wireguard: peerlookup: take lock before checking hash
+ in replace operation
+
+commit 6147f7b1e90ff09bd52afc8b9206a7fcd133daf7 upstream.
+
+Eric's suggested fix for the previous commit's mentioned race condition
+was to simply take the table->lock in wg_index_hashtable_replace(). The
+table->lock of the hash table is supposed to protect the bucket heads,
+not the entires, but actually, since all the mutator functions are
+already taking it, it makes sense to take it too for the test to
+hlist_unhashed, as a defense in depth measure, so that it no longer
+races with deletions, regardless of what other locks are protecting
+individual entries. This is sensible from a performance perspective
+because, as Eric pointed out, the case of being unhashed is already the
+unlikely case, so this won't add common contention. And comparing
+instructions, this basically doesn't make much of a difference other
+than pushing and popping %r13, used by the new `bool ret`. More
+generally, I like the idea of locking consistency across table mutator
+functions, and this might let me rest slightly easier at night.
+
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/wireguard/20200908145911.4090480-1-edumazet@google.com/
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/peerlookup.c | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/wireguard/peerlookup.c
++++ b/drivers/net/wireguard/peerlookup.c
+@@ -167,9 +167,13 @@ bool wg_index_hashtable_replace(struct i
+                               struct index_hashtable_entry *old,
+                               struct index_hashtable_entry *new)
+ {
+-      if (unlikely(hlist_unhashed(&old->index_hash)))
+-              return false;
++      bool ret;
++
+       spin_lock_bh(&table->lock);
++      ret = !hlist_unhashed(&old->index_hash);
++      if (unlikely(!ret))
++              goto out;
++
+       new->index = old->index;
+       hlist_replace_rcu(&old->index_hash, &new->index_hash);
+ 
+@@ -180,8 +184,9 @@ bool wg_index_hashtable_replace(struct i
+        * simply gets dropped, which isn't terrible.
+        */
+       INIT_HLIST_NODE(&old->index_hash);
++out:
+       spin_unlock_bh(&table->lock);
+-      return true;
++      return ret;
+ }
+ 
+ void wg_index_hashtable_remove(struct index_hashtable *table,
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0117-wireguard-selftests-check-that-route_me_harder-packe.patch b/target/linux/generic/backport-5.4/080-wireguard-0117-wireguard-selftests-check-that-route_me_harder-packe.patch

new file mode 100644 (file)

index 0000000..8a6e75b
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0117-wireguard-selftests-check-that-route_me_harder-packe.patch
@@ -0,0 +1,56 @@
+From 7e687dff94e8acf478f787c75007d180c9c2dcc0 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 29 Oct 2020 03:56:05 +0100
+Subject: [PATCH 117/124] wireguard: selftests: check that route_me_harder
+ packets use the right sk
+
+commit af8afcf1fdd5f365f70e2386c2d8c7a1abd853d7 upstream.
+
+If netfilter changes the packet mark, the packet is rerouted. The
+ip_route_me_harder family of functions fails to use the right sk, opting
+to instead use skb->sk, resulting in a routing loop when used with
+tunnels. With the next change fixing this issue in netfilter, test for
+the relevant condition inside our test suite, since wireguard was where
+the bug was discovered.
+
+Reported-by: Chen Minqiang <ptpt52@gmail.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ tools/testing/selftests/wireguard/netns.sh           | 8 ++++++++
+ tools/testing/selftests/wireguard/qemu/kernel.config | 2 ++
+ 2 files changed, 10 insertions(+)
+
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -316,6 +316,14 @@ pp sleep 3
+ n2 ping -W 1 -c 1 192.168.241.1
+ n1 wg set wg0 peer "$pub2" persistent-keepalive 0
+ 
++# Test that sk_bound_dev_if works
++n1 ping -I wg0 -c 1 -W 1 192.168.241.2
++# What about when the mark changes and the packet must be rerouted?
++n1 iptables -t mangle -I OUTPUT -j MARK --set-xmark 1
++n1 ping -c 1 -W 1 192.168.241.2 # First the boring case
++n1 ping -I wg0 -c 1 -W 1 192.168.241.2 # Then the sk_bound_dev_if case
++n1 iptables -t mangle -D OUTPUT -j MARK --set-xmark 1
++
+ # Test that onion routing works, even when it loops
+ n1 wg set wg0 peer "$pub3" allowed-ips 192.168.242.2/32 endpoint 192.168.241.2:5
+ ip1 addr add 192.168.242.1/24 dev wg0
+--- a/tools/testing/selftests/wireguard/qemu/kernel.config
++++ b/tools/testing/selftests/wireguard/qemu/kernel.config
+@@ -18,10 +18,12 @@ CONFIG_NF_NAT=y
+ CONFIG_NETFILTER_XTABLES=y
+ CONFIG_NETFILTER_XT_NAT=y
+ CONFIG_NETFILTER_XT_MATCH_LENGTH=y
++CONFIG_NETFILTER_XT_MARK=y
+ CONFIG_NF_CONNTRACK_IPV4=y
+ CONFIG_NF_NAT_IPV4=y
+ CONFIG_IP_NF_IPTABLES=y
+ CONFIG_IP_NF_FILTER=y
++CONFIG_IP_NF_MANGLE=y
+ CONFIG_IP_NF_NAT=y
+ CONFIG_IP_ADVANCED_ROUTER=y
+ CONFIG_IP_MULTIPLE_TABLES=y
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0118-wireguard-avoid-double-unlikely-notation-when-using-.patch b/target/linux/generic/backport-5.4/080-wireguard-0118-wireguard-avoid-double-unlikely-notation-when-using-.patch

new file mode 100644 (file)

index 0000000..b461b77
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0118-wireguard-avoid-double-unlikely-notation-when-using-.patch
@@ -0,0 +1,55 @@
+From 6f5f2660a44325a75ff2ccbf799103c3613e78bb Mon Sep 17 00:00:00 2001
+From: Antonio Quartulli <a@unstable.cc>
+Date: Mon, 22 Feb 2021 17:25:43 +0100
+Subject: [PATCH 118/124] wireguard: avoid double unlikely() notation when
+ using IS_ERR()
+
+commit 30ac4e2f54ec067b7b9ca0db27e75681581378d6 upstream.
+
+The definition of IS_ERR() already applies the unlikely() notation
+when checking the error status of the passed pointer. For this
+reason there is no need to have the same notation outside of
+IS_ERR() itself.
+
+Clean up code by removing redundant notation.
+
+Signed-off-by: Antonio Quartulli <a@unstable.cc>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/device.c | 2 +-
+ drivers/net/wireguard/socket.c | 4 ++--
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/wireguard/device.c
++++ b/drivers/net/wireguard/device.c
+@@ -157,7 +157,7 @@ static netdev_tx_t wg_xmit(struct sk_buf
+       } else {
+               struct sk_buff *segs = skb_gso_segment(skb, 0);
+ 
+-              if (unlikely(IS_ERR(segs))) {
++              if (IS_ERR(segs)) {
+                       ret = PTR_ERR(segs);
+                       goto err_peer;
+               }
+--- a/drivers/net/wireguard/socket.c
++++ b/drivers/net/wireguard/socket.c
+@@ -71,7 +71,7 @@ static int send4(struct wg_device *wg, s
+                               ip_rt_put(rt);
+                       rt = ip_route_output_flow(sock_net(sock), &fl, sock);
+               }
+-              if (unlikely(IS_ERR(rt))) {
++              if (IS_ERR(rt)) {
+                       ret = PTR_ERR(rt);
+                       net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n",
+                                           wg->dev->name, &endpoint->addr, ret);
+@@ -138,7 +138,7 @@ static int send6(struct wg_device *wg, s
+               }
+               dst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(sock), sock, &fl,
+                                                     NULL);
+-              if (unlikely(IS_ERR(dst))) {
++              if (IS_ERR(dst)) {
+                       ret = PTR_ERR(dst);
+                       net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n",
+                                           wg->dev->name, &endpoint->addr, ret);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0119-wireguard-socket-remove-bogus-__be32-annotation.patch b/target/linux/generic/backport-5.4/080-wireguard-0119-wireguard-socket-remove-bogus-__be32-annotation.patch

new file mode 100644 (file)

index 0000000..c497ce5
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0119-wireguard-socket-remove-bogus-__be32-annotation.patch
@@ -0,0 +1,52 @@
+From 03928cbd7e0c7906c7ab2a490e31d89d6ae3965a Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Mon, 22 Feb 2021 17:25:44 +0100
+Subject: [PATCH 119/124] wireguard: socket: remove bogus __be32 annotation
+
+commit 7f57bd8dc22de35ddd895294aa554003e4f19a72 upstream.
+
+The endpoint->src_if4 has nothing to do with fixed-endian numbers; remove
+the bogus annotation.
+
+This was introduced in
+https://git.zx2c4.com/wireguard-monolithic-historical/commit?id=14e7d0a499a676ec55176c0de2f9fcbd34074a82
+in the historical WireGuard repo because the old code used to
+zero-initialize multiple members as follows:
+
+    endpoint->src4.s_addr = endpoint->src_if4 = fl.saddr = 0;
+
+Because fl.saddr is fixed-endian and an assignment returns a value with the
+type of its left operand, this meant that sparse detected an assignment
+between values of different endianness.
+
+Since then, this assignment was already split up into separate statements;
+just the cast survived.
+
+Signed-off-by: Jann Horn <jannh@google.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/socket.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/wireguard/socket.c
++++ b/drivers/net/wireguard/socket.c
+@@ -53,7 +53,7 @@ static int send4(struct wg_device *wg, s
+               if (unlikely(!inet_confirm_addr(sock_net(sock), NULL, 0,
+                                               fl.saddr, RT_SCOPE_HOST))) {
+                       endpoint->src4.s_addr = 0;
+-                      *(__force __be32 *)&endpoint->src_if4 = 0;
++                      endpoint->src_if4 = 0;
+                       fl.saddr = 0;
+                       if (cache)
+                               dst_cache_reset(cache);
+@@ -63,7 +63,7 @@ static int send4(struct wg_device *wg, s
+                            PTR_ERR(rt) == -EINVAL) || (!IS_ERR(rt) &&
+                            rt->dst.dev->ifindex != endpoint->src_if4)))) {
+                       endpoint->src4.s_addr = 0;
+-                      *(__force __be32 *)&endpoint->src_if4 = 0;
++                      endpoint->src_if4 = 0;
+                       fl.saddr = 0;
+                       if (cache)
+                               dst_cache_reset(cache);
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0120-wireguard-selftests-test-multiple-parallel-streams.patch b/target/linux/generic/backport-5.4/080-wireguard-0120-wireguard-selftests-test-multiple-parallel-streams.patch

new file mode 100644 (file)

index 0000000..269f30f
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0120-wireguard-selftests-test-multiple-parallel-streams.patch
@@ -0,0 +1,52 @@
+From 5c4e6ed057bcaa6ece0386344ba787d88c8307d2 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 22 Feb 2021 17:25:45 +0100
+Subject: [PATCH 120/124] wireguard: selftests: test multiple parallel streams
+
+commit d5a49aa6c3e264a93a7d08485d66e346be0969dd upstream.
+
+In order to test ndo_start_xmit being called in parallel, explicitly add
+separate tests, which should all run on different cores. This should
+help tease out bugs associated with queueing up packets from different
+cores in parallel. Currently, it hasn't found those types of bugs, but
+given future planned work, this is a useful regression to avoid.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ tools/testing/selftests/wireguard/netns.sh | 15 ++++++++++++++-
+ 1 file changed, 14 insertions(+), 1 deletion(-)
+
+--- a/tools/testing/selftests/wireguard/netns.sh
++++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -39,7 +39,7 @@ ip0() { pretty 0 "ip $*"; ip -n $netns0
+ ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; }
+ ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; }
+ sleep() { read -t "$1" -N 1 || true; }
+-waitiperf() { pretty "${1//*-}" "wait for iperf:5201 pid $2"; while [[ $(ss -N "$1" -tlpH 'sport = 5201') != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; }
++waitiperf() { pretty "${1//*-}" "wait for iperf:${3:-5201} pid $2"; while [[ $(ss -N "$1" -tlpH "sport = ${3:-5201}") != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; }
+ waitncatudp() { pretty "${1//*-}" "wait for udp:1111 pid $2"; while [[ $(ss -N "$1" -ulpH 'sport = 1111') != *\"ncat\",pid=$2,fd=* ]]; do sleep 0.1; done; }
+ waitiface() { pretty "${1//*-}" "wait for $2 to come up"; ip netns exec "$1" bash -c "while [[ \$(< \"/sys/class/net/$2/operstate\") != up ]]; do read -t .1 -N 0 || true; done;"; }
+ 
+@@ -141,6 +141,19 @@ tests() {
+       n2 iperf3 -s -1 -B fd00::2 &
+       waitiperf $netns2 $!
+       n1 iperf3 -Z -t 3 -b 0 -u -c fd00::2
++
++      # TCP over IPv4, in parallel
++      for max in 4 5 50; do
++              local pids=( )
++              for ((i=0; i < max; ++i)) do
++                      n2 iperf3 -p $(( 5200 + i )) -s -1 -B 192.168.241.2 &
++                      pids+=( $! ); waitiperf $netns2 $! $(( 5200 + i ))
++              done
++              for ((i=0; i < max; ++i)) do
++                      n1 iperf3 -Z -t 3 -p $(( 5200 + i )) -c 192.168.241.2 &
++              done
++              wait "${pids[@]}"
++      done
+ }
+ 
+ [[ $(ip1 link show dev wg0) =~ mtu\ ([0-9]+) ]] && orig_mtu="${BASH_REMATCH[1]}"
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0121-wireguard-peer-put-frequently-used-members-above-cac.patch b/target/linux/generic/backport-5.4/080-wireguard-0121-wireguard-peer-put-frequently-used-members-above-cac.patch

new file mode 100644 (file)

index 0000000..bd4fd77
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0121-wireguard-peer-put-frequently-used-members-above-cac.patch
@@ -0,0 +1,42 @@
+From a13827e9091c07e25cdeec9a402d74a27e2a1111 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 22 Feb 2021 17:25:46 +0100
+Subject: [PATCH 121/124] wireguard: peer: put frequently used members above
+ cache lines
+
+commit 5a0598695634a6bb4126818902dd9140cd9df8b6 upstream.
+
+The is_dead boolean is checked for every single packet, while the
+internal_id member is used basically only for pr_debug messages. So it
+makes sense to hoist up is_dead into some space formerly unused by a
+struct hole, while demoting internal_api to below the lowest struct
+cache line.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/peer.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/wireguard/peer.h
++++ b/drivers/net/wireguard/peer.h
+@@ -39,6 +39,7 @@ struct wg_peer {
+       struct crypt_queue tx_queue, rx_queue;
+       struct sk_buff_head staged_packet_queue;
+       int serial_work_cpu;
++      bool is_dead;
+       struct noise_keypairs keypairs;
+       struct endpoint endpoint;
+       struct dst_cache endpoint_cache;
+@@ -61,9 +62,8 @@ struct wg_peer {
+       struct rcu_head rcu;
+       struct list_head peer_list;
+       struct list_head allowedips_list;
+-      u64 internal_id;
+       struct napi_struct napi;
+-      bool is_dead;
++      u64 internal_id;
+ };
+ 
+ struct wg_peer *wg_peer_create(struct wg_device *wg,
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0122-wireguard-device-do-not-generate-ICMP-for-non-IP-pac.patch b/target/linux/generic/backport-5.4/080-wireguard-0122-wireguard-device-do-not-generate-ICMP-for-non-IP-pac.patch

new file mode 100644 (file)

index 0000000..07a3662
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0122-wireguard-device-do-not-generate-ICMP-for-non-IP-pac.patch
@@ -0,0 +1,48 @@
+From 49da2a610d63cef849f0095e601821ad6edfbef7 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 22 Feb 2021 17:25:47 +0100
+Subject: [PATCH 122/124] wireguard: device: do not generate ICMP for non-IP
+ packets
+
+commit 99fff5264e7ab06f45b0ad60243475be0a8d0559 upstream.
+
+If skb->protocol doesn't match the actual skb->data header, it's
+probably not a good idea to pass it off to icmp{,v6}_ndo_send, which is
+expecting to reply to a valid IP packet. So this commit has that early
+mismatch case jump to a later error label.
+
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/device.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/wireguard/device.c
++++ b/drivers/net/wireguard/device.c
+@@ -138,7 +138,7 @@ static netdev_tx_t wg_xmit(struct sk_buf
+               else if (skb->protocol == htons(ETH_P_IPV6))
+                       net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI6\n",
+                                           dev->name, &ipv6_hdr(skb)->daddr);
+-              goto err;
++              goto err_icmp;
+       }
+ 
+       family = READ_ONCE(peer->endpoint.addr.sa_family);
+@@ -201,12 +201,13 @@ static netdev_tx_t wg_xmit(struct sk_buf
+ 
+ err_peer:
+       wg_peer_put(peer);
+-err:
+-      ++dev->stats.tx_errors;
++err_icmp:
+       if (skb->protocol == htons(ETH_P_IP))
+               icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+       else if (skb->protocol == htons(ETH_P_IPV6))
+               icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
++err:
++      ++dev->stats.tx_errors;
+       kfree_skb(skb);
+       return ret;
+ }
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0123-wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch b/target/linux/generic/backport-5.4/080-wireguard-0123-wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch

new file mode 100644 (file)

index 0000000..147c133
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0123-wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch
@@ -0,0 +1,560 @@
+From 1771bbcc5bc99f569dd82ec9e1b7c397a2fb50ac Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 22 Feb 2021 17:25:48 +0100
+Subject: [PATCH 123/124] wireguard: queueing: get rid of per-peer ring buffers
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 8b5553ace83cced775eefd0f3f18b5c6214ccf7a upstream.
+
+Having two ring buffers per-peer means that every peer results in two
+massive ring allocations. On an 8-core x86_64 machine, this commit
+reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which
+is an 90% reduction. Ninety percent! With some single-machine
+deployments approaching 500,000 peers, we're talking about a reduction
+from 7 gigs of memory down to 700 megs of memory.
+
+In order to get rid of these per-peer allocations, this commit switches
+to using a list-based queueing approach. Currently GSO fragments are
+chained together using the skb->next pointer (the skb_list_* singly
+linked list approach), so we form the per-peer queue around the unused
+skb->prev pointer (which sort of makes sense because the links are
+pointing backwards). Use of skb_queue_* is not possible here, because
+that is based on doubly linked lists and spinlocks. Multiple cores can
+write into the queue at any given time, because its writes occur in the
+start_xmit path or in the udp_recv path. But reads happen in a single
+workqueue item per-peer, amounting to a multi-producer, single-consumer
+paradigm.
+
+The MPSC queue is implemented locklessly and never blocks. However, it
+is not linearizable (though it is serializable), with a very tight and
+unlikely race on writes, which, when hit (some tiny fraction of the
+0.15% of partial adds on a fully loaded 16-core x86_64 system), causes
+the queue reader to terminate early. However, because every packet sent
+queues up the same workqueue item after it is fully added, the worker
+resumes again, and stopping early isn't actually a problem, since at
+that point the packet wouldn't have yet been added to the encryption
+queue. These properties allow us to avoid disabling interrupts or
+spinning. The design is based on Dmitry Vyukov's algorithm [1].
+
+Performance-wise, ordinarily list-based queues aren't preferable to
+ringbuffers, because of cache misses when following pointers around.
+However, we *already* have to follow the adjacent pointers when working
+through fragments, so there shouldn't actually be any change there. A
+potential downside is that dequeueing is a bit more complicated, but the
+ptr_ring structure used prior had a spinlock when dequeueing, so all and
+all the difference appears to be a wash.
+
+Actually, from profiling, the biggest performance hit, by far, of this
+commit winds up being atomic_add_unless(count, 1, max) and atomic_
+dec(count), which account for the majority of CPU time, according to
+perf. In that sense, the previous ring buffer was superior in that it
+could check if it was full by head==tail, which the list-based approach
+cannot do.
+
+But all and all, this enables us to get massive memory savings, allowing
+WireGuard to scale for real world deployments, without taking much of a
+performance hit.
+
+[1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue
+
+Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
+Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/device.c   | 12 ++---
+ drivers/net/wireguard/device.h   | 15 +++---
+ drivers/net/wireguard/peer.c     | 28 ++++-------
+ drivers/net/wireguard/peer.h     |  4 +-
+ drivers/net/wireguard/queueing.c | 86 +++++++++++++++++++++++++-------
+ drivers/net/wireguard/queueing.h | 45 ++++++++++++-----
+ drivers/net/wireguard/receive.c  | 16 +++---
+ drivers/net/wireguard/send.c     | 31 ++++--------
+ 8 files changed, 144 insertions(+), 93 deletions(-)
+
+--- a/drivers/net/wireguard/device.c
++++ b/drivers/net/wireguard/device.c
+@@ -235,8 +235,8 @@ static void wg_destruct(struct net_devic
+       destroy_workqueue(wg->handshake_receive_wq);
+       destroy_workqueue(wg->handshake_send_wq);
+       destroy_workqueue(wg->packet_crypt_wq);
+-      wg_packet_queue_free(&wg->decrypt_queue, true);
+-      wg_packet_queue_free(&wg->encrypt_queue, true);
++      wg_packet_queue_free(&wg->decrypt_queue);
++      wg_packet_queue_free(&wg->encrypt_queue);
+       rcu_barrier(); /* Wait for all the peers to be actually freed. */
+       wg_ratelimiter_uninit();
+       memzero_explicit(&wg->static_identity, sizeof(wg->static_identity));
+@@ -338,12 +338,12 @@ static int wg_newlink(struct net *src_ne
+               goto err_destroy_handshake_send;
+ 
+       ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker,
+-                                 true, MAX_QUEUED_PACKETS);
++                                 MAX_QUEUED_PACKETS);
+       if (ret < 0)
+               goto err_destroy_packet_crypt;
+ 
+       ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker,
+-                                 true, MAX_QUEUED_PACKETS);
++                                 MAX_QUEUED_PACKETS);
+       if (ret < 0)
+               goto err_free_encrypt_queue;
+ 
+@@ -368,9 +368,9 @@ static int wg_newlink(struct net *src_ne
+ err_uninit_ratelimiter:
+       wg_ratelimiter_uninit();
+ err_free_decrypt_queue:
+-      wg_packet_queue_free(&wg->decrypt_queue, true);
++      wg_packet_queue_free(&wg->decrypt_queue);
+ err_free_encrypt_queue:
+-      wg_packet_queue_free(&wg->encrypt_queue, true);
++      wg_packet_queue_free(&wg->encrypt_queue);
+ err_destroy_packet_crypt:
+       destroy_workqueue(wg->packet_crypt_wq);
+ err_destroy_handshake_send:
+--- a/drivers/net/wireguard/device.h
++++ b/drivers/net/wireguard/device.h
+@@ -27,13 +27,14 @@ struct multicore_worker {
+ 
+ struct crypt_queue {
+       struct ptr_ring ring;
+-      union {
+-              struct {
+-                      struct multicore_worker __percpu *worker;
+-                      int last_cpu;
+-              };
+-              struct work_struct work;
+-      };
++      struct multicore_worker __percpu *worker;
++      int last_cpu;
++};
++
++struct prev_queue {
++      struct sk_buff *head, *tail, *peeked;
++      struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff.
++      atomic_t count;
+ };
+ 
+ struct wg_device {
+--- a/drivers/net/wireguard/peer.c
++++ b/drivers/net/wireguard/peer.c
+@@ -32,27 +32,22 @@ struct wg_peer *wg_peer_create(struct wg
+       peer = kzalloc(sizeof(*peer), GFP_KERNEL);
+       if (unlikely(!peer))
+               return ERR_PTR(ret);
+-      peer->device = wg;
++      if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
++              goto err;
+ 
++      peer->device = wg;
+       wg_noise_handshake_init(&peer->handshake, &wg->static_identity,
+                               public_key, preshared_key, peer);
+-      if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
+-              goto err_1;
+-      if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false,
+-                               MAX_QUEUED_PACKETS))
+-              goto err_2;
+-      if (wg_packet_queue_init(&peer->rx_queue, NULL, false,
+-                               MAX_QUEUED_PACKETS))
+-              goto err_3;
+-
+       peer->internal_id = atomic64_inc_return(&peer_counter);
+       peer->serial_work_cpu = nr_cpumask_bits;
+       wg_cookie_init(&peer->latest_cookie);
+       wg_timers_init(peer);
+       wg_cookie_checker_precompute_peer_keys(peer);
+       spin_lock_init(&peer->keypairs.keypair_update_lock);
+-      INIT_WORK(&peer->transmit_handshake_work,
+-                wg_packet_handshake_send_worker);
++      INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker);
++      INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker);
++      wg_prev_queue_init(&peer->tx_queue);
++      wg_prev_queue_init(&peer->rx_queue);
+       rwlock_init(&peer->endpoint_lock);
+       kref_init(&peer->refcount);
+       skb_queue_head_init(&peer->staged_packet_queue);
+@@ -68,11 +63,7 @@ struct wg_peer *wg_peer_create(struct wg
+       pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id);
+       return peer;
+ 
+-err_3:
+-      wg_packet_queue_free(&peer->tx_queue, false);
+-err_2:
+-      dst_cache_destroy(&peer->endpoint_cache);
+-err_1:
++err:
+       kfree(peer);
+       return ERR_PTR(ret);
+ }
+@@ -197,8 +188,7 @@ static void rcu_release(struct rcu_head
+       struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu);
+ 
+       dst_cache_destroy(&peer->endpoint_cache);
+-      wg_packet_queue_free(&peer->rx_queue, false);
+-      wg_packet_queue_free(&peer->tx_queue, false);
++      WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue));
+ 
+       /* The final zeroing takes care of clearing any remaining handshake key
+        * material and other potentially sensitive information.
+--- a/drivers/net/wireguard/peer.h
++++ b/drivers/net/wireguard/peer.h
+@@ -36,7 +36,7 @@ struct endpoint {
+ 
+ struct wg_peer {
+       struct wg_device *device;
+-      struct crypt_queue tx_queue, rx_queue;
++      struct prev_queue tx_queue, rx_queue;
+       struct sk_buff_head staged_packet_queue;
+       int serial_work_cpu;
+       bool is_dead;
+@@ -46,7 +46,7 @@ struct wg_peer {
+       rwlock_t endpoint_lock;
+       struct noise_handshake handshake;
+       atomic64_t last_sent_handshake;
+-      struct work_struct transmit_handshake_work, clear_peer_work;
++      struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work;
+       struct cookie latest_cookie;
+       struct hlist_node pubkey_hash;
+       u64 rx_bytes, tx_bytes;
+--- a/drivers/net/wireguard/queueing.c
++++ b/drivers/net/wireguard/queueing.c
+@@ -9,8 +9,7 @@ struct multicore_worker __percpu *
+ wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr)
+ {
+       int cpu;
+-      struct multicore_worker __percpu *worker =
+-              alloc_percpu(struct multicore_worker);
++      struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker);
+ 
+       if (!worker)
+               return NULL;
+@@ -23,7 +22,7 @@ wg_packet_percpu_multicore_worker_alloc(
+ }
+ 
+ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
+-                       bool multicore, unsigned int len)
++                       unsigned int len)
+ {
+       int ret;
+ 
+@@ -31,25 +30,78 @@ int wg_packet_queue_init(struct crypt_qu
+       ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL);
+       if (ret)
+               return ret;
+-      if (function) {
+-              if (multicore) {
+-                      queue->worker = wg_packet_percpu_multicore_worker_alloc(
+-                              function, queue);
+-                      if (!queue->worker) {
+-                              ptr_ring_cleanup(&queue->ring, NULL);
+-                              return -ENOMEM;
+-                      }
+-              } else {
+-                      INIT_WORK(&queue->work, function);
+-              }
++      queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue);
++      if (!queue->worker) {
++              ptr_ring_cleanup(&queue->ring, NULL);
++              return -ENOMEM;
+       }
+       return 0;
+ }
+ 
+-void wg_packet_queue_free(struct crypt_queue *queue, bool multicore)
++void wg_packet_queue_free(struct crypt_queue *queue)
+ {
+-      if (multicore)
+-              free_percpu(queue->worker);
++      free_percpu(queue->worker);
+       WARN_ON(!__ptr_ring_empty(&queue->ring));
+       ptr_ring_cleanup(&queue->ring, NULL);
+ }
++
++#define NEXT(skb) ((skb)->prev)
++#define STUB(queue) ((struct sk_buff *)&queue->empty)
++
++void wg_prev_queue_init(struct prev_queue *queue)
++{
++      NEXT(STUB(queue)) = NULL;
++      queue->head = queue->tail = STUB(queue);
++      queue->peeked = NULL;
++      atomic_set(&queue->count, 0);
++      BUILD_BUG_ON(
++              offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) -
++                                                      offsetof(struct prev_queue, empty) ||
++              offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) -
++                                                       offsetof(struct prev_queue, empty));
++}
++
++static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
++{
++      WRITE_ONCE(NEXT(skb), NULL);
++      WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb);
++}
++
++bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
++{
++      if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS))
++              return false;
++      __wg_prev_queue_enqueue(queue, skb);
++      return true;
++}
++
++struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue)
++{
++      struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail));
++
++      if (tail == STUB(queue)) {
++              if (!next)
++                      return NULL;
++              queue->tail = next;
++              tail = next;
++              next = smp_load_acquire(&NEXT(next));
++      }
++      if (next) {
++              queue->tail = next;
++              atomic_dec(&queue->count);
++              return tail;
++      }
++      if (tail != READ_ONCE(queue->head))
++              return NULL;
++      __wg_prev_queue_enqueue(queue, STUB(queue));
++      next = smp_load_acquire(&NEXT(tail));
++      if (next) {
++              queue->tail = next;
++              atomic_dec(&queue->count);
++              return tail;
++      }
++      return NULL;
++}
++
++#undef NEXT
++#undef STUB
+--- a/drivers/net/wireguard/queueing.h
++++ b/drivers/net/wireguard/queueing.h
+@@ -17,12 +17,13 @@ struct wg_device;
+ struct wg_peer;
+ struct multicore_worker;
+ struct crypt_queue;
++struct prev_queue;
+ struct sk_buff;
+ 
+ /* queueing.c APIs: */
+ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
+-                       bool multicore, unsigned int len);
+-void wg_packet_queue_free(struct crypt_queue *queue, bool multicore);
++                       unsigned int len);
++void wg_packet_queue_free(struct crypt_queue *queue);
+ struct multicore_worker __percpu *
+ wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr);
+ 
+@@ -135,8 +136,31 @@ static inline int wg_cpumask_next_online
+       return cpu;
+ }
+ 
++void wg_prev_queue_init(struct prev_queue *queue);
++
++/* Multi producer */
++bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb);
++
++/* Single consumer */
++struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue);
++
++/* Single consumer */
++static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue)
++{
++      if (queue->peeked)
++              return queue->peeked;
++      queue->peeked = wg_prev_queue_dequeue(queue);
++      return queue->peeked;
++}
++
++/* Single consumer */
++static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue)
++{
++      queue->peeked = NULL;
++}
++
+ static inline int wg_queue_enqueue_per_device_and_peer(
+-      struct crypt_queue *device_queue, struct crypt_queue *peer_queue,
++      struct crypt_queue *device_queue, struct prev_queue *peer_queue,
+       struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu)
+ {
+       int cpu;
+@@ -145,8 +169,9 @@ static inline int wg_queue_enqueue_per_d
+       /* We first queue this up for the peer ingestion, but the consumer
+        * will wait for the state to change to CRYPTED or DEAD before.
+        */
+-      if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb)))
++      if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb)))
+               return -ENOSPC;
++
+       /* Then we queue it up in the device queue, which consumes the
+        * packet as soon as it can.
+        */
+@@ -157,9 +182,7 @@ static inline int wg_queue_enqueue_per_d
+       return 0;
+ }
+ 
+-static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue,
+-                                           struct sk_buff *skb,
+-                                           enum packet_state state)
++static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state)
+ {
+       /* We take a reference, because as soon as we call atomic_set, the
+        * peer can be freed from below us.
+@@ -167,14 +190,12 @@ static inline void wg_queue_enqueue_per_
+       struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
+ 
+       atomic_set_release(&PACKET_CB(skb)->state, state);
+-      queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu,
+-                                             peer->internal_id),
+-                    peer->device->packet_crypt_wq, &queue->work);
++      queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id),
++                    peer->device->packet_crypt_wq, &peer->transmit_packet_work);
+       wg_peer_put(peer);
+ }
+ 
+-static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb,
+-                                                enum packet_state state)
++static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state)
+ {
+       /* We take a reference, because as soon as we call atomic_set, the
+        * peer can be freed from below us.
+--- a/drivers/net/wireguard/receive.c
++++ b/drivers/net/wireguard/receive.c
+@@ -444,7 +444,6 @@ packet_processed:
+ int wg_packet_rx_poll(struct napi_struct *napi, int budget)
+ {
+       struct wg_peer *peer = container_of(napi, struct wg_peer, napi);
+-      struct crypt_queue *queue = &peer->rx_queue;
+       struct noise_keypair *keypair;
+       struct endpoint endpoint;
+       enum packet_state state;
+@@ -455,11 +454,10 @@ int wg_packet_rx_poll(struct napi_struct
+       if (unlikely(budget <= 0))
+               return 0;
+ 
+-      while ((skb = __ptr_ring_peek(&queue->ring)) != NULL &&
++      while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL &&
+              (state = atomic_read_acquire(&PACKET_CB(skb)->state)) !=
+                      PACKET_STATE_UNCRYPTED) {
+-              __ptr_ring_discard_one(&queue->ring);
+-              peer = PACKET_PEER(skb);
++              wg_prev_queue_drop_peeked(&peer->rx_queue);
+               keypair = PACKET_CB(skb)->keypair;
+               free = true;
+ 
+@@ -508,7 +506,7 @@ void wg_packet_decrypt_worker(struct wor
+               enum packet_state state =
+                       likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ?
+                               PACKET_STATE_CRYPTED : PACKET_STATE_DEAD;
+-              wg_queue_enqueue_per_peer_napi(skb, state);
++              wg_queue_enqueue_per_peer_rx(skb, state);
+               if (need_resched())
+                       cond_resched();
+       }
+@@ -531,12 +529,10 @@ static void wg_packet_consume_data(struc
+       if (unlikely(READ_ONCE(peer->is_dead)))
+               goto err;
+ 
+-      ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue,
+-                                                 &peer->rx_queue, skb,
+-                                                 wg->packet_crypt_wq,
+-                                                 &wg->decrypt_queue.last_cpu);
++      ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb,
++                                                 wg->packet_crypt_wq, &wg->decrypt_queue.last_cpu);
+       if (unlikely(ret == -EPIPE))
+-              wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD);
++              wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD);
+       if (likely(!ret || ret == -EPIPE)) {
+               rcu_read_unlock_bh();
+               return;
+--- a/drivers/net/wireguard/send.c
++++ b/drivers/net/wireguard/send.c
+@@ -239,8 +239,7 @@ void wg_packet_send_keepalive(struct wg_
+       wg_packet_send_staged_packets(peer);
+ }
+ 
+-static void wg_packet_create_data_done(struct sk_buff *first,
+-                                     struct wg_peer *peer)
++static void wg_packet_create_data_done(struct wg_peer *peer, struct sk_buff *first)
+ {
+       struct sk_buff *skb, *next;
+       bool is_keepalive, data_sent = false;
+@@ -262,22 +261,19 @@ static void wg_packet_create_data_done(s
+ 
+ void wg_packet_tx_worker(struct work_struct *work)
+ {
+-      struct crypt_queue *queue = container_of(work, struct crypt_queue,
+-                                               work);
++      struct wg_peer *peer = container_of(work, struct wg_peer, transmit_packet_work);
+       struct noise_keypair *keypair;
+       enum packet_state state;
+       struct sk_buff *first;
+-      struct wg_peer *peer;
+ 
+-      while ((first = __ptr_ring_peek(&queue->ring)) != NULL &&
++      while ((first = wg_prev_queue_peek(&peer->tx_queue)) != NULL &&
+              (state = atomic_read_acquire(&PACKET_CB(first)->state)) !=
+                      PACKET_STATE_UNCRYPTED) {
+-              __ptr_ring_discard_one(&queue->ring);
+-              peer = PACKET_PEER(first);
++              wg_prev_queue_drop_peeked(&peer->tx_queue);
+               keypair = PACKET_CB(first)->keypair;
+ 
+               if (likely(state == PACKET_STATE_CRYPTED))
+-                      wg_packet_create_data_done(first, peer);
++                      wg_packet_create_data_done(peer, first);
+               else
+                       kfree_skb_list(first);
+ 
+@@ -306,16 +302,14 @@ void wg_packet_encrypt_worker(struct wor
+                               break;
+                       }
+               }
+-              wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first,
+-                                        state);
++              wg_queue_enqueue_per_peer_tx(first, state);
+               if (need_resched())
+                       cond_resched();
+       }
+ }
+ 
+-static void wg_packet_create_data(struct sk_buff *first)
++static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first)
+ {
+-      struct wg_peer *peer = PACKET_PEER(first);
+       struct wg_device *wg = peer->device;
+       int ret = -EINVAL;
+ 
+@@ -323,13 +317,10 @@ static void wg_packet_create_data(struct
+       if (unlikely(READ_ONCE(peer->is_dead)))
+               goto err;
+ 
+-      ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue,
+-                                                 &peer->tx_queue, first,
+-                                                 wg->packet_crypt_wq,
+-                                                 &wg->encrypt_queue.last_cpu);
++      ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, &peer->tx_queue, first,
++                                                 wg->packet_crypt_wq, &wg->encrypt_queue.last_cpu);
+       if (unlikely(ret == -EPIPE))
+-              wg_queue_enqueue_per_peer(&peer->tx_queue, first,
+-                                        PACKET_STATE_DEAD);
++              wg_queue_enqueue_per_peer_tx(first, PACKET_STATE_DEAD);
+ err:
+       rcu_read_unlock_bh();
+       if (likely(!ret || ret == -EPIPE))
+@@ -393,7 +384,7 @@ void wg_packet_send_staged_packets(struc
+       packets.prev->next = NULL;
+       wg_peer_get(keypair->entry.peer);
+       PACKET_CB(packets.next)->keypair = keypair;
+-      wg_packet_create_data(packets.next);
++      wg_packet_create_data(peer, packets.next);
+       return;
+ 
+ out_invalid:
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0124-wireguard-kconfig-use-arm-chacha-even-with-no-neon.patch b/target/linux/generic/backport-5.4/080-wireguard-0124-wireguard-kconfig-use-arm-chacha-even-with-no-neon.patch

new file mode 100644 (file)

index 0000000..3c62dc6
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0124-wireguard-kconfig-use-arm-chacha-even-with-no-neon.patch
@@ -0,0 +1,30 @@
+From 514091206bc055a159348ae8575276dc925aea24 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 22 Feb 2021 17:25:49 +0100
+Subject: [PATCH 124/124] wireguard: kconfig: use arm chacha even with no neon
+
+commit bce2473927af8de12ad131a743f55d69d358c0b9 upstream.
+
+The condition here was incorrect: a non-neon fallback implementation is
+available on arm32 when NEON is not supported.
+
+Reported-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
+Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/Kconfig | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/Kconfig
++++ b/drivers/net/Kconfig
+@@ -87,7 +87,7 @@ config WIREGUARD
+       select CRYPTO_CURVE25519_X86 if X86 && 64BIT
+       select ARM_CRYPTO if ARM
+       select ARM64_CRYPTO if ARM64
+-      select CRYPTO_CHACHA20_NEON if (ARM || ARM64) && KERNEL_MODE_NEON
++      select CRYPTO_CHACHA20_NEON if ARM || (ARM64 && KERNEL_MODE_NEON)
+       select CRYPTO_POLY1305_NEON if ARM64 && KERNEL_MODE_NEON
+       select CRYPTO_POLY1305_ARM if ARM
+       select CRYPTO_CURVE25519_NEON if ARM && KERNEL_MODE_NEON
author	Jason A. Donenfeld <Jason@zx2c4.com>
	Fri, 19 Feb 2021 13:29:04 +0000 (14:29 +0100)
committer	David Bauer <mail@david-bauer.net>
	Fri, 26 Feb 2021 19:41:01 +0000 (20:41 +0100)
target/linux/generic/backport-5.4/080-wireguard-0001-crypto-lib-tidy-up-lib-crypto-Kconfig-and-Makefile.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0002-crypto-chacha-move-existing-library-code-into-lib-cr.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0003-crypto-x86-chacha-depend-on-generic-chacha-library-i.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0004-crypto-x86-chacha-expose-SIMD-ChaCha-routine-as-libr.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0005-crypto-arm64-chacha-depend-on-generic-chacha-library.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0006-crypto-arm64-chacha-expose-arm64-ChaCha-routine-as-l.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0007-crypto-arm-chacha-import-Eric-Biggers-s-scalar-accel.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0009-crypto-arm-chacha-expose-ARM-ChaCha-routine-as-libra.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0011-crypto-mips-chacha-wire-up-accelerated-32r2-code-fro.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0012-crypto-chacha-unexport-chacha_generic-routines.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0013-crypto-poly1305-move-core-routines-into-a-separate-l.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0014-crypto-x86-poly1305-unify-Poly1305-state-struct-with.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0015-crypto-poly1305-expose-init-update-final-library-int.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0016-crypto-x86-poly1305-depend-on-generic-library-not-ge.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0017-crypto-x86-poly1305-expose-existing-driver-as-poly13.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0018-crypto-arm64-poly1305-incorporate-OpenSSL-CRYPTOGAMS.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0021-crypto-blake2s-generic-C-library-implementation-and-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0022-crypto-testmgr-add-test-cases-for-Blake2s.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0023-crypto-blake2s-implement-generic-shash-driver.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0025-crypto-curve25519-generic-C-library-implementations.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0026-crypto-curve25519-add-kpp-selftest.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0027-crypto-curve25519-implement-generic-KPP-driver.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0028-crypto-lib-curve25519-work-around-Clang-stack-spilli.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0029-crypto-curve25519-x86_64-library-and-KPP-implementat.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0030-crypto-arm-curve25519-import-Bernstein-and-Schwabe-s.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0032-crypto-chacha20poly1305-import-construction-and-self.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0033-crypto-lib-chacha20poly1305-reimplement-crypt_from_s.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0034-crypto-chacha_generic-remove-unnecessary-setkey-func.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0035-crypto-x86-chacha-only-unregister-algorithms-if-regi.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0036-crypto-lib-chacha20poly1305-use-chacha20_crypt.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0037-crypto-arch-conditionalize-crypto-api-in-arch-glue-f.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0038-crypto-chacha-fix-warning-message-in-header-file.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0039-crypto-arm-curve25519-add-arch-specific-key-generati.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0040-crypto-lib-curve25519-re-add-selftests.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0041-crypto-poly1305-add-new-32-and-64-bit-generic-versio.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0042-crypto-x86-poly1305-import-unmodified-cryptogams-imp.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0044-crypto-arm-arm64-mips-poly1305-remove-redundant-non-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0045-crypto-curve25519-Fix-selftest-build-error.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0046-crypto-x86-poly1305-fix-.gitignore-typo.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0047-crypto-chacha20poly1305-add-back-missing-test-vector.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0048-crypto-x86-poly1305-emit-does-base-conversion-itself.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0049-crypto-arm-chacha-fix-build-failured-when-kernel-mod.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0050-crypto-Kconfig-allow-tests-to-be-disabled-when-manag.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0051-crypto-chacha20poly1305-prevent-integer-overflow-on-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0052-crypto-x86-curve25519-support-assemblers-with-no-adx.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0053-crypto-arm64-chacha-correctly-walk-through-blocks.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0054-crypto-x86-curve25519-replace-with-formally-verified.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0055-crypto-x86-curve25519-leave-r12-as-spare-register.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0056-crypto-arm-64-poly1305-add-artifact-to-.gitignore-fi.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0058-crypto-lib-chacha20poly1305-Add-missing-function-dec.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0060-crypto-x86-curve25519-Remove-unused-carry-variables.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0061-crypto-arm-curve25519-include-linux-scatterlist.h.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0062-crypto-arm-poly1305-Add-prototype-for-poly1305_block.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0063-crypto-curve25519-x86_64-Use-XORL-r32-32.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0064-crypto-poly1305-x86_64-Use-XORL-r32-32.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0065-crypto-x86-poly1305-Remove-assignments-with-no-effec.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0066-crypto-x86-poly1305-add-back-a-needed-assignment.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0067-crypto-Kconfig-CRYPTO_MANAGER_EXTRA_TESTS-requires-t.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0069-crypto-arm64-chacha-simplify-tail-block-handling.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0070-crypto-lib-chacha20poly1305-define-empty-module-exit.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0071-icmp-introduce-helper-for-nat-d-source-address-in-ne.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0072-net-icmp-pass-zeroed-opts-from-icmp-v6-_ndo_send-bef.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0073-net-WireGuard-secure-network-tunnel.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0074-wireguard-selftests-import-harness-makefile-for-test.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0075-wireguard-Kconfig-select-parent-dependency-for-crypt.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0076-wireguard-global-fix-spelling-mistakes-in-comments.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0077-wireguard-main-remove-unused-include-linux-version.h.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0078-wireguard-allowedips-use-kfree_rcu-instead-of-call_r.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0079-wireguard-selftests-remove-ancient-kernel-compatibil.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0080-wireguard-queueing-do-not-account-for-pfmemalloc-whe.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0081-wireguard-socket-mark-skbs-as-not-on-list-when-recei.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0082-wireguard-allowedips-fix-use-after-free-in-root_remo.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0083-wireguard-noise-reject-peers-with-low-order-public-k.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0084-wireguard-selftests-ensure-non-addition-of-peers-wit.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0085-wireguard-selftests-tie-socket-waiting-to-target-pid.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0086-wireguard-device-use-icmp_ndo_send-helper.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0087-wireguard-selftests-reduce-complexity-and-fix-make-r.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0088-wireguard-receive-reset-last_under_load-to-zero.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0089-wireguard-send-account-for-mtu-0-devices.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0090-wireguard-socket-remove-extra-call-to-synchronize_ne.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0091-wireguard-selftests-remove-duplicated-include-sys-ty.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0092-wireguard-queueing-account-for-skb-protocol-0.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0093-wireguard-receive-remove-dead-code-from-default-pack.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0094-wireguard-noise-error-out-precomputed-DH-during-hand.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0095-wireguard-send-remove-errant-newline-from-packet_enc.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0096-wireguard-queueing-cleanup-ptr_ring-in-error-path-of.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0097-wireguard-receive-use-tunnel-helpers-for-decapsulati.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0098-wireguard-selftests-use-normal-kernel-stack-size-on-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0099-wireguard-socket-remove-errant-restriction-on-loopin.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0100-wireguard-send-receive-cond_resched-when-processing-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0101-wireguard-selftests-initalize-ipv6-members-to-NULL-t.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0102-wireguard-send-receive-use-explicit-unlikely-branch-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0103-wireguard-selftests-use-newer-iproute2-for-gcc-10.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0104-wireguard-noise-read-preshared-key-while-taking-lock.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0105-wireguard-queueing-preserve-flow-hash-across-packet-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0106-wireguard-noise-separate-receive-counter-from-send-c.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0107-wireguard-noise-do-not-assign-initiation-time-in-if-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0108-wireguard-device-avoid-circular-netns-references.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0109-wireguard-receive-account-for-napi_gro_receive-never.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0110-net-ip_tunnel-add-header_ops-for-layer-3-devices.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0111-wireguard-implement-header_ops-parse_protocol-for-AF.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0112-wireguard-queueing-make-use-of-ip_tunnel_parse_proto.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0113-netlink-consistently-use-NLA_POLICY_EXACT_LEN.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0114-netlink-consistently-use-NLA_POLICY_MIN_LEN.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0115-wireguard-noise-take-lock-when-removing-handshake-en.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0116-wireguard-peerlookup-take-lock-before-checking-hash-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0117-wireguard-selftests-check-that-route_me_harder-packe.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0118-wireguard-avoid-double-unlikely-notation-when-using-.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0119-wireguard-socket-remove-bogus-__be32-annotation.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0120-wireguard-selftests-test-multiple-parallel-streams.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0121-wireguard-peer-put-frequently-used-members-above-cac.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0122-wireguard-device-do-not-generate-ICMP-for-non-IP-pac.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0123-wireguard-queueing-get-rid-of-per-peer-ring-buffers.patch	[new file with mode: 0644]	patch \| blob
target/linux/generic/backport-5.4/080-wireguard-0124-wireguard-kconfig-use-arm-chacha-even-with-no-neon.patch	[new file with mode: 0644]	patch \| blob