diff options
author | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2014-06-16 11:02:16 +0100 |
---|---|---|
committer | Catalin Marinas <catalin.marinas@arm.com> | 2014-06-18 12:40:54 +0100 |
commit | b913a6404ce2b7d10a735834218d3c1e1bceff2a (patch) | |
tree | 5d2736f6de8a7600c99e471dfddc6d9a5ed43b2d /arch/arm64/crypto | |
parent | 6aa8b209f5ef3610d470c519ddd6e6b47e9f6248 (diff) |
arm64/crypto: improve performance of GHASH algorithm
This patches modifies the GHASH secure hash implementation to switch to a
faster, polynomial multiplication based reduction instead of one that uses
shifts and rotates.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Diffstat (limited to 'arch/arm64/crypto')
-rw-r--r-- | arch/arm64/crypto/ghash-ce-core.S | 92 | ||||
-rw-r--r-- | arch/arm64/crypto/ghash-ce-glue.c | 4 |
2 files changed, 40 insertions, 56 deletions
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index b9e6eaf41c9..dc457015884 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -3,14 +3,6 @@ * * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> * - * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S - * - * Copyright (c) 2009 Intel Corp. - * Author: Huang Ying <ying.huang@intel.com> - * Vinodh Gopal - * Erdinc Ozturk - * Deniz Karakoyunlu - * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published * by the Free Software Foundation. @@ -19,13 +11,15 @@ #include <linux/linkage.h> #include <asm/assembler.h> - DATA .req v0 - SHASH .req v1 - IN1 .req v2 + SHASH .req v0 + SHASH2 .req v1 T1 .req v2 T2 .req v3 - T3 .req v4 - VZR .req v5 + MASK .req v4 + XL .req v5 + XM .req v6 + XH .req v7 + IN1 .req v7 .text .arch armv8-a+crypto @@ -35,61 +29,51 @@ * struct ghash_key const *k, const char *head) */ ENTRY(pmull_ghash_update) - ld1 {DATA.16b}, [x1] ld1 {SHASH.16b}, [x3] - eor VZR.16b, VZR.16b, VZR.16b + ld1 {XL.16b}, [x1] + movi MASK.16b, #0xe1 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + shl MASK.2d, MASK.2d, #57 + eor SHASH2.16b, SHASH2.16b, SHASH.16b /* do the head block first, if supplied */ cbz x4, 0f - ld1 {IN1.2d}, [x4] + ld1 {T1.2d}, [x4] b 1f -0: ld1 {IN1.2d}, [x2], #16 +0: ld1 {T1.2d}, [x2], #16 sub w0, w0, #1 -1: ext IN1.16b, IN1.16b, IN1.16b, #8 -CPU_LE( rev64 IN1.16b, IN1.16b ) - eor DATA.16b, DATA.16b, IN1.16b - /* multiply DATA by SHASH in GF(2^128) */ - ext T2.16b, DATA.16b, DATA.16b, #8 - ext T3.16b, SHASH.16b, SHASH.16b, #8 - eor T2.16b, T2.16b, DATA.16b - eor T3.16b, T3.16b, SHASH.16b +1: /* multiply XL by SHASH in GF(2^128) */ +CPU_LE( rev64 T1.16b, T1.16b ) - pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 - pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 - pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) - eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) - eor T2.16b, T2.16b, DATA.16b + ext T2.16b, XL.16b, XL.16b, #8 + ext IN1.16b, T1.16b, T1.16b, #8 + eor T1.16b, T1.16b, T2.16b + eor XL.16b, XL.16b, IN1.16b - ext T3.16b, VZR.16b, T2.16b, #8 - ext T2.16b, T2.16b, VZR.16b, #8 - eor DATA.16b, DATA.16b, T3.16b - eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of - // carry-less multiplication + pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 + eor T1.16b, T1.16b, XL.16b + pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 + pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) - /* first phase of the reduction */ - shl T3.2d, DATA.2d, #1 - eor T3.16b, T3.16b, DATA.16b - shl T3.2d, T3.2d, #5 - eor T3.16b, T3.16b, DATA.16b - shl T3.2d, T3.2d, #57 - ext T2.16b, VZR.16b, T3.16b, #8 - ext T3.16b, T3.16b, VZR.16b, #8 - eor DATA.16b, DATA.16b, T2.16b - eor T1.16b, T1.16b, T3.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, XL.16b, XH.16b + eor XM.16b, XM.16b, T1.16b + eor XM.16b, XM.16b, T2.16b + pmull T2.1q, XL.1d, MASK.1d - /* second phase of the reduction */ - ushr T2.2d, DATA.2d, #5 - eor T2.16b, T2.16b, DATA.16b - ushr T2.2d, T2.2d, #1 - eor T2.16b, T2.16b, DATA.16b - ushr T2.2d, T2.2d, #1 - eor T1.16b, T1.16b, T2.16b - eor DATA.16b, DATA.16b, T1.16b + mov XH.d[0], XM.d[1] + mov XM.d[1], XL.d[0] + + eor XL.16b, XM.16b, T2.16b + ext T2.16b, XL.16b, XL.16b, #8 + pmull XL.1q, XL.1d, MASK.1d + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b cbnz w0, 0b - st1 {DATA.16b}, [x1] + st1 {XL.16b}, [x1] ret ENDPROC(pmull_ghash_update) diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index ef6aa69c4e0..833ec1e3f3e 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -67,7 +67,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src, blocks = len / GHASH_BLOCK_SIZE; len %= GHASH_BLOCK_SIZE; - kernel_neon_begin_partial(6); + kernel_neon_begin_partial(8); pmull_ghash_update(blocks, ctx->digest, src, key, partial ? ctx->buf : NULL); kernel_neon_end(); @@ -89,7 +89,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); - kernel_neon_begin_partial(6); + kernel_neon_begin_partial(8); pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); kernel_neon_end(); } |