linux/arch/x86/crypto/ghash-clmulni-intel_asm.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
   4 * instructions. This file contains accelerated part of ghash
   5 * implementation. More information about PCLMULQDQ can be found at:
   6 *
   7 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
   8 *
   9 * Copyright (c) 2009 Intel Corp.
  10 *   Author: Huang Ying <ying.huang@intel.com>
  11 *           Vinodh Gopal
  12 *           Erdinc Ozturk
  13 *           Deniz Karakoyunlu
  14 */
  15
  16#include <linux/linkage.h>
  17#include <asm/frame.h>
  18
  19.section        .rodata.cst16.bswap_mask, "aM", @progbits, 16
  20.align 16
  21.Lbswap_mask:
  22        .octa 0x000102030405060708090a0b0c0d0e0f
  23
  24#define DATA    %xmm0
  25#define SHASH   %xmm1
  26#define T1      %xmm2
  27#define T2      %xmm3
  28#define T3      %xmm4
  29#define BSWAP   %xmm5
  30#define IN1     %xmm6
  31
  32.text
  33
  34/*
  35 * __clmul_gf128mul_ble:        internal ABI
  36 * input:
  37 *      DATA:                   operand1
  38 *      SHASH:                  operand2, hash_key << 1 mod poly
  39 * output:
  40 *      DATA:                   operand1 * operand2 mod poly
  41 * changed:
  42 *      T1
  43 *      T2
  44 *      T3
  45 */
  46SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
  47        movaps DATA, T1
  48        pshufd $0b01001110, DATA, T2
  49        pshufd $0b01001110, SHASH, T3
  50        pxor DATA, T2
  51        pxor SHASH, T3
  52
  53        pclmulqdq $0x00, SHASH, DATA    # DATA = a0 * b0
  54        pclmulqdq $0x11, SHASH, T1      # T1 = a1 * b1
  55        pclmulqdq $0x00, T3, T2         # T2 = (a1 + a0) * (b1 + b0)
  56        pxor DATA, T2
  57        pxor T1, T2                     # T2 = a0 * b1 + a1 * b0
  58
  59        movaps T2, T3
  60        pslldq $8, T3
  61        psrldq $8, T2
  62        pxor T3, DATA
  63        pxor T2, T1                     # <T1:DATA> is result of
  64                                        # carry-less multiplication
  65
  66        # first phase of the reduction
  67        movaps DATA, T3
  68        psllq $1, T3
  69        pxor DATA, T3
  70        psllq $5, T3
  71        pxor DATA, T3
  72        psllq $57, T3
  73        movaps T3, T2
  74        pslldq $8, T2
  75        psrldq $8, T3
  76        pxor T2, DATA
  77        pxor T3, T1
  78
  79        # second phase of the reduction
  80        movaps DATA, T2
  81        psrlq $5, T2
  82        pxor DATA, T2
  83        psrlq $1, T2
  84        pxor DATA, T2
  85        psrlq $1, T2
  86        pxor T2, T1
  87        pxor T1, DATA
  88        ret
  89SYM_FUNC_END(__clmul_gf128mul_ble)
  90
  91/* void clmul_ghash_mul(char *dst, const u128 *shash) */
  92SYM_FUNC_START(clmul_ghash_mul)
  93        FRAME_BEGIN
  94        movups (%rdi), DATA
  95        movups (%rsi), SHASH
  96        movaps .Lbswap_mask, BSWAP
  97        pshufb BSWAP, DATA
  98        call __clmul_gf128mul_ble
  99        pshufb BSWAP, DATA
 100        movups DATA, (%rdi)
 101        FRAME_END
 102        ret
 103SYM_FUNC_END(clmul_ghash_mul)
 104
 105/*
 106 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
 107 *                         const u128 *shash);
 108 */
 109SYM_FUNC_START(clmul_ghash_update)
 110        FRAME_BEGIN
 111        cmp $16, %rdx
 112        jb .Lupdate_just_ret    # check length
 113        movaps .Lbswap_mask, BSWAP
 114        movups (%rdi), DATA
 115        movups (%rcx), SHASH
 116        pshufb BSWAP, DATA
 117.align 4
 118.Lupdate_loop:
 119        movups (%rsi), IN1
 120        pshufb BSWAP, IN1
 121        pxor IN1, DATA
 122        call __clmul_gf128mul_ble
 123        sub $16, %rdx
 124        add $16, %rsi
 125        cmp $16, %rdx
 126        jge .Lupdate_loop
 127        pshufb BSWAP, DATA
 128        movups DATA, (%rdi)
 129.Lupdate_just_ret:
 130        FRAME_END
 131        ret
 132SYM_FUNC_END(clmul_ghash_update)
 133