linux/arch/x86/crypto/ghash-clmulni-intel_asm.S
<<
>>
Prefs
   1/*
   2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
   3 * instructions. This file contains accelerated part of ghash
   4 * implementation. More information about PCLMULQDQ can be found at:
   5 *
   6 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
   7 *
   8 * Copyright (c) 2009 Intel Corp.
   9 *   Author: Huang Ying <ying.huang@intel.com>
  10 *           Vinodh Gopal
  11 *           Erdinc Ozturk
  12 *           Deniz Karakoyunlu
  13 *
  14 * This program is free software; you can redistribute it and/or modify it
  15 * under the terms of the GNU General Public License version 2 as published
  16 * by the Free Software Foundation.
  17 */
  18
  19#include <linux/linkage.h>
  20#include <asm/inst.h>
  21#include <asm/frame.h>
  22
  23.data
  24
  25.align 16
  26.Lbswap_mask:
  27        .octa 0x000102030405060708090a0b0c0d0e0f
  28.Lpoly:
  29        .octa 0xc2000000000000000000000000000001
  30.Ltwo_one:
  31        .octa 0x00000001000000000000000000000001
  32
  33#define DATA    %xmm0
  34#define SHASH   %xmm1
  35#define T1      %xmm2
  36#define T2      %xmm3
  37#define T3      %xmm4
  38#define BSWAP   %xmm5
  39#define IN1     %xmm6
  40
  41.text
  42
  43/*
  44 * __clmul_gf128mul_ble:        internal ABI
  45 * input:
  46 *      DATA:                   operand1
  47 *      SHASH:                  operand2, hash_key << 1 mod poly
  48 * output:
  49 *      DATA:                   operand1 * operand2 mod poly
  50 * changed:
  51 *      T1
  52 *      T2
  53 *      T3
  54 */
  55__clmul_gf128mul_ble:
  56        movaps DATA, T1
  57        pshufd $0b01001110, DATA, T2
  58        pshufd $0b01001110, SHASH, T3
  59        pxor DATA, T2
  60        pxor SHASH, T3
  61
  62        PCLMULQDQ 0x00 SHASH DATA       # DATA = a0 * b0
  63        PCLMULQDQ 0x11 SHASH T1         # T1 = a1 * b1
  64        PCLMULQDQ 0x00 T3 T2            # T2 = (a1 + a0) * (b1 + b0)
  65        pxor DATA, T2
  66        pxor T1, T2                     # T2 = a0 * b1 + a1 * b0
  67
  68        movaps T2, T3
  69        pslldq $8, T3
  70        psrldq $8, T2
  71        pxor T3, DATA
  72        pxor T2, T1                     # <T1:DATA> is result of
  73                                        # carry-less multiplication
  74
  75        # first phase of the reduction
  76        movaps DATA, T3
  77        psllq $1, T3
  78        pxor DATA, T3
  79        psllq $5, T3
  80        pxor DATA, T3
  81        psllq $57, T3
  82        movaps T3, T2
  83        pslldq $8, T2
  84        psrldq $8, T3
  85        pxor T2, DATA
  86        pxor T3, T1
  87
  88        # second phase of the reduction
  89        movaps DATA, T2
  90        psrlq $5, T2
  91        pxor DATA, T2
  92        psrlq $1, T2
  93        pxor DATA, T2
  94        psrlq $1, T2
  95        pxor T2, T1
  96        pxor T1, DATA
  97        ret
  98ENDPROC(__clmul_gf128mul_ble)
  99
 100/* void clmul_ghash_mul(char *dst, const be128 *shash) */
 101ENTRY(clmul_ghash_mul)
 102        FRAME_BEGIN
 103        movups (%rdi), DATA
 104        movups (%rsi), SHASH
 105        movaps .Lbswap_mask, BSWAP
 106        PSHUFB_XMM BSWAP DATA
 107        call __clmul_gf128mul_ble
 108        PSHUFB_XMM BSWAP DATA
 109        movups DATA, (%rdi)
 110        FRAME_END
 111        ret
 112ENDPROC(clmul_ghash_mul)
 113
 114/*
 115 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
 116 *                         const be128 *shash);
 117 */
 118ENTRY(clmul_ghash_update)
 119        FRAME_BEGIN
 120        cmp $16, %rdx
 121        jb .Lupdate_just_ret    # check length
 122        movaps .Lbswap_mask, BSWAP
 123        movups (%rdi), DATA
 124        movups (%rcx), SHASH
 125        PSHUFB_XMM BSWAP DATA
 126.align 4
 127.Lupdate_loop:
 128        movups (%rsi), IN1
 129        PSHUFB_XMM BSWAP IN1
 130        pxor IN1, DATA
 131        call __clmul_gf128mul_ble
 132        sub $16, %rdx
 133        add $16, %rsi
 134        cmp $16, %rdx
 135        jge .Lupdate_loop
 136        PSHUFB_XMM BSWAP DATA
 137        movups DATA, (%rdi)
 138.Lupdate_just_ret:
 139        FRAME_END
 140        ret
 141ENDPROC(clmul_ghash_update)
 142
 143/*
 144 * void clmul_ghash_setkey(be128 *shash, const u8 *key);
 145 *
 146 * Calculate hash_key << 1 mod poly
 147 */
 148ENTRY(clmul_ghash_setkey)
 149        movaps .Lbswap_mask, BSWAP
 150        movups (%rsi), %xmm0
 151        PSHUFB_XMM BSWAP %xmm0
 152        movaps %xmm0, %xmm1
 153        psllq $1, %xmm0
 154        psrlq $63, %xmm1
 155        movaps %xmm1, %xmm2
 156        pslldq $8, %xmm1
 157        psrldq $8, %xmm2
 158        por %xmm1, %xmm0
 159        # reduction
 160        pshufd $0b00100100, %xmm2, %xmm1
 161        pcmpeqd .Ltwo_one, %xmm1
 162        pand .Lpoly, %xmm1
 163        pxor %xmm1, %xmm0
 164        movups %xmm0, (%rdi)
 165        ret
 166ENDPROC(clmul_ghash_setkey)
 167