linux/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
<<
>>
Prefs
   1/*
   2 * Buffer submit code for multi buffer SHA256 algorithm
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 *  Copyright(c) 2016 Intel Corporation.
  10 *
  11 *  This program is free software; you can redistribute it and/or modify
  12 *  it under the terms of version 2 of the GNU General Public License as
  13 *  published by the Free Software Foundation.
  14 *
  15 *  This program is distributed in the hope that it will be useful, but
  16 *  WITHOUT ANY WARRANTY; without even the implied warranty of
  17 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 *  General Public License for more details.
  19 *
  20 *  Contact Information:
  21 *      Megha Dey <megha.dey@linux.intel.com>
  22 *
  23 *  BSD LICENSE
  24 *
  25 *  Copyright(c) 2016 Intel Corporation.
  26 *
  27 *  Redistribution and use in source and binary forms, with or without
  28 *  modification, are permitted provided that the following conditions
  29 *  are met:
  30 *
  31 *    * Redistributions of source code must retain the above copyright
  32 *      notice, this list of conditions and the following disclaimer.
  33 *    * Redistributions in binary form must reproduce the above copyright
  34 *      notice, this list of conditions and the following disclaimer in
  35 *      the documentation and/or other materials provided with the
  36 *      distribution.
  37 *    * Neither the name of Intel Corporation nor the names of its
  38 *      contributors may be used to endorse or promote products derived
  39 *      from this software without specific prior written permission.
  40 *
  41 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  42 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  43 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  44 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  45 *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  46 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  47 *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  48 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  49 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  50 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  52 */
  53
  54#include <linux/linkage.h>
  55#include <asm/frame.h>
  56#include "sha256_mb_mgr_datastruct.S"
  57
  58.extern sha256_x8_avx2
  59
  60# LINUX register definitions
  61arg1            = %rdi
  62arg2            = %rsi
  63size_offset     = %rcx
  64tmp2            = %rcx
  65extra_blocks    = %rdx
  66
  67# Common definitions
  68#define state   arg1
  69#define job     %rsi
  70#define len2    arg2
  71#define p2      arg2
  72
  73# idx must be a register not clobberred by sha1_x8_avx2
  74idx             = %r8
  75DWORD_idx       = %r8d
  76last_len        = %r8
  77
  78p               = %r11
  79start_offset    = %r11
  80
  81unused_lanes    = %rbx
  82BYTE_unused_lanes = %bl
  83
  84job_rax         = %rax
  85len             = %rax
  86DWORD_len       = %eax
  87
  88lane            = %r12
  89tmp3            = %r12
  90
  91tmp             = %r9
  92DWORD_tmp       = %r9d
  93
  94lane_data       = %r10
  95
  96# JOB* sha256_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA256 *job)
  97# arg 1 : rcx : state
  98# arg 2 : rdx : job
  99ENTRY(sha256_mb_mgr_submit_avx2)
 100        FRAME_BEGIN
 101        push    %rbx
 102        push    %r12
 103
 104        mov     _unused_lanes(state), unused_lanes
 105        mov     unused_lanes, lane
 106        and     $0xF, lane
 107        shr     $4, unused_lanes
 108        imul    $_LANE_DATA_size, lane, lane_data
 109        movl    $STS_BEING_PROCESSED, _status(job)
 110        lea     _ldata(state, lane_data), lane_data
 111        mov     unused_lanes, _unused_lanes(state)
 112        movl    _len(job),  DWORD_len
 113
 114        mov     job, _job_in_lane(lane_data)
 115        shl     $4, len
 116        or      lane, len
 117
 118        movl    DWORD_len,  _lens(state , lane, 4)
 119
 120        # Load digest words from result_digest
 121        vmovdqu _result_digest(job), %xmm0
 122        vmovdqu _result_digest+1*16(job), %xmm1
 123        vmovd   %xmm0, _args_digest(state, lane, 4)
 124        vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
 125        vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
 126        vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
 127        vmovd   %xmm1, _args_digest+4*32(state , lane, 4)
 128
 129        vpextrd $1, %xmm1, _args_digest+5*32(state , lane, 4)
 130        vpextrd $2, %xmm1, _args_digest+6*32(state , lane, 4)
 131        vpextrd $3, %xmm1, _args_digest+7*32(state , lane, 4)
 132
 133        mov     _buffer(job), p
 134        mov     p, _args_data_ptr(state, lane, 8)
 135
 136        cmp     $0xF, unused_lanes
 137        jne     return_null
 138
 139start_loop:
 140        # Find min length
 141        vmovdqa _lens(state), %xmm0
 142        vmovdqa _lens+1*16(state), %xmm1
 143
 144        vpminud %xmm1, %xmm0, %xmm2             # xmm2 has {D,C,B,A}
 145        vpalignr $8, %xmm2, %xmm3, %xmm3        # xmm3 has {x,x,D,C}
 146        vpminud %xmm3, %xmm2, %xmm2             # xmm2 has {x,x,E,F}
 147        vpalignr $4, %xmm2, %xmm3, %xmm3        # xmm3 has {x,x,x,E}
 148        vpminud %xmm3, %xmm2, %xmm2             # xmm2 has min val in low dword
 149
 150        vmovd   %xmm2, DWORD_idx
 151        mov     idx, len2
 152        and     $0xF, idx
 153        shr     $4, len2
 154        jz      len_is_0
 155
 156        vpand   clear_low_nibble(%rip), %xmm2, %xmm2
 157        vpshufd $0, %xmm2, %xmm2
 158
 159        vpsubd  %xmm2, %xmm0, %xmm0
 160        vpsubd  %xmm2, %xmm1, %xmm1
 161
 162        vmovdqa %xmm0, _lens + 0*16(state)
 163        vmovdqa %xmm1, _lens + 1*16(state)
 164
 165        # "state" and "args" are the same address, arg1
 166        # len is arg2
 167        call    sha256_x8_avx2
 168
 169        # state and idx are intact
 170
 171len_is_0:
 172        # process completed job "idx"
 173        imul    $_LANE_DATA_size, idx, lane_data
 174        lea     _ldata(state, lane_data), lane_data
 175
 176        mov     _job_in_lane(lane_data), job_rax
 177        mov     _unused_lanes(state), unused_lanes
 178        movq    $0, _job_in_lane(lane_data)
 179        movl    $STS_COMPLETED, _status(job_rax)
 180        shl     $4, unused_lanes
 181        or      idx, unused_lanes
 182        mov     unused_lanes, _unused_lanes(state)
 183
 184        movl    $0xFFFFFFFF, _lens(state,idx,4)
 185
 186        vmovd   _args_digest(state, idx, 4), %xmm0
 187        vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
 188        vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
 189        vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
 190        vmovd   _args_digest+4*32(state, idx, 4), %xmm1
 191
 192        vpinsrd $1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1
 193        vpinsrd $2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1
 194        vpinsrd $3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1
 195
 196        vmovdqu %xmm0, _result_digest(job_rax)
 197        vmovdqu %xmm1, _result_digest+1*16(job_rax)
 198
 199return:
 200        pop     %r12
 201        pop     %rbx
 202        FRAME_END
 203        ret
 204
 205return_null:
 206        xor     job_rax, job_rax
 207        jmp     return
 208
 209ENDPROC(sha256_mb_mgr_submit_avx2)
 210
 211.data
 212
 213.align 16
 214clear_low_nibble:
 215        .octa   0x000000000000000000000000FFFFFFF0
 216