linux/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
<<
>>
Prefs
   1/*
   2 * Flush routine for SHA256 multibuffer
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 *  Copyright(c) 2016 Intel Corporation.
  10 *
  11 *  This program is free software; you can redistribute it and/or modify
  12 *  it under the terms of version 2 of the GNU General Public License as
  13 *  published by the Free Software Foundation.
  14 *
  15 *  This program is distributed in the hope that it will be useful, but
  16 *  WITHOUT ANY WARRANTY; without even the implied warranty of
  17 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 *  General Public License for more details.
  19 *
  20 *  Contact Information:
  21 *      Megha Dey <megha.dey@linux.intel.com>
  22 *
  23 *  BSD LICENSE
  24 *
  25 *  Copyright(c) 2016 Intel Corporation.
  26 *
  27 *  Redistribution and use in source and binary forms, with or without
  28 *  modification, are permitted provided that the following conditions
  29 *  are met:
  30 *
  31 *    * Redistributions of source code must retain the above copyright
  32 *      notice, this list of conditions and the following disclaimer.
  33 *    * Redistributions in binary form must reproduce the above copyright
  34 *      notice, this list of conditions and the following disclaimer in
  35 *      the documentation and/or other materials provided with the
  36 *      distribution.
  37 *    * Neither the name of Intel Corporation nor the names of its
  38 *      contributors may be used to endorse or promote products derived
  39 *      from this software without specific prior written permission.
  40 *
  41 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  42 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  43 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  44 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  45 *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  46 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  47 *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  48 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  49 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  50 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  52 */
  53#include <linux/linkage.h>
  54#include <asm/frame.h>
  55#include "sha256_mb_mgr_datastruct.S"
  56
  57.extern sha256_x8_avx2
  58
  59#LINUX register definitions
  60#define arg1    %rdi
  61#define arg2    %rsi
  62
  63# Common register definitions
  64#define state   arg1
  65#define job     arg2
  66#define len2    arg2
  67
  68# idx must be a register not clobberred by sha1_mult
  69#define idx             %r8
  70#define DWORD_idx       %r8d
  71
  72#define unused_lanes    %rbx
  73#define lane_data       %rbx
  74#define tmp2            %rbx
  75#define tmp2_w          %ebx
  76
  77#define job_rax         %rax
  78#define tmp1            %rax
  79#define size_offset     %rax
  80#define tmp             %rax
  81#define start_offset    %rax
  82
  83#define tmp3            %arg1
  84
  85#define extra_blocks    %arg2
  86#define p               %arg2
  87
  88.macro LABEL prefix n
  89\prefix\n\():
  90.endm
  91
  92.macro JNE_SKIP i
  93jne     skip_\i
  94.endm
  95
  96.altmacro
  97.macro SET_OFFSET _offset
  98offset = \_offset
  99.endm
 100.noaltmacro
 101
 102# JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state)
 103# arg 1 : rcx : state
 104ENTRY(sha256_mb_mgr_flush_avx2)
 105        FRAME_BEGIN
 106        push    %rbx
 107
 108        # If bit (32+3) is set, then all lanes are empty
 109        mov     _unused_lanes(state), unused_lanes
 110        bt      $32+3, unused_lanes
 111        jc      return_null
 112
 113        # find a lane with a non-null job
 114        xor     idx, idx
 115        offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
 116        cmpq    $0, offset(state)
 117        cmovne  one(%rip), idx
 118        offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
 119        cmpq    $0, offset(state)
 120        cmovne  two(%rip), idx
 121        offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
 122        cmpq    $0, offset(state)
 123        cmovne  three(%rip), idx
 124        offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
 125        cmpq    $0, offset(state)
 126        cmovne  four(%rip), idx
 127        offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
 128        cmpq    $0, offset(state)
 129        cmovne  five(%rip), idx
 130        offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
 131        cmpq    $0, offset(state)
 132        cmovne  six(%rip), idx
 133        offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
 134        cmpq    $0, offset(state)
 135        cmovne  seven(%rip), idx
 136
 137        # copy idx to empty lanes
 138copy_lane_data:
 139        offset =  (_args + _data_ptr)
 140        mov     offset(state,idx,8), tmp
 141
 142        I = 0
 143.rep 8
 144        offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
 145        cmpq    $0, offset(state)
 146.altmacro
 147        JNE_SKIP %I
 148        offset =  (_args + _data_ptr + 8*I)
 149        mov     tmp, offset(state)
 150        offset =  (_lens + 4*I)
 151        movl    $0xFFFFFFFF, offset(state)
 152LABEL skip_ %I
 153        I = (I+1)
 154.noaltmacro
 155.endr
 156
 157        # Find min length
 158        vmovdqu _lens+0*16(state), %xmm0
 159        vmovdqu _lens+1*16(state), %xmm1
 160
 161        vpminud %xmm1, %xmm0, %xmm2             # xmm2 has {D,C,B,A}
 162        vpalignr $8, %xmm2, %xmm3, %xmm3        # xmm3 has {x,x,D,C}
 163        vpminud %xmm3, %xmm2, %xmm2             # xmm2 has {x,x,E,F}
 164        vpalignr $4, %xmm2, %xmm3, %xmm3        # xmm3 has {x,x,x,E}
 165        vpminud %xmm3, %xmm2, %xmm2             # xmm2 has min val in low dword
 166
 167        vmovd   %xmm2, DWORD_idx
 168        mov     idx, len2
 169        and     $0xF, idx
 170        shr     $4, len2
 171        jz      len_is_0
 172
 173        vpand   clear_low_nibble(%rip), %xmm2, %xmm2
 174        vpshufd $0, %xmm2, %xmm2
 175
 176        vpsubd  %xmm2, %xmm0, %xmm0
 177        vpsubd  %xmm2, %xmm1, %xmm1
 178
 179        vmovdqu %xmm0, _lens+0*16(state)
 180        vmovdqu %xmm1, _lens+1*16(state)
 181
 182        # "state" and "args" are the same address, arg1
 183        # len is arg2
 184        call    sha256_x8_avx2
 185        # state and idx are intact
 186
 187len_is_0:
 188        # process completed job "idx"
 189        imul    $_LANE_DATA_size, idx, lane_data
 190        lea     _ldata(state, lane_data), lane_data
 191
 192        mov     _job_in_lane(lane_data), job_rax
 193        movq    $0, _job_in_lane(lane_data)
 194        movl    $STS_COMPLETED, _status(job_rax)
 195        mov     _unused_lanes(state), unused_lanes
 196        shl     $4, unused_lanes
 197        or      idx, unused_lanes
 198
 199        mov     unused_lanes, _unused_lanes(state)
 200        movl    $0xFFFFFFFF, _lens(state,idx,4)
 201
 202        vmovd   _args_digest(state , idx, 4) , %xmm0
 203        vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
 204        vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
 205        vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
 206        vmovd   _args_digest+4*32(state, idx, 4), %xmm1
 207        vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
 208        vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
 209        vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
 210
 211        vmovdqu %xmm0, _result_digest(job_rax)
 212        offset =  (_result_digest + 1*16)
 213        vmovdqu %xmm1, offset(job_rax)
 214
 215return:
 216        pop     %rbx
 217        FRAME_END
 218        ret
 219
 220return_null:
 221        xor     job_rax, job_rax
 222        jmp     return
 223ENDPROC(sha256_mb_mgr_flush_avx2)
 224
 225##############################################################################
 226
 227.align 16
 228ENTRY(sha256_mb_mgr_get_comp_job_avx2)
 229        push    %rbx
 230
 231        ## if bit 32+3 is set, then all lanes are empty
 232        mov     _unused_lanes(state), unused_lanes
 233        bt      $(32+3), unused_lanes
 234        jc      .return_null
 235
 236        # Find min length
 237        vmovdqu _lens(state), %xmm0
 238        vmovdqu _lens+1*16(state), %xmm1
 239
 240        vpminud %xmm1, %xmm0, %xmm2             # xmm2 has {D,C,B,A}
 241        vpalignr $8, %xmm2, %xmm3, %xmm3        # xmm3 has {x,x,D,C}
 242        vpminud %xmm3, %xmm2, %xmm2             # xmm2 has {x,x,E,F}
 243        vpalignr $4, %xmm2, %xmm3, %xmm3        # xmm3 has {x,x,x,E}
 244        vpminud %xmm3, %xmm2, %xmm2             # xmm2 has min val in low dword
 245
 246        vmovd   %xmm2, DWORD_idx
 247        test    $~0xF, idx
 248        jnz     .return_null
 249
 250        # process completed job "idx"
 251        imul    $_LANE_DATA_size, idx, lane_data
 252        lea     _ldata(state, lane_data), lane_data
 253
 254        mov     _job_in_lane(lane_data), job_rax
 255        movq    $0,  _job_in_lane(lane_data)
 256        movl    $STS_COMPLETED, _status(job_rax)
 257        mov     _unused_lanes(state), unused_lanes
 258        shl     $4, unused_lanes
 259        or      idx, unused_lanes
 260        mov     unused_lanes, _unused_lanes(state)
 261
 262        movl    $0xFFFFFFFF, _lens(state,  idx, 4)
 263
 264        vmovd   _args_digest(state, idx, 4), %xmm0
 265        vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
 266        vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
 267        vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
 268        vmovd   _args_digest(state , idx, 4) , %xmm0
 269        vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
 270        vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
 271        vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
 272
 273        vmovdqu %xmm0, _result_digest(job_rax)
 274        offset =  (_result_digest + 1*16)
 275        vmovdqu %xmm1, offset(job_rax)
 276
 277        pop     %rbx
 278
 279        ret
 280
 281.return_null:
 282        xor     job_rax, job_rax
 283        pop     %rbx
 284        ret
 285ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
 286
 287.section        .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 288.align 16
 289clear_low_nibble:
 290.octa   0x000000000000000000000000FFFFFFF0
 291
 292.section        .rodata.cst8, "aM", @progbits, 8
 293.align 8
 294one:
 295.quad   1
 296two:
 297.quad   2
 298three:
 299.quad   3
 300four:
 301.quad   4
 302five:
 303.quad   5
 304six:
 305.quad   6
 306seven:
 307.quad  7
 308