linux/arch/arm/crypto/sha1-armv7-neon.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
   3 *
   4 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
   5 */
   6
   7#include <linux/linkage.h>
   8#include <asm/assembler.h>
   9
  10.syntax unified
  11.fpu neon
  12
  13.text
  14
  15
  16/* Context structure */
  17
  18#define state_h0 0
  19#define state_h1 4
  20#define state_h2 8
  21#define state_h3 12
  22#define state_h4 16
  23
  24
  25/* Constants */
  26
  27#define K1  0x5A827999
  28#define K2  0x6ED9EBA1
  29#define K3  0x8F1BBCDC
  30#define K4  0xCA62C1D6
  31.align 4
  32.LK_VEC:
  33.LK1:   .long K1, K1, K1, K1
  34.LK2:   .long K2, K2, K2, K2
  35.LK3:   .long K3, K3, K3, K3
  36.LK4:   .long K4, K4, K4, K4
  37
  38
  39/* Register macros */
  40
  41#define RSTATE r0
  42#define RDATA r1
  43#define RNBLKS r2
  44#define ROLDSTACK r3
  45#define RWK lr
  46
  47#define _a r4
  48#define _b r5
  49#define _c r6
  50#define _d r7
  51#define _e r8
  52
  53#define RT0 r9
  54#define RT1 r10
  55#define RT2 r11
  56#define RT3 r12
  57
  58#define W0 q0
  59#define W1 q7
  60#define W2 q2
  61#define W3 q3
  62#define W4 q4
  63#define W5 q6
  64#define W6 q5
  65#define W7 q1
  66
  67#define tmp0 q8
  68#define tmp1 q9
  69#define tmp2 q10
  70#define tmp3 q11
  71
  72#define qK1 q12
  73#define qK2 q13
  74#define qK3 q14
  75#define qK4 q15
  76
  77#ifdef CONFIG_CPU_BIG_ENDIAN
  78#define ARM_LE(code...)
  79#else
  80#define ARM_LE(code...)         code
  81#endif
  82
  83/* Round function macros. */
  84
  85#define WK_offs(i) (((i) & 15) * 4)
  86
  87#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  88              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  89        ldr RT3, [sp, WK_offs(i)]; \
  90                pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  91        bic RT0, d, b; \
  92        add e, e, a, ror #(32 - 5); \
  93        and RT1, c, b; \
  94                pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  95        add RT0, RT0, RT3; \
  96        add e, e, RT1; \
  97        ror b, #(32 - 30); \
  98                pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  99        add e, e, RT0;
 100
 101#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
 102              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 103        ldr RT3, [sp, WK_offs(i)]; \
 104                pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 105        eor RT0, d, b; \
 106        add e, e, a, ror #(32 - 5); \
 107        eor RT0, RT0, c; \
 108                pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 109        add e, e, RT3; \
 110        ror b, #(32 - 30); \
 111                pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 112        add e, e, RT0; \
 113
 114#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
 115              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 116        ldr RT3, [sp, WK_offs(i)]; \
 117                pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 118        eor RT0, b, c; \
 119        and RT1, b, c; \
 120        add e, e, a, ror #(32 - 5); \
 121                pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 122        and RT0, RT0, d; \
 123        add RT1, RT1, RT3; \
 124        add e, e, RT0; \
 125        ror b, #(32 - 30); \
 126                pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 127        add e, e, RT1;
 128
 129#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
 130              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 131        _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
 132              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
 133
 134#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
 135           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 136        _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
 137               W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
 138
 139#define R(a,b,c,d,e,f,i) \
 140        _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
 141               W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
 142
 143#define dummy(...)
 144
 145
 146/* Input expansion macros. */
 147
 148/********* Precalc macros for rounds 0-15 *************************************/
 149
 150#define W_PRECALC_00_15() \
 151        add       RWK, sp, #(WK_offs(0));                       \
 152        \
 153        vld1.32   {W0, W7}, [RDATA]!;                           \
 154 ARM_LE(vrev32.8  W0, W0;       )       /* big => little */     \
 155        vld1.32   {W6, W5}, [RDATA]!;                           \
 156        vadd.u32  tmp0, W0, curK;                               \
 157 ARM_LE(vrev32.8  W7, W7;       )       /* big => little */     \
 158 ARM_LE(vrev32.8  W6, W6;       )       /* big => little */     \
 159        vadd.u32  tmp1, W7, curK;                               \
 160 ARM_LE(vrev32.8  W5, W5;       )       /* big => little */     \
 161        vadd.u32  tmp2, W6, curK;                               \
 162        vst1.32   {tmp0, tmp1}, [RWK]!;                         \
 163        vadd.u32  tmp3, W5, curK;                               \
 164        vst1.32   {tmp2, tmp3}, [RWK];                          \
 165
 166#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 167        vld1.32   {W0, W7}, [RDATA]!;                           \
 168
 169#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 170        add       RWK, sp, #(WK_offs(0));                       \
 171
 172#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 173 ARM_LE(vrev32.8  W0, W0;       )       /* big => little */     \
 174
 175#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 176        vld1.32   {W6, W5}, [RDATA]!;                           \
 177
 178#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 179        vadd.u32  tmp0, W0, curK;                               \
 180
 181#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 182 ARM_LE(vrev32.8  W7, W7;       )       /* big => little */     \
 183
 184#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 185 ARM_LE(vrev32.8  W6, W6;       )       /* big => little */     \
 186
 187#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 188        vadd.u32  tmp1, W7, curK;                               \
 189
 190#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 191 ARM_LE(vrev32.8  W5, W5;       )       /* big => little */     \
 192
 193#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 194        vadd.u32  tmp2, W6, curK;                               \
 195
 196#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 197        vst1.32   {tmp0, tmp1}, [RWK]!;                         \
 198
 199#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 200        vadd.u32  tmp3, W5, curK;                               \
 201
 202#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 203        vst1.32   {tmp2, tmp3}, [RWK];                          \
 204
 205
 206/********* Precalc macros for rounds 16-31 ************************************/
 207
 208#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 209        veor      tmp0, tmp0;                   \
 210        vext.8    W, W_m16, W_m12, #8;          \
 211
 212#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 213        add       RWK, sp, #(WK_offs(i));       \
 214        vext.8    tmp0, W_m04, tmp0, #4;        \
 215
 216#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 217        veor      tmp0, tmp0, W_m16;            \
 218        veor.32   W, W, W_m08;                  \
 219
 220#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 221        veor      tmp1, tmp1;                   \
 222        veor      W, W, tmp0;                   \
 223
 224#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 225        vshl.u32  tmp0, W, #1;                  \
 226
 227#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 228        vext.8    tmp1, tmp1, W, #(16-12);      \
 229        vshr.u32  W, W, #31;                    \
 230
 231#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 232        vorr      tmp0, tmp0, W;                \
 233        vshr.u32  W, tmp1, #30;                 \
 234
 235#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 236        vshl.u32  tmp1, tmp1, #2;               \
 237
 238#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 239        veor      tmp0, tmp0, W;                \
 240
 241#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 242        veor      W, tmp0, tmp1;                \
 243
 244#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 245        vadd.u32  tmp0, W, curK;                \
 246
 247#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 248        vst1.32   {tmp0}, [RWK];
 249
 250
 251/********* Precalc macros for rounds 32-79 ************************************/
 252
 253#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 254        veor W, W_m28; \
 255
 256#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 257        vext.8 tmp0, W_m08, W_m04, #8; \
 258
 259#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 260        veor W, W_m16; \
 261
 262#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 263        veor W, tmp0; \
 264
 265#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 266        add RWK, sp, #(WK_offs(i&~3)); \
 267
 268#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 269        vshl.u32 tmp1, W, #2; \
 270
 271#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 272        vshr.u32 tmp0, W, #30; \
 273
 274#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 275        vorr W, tmp0, tmp1; \
 276
 277#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 278        vadd.u32 tmp0, W, curK; \
 279
 280#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 281        vst1.32 {tmp0}, [RWK];
 282
 283
 284/*
 285 * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
 286 *
 287 * unsigned int
 288 * sha1_transform_neon (void *ctx, const unsigned char *data,
 289 *                      unsigned int nblks)
 290 */
 291.align 3
 292ENTRY(sha1_transform_neon)
 293  /* input:
 294   *    r0: ctx, CTX
 295   *    r1: data (64*nblks bytes)
 296   *    r2: nblks
 297   */
 298
 299  cmp RNBLKS, #0;
 300  beq .Ldo_nothing;
 301
 302  push {r4-r12, lr};
 303  /*vpush {q4-q7};*/
 304
 305  adr RT3, .LK_VEC;
 306
 307  mov ROLDSTACK, sp;
 308
 309  /* Align stack. */
 310  sub RT0, sp, #(16*4);
 311  and RT0, #(~(16-1));
 312  mov sp, RT0;
 313
 314  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
 315
 316  /* Get the values of the chaining variables. */
 317  ldm RSTATE, {_a-_e};
 318
 319  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
 320
 321#undef curK
 322#define curK qK1
 323  /* Precalc 0-15. */
 324  W_PRECALC_00_15();
 325
 326.Loop:
 327  /* Transform 0-15 + Precalc 16-31. */
 328  _R( _a, _b, _c, _d, _e, F1,  0,
 329      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
 330      W4, W5, W6, W7, W0, _, _, _ );
 331  _R( _e, _a, _b, _c, _d, F1,  1,
 332      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
 333      W4, W5, W6, W7, W0, _, _, _ );
 334  _R( _d, _e, _a, _b, _c, F1,  2,
 335      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
 336      W4, W5, W6, W7, W0, _, _, _ );
 337  _R( _c, _d, _e, _a, _b, F1,  3,
 338      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
 339      W4, W5, W6, W7, W0, _, _, _ );
 340
 341#undef curK
 342#define curK qK2
 343  _R( _b, _c, _d, _e, _a, F1,  4,
 344      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
 345      W3, W4, W5, W6, W7, _, _, _ );
 346  _R( _a, _b, _c, _d, _e, F1,  5,
 347      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
 348      W3, W4, W5, W6, W7, _, _, _ );
 349  _R( _e, _a, _b, _c, _d, F1,  6,
 350      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
 351      W3, W4, W5, W6, W7, _, _, _ );
 352  _R( _d, _e, _a, _b, _c, F1,  7,
 353      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
 354      W3, W4, W5, W6, W7, _, _, _ );
 355
 356  _R( _c, _d, _e, _a, _b, F1,  8,
 357      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
 358      W2, W3, W4, W5, W6, _, _, _ );
 359  _R( _b, _c, _d, _e, _a, F1,  9,
 360      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
 361      W2, W3, W4, W5, W6, _, _, _ );
 362  _R( _a, _b, _c, _d, _e, F1, 10,
 363      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
 364      W2, W3, W4, W5, W6, _, _, _ );
 365  _R( _e, _a, _b, _c, _d, F1, 11,
 366      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
 367      W2, W3, W4, W5, W6, _, _, _ );
 368
 369  _R( _d, _e, _a, _b, _c, F1, 12,
 370      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
 371      W1, W2, W3, W4, W5, _, _, _ );
 372  _R( _c, _d, _e, _a, _b, F1, 13,
 373      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
 374      W1, W2, W3, W4, W5, _, _, _ );
 375  _R( _b, _c, _d, _e, _a, F1, 14,
 376      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
 377      W1, W2, W3, W4, W5, _, _, _ );
 378  _R( _a, _b, _c, _d, _e, F1, 15,
 379      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
 380      W1, W2, W3, W4, W5, _, _, _ );
 381
 382  /* Transform 16-63 + Precalc 32-79. */
 383  _R( _e, _a, _b, _c, _d, F1, 16,
 384      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
 385      W0, W1, W2, W3, W4, W5, W6, W7);
 386  _R( _d, _e, _a, _b, _c, F1, 17,
 387      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
 388      W0, W1, W2, W3, W4, W5, W6, W7);
 389  _R( _c, _d, _e, _a, _b, F1, 18,
 390      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
 391      W0, W1, W2, W3, W4, W5, W6, W7);
 392  _R( _b, _c, _d, _e, _a, F1, 19,
 393      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
 394      W0, W1, W2, W3, W4, W5, W6, W7);
 395
 396  _R( _a, _b, _c, _d, _e, F2, 20,
 397      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
 398      W7, W0, W1, W2, W3, W4, W5, W6);
 399  _R( _e, _a, _b, _c, _d, F2, 21,
 400      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
 401      W7, W0, W1, W2, W3, W4, W5, W6);
 402  _R( _d, _e, _a, _b, _c, F2, 22,
 403      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
 404      W7, W0, W1, W2, W3, W4, W5, W6);
 405  _R( _c, _d, _e, _a, _b, F2, 23,
 406      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
 407      W7, W0, W1, W2, W3, W4, W5, W6);
 408
 409#undef curK
 410#define curK qK3
 411  _R( _b, _c, _d, _e, _a, F2, 24,
 412      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
 413      W6, W7, W0, W1, W2, W3, W4, W5);
 414  _R( _a, _b, _c, _d, _e, F2, 25,
 415      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
 416      W6, W7, W0, W1, W2, W3, W4, W5);
 417  _R( _e, _a, _b, _c, _d, F2, 26,
 418      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
 419      W6, W7, W0, W1, W2, W3, W4, W5);
 420  _R( _d, _e, _a, _b, _c, F2, 27,
 421      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
 422      W6, W7, W0, W1, W2, W3, W4, W5);
 423
 424  _R( _c, _d, _e, _a, _b, F2, 28,
 425      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
 426      W5, W6, W7, W0, W1, W2, W3, W4);
 427  _R( _b, _c, _d, _e, _a, F2, 29,
 428      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
 429      W5, W6, W7, W0, W1, W2, W3, W4);
 430  _R( _a, _b, _c, _d, _e, F2, 30,
 431      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
 432      W5, W6, W7, W0, W1, W2, W3, W4);
 433  _R( _e, _a, _b, _c, _d, F2, 31,
 434      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
 435      W5, W6, W7, W0, W1, W2, W3, W4);
 436
 437  _R( _d, _e, _a, _b, _c, F2, 32,
 438      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
 439      W4, W5, W6, W7, W0, W1, W2, W3);
 440  _R( _c, _d, _e, _a, _b, F2, 33,
 441      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
 442      W4, W5, W6, W7, W0, W1, W2, W3);
 443  _R( _b, _c, _d, _e, _a, F2, 34,
 444      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
 445      W4, W5, W6, W7, W0, W1, W2, W3);
 446  _R( _a, _b, _c, _d, _e, F2, 35,
 447      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
 448      W4, W5, W6, W7, W0, W1, W2, W3);
 449
 450  _R( _e, _a, _b, _c, _d, F2, 36,
 451      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
 452      W3, W4, W5, W6, W7, W0, W1, W2);
 453  _R( _d, _e, _a, _b, _c, F2, 37,
 454      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
 455      W3, W4, W5, W6, W7, W0, W1, W2);
 456  _R( _c, _d, _e, _a, _b, F2, 38,
 457      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
 458      W3, W4, W5, W6, W7, W0, W1, W2);
 459  _R( _b, _c, _d, _e, _a, F2, 39,
 460      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
 461      W3, W4, W5, W6, W7, W0, W1, W2);
 462
 463  _R( _a, _b, _c, _d, _e, F3, 40,
 464      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
 465      W2, W3, W4, W5, W6, W7, W0, W1);
 466  _R( _e, _a, _b, _c, _d, F3, 41,
 467      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
 468      W2, W3, W4, W5, W6, W7, W0, W1);
 469  _R( _d, _e, _a, _b, _c, F3, 42,
 470      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
 471      W2, W3, W4, W5, W6, W7, W0, W1);
 472  _R( _c, _d, _e, _a, _b, F3, 43,
 473      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
 474      W2, W3, W4, W5, W6, W7, W0, W1);
 475
 476#undef curK
 477#define curK qK4
 478  _R( _b, _c, _d, _e, _a, F3, 44,
 479      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
 480      W1, W2, W3, W4, W5, W6, W7, W0);
 481  _R( _a, _b, _c, _d, _e, F3, 45,
 482      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
 483      W1, W2, W3, W4, W5, W6, W7, W0);
 484  _R( _e, _a, _b, _c, _d, F3, 46,
 485      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
 486      W1, W2, W3, W4, W5, W6, W7, W0);
 487  _R( _d, _e, _a, _b, _c, F3, 47,
 488      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
 489      W1, W2, W3, W4, W5, W6, W7, W0);
 490
 491  _R( _c, _d, _e, _a, _b, F3, 48,
 492      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
 493      W0, W1, W2, W3, W4, W5, W6, W7);
 494  _R( _b, _c, _d, _e, _a, F3, 49,
 495      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
 496      W0, W1, W2, W3, W4, W5, W6, W7);
 497  _R( _a, _b, _c, _d, _e, F3, 50,
 498      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
 499      W0, W1, W2, W3, W4, W5, W6, W7);
 500  _R( _e, _a, _b, _c, _d, F3, 51,
 501      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
 502      W0, W1, W2, W3, W4, W5, W6, W7);
 503
 504  _R( _d, _e, _a, _b, _c, F3, 52,
 505      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
 506      W7, W0, W1, W2, W3, W4, W5, W6);
 507  _R( _c, _d, _e, _a, _b, F3, 53,
 508      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
 509      W7, W0, W1, W2, W3, W4, W5, W6);
 510  _R( _b, _c, _d, _e, _a, F3, 54,
 511      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
 512      W7, W0, W1, W2, W3, W4, W5, W6);
 513  _R( _a, _b, _c, _d, _e, F3, 55,
 514      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
 515      W7, W0, W1, W2, W3, W4, W5, W6);
 516
 517  _R( _e, _a, _b, _c, _d, F3, 56,
 518      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
 519      W6, W7, W0, W1, W2, W3, W4, W5);
 520  _R( _d, _e, _a, _b, _c, F3, 57,
 521      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
 522      W6, W7, W0, W1, W2, W3, W4, W5);
 523  _R( _c, _d, _e, _a, _b, F3, 58,
 524      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
 525      W6, W7, W0, W1, W2, W3, W4, W5);
 526  _R( _b, _c, _d, _e, _a, F3, 59,
 527      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
 528      W6, W7, W0, W1, W2, W3, W4, W5);
 529
 530  subs RNBLKS, #1;
 531
 532  _R( _a, _b, _c, _d, _e, F4, 60,
 533      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
 534      W5, W6, W7, W0, W1, W2, W3, W4);
 535  _R( _e, _a, _b, _c, _d, F4, 61,
 536      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
 537      W5, W6, W7, W0, W1, W2, W3, W4);
 538  _R( _d, _e, _a, _b, _c, F4, 62,
 539      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
 540      W5, W6, W7, W0, W1, W2, W3, W4);
 541  _R( _c, _d, _e, _a, _b, F4, 63,
 542      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
 543      W5, W6, W7, W0, W1, W2, W3, W4);
 544
 545  beq .Lend;
 546
 547  /* Transform 64-79 + Precalc 0-15 of next block. */
 548#undef curK
 549#define curK qK1
 550  _R( _b, _c, _d, _e, _a, F4, 64,
 551      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 552  _R( _a, _b, _c, _d, _e, F4, 65,
 553      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 554  _R( _e, _a, _b, _c, _d, F4, 66,
 555      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 556  _R( _d, _e, _a, _b, _c, F4, 67,
 557      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 558
 559  _R( _c, _d, _e, _a, _b, F4, 68,
 560      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
 561  _R( _b, _c, _d, _e, _a, F4, 69,
 562      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
 563  _R( _a, _b, _c, _d, _e, F4, 70,
 564      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 565  _R( _e, _a, _b, _c, _d, F4, 71,
 566      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 567
 568  _R( _d, _e, _a, _b, _c, F4, 72,
 569      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
 570  _R( _c, _d, _e, _a, _b, F4, 73,
 571      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
 572  _R( _b, _c, _d, _e, _a, F4, 74,
 573      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 574  _R( _a, _b, _c, _d, _e, F4, 75,
 575      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 576
 577  _R( _e, _a, _b, _c, _d, F4, 76,
 578      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 579  _R( _d, _e, _a, _b, _c, F4, 77,
 580      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 581  _R( _c, _d, _e, _a, _b, F4, 78,
 582      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 583  _R( _b, _c, _d, _e, _a, F4, 79,
 584      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
 585
 586  /* Update the chaining variables. */
 587  ldm RSTATE, {RT0-RT3};
 588  add _a, RT0;
 589  ldr RT0, [RSTATE, #state_h4];
 590  add _b, RT1;
 591  add _c, RT2;
 592  add _d, RT3;
 593  add _e, RT0;
 594  stm RSTATE, {_a-_e};
 595
 596  b .Loop;
 597
 598.Lend:
 599  /* Transform 64-79 */
 600  R( _b, _c, _d, _e, _a, F4, 64 );
 601  R( _a, _b, _c, _d, _e, F4, 65 );
 602  R( _e, _a, _b, _c, _d, F4, 66 );
 603  R( _d, _e, _a, _b, _c, F4, 67 );
 604  R( _c, _d, _e, _a, _b, F4, 68 );
 605  R( _b, _c, _d, _e, _a, F4, 69 );
 606  R( _a, _b, _c, _d, _e, F4, 70 );
 607  R( _e, _a, _b, _c, _d, F4, 71 );
 608  R( _d, _e, _a, _b, _c, F4, 72 );
 609  R( _c, _d, _e, _a, _b, F4, 73 );
 610  R( _b, _c, _d, _e, _a, F4, 74 );
 611  R( _a, _b, _c, _d, _e, F4, 75 );
 612  R( _e, _a, _b, _c, _d, F4, 76 );
 613  R( _d, _e, _a, _b, _c, F4, 77 );
 614  R( _c, _d, _e, _a, _b, F4, 78 );
 615  R( _b, _c, _d, _e, _a, F4, 79 );
 616
 617  mov sp, ROLDSTACK;
 618
 619  /* Update the chaining variables. */
 620  ldm RSTATE, {RT0-RT3};
 621  add _a, RT0;
 622  ldr RT0, [RSTATE, #state_h4];
 623  add _b, RT1;
 624  add _c, RT2;
 625  add _d, RT3;
 626  /*vpop {q4-q7};*/
 627  add _e, RT0;
 628  stm RSTATE, {_a-_e};
 629
 630  pop {r4-r12, pc};
 631
 632.Ldo_nothing:
 633  bx lr
 634ENDPROC(sha1_transform_neon)
 635