linux/arch/arm/crypto/sha1-armv7-neon.S
<<
>>
Prefs
   1/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
   2 *
   3 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms of the GNU General Public License as published by the Free
   7 * Software Foundation; either version 2 of the License, or (at your option)
   8 * any later version.
   9 */
  10
  11#include <linux/linkage.h>
  12#include <asm/assembler.h>
  13
  14.syntax unified
  15.fpu neon
  16
  17.text
  18
  19
  20/* Context structure */
  21
  22#define state_h0 0
  23#define state_h1 4
  24#define state_h2 8
  25#define state_h3 12
  26#define state_h4 16
  27
  28
  29/* Constants */
  30
  31#define K1  0x5A827999
  32#define K2  0x6ED9EBA1
  33#define K3  0x8F1BBCDC
  34#define K4  0xCA62C1D6
  35.align 4
  36.LK_VEC:
  37.LK1:   .long K1, K1, K1, K1
  38.LK2:   .long K2, K2, K2, K2
  39.LK3:   .long K3, K3, K3, K3
  40.LK4:   .long K4, K4, K4, K4
  41
  42
  43/* Register macros */
  44
  45#define RSTATE r0
  46#define RDATA r1
  47#define RNBLKS r2
  48#define ROLDSTACK r3
  49#define RWK lr
  50
  51#define _a r4
  52#define _b r5
  53#define _c r6
  54#define _d r7
  55#define _e r8
  56
  57#define RT0 r9
  58#define RT1 r10
  59#define RT2 r11
  60#define RT3 r12
  61
  62#define W0 q0
  63#define W1 q7
  64#define W2 q2
  65#define W3 q3
  66#define W4 q4
  67#define W5 q6
  68#define W6 q5
  69#define W7 q1
  70
  71#define tmp0 q8
  72#define tmp1 q9
  73#define tmp2 q10
  74#define tmp3 q11
  75
  76#define qK1 q12
  77#define qK2 q13
  78#define qK3 q14
  79#define qK4 q15
  80
  81#ifdef CONFIG_CPU_BIG_ENDIAN
  82#define ARM_LE(code...)
  83#else
  84#define ARM_LE(code...)         code
  85#endif
  86
  87/* Round function macros. */
  88
  89#define WK_offs(i) (((i) & 15) * 4)
  90
  91#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  92              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  93        ldr RT3, [sp, WK_offs(i)]; \
  94                pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  95        bic RT0, d, b; \
  96        add e, e, a, ror #(32 - 5); \
  97        and RT1, c, b; \
  98                pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  99        add RT0, RT0, RT3; \
 100        add e, e, RT1; \
 101        ror b, #(32 - 30); \
 102                pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 103        add e, e, RT0;
 104
 105#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
 106              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 107        ldr RT3, [sp, WK_offs(i)]; \
 108                pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 109        eor RT0, d, b; \
 110        add e, e, a, ror #(32 - 5); \
 111        eor RT0, RT0, c; \
 112                pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 113        add e, e, RT3; \
 114        ror b, #(32 - 30); \
 115                pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 116        add e, e, RT0; \
 117
 118#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
 119              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 120        ldr RT3, [sp, WK_offs(i)]; \
 121                pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 122        eor RT0, b, c; \
 123        and RT1, b, c; \
 124        add e, e, a, ror #(32 - 5); \
 125                pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 126        and RT0, RT0, d; \
 127        add RT1, RT1, RT3; \
 128        add e, e, RT0; \
 129        ror b, #(32 - 30); \
 130                pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
 131        add e, e, RT1;
 132
 133#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
 134              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 135        _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
 136              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
 137
 138#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
 139           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 140        _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
 141               W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
 142
 143#define R(a,b,c,d,e,f,i) \
 144        _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
 145               W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
 146
 147#define dummy(...)
 148
 149
 150/* Input expansion macros. */
 151
 152/********* Precalc macros for rounds 0-15 *************************************/
 153
 154#define W_PRECALC_00_15() \
 155        add       RWK, sp, #(WK_offs(0));                       \
 156        \
 157        vld1.32   {W0, W7}, [RDATA]!;                           \
 158 ARM_LE(vrev32.8  W0, W0;       )       /* big => little */     \
 159        vld1.32   {W6, W5}, [RDATA]!;                           \
 160        vadd.u32  tmp0, W0, curK;                               \
 161 ARM_LE(vrev32.8  W7, W7;       )       /* big => little */     \
 162 ARM_LE(vrev32.8  W6, W6;       )       /* big => little */     \
 163        vadd.u32  tmp1, W7, curK;                               \
 164 ARM_LE(vrev32.8  W5, W5;       )       /* big => little */     \
 165        vadd.u32  tmp2, W6, curK;                               \
 166        vst1.32   {tmp0, tmp1}, [RWK]!;                         \
 167        vadd.u32  tmp3, W5, curK;                               \
 168        vst1.32   {tmp2, tmp3}, [RWK];                          \
 169
 170#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 171        vld1.32   {W0, W7}, [RDATA]!;                           \
 172
 173#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 174        add       RWK, sp, #(WK_offs(0));                       \
 175
 176#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 177 ARM_LE(vrev32.8  W0, W0;       )       /* big => little */     \
 178
 179#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 180        vld1.32   {W6, W5}, [RDATA]!;                           \
 181
 182#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 183        vadd.u32  tmp0, W0, curK;                               \
 184
 185#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 186 ARM_LE(vrev32.8  W7, W7;       )       /* big => little */     \
 187
 188#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 189 ARM_LE(vrev32.8  W6, W6;       )       /* big => little */     \
 190
 191#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 192        vadd.u32  tmp1, W7, curK;                               \
 193
 194#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 195 ARM_LE(vrev32.8  W5, W5;       )       /* big => little */     \
 196
 197#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 198        vadd.u32  tmp2, W6, curK;                               \
 199
 200#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 201        vst1.32   {tmp0, tmp1}, [RWK]!;                         \
 202
 203#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 204        vadd.u32  tmp3, W5, curK;                               \
 205
 206#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 207        vst1.32   {tmp2, tmp3}, [RWK];                          \
 208
 209
 210/********* Precalc macros for rounds 16-31 ************************************/
 211
 212#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 213        veor      tmp0, tmp0;                   \
 214        vext.8    W, W_m16, W_m12, #8;          \
 215
 216#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 217        add       RWK, sp, #(WK_offs(i));       \
 218        vext.8    tmp0, W_m04, tmp0, #4;        \
 219
 220#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 221        veor      tmp0, tmp0, W_m16;            \
 222        veor.32   W, W, W_m08;                  \
 223
 224#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 225        veor      tmp1, tmp1;                   \
 226        veor      W, W, tmp0;                   \
 227
 228#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 229        vshl.u32  tmp0, W, #1;                  \
 230
 231#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 232        vext.8    tmp1, tmp1, W, #(16-12);      \
 233        vshr.u32  W, W, #31;                    \
 234
 235#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 236        vorr      tmp0, tmp0, W;                \
 237        vshr.u32  W, tmp1, #30;                 \
 238
 239#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 240        vshl.u32  tmp1, tmp1, #2;               \
 241
 242#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 243        veor      tmp0, tmp0, W;                \
 244
 245#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 246        veor      W, tmp0, tmp1;                \
 247
 248#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 249        vadd.u32  tmp0, W, curK;                \
 250
 251#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 252        vst1.32   {tmp0}, [RWK];
 253
 254
 255/********* Precalc macros for rounds 32-79 ************************************/
 256
 257#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 258        veor W, W_m28; \
 259
 260#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 261        vext.8 tmp0, W_m08, W_m04, #8; \
 262
 263#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 264        veor W, W_m16; \
 265
 266#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 267        veor W, tmp0; \
 268
 269#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 270        add RWK, sp, #(WK_offs(i&~3)); \
 271
 272#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 273        vshl.u32 tmp1, W, #2; \
 274
 275#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 276        vshr.u32 tmp0, W, #30; \
 277
 278#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 279        vorr W, tmp0, tmp1; \
 280
 281#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 282        vadd.u32 tmp0, W, curK; \
 283
 284#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
 285        vst1.32 {tmp0}, [RWK];
 286
 287
 288/*
 289 * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
 290 *
 291 * unsigned int
 292 * sha1_transform_neon (void *ctx, const unsigned char *data,
 293 *                      unsigned int nblks)
 294 */
 295.align 3
 296ENTRY(sha1_transform_neon)
 297  /* input:
 298   *    r0: ctx, CTX
 299   *    r1: data (64*nblks bytes)
 300   *    r2: nblks
 301   */
 302
 303  cmp RNBLKS, #0;
 304  beq .Ldo_nothing;
 305
 306  push {r4-r12, lr};
 307  /*vpush {q4-q7};*/
 308
 309  adr RT3, .LK_VEC;
 310
 311  mov ROLDSTACK, sp;
 312
 313  /* Align stack. */
 314  sub RT0, sp, #(16*4);
 315  and RT0, #(~(16-1));
 316  mov sp, RT0;
 317
 318  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
 319
 320  /* Get the values of the chaining variables. */
 321  ldm RSTATE, {_a-_e};
 322
 323  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
 324
 325#undef curK
 326#define curK qK1
 327  /* Precalc 0-15. */
 328  W_PRECALC_00_15();
 329
 330.Loop:
 331  /* Transform 0-15 + Precalc 16-31. */
 332  _R( _a, _b, _c, _d, _e, F1,  0,
 333      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
 334      W4, W5, W6, W7, W0, _, _, _ );
 335  _R( _e, _a, _b, _c, _d, F1,  1,
 336      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
 337      W4, W5, W6, W7, W0, _, _, _ );
 338  _R( _d, _e, _a, _b, _c, F1,  2,
 339      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
 340      W4, W5, W6, W7, W0, _, _, _ );
 341  _R( _c, _d, _e, _a, _b, F1,  3,
 342      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
 343      W4, W5, W6, W7, W0, _, _, _ );
 344
 345#undef curK
 346#define curK qK2
 347  _R( _b, _c, _d, _e, _a, F1,  4,
 348      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
 349      W3, W4, W5, W6, W7, _, _, _ );
 350  _R( _a, _b, _c, _d, _e, F1,  5,
 351      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
 352      W3, W4, W5, W6, W7, _, _, _ );
 353  _R( _e, _a, _b, _c, _d, F1,  6,
 354      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
 355      W3, W4, W5, W6, W7, _, _, _ );
 356  _R( _d, _e, _a, _b, _c, F1,  7,
 357      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
 358      W3, W4, W5, W6, W7, _, _, _ );
 359
 360  _R( _c, _d, _e, _a, _b, F1,  8,
 361      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
 362      W2, W3, W4, W5, W6, _, _, _ );
 363  _R( _b, _c, _d, _e, _a, F1,  9,
 364      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
 365      W2, W3, W4, W5, W6, _, _, _ );
 366  _R( _a, _b, _c, _d, _e, F1, 10,
 367      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
 368      W2, W3, W4, W5, W6, _, _, _ );
 369  _R( _e, _a, _b, _c, _d, F1, 11,
 370      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
 371      W2, W3, W4, W5, W6, _, _, _ );
 372
 373  _R( _d, _e, _a, _b, _c, F1, 12,
 374      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
 375      W1, W2, W3, W4, W5, _, _, _ );
 376  _R( _c, _d, _e, _a, _b, F1, 13,
 377      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
 378      W1, W2, W3, W4, W5, _, _, _ );
 379  _R( _b, _c, _d, _e, _a, F1, 14,
 380      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
 381      W1, W2, W3, W4, W5, _, _, _ );
 382  _R( _a, _b, _c, _d, _e, F1, 15,
 383      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
 384      W1, W2, W3, W4, W5, _, _, _ );
 385
 386  /* Transform 16-63 + Precalc 32-79. */
 387  _R( _e, _a, _b, _c, _d, F1, 16,
 388      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
 389      W0, W1, W2, W3, W4, W5, W6, W7);
 390  _R( _d, _e, _a, _b, _c, F1, 17,
 391      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
 392      W0, W1, W2, W3, W4, W5, W6, W7);
 393  _R( _c, _d, _e, _a, _b, F1, 18,
 394      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
 395      W0, W1, W2, W3, W4, W5, W6, W7);
 396  _R( _b, _c, _d, _e, _a, F1, 19,
 397      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
 398      W0, W1, W2, W3, W4, W5, W6, W7);
 399
 400  _R( _a, _b, _c, _d, _e, F2, 20,
 401      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
 402      W7, W0, W1, W2, W3, W4, W5, W6);
 403  _R( _e, _a, _b, _c, _d, F2, 21,
 404      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
 405      W7, W0, W1, W2, W3, W4, W5, W6);
 406  _R( _d, _e, _a, _b, _c, F2, 22,
 407      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
 408      W7, W0, W1, W2, W3, W4, W5, W6);
 409  _R( _c, _d, _e, _a, _b, F2, 23,
 410      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
 411      W7, W0, W1, W2, W3, W4, W5, W6);
 412
 413#undef curK
 414#define curK qK3
 415  _R( _b, _c, _d, _e, _a, F2, 24,
 416      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
 417      W6, W7, W0, W1, W2, W3, W4, W5);
 418  _R( _a, _b, _c, _d, _e, F2, 25,
 419      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
 420      W6, W7, W0, W1, W2, W3, W4, W5);
 421  _R( _e, _a, _b, _c, _d, F2, 26,
 422      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
 423      W6, W7, W0, W1, W2, W3, W4, W5);
 424  _R( _d, _e, _a, _b, _c, F2, 27,
 425      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
 426      W6, W7, W0, W1, W2, W3, W4, W5);
 427
 428  _R( _c, _d, _e, _a, _b, F2, 28,
 429      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
 430      W5, W6, W7, W0, W1, W2, W3, W4);
 431  _R( _b, _c, _d, _e, _a, F2, 29,
 432      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
 433      W5, W6, W7, W0, W1, W2, W3, W4);
 434  _R( _a, _b, _c, _d, _e, F2, 30,
 435      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
 436      W5, W6, W7, W0, W1, W2, W3, W4);
 437  _R( _e, _a, _b, _c, _d, F2, 31,
 438      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
 439      W5, W6, W7, W0, W1, W2, W3, W4);
 440
 441  _R( _d, _e, _a, _b, _c, F2, 32,
 442      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
 443      W4, W5, W6, W7, W0, W1, W2, W3);
 444  _R( _c, _d, _e, _a, _b, F2, 33,
 445      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
 446      W4, W5, W6, W7, W0, W1, W2, W3);
 447  _R( _b, _c, _d, _e, _a, F2, 34,
 448      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
 449      W4, W5, W6, W7, W0, W1, W2, W3);
 450  _R( _a, _b, _c, _d, _e, F2, 35,
 451      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
 452      W4, W5, W6, W7, W0, W1, W2, W3);
 453
 454  _R( _e, _a, _b, _c, _d, F2, 36,
 455      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
 456      W3, W4, W5, W6, W7, W0, W1, W2);
 457  _R( _d, _e, _a, _b, _c, F2, 37,
 458      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
 459      W3, W4, W5, W6, W7, W0, W1, W2);
 460  _R( _c, _d, _e, _a, _b, F2, 38,
 461      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
 462      W3, W4, W5, W6, W7, W0, W1, W2);
 463  _R( _b, _c, _d, _e, _a, F2, 39,
 464      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
 465      W3, W4, W5, W6, W7, W0, W1, W2);
 466
 467  _R( _a, _b, _c, _d, _e, F3, 40,
 468      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
 469      W2, W3, W4, W5, W6, W7, W0, W1);
 470  _R( _e, _a, _b, _c, _d, F3, 41,
 471      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
 472      W2, W3, W4, W5, W6, W7, W0, W1);
 473  _R( _d, _e, _a, _b, _c, F3, 42,
 474      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
 475      W2, W3, W4, W5, W6, W7, W0, W1);
 476  _R( _c, _d, _e, _a, _b, F3, 43,
 477      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
 478      W2, W3, W4, W5, W6, W7, W0, W1);
 479
 480#undef curK
 481#define curK qK4
 482  _R( _b, _c, _d, _e, _a, F3, 44,
 483      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
 484      W1, W2, W3, W4, W5, W6, W7, W0);
 485  _R( _a, _b, _c, _d, _e, F3, 45,
 486      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
 487      W1, W2, W3, W4, W5, W6, W7, W0);
 488  _R( _e, _a, _b, _c, _d, F3, 46,
 489      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
 490      W1, W2, W3, W4, W5, W6, W7, W0);
 491  _R( _d, _e, _a, _b, _c, F3, 47,
 492      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
 493      W1, W2, W3, W4, W5, W6, W7, W0);
 494
 495  _R( _c, _d, _e, _a, _b, F3, 48,
 496      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
 497      W0, W1, W2, W3, W4, W5, W6, W7);
 498  _R( _b, _c, _d, _e, _a, F3, 49,
 499      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
 500      W0, W1, W2, W3, W4, W5, W6, W7);
 501  _R( _a, _b, _c, _d, _e, F3, 50,
 502      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
 503      W0, W1, W2, W3, W4, W5, W6, W7);
 504  _R( _e, _a, _b, _c, _d, F3, 51,
 505      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
 506      W0, W1, W2, W3, W4, W5, W6, W7);
 507
 508  _R( _d, _e, _a, _b, _c, F3, 52,
 509      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
 510      W7, W0, W1, W2, W3, W4, W5, W6);
 511  _R( _c, _d, _e, _a, _b, F3, 53,
 512      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
 513      W7, W0, W1, W2, W3, W4, W5, W6);
 514  _R( _b, _c, _d, _e, _a, F3, 54,
 515      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
 516      W7, W0, W1, W2, W3, W4, W5, W6);
 517  _R( _a, _b, _c, _d, _e, F3, 55,
 518      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
 519      W7, W0, W1, W2, W3, W4, W5, W6);
 520
 521  _R( _e, _a, _b, _c, _d, F3, 56,
 522      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
 523      W6, W7, W0, W1, W2, W3, W4, W5);
 524  _R( _d, _e, _a, _b, _c, F3, 57,
 525      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
 526      W6, W7, W0, W1, W2, W3, W4, W5);
 527  _R( _c, _d, _e, _a, _b, F3, 58,
 528      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
 529      W6, W7, W0, W1, W2, W3, W4, W5);
 530  _R( _b, _c, _d, _e, _a, F3, 59,
 531      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
 532      W6, W7, W0, W1, W2, W3, W4, W5);
 533
 534  subs RNBLKS, #1;
 535
 536  _R( _a, _b, _c, _d, _e, F4, 60,
 537      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
 538      W5, W6, W7, W0, W1, W2, W3, W4);
 539  _R( _e, _a, _b, _c, _d, F4, 61,
 540      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
 541      W5, W6, W7, W0, W1, W2, W3, W4);
 542  _R( _d, _e, _a, _b, _c, F4, 62,
 543      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
 544      W5, W6, W7, W0, W1, W2, W3, W4);
 545  _R( _c, _d, _e, _a, _b, F4, 63,
 546      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
 547      W5, W6, W7, W0, W1, W2, W3, W4);
 548
 549  beq .Lend;
 550
 551  /* Transform 64-79 + Precalc 0-15 of next block. */
 552#undef curK
 553#define curK qK1
 554  _R( _b, _c, _d, _e, _a, F4, 64,
 555      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 556  _R( _a, _b, _c, _d, _e, F4, 65,
 557      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 558  _R( _e, _a, _b, _c, _d, F4, 66,
 559      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 560  _R( _d, _e, _a, _b, _c, F4, 67,
 561      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 562
 563  _R( _c, _d, _e, _a, _b, F4, 68,
 564      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
 565  _R( _b, _c, _d, _e, _a, F4, 69,
 566      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
 567  _R( _a, _b, _c, _d, _e, F4, 70,
 568      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 569  _R( _e, _a, _b, _c, _d, F4, 71,
 570      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 571
 572  _R( _d, _e, _a, _b, _c, F4, 72,
 573      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
 574  _R( _c, _d, _e, _a, _b, F4, 73,
 575      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
 576  _R( _b, _c, _d, _e, _a, F4, 74,
 577      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 578  _R( _a, _b, _c, _d, _e, F4, 75,
 579      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 580
 581  _R( _e, _a, _b, _c, _d, F4, 76,
 582      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 583  _R( _d, _e, _a, _b, _c, F4, 77,
 584      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 585  _R( _c, _d, _e, _a, _b, F4, 78,
 586      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
 587  _R( _b, _c, _d, _e, _a, F4, 79,
 588      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
 589
 590  /* Update the chaining variables. */
 591  ldm RSTATE, {RT0-RT3};
 592  add _a, RT0;
 593  ldr RT0, [RSTATE, #state_h4];
 594  add _b, RT1;
 595  add _c, RT2;
 596  add _d, RT3;
 597  add _e, RT0;
 598  stm RSTATE, {_a-_e};
 599
 600  b .Loop;
 601
 602.Lend:
 603  /* Transform 64-79 */
 604  R( _b, _c, _d, _e, _a, F4, 64 );
 605  R( _a, _b, _c, _d, _e, F4, 65 );
 606  R( _e, _a, _b, _c, _d, F4, 66 );
 607  R( _d, _e, _a, _b, _c, F4, 67 );
 608  R( _c, _d, _e, _a, _b, F4, 68 );
 609  R( _b, _c, _d, _e, _a, F4, 69 );
 610  R( _a, _b, _c, _d, _e, F4, 70 );
 611  R( _e, _a, _b, _c, _d, F4, 71 );
 612  R( _d, _e, _a, _b, _c, F4, 72 );
 613  R( _c, _d, _e, _a, _b, F4, 73 );
 614  R( _b, _c, _d, _e, _a, F4, 74 );
 615  R( _a, _b, _c, _d, _e, F4, 75 );
 616  R( _e, _a, _b, _c, _d, F4, 76 );
 617  R( _d, _e, _a, _b, _c, F4, 77 );
 618  R( _c, _d, _e, _a, _b, F4, 78 );
 619  R( _b, _c, _d, _e, _a, F4, 79 );
 620
 621  mov sp, ROLDSTACK;
 622
 623  /* Update the chaining variables. */
 624  ldm RSTATE, {RT0-RT3};
 625  add _a, RT0;
 626  ldr RT0, [RSTATE, #state_h4];
 627  add _b, RT1;
 628  add _c, RT2;
 629  add _d, RT3;
 630  /*vpop {q4-q7};*/
 631  add _e, RT0;
 632  stm RSTATE, {_a-_e};
 633
 634  pop {r4-r12, pc};
 635
 636.Ldo_nothing:
 637  bx lr
 638ENDPROC(sha1_transform_neon)
 639