busybox/libbb/hash_md5_sha.c
<<
>>
Prefs
   1/* vi: set sw=4 ts=4: */
   2/*
   3 * Utility routines.
   4 *
   5 * Copyright (C) 2010 Denys Vlasenko
   6 *
   7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
   8 */
   9#include "libbb.h"
  10
  11#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
  12
  13/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
  14 * (for rotX32, there is no difference). Why? My guess is that
  15 * macro requires clever common subexpression elimination heuristics
  16 * in gcc, while inline basically forces it to happen.
  17 */
  18//#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
  19static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
  20{
  21        return (x << n) | (x >> (32 - n));
  22}
  23//#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
  24static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
  25{
  26        return (x >> n) | (x << (32 - n));
  27}
  28/* rotr64 in needed for sha512 only: */
  29//#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
  30static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
  31{
  32        return (x >> n) | (x << (64 - n));
  33}
  34
  35/* rotl64 only used for sha3 currently */
  36static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
  37{
  38        return (x << n) | (x >> (64 - n));
  39}
  40
  41/* Feed data through a temporary buffer.
  42 * The internal buffer remembers previous data until it has 64
  43 * bytes worth to pass on.
  44 */
  45static void FAST_FUNC common64_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
  46{
  47        unsigned bufpos = ctx->total64 & 63;
  48
  49        ctx->total64 += len;
  50
  51        while (1) {
  52                unsigned remaining = 64 - bufpos;
  53                if (remaining > len)
  54                        remaining = len;
  55                /* Copy data into aligned buffer */
  56                memcpy(ctx->wbuffer + bufpos, buffer, remaining);
  57                len -= remaining;
  58                buffer = (const char *)buffer + remaining;
  59                bufpos += remaining;
  60                /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
  61                bufpos -= 64;
  62                if (bufpos != 0)
  63                        break;
  64                /* Buffer is filled up, process it */
  65                ctx->process_block(ctx);
  66                /*bufpos = 0; - already is */
  67        }
  68}
  69
  70/* Process the remaining bytes in the buffer */
  71static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
  72{
  73        unsigned bufpos = ctx->total64 & 63;
  74        /* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
  75        ctx->wbuffer[bufpos++] = 0x80;
  76
  77        /* This loop iterates either once or twice, no more, no less */
  78        while (1) {
  79                unsigned remaining = 64 - bufpos;
  80                memset(ctx->wbuffer + bufpos, 0, remaining);
  81                /* Do we have enough space for the length count? */
  82                if (remaining >= 8) {
  83                        /* Store the 64-bit counter of bits in the buffer */
  84                        uint64_t t = ctx->total64 << 3;
  85                        if (swap_needed)
  86                                t = bb_bswap_64(t);
  87                        /* wbuffer is suitably aligned for this */
  88                        *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
  89                }
  90                ctx->process_block(ctx);
  91                if (remaining >= 8)
  92                        break;
  93                bufpos = 0;
  94        }
  95}
  96
  97
  98/*
  99 * Compute MD5 checksum of strings according to the
 100 * definition of MD5 in RFC 1321 from April 1992.
 101 *
 102 * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
 103 *
 104 * Copyright (C) 1995-1999 Free Software Foundation, Inc.
 105 * Copyright (C) 2001 Manuel Novoa III
 106 * Copyright (C) 2003 Glenn L. McGrath
 107 * Copyright (C) 2003 Erik Andersen
 108 *
 109 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
 110 */
 111
 112/* 0: fastest, 3: smallest */
 113#if CONFIG_MD5_SMALL < 0
 114# define MD5_SMALL 0
 115#elif CONFIG_MD5_SMALL > 3
 116# define MD5_SMALL 3
 117#else
 118# define MD5_SMALL CONFIG_MD5_SMALL
 119#endif
 120
 121/* These are the four functions used in the four steps of the MD5 algorithm
 122 * and defined in the RFC 1321.  The first function is a little bit optimized
 123 * (as found in Colin Plumbs public domain implementation).
 124 * #define FF(b, c, d) ((b & c) | (~b & d))
 125 */
 126#undef FF
 127#undef FG
 128#undef FH
 129#undef FI
 130#define FF(b, c, d) (d ^ (b & (c ^ d)))
 131#define FG(b, c, d) FF(d, b, c)
 132#define FH(b, c, d) (b ^ c ^ d)
 133#define FI(b, c, d) (c ^ (b | ~d))
 134
 135/* Hash a single block, 64 bytes long and 4-byte aligned */
 136static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
 137{
 138#if MD5_SMALL > 0
 139        /* Before we start, one word to the strange constants.
 140           They are defined in RFC 1321 as
 141           T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
 142         */
 143        static const uint32_t C_array[] = {
 144                /* round 1 */
 145                0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
 146                0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
 147                0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
 148                0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
 149                /* round 2 */
 150                0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
 151                0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
 152                0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
 153                0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
 154                /* round 3 */
 155                0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
 156                0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
 157                0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
 158                0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
 159                /* round 4 */
 160                0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
 161                0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
 162                0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
 163                0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
 164        };
 165        static const char P_array[] ALIGN1 = {
 166# if MD5_SMALL > 1
 167                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
 168# endif
 169                1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
 170                5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
 171                0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9  /* 4 */
 172        };
 173#endif
 174        uint32_t *words = (void*) ctx->wbuffer;
 175        uint32_t A = ctx->hash[0];
 176        uint32_t B = ctx->hash[1];
 177        uint32_t C = ctx->hash[2];
 178        uint32_t D = ctx->hash[3];
 179
 180#if MD5_SMALL >= 2  /* 2 or 3 */
 181
 182        static const char S_array[] ALIGN1 = {
 183                7, 12, 17, 22,
 184                5, 9, 14, 20,
 185                4, 11, 16, 23,
 186                6, 10, 15, 21
 187        };
 188        const uint32_t *pc;
 189        const char *pp;
 190        const char *ps;
 191        int i;
 192        uint32_t temp;
 193
 194        if (BB_BIG_ENDIAN)
 195                for (i = 0; i < 16; i++)
 196                        words[i] = SWAP_LE32(words[i]);
 197
 198# if MD5_SMALL == 3
 199        pc = C_array;
 200        pp = P_array;
 201        ps = S_array - 4;
 202
 203        for (i = 0; i < 64; i++) {
 204                if ((i & 0x0f) == 0)
 205                        ps += 4;
 206                temp = A;
 207                switch (i >> 4) {
 208                case 0:
 209                        temp += FF(B, C, D);
 210                        break;
 211                case 1:
 212                        temp += FG(B, C, D);
 213                        break;
 214                case 2:
 215                        temp += FH(B, C, D);
 216                        break;
 217                default: /* case 3 */
 218                        temp += FI(B, C, D);
 219                }
 220                temp += words[(int) (*pp++)] + *pc++;
 221                temp = rotl32(temp, ps[i & 3]);
 222                temp += B;
 223                A = D;
 224                D = C;
 225                C = B;
 226                B = temp;
 227        }
 228# else  /* MD5_SMALL == 2 */
 229        pc = C_array;
 230        pp = P_array;
 231        ps = S_array;
 232
 233        for (i = 0; i < 16; i++) {
 234                temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
 235                temp = rotl32(temp, ps[i & 3]);
 236                temp += B;
 237                A = D;
 238                D = C;
 239                C = B;
 240                B = temp;
 241        }
 242        ps += 4;
 243        for (i = 0; i < 16; i++) {
 244                temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
 245                temp = rotl32(temp, ps[i & 3]);
 246                temp += B;
 247                A = D;
 248                D = C;
 249                C = B;
 250                B = temp;
 251        }
 252        ps += 4;
 253        for (i = 0; i < 16; i++) {
 254                temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
 255                temp = rotl32(temp, ps[i & 3]);
 256                temp += B;
 257                A = D;
 258                D = C;
 259                C = B;
 260                B = temp;
 261        }
 262        ps += 4;
 263        for (i = 0; i < 16; i++) {
 264                temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
 265                temp = rotl32(temp, ps[i & 3]);
 266                temp += B;
 267                A = D;
 268                D = C;
 269                C = B;
 270                B = temp;
 271        }
 272# endif
 273        /* Add checksum to the starting values */
 274        ctx->hash[0] += A;
 275        ctx->hash[1] += B;
 276        ctx->hash[2] += C;
 277        ctx->hash[3] += D;
 278
 279#else  /* MD5_SMALL == 0 or 1 */
 280
 281# if MD5_SMALL == 1
 282        const uint32_t *pc;
 283        const char *pp;
 284        int i;
 285# endif
 286
 287        /* First round: using the given function, the context and a constant
 288           the next context is computed.  Because the algorithm's processing
 289           unit is a 32-bit word and it is determined to work on words in
 290           little endian byte order we perhaps have to change the byte order
 291           before the computation.  To reduce the work for the next steps
 292           we save swapped words in WORDS array.  */
 293# undef OP
 294# define OP(a, b, c, d, s, T) \
 295        do { \
 296                a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
 297                words++; \
 298                a = rotl32(a, s); \
 299                a += b; \
 300        } while (0)
 301
 302        /* Round 1 */
 303# if MD5_SMALL == 1
 304        pc = C_array;
 305        for (i = 0; i < 4; i++) {
 306                OP(A, B, C, D, 7, *pc++);
 307                OP(D, A, B, C, 12, *pc++);
 308                OP(C, D, A, B, 17, *pc++);
 309                OP(B, C, D, A, 22, *pc++);
 310        }
 311# else
 312        OP(A, B, C, D, 7, 0xd76aa478);
 313        OP(D, A, B, C, 12, 0xe8c7b756);
 314        OP(C, D, A, B, 17, 0x242070db);
 315        OP(B, C, D, A, 22, 0xc1bdceee);
 316        OP(A, B, C, D, 7, 0xf57c0faf);
 317        OP(D, A, B, C, 12, 0x4787c62a);
 318        OP(C, D, A, B, 17, 0xa8304613);
 319        OP(B, C, D, A, 22, 0xfd469501);
 320        OP(A, B, C, D, 7, 0x698098d8);
 321        OP(D, A, B, C, 12, 0x8b44f7af);
 322        OP(C, D, A, B, 17, 0xffff5bb1);
 323        OP(B, C, D, A, 22, 0x895cd7be);
 324        OP(A, B, C, D, 7, 0x6b901122);
 325        OP(D, A, B, C, 12, 0xfd987193);
 326        OP(C, D, A, B, 17, 0xa679438e);
 327        OP(B, C, D, A, 22, 0x49b40821);
 328# endif
 329        words -= 16;
 330
 331        /* For the second to fourth round we have the possibly swapped words
 332           in WORDS.  Redefine the macro to take an additional first
 333           argument specifying the function to use.  */
 334# undef OP
 335# define OP(f, a, b, c, d, k, s, T) \
 336        do { \
 337                a += f(b, c, d) + words[k] + T; \
 338                a = rotl32(a, s); \
 339                a += b; \
 340        } while (0)
 341
 342        /* Round 2 */
 343# if MD5_SMALL == 1
 344        pp = P_array;
 345        for (i = 0; i < 4; i++) {
 346                OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
 347                OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
 348                OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
 349                OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
 350        }
 351# else
 352        OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
 353        OP(FG, D, A, B, C, 6, 9, 0xc040b340);
 354        OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
 355        OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
 356        OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
 357        OP(FG, D, A, B, C, 10, 9, 0x02441453);
 358        OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
 359        OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
 360        OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
 361        OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
 362        OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
 363        OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
 364        OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
 365        OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
 366        OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
 367        OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
 368# endif
 369
 370        /* Round 3 */
 371# if MD5_SMALL == 1
 372        for (i = 0; i < 4; i++) {
 373                OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
 374                OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
 375                OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
 376                OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
 377        }
 378# else
 379        OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
 380        OP(FH, D, A, B, C, 8, 11, 0x8771f681);
 381        OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
 382        OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
 383        OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
 384        OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
 385        OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
 386        OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
 387        OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
 388        OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
 389        OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
 390        OP(FH, B, C, D, A, 6, 23, 0x04881d05);
 391        OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
 392        OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
 393        OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
 394        OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
 395# endif
 396
 397        /* Round 4 */
 398# if MD5_SMALL == 1
 399        for (i = 0; i < 4; i++) {
 400                OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
 401                OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
 402                OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
 403                OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
 404        }
 405# else
 406        OP(FI, A, B, C, D, 0, 6, 0xf4292244);
 407        OP(FI, D, A, B, C, 7, 10, 0x432aff97);
 408        OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
 409        OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
 410        OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
 411        OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
 412        OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
 413        OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
 414        OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
 415        OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
 416        OP(FI, C, D, A, B, 6, 15, 0xa3014314);
 417        OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
 418        OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
 419        OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
 420        OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
 421        OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
 422# undef OP
 423# endif
 424        /* Add checksum to the starting values */
 425        ctx->hash[0] += A;
 426        ctx->hash[1] += B;
 427        ctx->hash[2] += C;
 428        ctx->hash[3] += D;
 429#endif
 430}
 431#undef FF
 432#undef FG
 433#undef FH
 434#undef FI
 435
 436/* Initialize structure containing state of computation.
 437 * (RFC 1321, 3.3: Step 3)
 438 */
 439void FAST_FUNC md5_begin(md5_ctx_t *ctx)
 440{
 441        ctx->hash[0] = 0x67452301;
 442        ctx->hash[1] = 0xefcdab89;
 443        ctx->hash[2] = 0x98badcfe;
 444        ctx->hash[3] = 0x10325476;
 445        ctx->total64 = 0;
 446        ctx->process_block = md5_process_block64;
 447}
 448
 449/* Used also for sha1 and sha256 */
 450void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
 451{
 452        common64_hash(ctx, buffer, len);
 453}
 454
 455/* Process the remaining bytes in the buffer and put result from CTX
 456 * in first 16 bytes following RESBUF.  The result is always in little
 457 * endian byte order, so that a byte-wise output yields to the wanted
 458 * ASCII representation of the message digest.
 459 */
 460unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
 461{
 462        /* MD5 stores total in LE, need to swap on BE arches: */
 463        common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
 464
 465        /* The MD5 result is in little endian byte order */
 466        if (BB_BIG_ENDIAN) {
 467                ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
 468                ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
 469                ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
 470                ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
 471        }
 472
 473        memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
 474        return sizeof(ctx->hash[0]) * 4;
 475}
 476
 477
 478/*
 479 * SHA1 part is:
 480 * Copyright 2007 Rob Landley <rob@landley.net>
 481 *
 482 * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
 483 * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
 484 *
 485 * Licensed under GPLv2, see file LICENSE in this source tree.
 486 *
 487 * ---------------------------------------------------------------------------
 488 *
 489 * SHA256 and SHA512 parts are:
 490 * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
 491 * Shrank by Denys Vlasenko.
 492 *
 493 * ---------------------------------------------------------------------------
 494 *
 495 * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
 496 * and replace "4096" with something like "2000 + time(NULL) % 2097",
 497 * then rebuild and compare "shaNNNsum bigfile" results.
 498 */
 499
 500static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 501{
 502        static const uint32_t rconsts[] = {
 503                0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
 504        };
 505        int i, j;
 506        int cnt;
 507        uint32_t W[16+16];
 508        uint32_t a, b, c, d, e;
 509
 510        /* On-stack work buffer frees up one register in the main loop
 511         * which otherwise will be needed to hold ctx pointer */
 512        for (i = 0; i < 16; i++)
 513                W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
 514
 515        a = ctx->hash[0];
 516        b = ctx->hash[1];
 517        c = ctx->hash[2];
 518        d = ctx->hash[3];
 519        e = ctx->hash[4];
 520
 521        /* 4 rounds of 20 operations each */
 522        cnt = 0;
 523        for (i = 0; i < 4; i++) {
 524                j = 19;
 525                do {
 526                        uint32_t work;
 527
 528                        work = c ^ d;
 529                        if (i == 0) {
 530                                work = (work & b) ^ d;
 531                                if (j <= 3)
 532                                        goto ge16;
 533                                /* Used to do SWAP_BE32 here, but this
 534                                 * requires ctx (see comment above) */
 535                                work += W[cnt];
 536                        } else {
 537                                if (i == 2)
 538                                        work = ((b | c) & d) | (b & c);
 539                                else /* i = 1 or 3 */
 540                                        work ^= b;
 541 ge16:
 542                                W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
 543                                work += W[cnt];
 544                        }
 545                        work += e + rotl32(a, 5) + rconsts[i];
 546
 547                        /* Rotate by one for next time */
 548                        e = d;
 549                        d = c;
 550                        c = /* b = */ rotl32(b, 30);
 551                        b = a;
 552                        a = work;
 553                        cnt = (cnt + 1) & 15;
 554                } while (--j >= 0);
 555        }
 556
 557        ctx->hash[0] += a;
 558        ctx->hash[1] += b;
 559        ctx->hash[2] += c;
 560        ctx->hash[3] += d;
 561        ctx->hash[4] += e;
 562}
 563
 564/* Constants for SHA512 from FIPS 180-2:4.2.3.
 565 * SHA256 constants from FIPS 180-2:4.2.2
 566 * are the most significant half of first 64 elements
 567 * of the same array.
 568 */
 569#undef K
 570#if NEED_SHA512
 571typedef uint64_t sha_K_int;
 572# define K(v) v
 573#else
 574typedef uint32_t sha_K_int;
 575# define K(v) (uint32_t)(v >> 32)
 576#endif
 577static const sha_K_int sha_K[] = {
 578        K(0x428a2f98d728ae22ULL), K(0x7137449123ef65cdULL),
 579        K(0xb5c0fbcfec4d3b2fULL), K(0xe9b5dba58189dbbcULL),
 580        K(0x3956c25bf348b538ULL), K(0x59f111f1b605d019ULL),
 581        K(0x923f82a4af194f9bULL), K(0xab1c5ed5da6d8118ULL),
 582        K(0xd807aa98a3030242ULL), K(0x12835b0145706fbeULL),
 583        K(0x243185be4ee4b28cULL), K(0x550c7dc3d5ffb4e2ULL),
 584        K(0x72be5d74f27b896fULL), K(0x80deb1fe3b1696b1ULL),
 585        K(0x9bdc06a725c71235ULL), K(0xc19bf174cf692694ULL),
 586        K(0xe49b69c19ef14ad2ULL), K(0xefbe4786384f25e3ULL),
 587        K(0x0fc19dc68b8cd5b5ULL), K(0x240ca1cc77ac9c65ULL),
 588        K(0x2de92c6f592b0275ULL), K(0x4a7484aa6ea6e483ULL),
 589        K(0x5cb0a9dcbd41fbd4ULL), K(0x76f988da831153b5ULL),
 590        K(0x983e5152ee66dfabULL), K(0xa831c66d2db43210ULL),
 591        K(0xb00327c898fb213fULL), K(0xbf597fc7beef0ee4ULL),
 592        K(0xc6e00bf33da88fc2ULL), K(0xd5a79147930aa725ULL),
 593        K(0x06ca6351e003826fULL), K(0x142929670a0e6e70ULL),
 594        K(0x27b70a8546d22ffcULL), K(0x2e1b21385c26c926ULL),
 595        K(0x4d2c6dfc5ac42aedULL), K(0x53380d139d95b3dfULL),
 596        K(0x650a73548baf63deULL), K(0x766a0abb3c77b2a8ULL),
 597        K(0x81c2c92e47edaee6ULL), K(0x92722c851482353bULL),
 598        K(0xa2bfe8a14cf10364ULL), K(0xa81a664bbc423001ULL),
 599        K(0xc24b8b70d0f89791ULL), K(0xc76c51a30654be30ULL),
 600        K(0xd192e819d6ef5218ULL), K(0xd69906245565a910ULL),
 601        K(0xf40e35855771202aULL), K(0x106aa07032bbd1b8ULL),
 602        K(0x19a4c116b8d2d0c8ULL), K(0x1e376c085141ab53ULL),
 603        K(0x2748774cdf8eeb99ULL), K(0x34b0bcb5e19b48a8ULL),
 604        K(0x391c0cb3c5c95a63ULL), K(0x4ed8aa4ae3418acbULL),
 605        K(0x5b9cca4f7763e373ULL), K(0x682e6ff3d6b2b8a3ULL),
 606        K(0x748f82ee5defb2fcULL), K(0x78a5636f43172f60ULL),
 607        K(0x84c87814a1f0ab72ULL), K(0x8cc702081a6439ecULL),
 608        K(0x90befffa23631e28ULL), K(0xa4506cebde82bde9ULL),
 609        K(0xbef9a3f7b2c67915ULL), K(0xc67178f2e372532bULL),
 610#if NEED_SHA512  /* [64]+ are used for sha512 only */
 611        K(0xca273eceea26619cULL), K(0xd186b8c721c0c207ULL),
 612        K(0xeada7dd6cde0eb1eULL), K(0xf57d4f7fee6ed178ULL),
 613        K(0x06f067aa72176fbaULL), K(0x0a637dc5a2c898a6ULL),
 614        K(0x113f9804bef90daeULL), K(0x1b710b35131c471bULL),
 615        K(0x28db77f523047d84ULL), K(0x32caab7b40c72493ULL),
 616        K(0x3c9ebe0a15c9bebcULL), K(0x431d67c49c100d4cULL),
 617        K(0x4cc5d4becb3e42b6ULL), K(0x597f299cfc657e2aULL),
 618        K(0x5fcb6fab3ad6faecULL), K(0x6c44198c4a475817ULL),
 619#endif
 620};
 621#undef K
 622
 623#undef Ch
 624#undef Maj
 625#undef S0
 626#undef S1
 627#undef R0
 628#undef R1
 629
 630static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
 631{
 632        unsigned t;
 633        uint32_t W[64], a, b, c, d, e, f, g, h;
 634        const uint32_t *words = (uint32_t*) ctx->wbuffer;
 635
 636        /* Operators defined in FIPS 180-2:4.1.2.  */
 637#define Ch(x, y, z) ((x & y) ^ (~x & z))
 638#define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
 639#define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
 640#define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
 641#define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
 642#define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
 643
 644        /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2.  */
 645        for (t = 0; t < 16; ++t)
 646                W[t] = SWAP_BE32(words[t]);
 647        for (/*t = 16*/; t < 64; ++t)
 648                W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
 649
 650        a = ctx->hash[0];
 651        b = ctx->hash[1];
 652        c = ctx->hash[2];
 653        d = ctx->hash[3];
 654        e = ctx->hash[4];
 655        f = ctx->hash[5];
 656        g = ctx->hash[6];
 657        h = ctx->hash[7];
 658
 659        /* The actual computation according to FIPS 180-2:6.2.2 step 3.  */
 660        for (t = 0; t < 64; ++t) {
 661                /* Need to fetch upper half of sha_K[t]
 662                 * (I hope compiler is clever enough to just fetch
 663                 * upper half)
 664                 */
 665                uint32_t K_t = NEED_SHA512 ? (sha_K[t] >> 32) : sha_K[t];
 666                uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
 667                uint32_t T2 = S0(a) + Maj(a, b, c);
 668                h = g;
 669                g = f;
 670                f = e;
 671                e = d + T1;
 672                d = c;
 673                c = b;
 674                b = a;
 675                a = T1 + T2;
 676        }
 677#undef Ch
 678#undef Maj
 679#undef S0
 680#undef S1
 681#undef R0
 682#undef R1
 683        /* Add the starting values of the context according to FIPS 180-2:6.2.2
 684           step 4.  */
 685        ctx->hash[0] += a;
 686        ctx->hash[1] += b;
 687        ctx->hash[2] += c;
 688        ctx->hash[3] += d;
 689        ctx->hash[4] += e;
 690        ctx->hash[5] += f;
 691        ctx->hash[6] += g;
 692        ctx->hash[7] += h;
 693}
 694
 695#if NEED_SHA512
 696static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
 697{
 698        unsigned t;
 699        uint64_t W[80];
 700        /* On i386, having assignments here (not later as sha256 does)
 701         * produces 99 bytes smaller code with gcc 4.3.1
 702         */
 703        uint64_t a = ctx->hash[0];
 704        uint64_t b = ctx->hash[1];
 705        uint64_t c = ctx->hash[2];
 706        uint64_t d = ctx->hash[3];
 707        uint64_t e = ctx->hash[4];
 708        uint64_t f = ctx->hash[5];
 709        uint64_t g = ctx->hash[6];
 710        uint64_t h = ctx->hash[7];
 711        const uint64_t *words = (uint64_t*) ctx->wbuffer;
 712
 713        /* Operators defined in FIPS 180-2:4.1.2.  */
 714#define Ch(x, y, z) ((x & y) ^ (~x & z))
 715#define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
 716#define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
 717#define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
 718#define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
 719#define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
 720
 721        /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2.  */
 722        for (t = 0; t < 16; ++t)
 723                W[t] = SWAP_BE64(words[t]);
 724        for (/*t = 16*/; t < 80; ++t)
 725                W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
 726
 727        /* The actual computation according to FIPS 180-2:6.3.2 step 3.  */
 728        for (t = 0; t < 80; ++t) {
 729                uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
 730                uint64_t T2 = S0(a) + Maj(a, b, c);
 731                h = g;
 732                g = f;
 733                f = e;
 734                e = d + T1;
 735                d = c;
 736                c = b;
 737                b = a;
 738                a = T1 + T2;
 739        }
 740#undef Ch
 741#undef Maj
 742#undef S0
 743#undef S1
 744#undef R0
 745#undef R1
 746        /* Add the starting values of the context according to FIPS 180-2:6.3.2
 747           step 4.  */
 748        ctx->hash[0] += a;
 749        ctx->hash[1] += b;
 750        ctx->hash[2] += c;
 751        ctx->hash[3] += d;
 752        ctx->hash[4] += e;
 753        ctx->hash[5] += f;
 754        ctx->hash[6] += g;
 755        ctx->hash[7] += h;
 756}
 757#endif /* NEED_SHA512 */
 758
 759void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 760{
 761        ctx->hash[0] = 0x67452301;
 762        ctx->hash[1] = 0xefcdab89;
 763        ctx->hash[2] = 0x98badcfe;
 764        ctx->hash[3] = 0x10325476;
 765        ctx->hash[4] = 0xc3d2e1f0;
 766        ctx->total64 = 0;
 767        ctx->process_block = sha1_process_block64;
 768}
 769
 770static const uint32_t init256[] = {
 771        0,
 772        0,
 773        0x6a09e667,
 774        0xbb67ae85,
 775        0x3c6ef372,
 776        0xa54ff53a,
 777        0x510e527f,
 778        0x9b05688c,
 779        0x1f83d9ab,
 780        0x5be0cd19,
 781};
 782#if NEED_SHA512
 783static const uint32_t init512_lo[] = {
 784        0,
 785        0,
 786        0xf3bcc908,
 787        0x84caa73b,
 788        0xfe94f82b,
 789        0x5f1d36f1,
 790        0xade682d1,
 791        0x2b3e6c1f,
 792        0xfb41bd6b,
 793        0x137e2179,
 794};
 795#endif /* NEED_SHA512 */
 796
 797// Note: SHA-384 is identical to SHA-512, except that initial hash values are
 798// 0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
 799// 0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
 800// and the output is constructed by omitting last two 64-bit words of it.
 801
 802/* Initialize structure containing state of computation.
 803   (FIPS 180-2:5.3.2)  */
 804void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
 805{
 806        memcpy(&ctx->total64, init256, sizeof(init256));
 807        /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
 808        ctx->process_block = sha256_process_block64;
 809}
 810
 811#if NEED_SHA512
 812/* Initialize structure containing state of computation.
 813   (FIPS 180-2:5.3.3)  */
 814void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
 815{
 816        int i;
 817        /* Two extra iterations zero out ctx->total64[2] */
 818        uint64_t *tp = ctx->total64;
 819        for (i = 0; i < 2+8; i++)
 820                tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
 821        /*ctx->total64[0] = ctx->total64[1] = 0; - already done */
 822}
 823
 824void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
 825{
 826        unsigned bufpos = ctx->total64[0] & 127;
 827        unsigned remaining;
 828
 829        /* First increment the byte count.  FIPS 180-2 specifies the possible
 830           length of the file up to 2^128 _bits_.
 831           We compute the number of _bytes_ and convert to bits later.  */
 832        ctx->total64[0] += len;
 833        if (ctx->total64[0] < len)
 834                ctx->total64[1]++;
 835# if 0
 836        remaining = 128 - bufpos;
 837
 838        /* Hash whole blocks */
 839        while (len >= remaining) {
 840                memcpy(ctx->wbuffer + bufpos, buffer, remaining);
 841                buffer = (const char *)buffer + remaining;
 842                len -= remaining;
 843                remaining = 128;
 844                bufpos = 0;
 845                sha512_process_block128(ctx);
 846        }
 847
 848        /* Save last, partial blosk */
 849        memcpy(ctx->wbuffer + bufpos, buffer, len);
 850# else
 851        while (1) {
 852                remaining = 128 - bufpos;
 853                if (remaining > len)
 854                        remaining = len;
 855                /* Copy data into aligned buffer */
 856                memcpy(ctx->wbuffer + bufpos, buffer, remaining);
 857                len -= remaining;
 858                buffer = (const char *)buffer + remaining;
 859                bufpos += remaining;
 860                /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
 861                bufpos -= 128;
 862                if (bufpos != 0)
 863                        break;
 864                /* Buffer is filled up, process it */
 865                sha512_process_block128(ctx);
 866                /*bufpos = 0; - already is */
 867        }
 868# endif
 869}
 870#endif /* NEED_SHA512 */
 871
 872/* Used also for sha256 */
 873unsigned FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
 874{
 875        unsigned hash_size;
 876
 877        /* SHA stores total in BE, need to swap on LE arches: */
 878        common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
 879
 880        hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
 881        /* This way we do not impose alignment constraints on resbuf: */
 882        if (BB_LITTLE_ENDIAN) {
 883                unsigned i;
 884                for (i = 0; i < hash_size; ++i)
 885                        ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
 886        }
 887        hash_size *= sizeof(ctx->hash[0]);
 888        memcpy(resbuf, ctx->hash, hash_size);
 889        return hash_size;
 890}
 891
 892#if NEED_SHA512
 893unsigned FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
 894{
 895        unsigned bufpos = ctx->total64[0] & 127;
 896
 897        /* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
 898        ctx->wbuffer[bufpos++] = 0x80;
 899
 900        while (1) {
 901                unsigned remaining = 128 - bufpos;
 902                memset(ctx->wbuffer + bufpos, 0, remaining);
 903                if (remaining >= 16) {
 904                        /* Store the 128-bit counter of bits in the buffer in BE format */
 905                        uint64_t t;
 906                        t = ctx->total64[0] << 3;
 907                        t = SWAP_BE64(t);
 908                        *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
 909                        t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
 910                        t = SWAP_BE64(t);
 911                        *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
 912                }
 913                sha512_process_block128(ctx);
 914                if (remaining >= 16)
 915                        break;
 916                bufpos = 0;
 917        }
 918
 919        if (BB_LITTLE_ENDIAN) {
 920                unsigned i;
 921                for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
 922                        ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
 923        }
 924        memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
 925        return sizeof(ctx->hash);
 926}
 927#endif /* NEED_SHA512 */
 928
 929
 930/*
 931 * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
 932 * Michael Peeters and Gilles Van Assche. For more information, feedback or
 933 * questions, please refer to our website: http://keccak.noekeon.org/
 934 *
 935 * Implementation by Ronny Van Keer,
 936 * hereby denoted as "the implementer".
 937 *
 938 * To the extent possible under law, the implementer has waived all copyright
 939 * and related or neighboring rights to the source code in this file.
 940 * http://creativecommons.org/publicdomain/zero/1.0/
 941 *
 942 * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
 943 */
 944
 945#if CONFIG_SHA3_SMALL < 0
 946# define SHA3_SMALL 0
 947#elif CONFIG_SHA3_SMALL > 1
 948# define SHA3_SMALL 1
 949#else
 950# define SHA3_SMALL CONFIG_SHA3_SMALL
 951#endif
 952
 953#define OPTIMIZE_SHA3_FOR_32 0
 954/*
 955 * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
 956 * every 64-bit word of state[] can be split into two 32-bit words
 957 * by even/odd bits. In this form, all rotations of sha3 round
 958 * are 32-bit - and there are lots of them.
 959 * However, it requires either splitting/combining state words
 960 * before/after sha3 round (code does this now)
 961 * or shuffling bits before xor'ing them into state and in sha3_end.
 962 * Without shuffling, bit-slicing results in -130 bytes of code
 963 * and marginal speedup (but of course it gives wrong result).
 964 * With shuffling it works, but +260 code bytes, and slower.
 965 * Disabled for now:
 966 */
 967#if 0 /* LONG_MAX == 0x7fffffff */
 968# undef OPTIMIZE_SHA3_FOR_32
 969# define OPTIMIZE_SHA3_FOR_32 1
 970#endif
 971
 972#if OPTIMIZE_SHA3_FOR_32
 973/* This splits every 64-bit word into a pair of 32-bit words,
 974 * even bits go into first word, odd bits go to second one.
 975 * The conversion is done in-place.
 976 */
 977static void split_halves(uint64_t *state)
 978{
 979        /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
 980        uint32_t *s32 = (uint32_t*)state;
 981        uint32_t t, x0, x1;
 982        int i;
 983        for (i = 24; i >= 0; --i) {
 984                x0 = s32[0];
 985                t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
 986                t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
 987                t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
 988                t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
 989                x1 = s32[1];
 990                t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
 991                t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
 992                t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
 993                t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
 994                *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
 995                *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
 996        }
 997}
 998/* The reverse operation */
 999static void combine_halves(uint64_t *state)
1000{
1001        uint32_t *s32 = (uint32_t*)state;
1002        uint32_t t, x0, x1;
1003        int i;
1004        for (i = 24; i >= 0; --i) {
1005                x0 = s32[0];
1006                x1 = s32[1];
1007                t = (x0 & 0x0000FFFF) | (x1 << 16);
1008                x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
1009                x0 = t;
1010                t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
1011                t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
1012                t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
1013                t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
1014                *s32++ = x0;
1015                t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
1016                t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
1017                t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
1018                t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
1019                *s32++ = x1;
1020        }
1021}
1022#endif
1023
1024/*
1025 * In the crypto literature this function is usually called Keccak-f().
1026 */
1027static void sha3_process_block72(uint64_t *state)
1028{
1029        enum { NROUNDS = 24 };
1030
1031#if OPTIMIZE_SHA3_FOR_32
1032        /*
1033        static const uint32_t IOTA_CONST_0[NROUNDS] = {
1034                0x00000001UL,
1035                0x00000000UL,
1036                0x00000000UL,
1037                0x00000000UL,
1038                0x00000001UL,
1039                0x00000001UL,
1040                0x00000001UL,
1041                0x00000001UL,
1042                0x00000000UL,
1043                0x00000000UL,
1044                0x00000001UL,
1045                0x00000000UL,
1046                0x00000001UL,
1047                0x00000001UL,
1048                0x00000001UL,
1049                0x00000001UL,
1050                0x00000000UL,
1051                0x00000000UL,
1052                0x00000000UL,
1053                0x00000000UL,
1054                0x00000001UL,
1055                0x00000000UL,
1056                0x00000001UL,
1057                0x00000000UL,
1058        };
1059        ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1060        */
1061        uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1062        static const uint32_t IOTA_CONST_1[NROUNDS] = {
1063                0x00000000UL,
1064                0x00000089UL,
1065                0x8000008bUL,
1066                0x80008080UL,
1067                0x0000008bUL,
1068                0x00008000UL,
1069                0x80008088UL,
1070                0x80000082UL,
1071                0x0000000bUL,
1072                0x0000000aUL,
1073                0x00008082UL,
1074                0x00008003UL,
1075                0x0000808bUL,
1076                0x8000000bUL,
1077                0x8000008aUL,
1078                0x80000081UL,
1079                0x80000081UL,
1080                0x80000008UL,
1081                0x00000083UL,
1082                0x80008003UL,
1083                0x80008088UL,
1084                0x80000088UL,
1085                0x00008000UL,
1086                0x80008082UL,
1087        };
1088
1089        uint32_t *const s32 = (uint32_t*)state;
1090        unsigned round;
1091
1092        split_halves(state);
1093
1094        for (round = 0; round < NROUNDS; round++) {
1095                unsigned x;
1096
1097                /* Theta */
1098                {
1099                        uint32_t BC[20];
1100                        for (x = 0; x < 10; ++x) {
1101                                BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1102                        }
1103                        for (x = 0; x < 10; x += 2) {
1104                                uint32_t ta, tb;
1105                                ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1106                                tb = BC[x+9] ^ BC[x+2];
1107                                s32[x+0] ^= ta;
1108                                s32[x+1] ^= tb;
1109                                s32[x+10] ^= ta;
1110                                s32[x+11] ^= tb;
1111                                s32[x+20] ^= ta;
1112                                s32[x+21] ^= tb;
1113                                s32[x+30] ^= ta;
1114                                s32[x+31] ^= tb;
1115                                s32[x+40] ^= ta;
1116                                s32[x+41] ^= tb;
1117                        }
1118                }
1119                /* RhoPi */
1120                {
1121                        uint32_t t0a,t0b, t1a,t1b;
1122                        t1a = s32[1*2+0];
1123                        t1b = s32[1*2+1];
1124
1125#define RhoPi(PI_LANE, ROT_CONST) \
1126        t0a = s32[PI_LANE*2+0];\
1127        t0b = s32[PI_LANE*2+1];\
1128        if (ROT_CONST & 1) {\
1129                s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1130                s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1131        } else {\
1132                s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1133                s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1134        }\
1135        t1a = t0a; t1b = t0b;
1136
1137                        RhoPi(10, 1)
1138                        RhoPi( 7, 3)
1139                        RhoPi(11, 6)
1140                        RhoPi(17,10)
1141                        RhoPi(18,15)
1142                        RhoPi( 3,21)
1143                        RhoPi( 5,28)
1144                        RhoPi(16,36)
1145                        RhoPi( 8,45)
1146                        RhoPi(21,55)
1147                        RhoPi(24, 2)
1148                        RhoPi( 4,14)
1149                        RhoPi(15,27)
1150                        RhoPi(23,41)
1151                        RhoPi(19,56)
1152                        RhoPi(13, 8)
1153                        RhoPi(12,25)
1154                        RhoPi( 2,43)
1155                        RhoPi(20,62)
1156                        RhoPi(14,18)
1157                        RhoPi(22,39)
1158                        RhoPi( 9,61)
1159                        RhoPi( 6,20)
1160                        RhoPi( 1,44)
1161#undef RhoPi
1162                }
1163                /* Chi */
1164                for (x = 0; x <= 40;) {
1165                        uint32_t BC0, BC1, BC2, BC3, BC4;
1166                        BC0 = s32[x + 0*2];
1167                        BC1 = s32[x + 1*2];
1168                        BC2 = s32[x + 2*2];
1169                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1170                        BC3 = s32[x + 3*2];
1171                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1172                        BC4 = s32[x + 4*2];
1173                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1174                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1175                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1176                        x++;
1177                        BC0 = s32[x + 0*2];
1178                        BC1 = s32[x + 1*2];
1179                        BC2 = s32[x + 2*2];
1180                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1181                        BC3 = s32[x + 3*2];
1182                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1183                        BC4 = s32[x + 4*2];
1184                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1185                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1186                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1187                        x += 9;
1188                }
1189                /* Iota */
1190                s32[0] ^= IOTA_CONST_0bits & 1;
1191                IOTA_CONST_0bits >>= 1;
1192                s32[1] ^= IOTA_CONST_1[round];
1193        }
1194
1195        combine_halves(state);
1196#else
1197        /* Native 64-bit algorithm */
1198        static const uint16_t IOTA_CONST[NROUNDS] = {
1199                /* Elements should be 64-bit, but top half is always zero
1200                 * or 0x80000000. We encode 63rd bits in a separate word below.
1201                 * Same is true for 31th bits, which lets us use 16-bit table
1202                 * instead of 64-bit. The speed penalty is lost in the noise.
1203                 */
1204                0x0001,
1205                0x8082,
1206                0x808a,
1207                0x8000,
1208                0x808b,
1209                0x0001,
1210                0x8081,
1211                0x8009,
1212                0x008a,
1213                0x0088,
1214                0x8009,
1215                0x000a,
1216                0x808b,
1217                0x008b,
1218                0x8089,
1219                0x8003,
1220                0x8002,
1221                0x0080,
1222                0x800a,
1223                0x000a,
1224                0x8081,
1225                0x8080,
1226                0x0001,
1227                0x8008,
1228        };
1229        /* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1230        const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1231        /* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1232        const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1233
1234        static const uint8_t ROT_CONST[24] = {
1235                1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1236                27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1237        };
1238        static const uint8_t PI_LANE[24] = {
1239                10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1240                15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1241        };
1242        /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1243
1244        unsigned x;
1245        unsigned round;
1246
1247        if (BB_BIG_ENDIAN) {
1248                for (x = 0; x < 25; x++) {
1249                        state[x] = SWAP_LE64(state[x]);
1250                }
1251        }
1252
1253        for (round = 0; round < NROUNDS; ++round) {
1254                /* Theta */
1255                {
1256                        uint64_t BC[10];
1257                        for (x = 0; x < 5; ++x) {
1258                                BC[x + 5] = BC[x] = state[x]
1259                                        ^ state[x + 5] ^ state[x + 10]
1260                                        ^ state[x + 15] ^ state[x + 20];
1261                        }
1262                        /* Using 2x5 vector above eliminates the need to use
1263                         * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1264                         * and the code is a bit _smaller_.
1265                         */
1266                        for (x = 0; x < 5; ++x) {
1267                                uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1268                                state[x] ^= temp;
1269                                state[x + 5] ^= temp;
1270                                state[x + 10] ^= temp;
1271                                state[x + 15] ^= temp;
1272                                state[x + 20] ^= temp;
1273                        }
1274                }
1275
1276                /* Rho Pi */
1277                if (SHA3_SMALL) {
1278                        uint64_t t1 = state[1];
1279                        for (x = 0; x < 24; ++x) {
1280                                uint64_t t0 = state[PI_LANE[x]];
1281                                state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1282                                t1 = t0;
1283                        }
1284                } else {
1285                        /* Especially large benefit for 32-bit arch (75% faster):
1286                         * 64-bit rotations by non-constant usually are SLOW on those.
1287                         * We resort to unrolling here.
1288                         * This optimizes out PI_LANE[] and ROT_CONST[],
1289                         * but generates 300-500 more bytes of code.
1290                         */
1291                        uint64_t t0;
1292                        uint64_t t1 = state[1];
1293#define RhoPi_twice(x) \
1294        t0 = state[PI_LANE[x  ]]; \
1295        state[PI_LANE[x  ]] = rotl64(t1, ROT_CONST[x  ]); \
1296        t1 = state[PI_LANE[x+1]]; \
1297        state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1298                        RhoPi_twice(0); RhoPi_twice(2);
1299                        RhoPi_twice(4); RhoPi_twice(6);
1300                        RhoPi_twice(8); RhoPi_twice(10);
1301                        RhoPi_twice(12); RhoPi_twice(14);
1302                        RhoPi_twice(16); RhoPi_twice(18);
1303                        RhoPi_twice(20); RhoPi_twice(22);
1304#undef RhoPi_twice
1305                }
1306                /* Chi */
1307# if LONG_MAX > 0x7fffffff
1308                for (x = 0; x <= 20; x += 5) {
1309                        uint64_t BC0, BC1, BC2, BC3, BC4;
1310                        BC0 = state[x + 0];
1311                        BC1 = state[x + 1];
1312                        BC2 = state[x + 2];
1313                        state[x + 0] = BC0 ^ ((~BC1) & BC2);
1314                        BC3 = state[x + 3];
1315                        state[x + 1] = BC1 ^ ((~BC2) & BC3);
1316                        BC4 = state[x + 4];
1317                        state[x + 2] = BC2 ^ ((~BC3) & BC4);
1318                        state[x + 3] = BC3 ^ ((~BC4) & BC0);
1319                        state[x + 4] = BC4 ^ ((~BC0) & BC1);
1320                }
1321# else
1322                /* Reduced register pressure version
1323                 * for register-starved 32-bit arches
1324                 * (i386: -95 bytes, and it is _faster_)
1325                 */
1326                for (x = 0; x <= 40;) {
1327                        uint32_t BC0, BC1, BC2, BC3, BC4;
1328                        uint32_t *const s32 = (uint32_t*)state;
1329#  if SHA3_SMALL
1330 do_half:
1331#  endif
1332                        BC0 = s32[x + 0*2];
1333                        BC1 = s32[x + 1*2];
1334                        BC2 = s32[x + 2*2];
1335                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1336                        BC3 = s32[x + 3*2];
1337                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1338                        BC4 = s32[x + 4*2];
1339                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1340                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1341                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1342                        x++;
1343#  if SHA3_SMALL
1344                        if (x & 1)
1345                                goto do_half;
1346                        x += 8;
1347#  else
1348                        BC0 = s32[x + 0*2];
1349                        BC1 = s32[x + 1*2];
1350                        BC2 = s32[x + 2*2];
1351                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1352                        BC3 = s32[x + 3*2];
1353                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1354                        BC4 = s32[x + 4*2];
1355                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1356                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1357                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1358                        x += 9;
1359#  endif
1360                }
1361# endif /* long is 32-bit */
1362                /* Iota */
1363                state[0] ^= IOTA_CONST[round]
1364                        | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1365                        | (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1366        }
1367
1368        if (BB_BIG_ENDIAN) {
1369                for (x = 0; x < 25; x++) {
1370                        state[x] = SWAP_LE64(state[x]);
1371                }
1372        }
1373#endif
1374}
1375
1376void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1377{
1378        memset(ctx, 0, sizeof(*ctx));
1379        /* SHA3-512, user can override */
1380        ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
1381}
1382
1383void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1384{
1385#if SHA3_SMALL
1386        const uint8_t *data = buffer;
1387        unsigned bufpos = ctx->bytes_queued;
1388
1389        while (1) {
1390                unsigned remaining = ctx->input_block_bytes - bufpos;
1391                if (remaining > len)
1392                        remaining = len;
1393                len -= remaining;
1394                /* XOR data into buffer */
1395                while (remaining != 0) {
1396                        uint8_t *buf = (uint8_t*)ctx->state;
1397                        buf[bufpos] ^= *data++;
1398                        bufpos++;
1399                        remaining--;
1400                }
1401                /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1402                bufpos -= ctx->input_block_bytes;
1403                if (bufpos != 0)
1404                        break;
1405                /* Buffer is filled up, process it */
1406                sha3_process_block72(ctx->state);
1407                /*bufpos = 0; - already is */
1408        }
1409        ctx->bytes_queued = bufpos + ctx->input_block_bytes;
1410#else
1411        /* +50 bytes code size, but a bit faster because of long-sized XORs */
1412        const uint8_t *data = buffer;
1413        unsigned bufpos = ctx->bytes_queued;
1414        unsigned iblk_bytes = ctx->input_block_bytes;
1415
1416        /* If already data in queue, continue queuing first */
1417        if (bufpos != 0) {
1418                while (len != 0) {
1419                        uint8_t *buf = (uint8_t*)ctx->state;
1420                        buf[bufpos] ^= *data++;
1421                        len--;
1422                        bufpos++;
1423                        if (bufpos == iblk_bytes) {
1424                                bufpos = 0;
1425                                goto do_block;
1426                        }
1427                }
1428        }
1429
1430        /* Absorb complete blocks */
1431        while (len >= iblk_bytes) {
1432                /* XOR data onto beginning of state[].
1433                 * We try to be efficient - operate one word at a time, not byte.
1434                 * Careful wrt unaligned access: can't just use "*(long*)data"!
1435                 */
1436                unsigned count = iblk_bytes / sizeof(long);
1437                long *buf = (long*)ctx->state;
1438                do {
1439                        long v;
1440                        move_from_unaligned_long(v, (long*)data);
1441                        *buf++ ^= v;
1442                        data += sizeof(long);
1443                } while (--count);
1444                len -= iblk_bytes;
1445 do_block:
1446                sha3_process_block72(ctx->state);
1447        }
1448
1449        /* Queue remaining data bytes */
1450        while (len != 0) {
1451                uint8_t *buf = (uint8_t*)ctx->state;
1452                buf[bufpos] ^= *data++;
1453                bufpos++;
1454                len--;
1455        }
1456
1457        ctx->bytes_queued = bufpos;
1458#endif
1459}
1460
1461unsigned FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1462{
1463        /* Padding */
1464        uint8_t *buf = (uint8_t*)ctx->state;
1465        /*
1466         * Keccak block padding is: add 1 bit after last bit of input,
1467         * then add zero bits until the end of block, and add the last 1 bit
1468         * (the last bit in the block) - the "10*1" pattern.
1469         * SHA3 standard appends additional two bits, 01,  before that padding:
1470         *
1471         * SHA3-224(M) = KECCAK[448](M||01, 224)
1472         * SHA3-256(M) = KECCAK[512](M||01, 256)
1473         * SHA3-384(M) = KECCAK[768](M||01, 384)
1474         * SHA3-512(M) = KECCAK[1024](M||01, 512)
1475         * (M is the input, || is bit concatenation)
1476         *
1477         * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1478         */
1479        buf[ctx->bytes_queued]          ^= 6; /* bit pattern 00000110 */
1480        buf[ctx->input_block_bytes - 1] ^= 0x80;
1481
1482        sha3_process_block72(ctx->state);
1483
1484        /* Output */
1485        memcpy(resbuf, ctx->state, 64);
1486        return 64;
1487}
1488