busybox/libbb/hash_md5_sha.c
<<
>>
Prefs
   1/* vi: set sw=4 ts=4: */
   2/*
   3 * Utility routines.
   4 *
   5 * Copyright (C) 2010 Denys Vlasenko
   6 *
   7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
   8 */
   9#include "libbb.h"
  10
  11#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
  12
  13/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
  14 * (for rotX32, there is no difference). Why? My guess is that
  15 * macro requires clever common subexpression elimination heuristics
  16 * in gcc, while inline basically forces it to happen.
  17 */
  18//#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
  19static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
  20{
  21        return (x << n) | (x >> (32 - n));
  22}
  23//#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
  24static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
  25{
  26        return (x >> n) | (x << (32 - n));
  27}
  28/* rotr64 in needed for sha512 only: */
  29//#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
  30static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
  31{
  32        return (x >> n) | (x << (64 - n));
  33}
  34
  35/* rotl64 only used for sha3 currently */
  36static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
  37{
  38        return (x << n) | (x >> (64 - n));
  39}
  40
  41/* Process the remaining bytes in the buffer */
  42static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
  43{
  44        unsigned bufpos = ctx->total64 & 63;
  45        /* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
  46        ctx->wbuffer[bufpos++] = 0x80;
  47
  48        /* This loop iterates either once or twice, no more, no less */
  49        while (1) {
  50                unsigned remaining = 64 - bufpos;
  51                memset(ctx->wbuffer + bufpos, 0, remaining);
  52                /* Do we have enough space for the length count? */
  53                if (remaining >= 8) {
  54                        /* Store the 64-bit counter of bits in the buffer */
  55                        uint64_t t = ctx->total64 << 3;
  56                        if (swap_needed)
  57                                t = bb_bswap_64(t);
  58                        /* wbuffer is suitably aligned for this */
  59                        *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
  60                }
  61                ctx->process_block(ctx);
  62                if (remaining >= 8)
  63                        break;
  64                bufpos = 0;
  65        }
  66}
  67
  68
  69/*
  70 * Compute MD5 checksum of strings according to the
  71 * definition of MD5 in RFC 1321 from April 1992.
  72 *
  73 * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
  74 *
  75 * Copyright (C) 1995-1999 Free Software Foundation, Inc.
  76 * Copyright (C) 2001 Manuel Novoa III
  77 * Copyright (C) 2003 Glenn L. McGrath
  78 * Copyright (C) 2003 Erik Andersen
  79 *
  80 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
  81 */
  82
  83/* 0: fastest, 3: smallest */
  84#if CONFIG_MD5_SMALL < 0
  85# define MD5_SMALL 0
  86#elif CONFIG_MD5_SMALL > 3
  87# define MD5_SMALL 3
  88#else
  89# define MD5_SMALL CONFIG_MD5_SMALL
  90#endif
  91
  92/* These are the four functions used in the four steps of the MD5 algorithm
  93 * and defined in the RFC 1321.  The first function is a little bit optimized
  94 * (as found in Colin Plumbs public domain implementation).
  95 * #define FF(b, c, d) ((b & c) | (~b & d))
  96 */
  97#undef FF
  98#undef FG
  99#undef FH
 100#undef FI
 101#define FF(b, c, d) (d ^ (b & (c ^ d)))
 102#define FG(b, c, d) FF(d, b, c)
 103#define FH(b, c, d) (b ^ c ^ d)
 104#define FI(b, c, d) (c ^ (b | ~d))
 105
 106/* Hash a single block, 64 bytes long and 4-byte aligned */
 107static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
 108{
 109#if MD5_SMALL > 0
 110        /* Before we start, one word to the strange constants.
 111           They are defined in RFC 1321 as
 112           T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
 113         */
 114        static const uint32_t C_array[] ALIGN4 = {
 115                /* round 1 */
 116                0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
 117                0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
 118                0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
 119                0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
 120                /* round 2 */
 121                0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
 122                0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
 123                0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
 124                0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
 125                /* round 3 */
 126                0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
 127                0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
 128                0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
 129                0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
 130                /* round 4 */
 131                0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
 132                0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
 133                0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
 134                0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
 135        };
 136        static const char P_array[] ALIGN1 = {
 137# if MD5_SMALL > 1
 138                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
 139# endif
 140                1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
 141                5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
 142                0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9  /* 4 */
 143        };
 144#endif
 145        uint32_t *words = (void*) ctx->wbuffer;
 146        uint32_t A = ctx->hash[0];
 147        uint32_t B = ctx->hash[1];
 148        uint32_t C = ctx->hash[2];
 149        uint32_t D = ctx->hash[3];
 150
 151#if MD5_SMALL >= 2  /* 2 or 3 */
 152
 153        static const char S_array[] ALIGN1 = {
 154                7, 12, 17, 22,
 155                5, 9, 14, 20,
 156                4, 11, 16, 23,
 157                6, 10, 15, 21
 158        };
 159        const uint32_t *pc;
 160        const char *pp;
 161        const char *ps;
 162        int i;
 163        uint32_t temp;
 164
 165        if (BB_BIG_ENDIAN)
 166                for (i = 0; i < 16; i++)
 167                        words[i] = SWAP_LE32(words[i]);
 168
 169# if MD5_SMALL == 3
 170        pc = C_array;
 171        pp = P_array;
 172        ps = S_array - 4;
 173
 174        for (i = 0; i < 64; i++) {
 175                if ((i & 0x0f) == 0)
 176                        ps += 4;
 177                temp = A;
 178                switch (i >> 4) {
 179                case 0:
 180                        temp += FF(B, C, D);
 181                        break;
 182                case 1:
 183                        temp += FG(B, C, D);
 184                        break;
 185                case 2:
 186                        temp += FH(B, C, D);
 187                        break;
 188                default: /* case 3 */
 189                        temp += FI(B, C, D);
 190                }
 191                temp += words[(int) (*pp++)] + *pc++;
 192                temp = rotl32(temp, ps[i & 3]);
 193                temp += B;
 194                A = D;
 195                D = C;
 196                C = B;
 197                B = temp;
 198        }
 199# else  /* MD5_SMALL == 2 */
 200        pc = C_array;
 201        pp = P_array;
 202        ps = S_array;
 203
 204        for (i = 0; i < 16; i++) {
 205                temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
 206                temp = rotl32(temp, ps[i & 3]);
 207                temp += B;
 208                A = D;
 209                D = C;
 210                C = B;
 211                B = temp;
 212        }
 213        ps += 4;
 214        for (i = 0; i < 16; i++) {
 215                temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
 216                temp = rotl32(temp, ps[i & 3]);
 217                temp += B;
 218                A = D;
 219                D = C;
 220                C = B;
 221                B = temp;
 222        }
 223        ps += 4;
 224        for (i = 0; i < 16; i++) {
 225                temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
 226                temp = rotl32(temp, ps[i & 3]);
 227                temp += B;
 228                A = D;
 229                D = C;
 230                C = B;
 231                B = temp;
 232        }
 233        ps += 4;
 234        for (i = 0; i < 16; i++) {
 235                temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
 236                temp = rotl32(temp, ps[i & 3]);
 237                temp += B;
 238                A = D;
 239                D = C;
 240                C = B;
 241                B = temp;
 242        }
 243# endif
 244        /* Add checksum to the starting values */
 245        ctx->hash[0] += A;
 246        ctx->hash[1] += B;
 247        ctx->hash[2] += C;
 248        ctx->hash[3] += D;
 249
 250#else  /* MD5_SMALL == 0 or 1 */
 251
 252# if MD5_SMALL == 1
 253        const uint32_t *pc;
 254        const char *pp;
 255        int i;
 256# endif
 257
 258        /* First round: using the given function, the context and a constant
 259           the next context is computed.  Because the algorithm's processing
 260           unit is a 32-bit word and it is determined to work on words in
 261           little endian byte order we perhaps have to change the byte order
 262           before the computation.  To reduce the work for the next steps
 263           we save swapped words in WORDS array.  */
 264# undef OP
 265# define OP(a, b, c, d, s, T) \
 266        do { \
 267                a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
 268                words++; \
 269                a = rotl32(a, s); \
 270                a += b; \
 271        } while (0)
 272
 273        /* Round 1 */
 274# if MD5_SMALL == 1
 275        pc = C_array;
 276        for (i = 0; i < 4; i++) {
 277                OP(A, B, C, D, 7, *pc++);
 278                OP(D, A, B, C, 12, *pc++);
 279                OP(C, D, A, B, 17, *pc++);
 280                OP(B, C, D, A, 22, *pc++);
 281        }
 282# else
 283        OP(A, B, C, D, 7, 0xd76aa478);
 284        OP(D, A, B, C, 12, 0xe8c7b756);
 285        OP(C, D, A, B, 17, 0x242070db);
 286        OP(B, C, D, A, 22, 0xc1bdceee);
 287        OP(A, B, C, D, 7, 0xf57c0faf);
 288        OP(D, A, B, C, 12, 0x4787c62a);
 289        OP(C, D, A, B, 17, 0xa8304613);
 290        OP(B, C, D, A, 22, 0xfd469501);
 291        OP(A, B, C, D, 7, 0x698098d8);
 292        OP(D, A, B, C, 12, 0x8b44f7af);
 293        OP(C, D, A, B, 17, 0xffff5bb1);
 294        OP(B, C, D, A, 22, 0x895cd7be);
 295        OP(A, B, C, D, 7, 0x6b901122);
 296        OP(D, A, B, C, 12, 0xfd987193);
 297        OP(C, D, A, B, 17, 0xa679438e);
 298        OP(B, C, D, A, 22, 0x49b40821);
 299# endif
 300        words -= 16;
 301
 302        /* For the second to fourth round we have the possibly swapped words
 303           in WORDS.  Redefine the macro to take an additional first
 304           argument specifying the function to use.  */
 305# undef OP
 306# define OP(f, a, b, c, d, k, s, T) \
 307        do { \
 308                a += f(b, c, d) + words[k] + T; \
 309                a = rotl32(a, s); \
 310                a += b; \
 311        } while (0)
 312
 313        /* Round 2 */
 314# if MD5_SMALL == 1
 315        pp = P_array;
 316        for (i = 0; i < 4; i++) {
 317                OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
 318                OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
 319                OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
 320                OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
 321        }
 322# else
 323        OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
 324        OP(FG, D, A, B, C, 6, 9, 0xc040b340);
 325        OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
 326        OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
 327        OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
 328        OP(FG, D, A, B, C, 10, 9, 0x02441453);
 329        OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
 330        OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
 331        OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
 332        OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
 333        OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
 334        OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
 335        OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
 336        OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
 337        OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
 338        OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
 339# endif
 340
 341        /* Round 3 */
 342# if MD5_SMALL == 1
 343        for (i = 0; i < 4; i++) {
 344                OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
 345                OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
 346                OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
 347                OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
 348        }
 349# else
 350        OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
 351        OP(FH, D, A, B, C, 8, 11, 0x8771f681);
 352        OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
 353        OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
 354        OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
 355        OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
 356        OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
 357        OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
 358        OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
 359        OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
 360        OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
 361        OP(FH, B, C, D, A, 6, 23, 0x04881d05);
 362        OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
 363        OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
 364        OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
 365        OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
 366# endif
 367
 368        /* Round 4 */
 369# if MD5_SMALL == 1
 370        for (i = 0; i < 4; i++) {
 371                OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
 372                OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
 373                OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
 374                OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
 375        }
 376# else
 377        OP(FI, A, B, C, D, 0, 6, 0xf4292244);
 378        OP(FI, D, A, B, C, 7, 10, 0x432aff97);
 379        OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
 380        OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
 381        OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
 382        OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
 383        OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
 384        OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
 385        OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
 386        OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
 387        OP(FI, C, D, A, B, 6, 15, 0xa3014314);
 388        OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
 389        OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
 390        OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
 391        OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
 392        OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
 393# undef OP
 394# endif
 395        /* Add checksum to the starting values */
 396        ctx->hash[0] += A;
 397        ctx->hash[1] += B;
 398        ctx->hash[2] += C;
 399        ctx->hash[3] += D;
 400#endif
 401}
 402#undef FF
 403#undef FG
 404#undef FH
 405#undef FI
 406
 407/* Initialize structure containing state of computation.
 408 * (RFC 1321, 3.3: Step 3)
 409 */
 410void FAST_FUNC md5_begin(md5_ctx_t *ctx)
 411{
 412        ctx->hash[0] = 0x67452301;
 413        ctx->hash[1] = 0xefcdab89;
 414        ctx->hash[2] = 0x98badcfe;
 415        ctx->hash[3] = 0x10325476;
 416        ctx->total64 = 0;
 417        ctx->process_block = md5_process_block64;
 418}
 419
 420/* Used also for sha1 and sha256 */
 421void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
 422{
 423        unsigned bufpos = ctx->total64 & 63;
 424
 425        ctx->total64 += len;
 426
 427        while (1) {
 428                unsigned remaining = 64 - bufpos;
 429                if (remaining > len)
 430                        remaining = len;
 431                /* Copy data into aligned buffer */
 432                memcpy(ctx->wbuffer + bufpos, buffer, remaining);
 433                len -= remaining;
 434                buffer = (const char *)buffer + remaining;
 435                bufpos += remaining;
 436
 437                /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
 438                bufpos -= 64;
 439                if (bufpos != 0)
 440                        break;
 441
 442                /* Buffer is filled up, process it */
 443                ctx->process_block(ctx);
 444                /*bufpos = 0; - already is */
 445        }
 446}
 447
 448/* Process the remaining bytes in the buffer and put result from CTX
 449 * in first 16 bytes following RESBUF.  The result is always in little
 450 * endian byte order, so that a byte-wise output yields to the wanted
 451 * ASCII representation of the message digest.
 452 */
 453unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
 454{
 455        /* MD5 stores total in LE, need to swap on BE arches: */
 456        common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
 457
 458        /* The MD5 result is in little endian byte order */
 459        if (BB_BIG_ENDIAN) {
 460                ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
 461                ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
 462                ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
 463                ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
 464        }
 465
 466        memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
 467        return sizeof(ctx->hash[0]) * 4;
 468}
 469
 470
 471/*
 472 * SHA1 part is:
 473 * Copyright 2007 Rob Landley <rob@landley.net>
 474 *
 475 * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
 476 * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
 477 *
 478 * Licensed under GPLv2, see file LICENSE in this source tree.
 479 *
 480 * ---------------------------------------------------------------------------
 481 *
 482 * SHA256 and SHA512 parts are:
 483 * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
 484 * Shrank by Denys Vlasenko.
 485 *
 486 * ---------------------------------------------------------------------------
 487 *
 488 * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
 489 * and replace "4096" with something like "2000 + time(NULL) % 2097",
 490 * then rebuild and compare "shaNNNsum bigfile" results.
 491 */
 492
 493static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 494{
 495        static const uint32_t rconsts[] ALIGN4 = {
 496                0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
 497        };
 498        int i, j;
 499        int cnt;
 500        uint32_t W[16+16];
 501        uint32_t a, b, c, d, e;
 502
 503        /* On-stack work buffer frees up one register in the main loop
 504         * which otherwise will be needed to hold ctx pointer */
 505        for (i = 0; i < 16; i++)
 506                W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
 507
 508        a = ctx->hash[0];
 509        b = ctx->hash[1];
 510        c = ctx->hash[2];
 511        d = ctx->hash[3];
 512        e = ctx->hash[4];
 513
 514        /* 4 rounds of 20 operations each */
 515        cnt = 0;
 516        for (i = 0; i < 4; i++) {
 517                j = 19;
 518                do {
 519                        uint32_t work;
 520
 521                        work = c ^ d;
 522                        if (i == 0) {
 523                                work = (work & b) ^ d;
 524                                if (j <= 3)
 525                                        goto ge16;
 526                                /* Used to do SWAP_BE32 here, but this
 527                                 * requires ctx (see comment above) */
 528                                work += W[cnt];
 529                        } else {
 530                                if (i == 2)
 531                                        work = ((b | c) & d) | (b & c);
 532                                else /* i = 1 or 3 */
 533                                        work ^= b;
 534 ge16:
 535                                W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
 536                                work += W[cnt];
 537                        }
 538                        work += e + rotl32(a, 5) + rconsts[i];
 539
 540                        /* Rotate by one for next time */
 541                        e = d;
 542                        d = c;
 543                        c = /* b = */ rotl32(b, 30);
 544                        b = a;
 545                        a = work;
 546                        cnt = (cnt + 1) & 15;
 547                } while (--j >= 0);
 548        }
 549
 550        ctx->hash[0] += a;
 551        ctx->hash[1] += b;
 552        ctx->hash[2] += c;
 553        ctx->hash[3] += d;
 554        ctx->hash[4] += e;
 555}
 556
 557/* Constants for SHA512 from FIPS 180-2:4.2.3.
 558 * SHA256 constants from FIPS 180-2:4.2.2
 559 * are the most significant half of first 64 elements
 560 * of the same array.
 561 */
 562#undef K
 563#if NEED_SHA512
 564typedef uint64_t sha_K_int;
 565# define K(v) v
 566#else
 567typedef uint32_t sha_K_int;
 568# define K(v) (uint32_t)(v >> 32)
 569#endif
 570static const sha_K_int sha_K[] ALIGN8 = {
 571        K(0x428a2f98d728ae22ULL), K(0x7137449123ef65cdULL),
 572        K(0xb5c0fbcfec4d3b2fULL), K(0xe9b5dba58189dbbcULL),
 573        K(0x3956c25bf348b538ULL), K(0x59f111f1b605d019ULL),
 574        K(0x923f82a4af194f9bULL), K(0xab1c5ed5da6d8118ULL),
 575        K(0xd807aa98a3030242ULL), K(0x12835b0145706fbeULL),
 576        K(0x243185be4ee4b28cULL), K(0x550c7dc3d5ffb4e2ULL),
 577        K(0x72be5d74f27b896fULL), K(0x80deb1fe3b1696b1ULL),
 578        K(0x9bdc06a725c71235ULL), K(0xc19bf174cf692694ULL),
 579        K(0xe49b69c19ef14ad2ULL), K(0xefbe4786384f25e3ULL),
 580        K(0x0fc19dc68b8cd5b5ULL), K(0x240ca1cc77ac9c65ULL),
 581        K(0x2de92c6f592b0275ULL), K(0x4a7484aa6ea6e483ULL),
 582        K(0x5cb0a9dcbd41fbd4ULL), K(0x76f988da831153b5ULL),
 583        K(0x983e5152ee66dfabULL), K(0xa831c66d2db43210ULL),
 584        K(0xb00327c898fb213fULL), K(0xbf597fc7beef0ee4ULL),
 585        K(0xc6e00bf33da88fc2ULL), K(0xd5a79147930aa725ULL),
 586        K(0x06ca6351e003826fULL), K(0x142929670a0e6e70ULL),
 587        K(0x27b70a8546d22ffcULL), K(0x2e1b21385c26c926ULL),
 588        K(0x4d2c6dfc5ac42aedULL), K(0x53380d139d95b3dfULL),
 589        K(0x650a73548baf63deULL), K(0x766a0abb3c77b2a8ULL),
 590        K(0x81c2c92e47edaee6ULL), K(0x92722c851482353bULL),
 591        K(0xa2bfe8a14cf10364ULL), K(0xa81a664bbc423001ULL),
 592        K(0xc24b8b70d0f89791ULL), K(0xc76c51a30654be30ULL),
 593        K(0xd192e819d6ef5218ULL), K(0xd69906245565a910ULL),
 594        K(0xf40e35855771202aULL), K(0x106aa07032bbd1b8ULL),
 595        K(0x19a4c116b8d2d0c8ULL), K(0x1e376c085141ab53ULL),
 596        K(0x2748774cdf8eeb99ULL), K(0x34b0bcb5e19b48a8ULL),
 597        K(0x391c0cb3c5c95a63ULL), K(0x4ed8aa4ae3418acbULL),
 598        K(0x5b9cca4f7763e373ULL), K(0x682e6ff3d6b2b8a3ULL),
 599        K(0x748f82ee5defb2fcULL), K(0x78a5636f43172f60ULL),
 600        K(0x84c87814a1f0ab72ULL), K(0x8cc702081a6439ecULL),
 601        K(0x90befffa23631e28ULL), K(0xa4506cebde82bde9ULL),
 602        K(0xbef9a3f7b2c67915ULL), K(0xc67178f2e372532bULL),
 603#if NEED_SHA512  /* [64]+ are used for sha512 only */
 604        K(0xca273eceea26619cULL), K(0xd186b8c721c0c207ULL),
 605        K(0xeada7dd6cde0eb1eULL), K(0xf57d4f7fee6ed178ULL),
 606        K(0x06f067aa72176fbaULL), K(0x0a637dc5a2c898a6ULL),
 607        K(0x113f9804bef90daeULL), K(0x1b710b35131c471bULL),
 608        K(0x28db77f523047d84ULL), K(0x32caab7b40c72493ULL),
 609        K(0x3c9ebe0a15c9bebcULL), K(0x431d67c49c100d4cULL),
 610        K(0x4cc5d4becb3e42b6ULL), K(0x597f299cfc657e2aULL),
 611        K(0x5fcb6fab3ad6faecULL), K(0x6c44198c4a475817ULL),
 612#endif
 613};
 614#undef K
 615
 616#undef Ch
 617#undef Maj
 618#undef S0
 619#undef S1
 620#undef R0
 621#undef R1
 622
 623static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
 624{
 625        unsigned t;
 626        uint32_t W[64], a, b, c, d, e, f, g, h;
 627        const uint32_t *words = (uint32_t*) ctx->wbuffer;
 628
 629        /* Operators defined in FIPS 180-2:4.1.2.  */
 630#define Ch(x, y, z) ((x & y) ^ (~x & z))
 631#define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
 632#define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
 633#define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
 634#define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
 635#define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
 636
 637        /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2.  */
 638        for (t = 0; t < 16; ++t)
 639                W[t] = SWAP_BE32(words[t]);
 640        for (/*t = 16*/; t < 64; ++t)
 641                W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
 642
 643        a = ctx->hash[0];
 644        b = ctx->hash[1];
 645        c = ctx->hash[2];
 646        d = ctx->hash[3];
 647        e = ctx->hash[4];
 648        f = ctx->hash[5];
 649        g = ctx->hash[6];
 650        h = ctx->hash[7];
 651
 652        /* The actual computation according to FIPS 180-2:6.2.2 step 3.  */
 653        for (t = 0; t < 64; ++t) {
 654                /* Need to fetch upper half of sha_K[t]
 655                 * (I hope compiler is clever enough to just fetch
 656                 * upper half)
 657                 */
 658                uint32_t K_t = NEED_SHA512 ? (sha_K[t] >> 32) : sha_K[t];
 659                uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
 660                uint32_t T2 = S0(a) + Maj(a, b, c);
 661                h = g;
 662                g = f;
 663                f = e;
 664                e = d + T1;
 665                d = c;
 666                c = b;
 667                b = a;
 668                a = T1 + T2;
 669        }
 670#undef Ch
 671#undef Maj
 672#undef S0
 673#undef S1
 674#undef R0
 675#undef R1
 676        /* Add the starting values of the context according to FIPS 180-2:6.2.2
 677           step 4.  */
 678        ctx->hash[0] += a;
 679        ctx->hash[1] += b;
 680        ctx->hash[2] += c;
 681        ctx->hash[3] += d;
 682        ctx->hash[4] += e;
 683        ctx->hash[5] += f;
 684        ctx->hash[6] += g;
 685        ctx->hash[7] += h;
 686}
 687
 688#if NEED_SHA512
 689static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
 690{
 691        unsigned t;
 692        uint64_t W[80];
 693        /* On i386, having assignments here (not later as sha256 does)
 694         * produces 99 bytes smaller code with gcc 4.3.1
 695         */
 696        uint64_t a = ctx->hash[0];
 697        uint64_t b = ctx->hash[1];
 698        uint64_t c = ctx->hash[2];
 699        uint64_t d = ctx->hash[3];
 700        uint64_t e = ctx->hash[4];
 701        uint64_t f = ctx->hash[5];
 702        uint64_t g = ctx->hash[6];
 703        uint64_t h = ctx->hash[7];
 704        const uint64_t *words = (uint64_t*) ctx->wbuffer;
 705
 706        /* Operators defined in FIPS 180-2:4.1.2.  */
 707#define Ch(x, y, z) ((x & y) ^ (~x & z))
 708#define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
 709#define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
 710#define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
 711#define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
 712#define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
 713
 714        /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2.  */
 715        for (t = 0; t < 16; ++t)
 716                W[t] = SWAP_BE64(words[t]);
 717        for (/*t = 16*/; t < 80; ++t)
 718                W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
 719
 720        /* The actual computation according to FIPS 180-2:6.3.2 step 3.  */
 721        for (t = 0; t < 80; ++t) {
 722                uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
 723                uint64_t T2 = S0(a) + Maj(a, b, c);
 724                h = g;
 725                g = f;
 726                f = e;
 727                e = d + T1;
 728                d = c;
 729                c = b;
 730                b = a;
 731                a = T1 + T2;
 732        }
 733#undef Ch
 734#undef Maj
 735#undef S0
 736#undef S1
 737#undef R0
 738#undef R1
 739        /* Add the starting values of the context according to FIPS 180-2:6.3.2
 740           step 4.  */
 741        ctx->hash[0] += a;
 742        ctx->hash[1] += b;
 743        ctx->hash[2] += c;
 744        ctx->hash[3] += d;
 745        ctx->hash[4] += e;
 746        ctx->hash[5] += f;
 747        ctx->hash[6] += g;
 748        ctx->hash[7] += h;
 749}
 750#endif /* NEED_SHA512 */
 751
 752void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 753{
 754        ctx->hash[0] = 0x67452301;
 755        ctx->hash[1] = 0xefcdab89;
 756        ctx->hash[2] = 0x98badcfe;
 757        ctx->hash[3] = 0x10325476;
 758        ctx->hash[4] = 0xc3d2e1f0;
 759        ctx->total64 = 0;
 760        ctx->process_block = sha1_process_block64;
 761}
 762
 763static const uint32_t init256[] ALIGN4 = {
 764        0,
 765        0,
 766        0x6a09e667,
 767        0xbb67ae85,
 768        0x3c6ef372,
 769        0xa54ff53a,
 770        0x510e527f,
 771        0x9b05688c,
 772        0x1f83d9ab,
 773        0x5be0cd19,
 774};
 775#if NEED_SHA512
 776static const uint32_t init512_lo[] ALIGN4 = {
 777        0,
 778        0,
 779        0xf3bcc908,
 780        0x84caa73b,
 781        0xfe94f82b,
 782        0x5f1d36f1,
 783        0xade682d1,
 784        0x2b3e6c1f,
 785        0xfb41bd6b,
 786        0x137e2179,
 787};
 788#endif /* NEED_SHA512 */
 789
 790// Note: SHA-384 is identical to SHA-512, except that initial hash values are
 791// 0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
 792// 0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
 793// and the output is constructed by omitting last two 64-bit words of it.
 794
 795/* Initialize structure containing state of computation.
 796   (FIPS 180-2:5.3.2)  */
 797void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
 798{
 799        memcpy(&ctx->total64, init256, sizeof(init256));
 800        /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
 801        ctx->process_block = sha256_process_block64;
 802}
 803
 804#if NEED_SHA512
 805/* Initialize structure containing state of computation.
 806   (FIPS 180-2:5.3.3)  */
 807void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
 808{
 809        int i;
 810        /* Two extra iterations zero out ctx->total64[2] */
 811        uint64_t *tp = ctx->total64;
 812        for (i = 0; i < 8 + 2; i++)
 813                tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
 814        /*ctx->total64[0] = ctx->total64[1] = 0; - already done */
 815}
 816
 817void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
 818{
 819        unsigned bufpos = ctx->total64[0] & 127;
 820        unsigned remaining;
 821
 822        /* First increment the byte count.  FIPS 180-2 specifies the possible
 823           length of the file up to 2^128 _bits_.
 824           We compute the number of _bytes_ and convert to bits later.  */
 825        ctx->total64[0] += len;
 826        if (ctx->total64[0] < len)
 827                ctx->total64[1]++;
 828
 829        while (1) {
 830                remaining = 128 - bufpos;
 831                if (remaining > len)
 832                        remaining = len;
 833                /* Copy data into aligned buffer */
 834                memcpy(ctx->wbuffer + bufpos, buffer, remaining);
 835                len -= remaining;
 836                buffer = (const char *)buffer + remaining;
 837                bufpos += remaining;
 838
 839                /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
 840                bufpos -= 128;
 841                if (bufpos != 0)
 842                        break;
 843
 844                /* Buffer is filled up, process it */
 845                sha512_process_block128(ctx);
 846                /*bufpos = 0; - already is */
 847        }
 848}
 849#endif /* NEED_SHA512 */
 850
 851/* Used also for sha256 */
 852unsigned FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
 853{
 854        unsigned hash_size;
 855
 856        /* SHA stores total in BE, need to swap on LE arches: */
 857        common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
 858
 859        hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
 860        /* This way we do not impose alignment constraints on resbuf: */
 861        if (BB_LITTLE_ENDIAN) {
 862                unsigned i;
 863                for (i = 0; i < hash_size; ++i)
 864                        ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
 865        }
 866        hash_size *= sizeof(ctx->hash[0]);
 867        memcpy(resbuf, ctx->hash, hash_size);
 868        return hash_size;
 869}
 870
 871#if NEED_SHA512
 872unsigned FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
 873{
 874        unsigned bufpos = ctx->total64[0] & 127;
 875
 876        /* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
 877        ctx->wbuffer[bufpos++] = 0x80;
 878
 879        while (1) {
 880                unsigned remaining = 128 - bufpos;
 881                memset(ctx->wbuffer + bufpos, 0, remaining);
 882                if (remaining >= 16) {
 883                        /* Store the 128-bit counter of bits in the buffer in BE format */
 884                        uint64_t t;
 885                        t = ctx->total64[0] << 3;
 886                        t = SWAP_BE64(t);
 887                        *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
 888                        t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
 889                        t = SWAP_BE64(t);
 890                        *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
 891                }
 892                sha512_process_block128(ctx);
 893                if (remaining >= 16)
 894                        break;
 895                bufpos = 0;
 896        }
 897
 898        if (BB_LITTLE_ENDIAN) {
 899                unsigned i;
 900                for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
 901                        ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
 902        }
 903        memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
 904        return sizeof(ctx->hash);
 905}
 906#endif /* NEED_SHA512 */
 907
 908
 909/*
 910 * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
 911 * Michael Peeters and Gilles Van Assche. For more information, feedback or
 912 * questions, please refer to our website: http://keccak.noekeon.org/
 913 *
 914 * Implementation by Ronny Van Keer,
 915 * hereby denoted as "the implementer".
 916 *
 917 * To the extent possible under law, the implementer has waived all copyright
 918 * and related or neighboring rights to the source code in this file.
 919 * http://creativecommons.org/publicdomain/zero/1.0/
 920 *
 921 * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
 922 */
 923
 924#if CONFIG_SHA3_SMALL < 0
 925# define SHA3_SMALL 0
 926#elif CONFIG_SHA3_SMALL > 1
 927# define SHA3_SMALL 1
 928#else
 929# define SHA3_SMALL CONFIG_SHA3_SMALL
 930#endif
 931
 932#define OPTIMIZE_SHA3_FOR_32 0
 933/*
 934 * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
 935 * every 64-bit word of state[] can be split into two 32-bit words
 936 * by even/odd bits. In this form, all rotations of sha3 round
 937 * are 32-bit - and there are lots of them.
 938 * However, it requires either splitting/combining state words
 939 * before/after sha3 round (code does this now)
 940 * or shuffling bits before xor'ing them into state and in sha3_end.
 941 * Without shuffling, bit-slicing results in -130 bytes of code
 942 * and marginal speedup (but of course it gives wrong result).
 943 * With shuffling it works, but +260 code bytes, and slower.
 944 * Disabled for now:
 945 */
 946#if 0 /* LONG_MAX == 0x7fffffff */
 947# undef OPTIMIZE_SHA3_FOR_32
 948# define OPTIMIZE_SHA3_FOR_32 1
 949#endif
 950
 951#if OPTIMIZE_SHA3_FOR_32
 952/* This splits every 64-bit word into a pair of 32-bit words,
 953 * even bits go into first word, odd bits go to second one.
 954 * The conversion is done in-place.
 955 */
 956static void split_halves(uint64_t *state)
 957{
 958        /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
 959        uint32_t *s32 = (uint32_t*)state;
 960        uint32_t t, x0, x1;
 961        int i;
 962        for (i = 24; i >= 0; --i) {
 963                x0 = s32[0];
 964                t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
 965                t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
 966                t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
 967                t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
 968                x1 = s32[1];
 969                t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
 970                t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
 971                t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
 972                t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
 973                *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
 974                *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
 975        }
 976}
 977/* The reverse operation */
 978static void combine_halves(uint64_t *state)
 979{
 980        uint32_t *s32 = (uint32_t*)state;
 981        uint32_t t, x0, x1;
 982        int i;
 983        for (i = 24; i >= 0; --i) {
 984                x0 = s32[0];
 985                x1 = s32[1];
 986                t = (x0 & 0x0000FFFF) | (x1 << 16);
 987                x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
 988                x0 = t;
 989                t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
 990                t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
 991                t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
 992                t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
 993                *s32++ = x0;
 994                t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
 995                t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
 996                t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
 997                t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
 998                *s32++ = x1;
 999        }
1000}
1001#endif
1002
1003/*
1004 * In the crypto literature this function is usually called Keccak-f().
1005 */
1006static void sha3_process_block72(uint64_t *state)
1007{
1008        enum { NROUNDS = 24 };
1009
1010#if OPTIMIZE_SHA3_FOR_32
1011        /*
1012        static const uint32_t IOTA_CONST_0[NROUNDS] ALIGN4 = {
1013                0x00000001UL,
1014                0x00000000UL,
1015                0x00000000UL,
1016                0x00000000UL,
1017                0x00000001UL,
1018                0x00000001UL,
1019                0x00000001UL,
1020                0x00000001UL,
1021                0x00000000UL,
1022                0x00000000UL,
1023                0x00000001UL,
1024                0x00000000UL,
1025                0x00000001UL,
1026                0x00000001UL,
1027                0x00000001UL,
1028                0x00000001UL,
1029                0x00000000UL,
1030                0x00000000UL,
1031                0x00000000UL,
1032                0x00000000UL,
1033                0x00000001UL,
1034                0x00000000UL,
1035                0x00000001UL,
1036                0x00000000UL,
1037        };
1038        ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1039        */
1040        uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1041        static const uint32_t IOTA_CONST_1[NROUNDS] ALIGN4 = {
1042                0x00000000UL,
1043                0x00000089UL,
1044                0x8000008bUL,
1045                0x80008080UL,
1046                0x0000008bUL,
1047                0x00008000UL,
1048                0x80008088UL,
1049                0x80000082UL,
1050                0x0000000bUL,
1051                0x0000000aUL,
1052                0x00008082UL,
1053                0x00008003UL,
1054                0x0000808bUL,
1055                0x8000000bUL,
1056                0x8000008aUL,
1057                0x80000081UL,
1058                0x80000081UL,
1059                0x80000008UL,
1060                0x00000083UL,
1061                0x80008003UL,
1062                0x80008088UL,
1063                0x80000088UL,
1064                0x00008000UL,
1065                0x80008082UL,
1066        };
1067
1068        uint32_t *const s32 = (uint32_t*)state;
1069        unsigned round;
1070
1071        split_halves(state);
1072
1073        for (round = 0; round < NROUNDS; round++) {
1074                unsigned x;
1075
1076                /* Theta */
1077                {
1078                        uint32_t BC[20];
1079                        for (x = 0; x < 10; ++x) {
1080                                BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1081                        }
1082                        for (x = 0; x < 10; x += 2) {
1083                                uint32_t ta, tb;
1084                                ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1085                                tb = BC[x+9] ^ BC[x+2];
1086                                s32[x+0] ^= ta;
1087                                s32[x+1] ^= tb;
1088                                s32[x+10] ^= ta;
1089                                s32[x+11] ^= tb;
1090                                s32[x+20] ^= ta;
1091                                s32[x+21] ^= tb;
1092                                s32[x+30] ^= ta;
1093                                s32[x+31] ^= tb;
1094                                s32[x+40] ^= ta;
1095                                s32[x+41] ^= tb;
1096                        }
1097                }
1098                /* RhoPi */
1099                {
1100                        uint32_t t0a,t0b, t1a,t1b;
1101                        t1a = s32[1*2+0];
1102                        t1b = s32[1*2+1];
1103
1104#define RhoPi(PI_LANE, ROT_CONST) \
1105        t0a = s32[PI_LANE*2+0];\
1106        t0b = s32[PI_LANE*2+1];\
1107        if (ROT_CONST & 1) {\
1108                s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1109                s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1110        } else {\
1111                s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1112                s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1113        }\
1114        t1a = t0a; t1b = t0b;
1115
1116                        RhoPi(10, 1)
1117                        RhoPi( 7, 3)
1118                        RhoPi(11, 6)
1119                        RhoPi(17,10)
1120                        RhoPi(18,15)
1121                        RhoPi( 3,21)
1122                        RhoPi( 5,28)
1123                        RhoPi(16,36)
1124                        RhoPi( 8,45)
1125                        RhoPi(21,55)
1126                        RhoPi(24, 2)
1127                        RhoPi( 4,14)
1128                        RhoPi(15,27)
1129                        RhoPi(23,41)
1130                        RhoPi(19,56)
1131                        RhoPi(13, 8)
1132                        RhoPi(12,25)
1133                        RhoPi( 2,43)
1134                        RhoPi(20,62)
1135                        RhoPi(14,18)
1136                        RhoPi(22,39)
1137                        RhoPi( 9,61)
1138                        RhoPi( 6,20)
1139                        RhoPi( 1,44)
1140#undef RhoPi
1141                }
1142                /* Chi */
1143                for (x = 0; x <= 40;) {
1144                        uint32_t BC0, BC1, BC2, BC3, BC4;
1145                        BC0 = s32[x + 0*2];
1146                        BC1 = s32[x + 1*2];
1147                        BC2 = s32[x + 2*2];
1148                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1149                        BC3 = s32[x + 3*2];
1150                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1151                        BC4 = s32[x + 4*2];
1152                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1153                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1154                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1155                        x++;
1156                        BC0 = s32[x + 0*2];
1157                        BC1 = s32[x + 1*2];
1158                        BC2 = s32[x + 2*2];
1159                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1160                        BC3 = s32[x + 3*2];
1161                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1162                        BC4 = s32[x + 4*2];
1163                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1164                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1165                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1166                        x += 9;
1167                }
1168                /* Iota */
1169                s32[0] ^= IOTA_CONST_0bits & 1;
1170                IOTA_CONST_0bits >>= 1;
1171                s32[1] ^= IOTA_CONST_1[round];
1172        }
1173
1174        combine_halves(state);
1175#else
1176        /* Native 64-bit algorithm */
1177        static const uint16_t IOTA_CONST[NROUNDS] ALIGN2 = {
1178                /* Elements should be 64-bit, but top half is always zero
1179                 * or 0x80000000. We encode 63rd bits in a separate word below.
1180                 * Same is true for 31th bits, which lets us use 16-bit table
1181                 * instead of 64-bit. The speed penalty is lost in the noise.
1182                 */
1183                0x0001,
1184                0x8082,
1185                0x808a,
1186                0x8000,
1187                0x808b,
1188                0x0001,
1189                0x8081,
1190                0x8009,
1191                0x008a,
1192                0x0088,
1193                0x8009,
1194                0x000a,
1195                0x808b,
1196                0x008b,
1197                0x8089,
1198                0x8003,
1199                0x8002,
1200                0x0080,
1201                0x800a,
1202                0x000a,
1203                0x8081,
1204                0x8080,
1205                0x0001,
1206                0x8008,
1207        };
1208        /* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1209        const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1210        /* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1211        const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1212
1213        static const uint8_t ROT_CONST[24] ALIGN1 = {
1214                1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1215                27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1216        };
1217        static const uint8_t PI_LANE[24] ALIGN1 = {
1218                10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1219                15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1220        };
1221        /*static const uint8_t MOD5[10] ALIGN1 = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1222
1223        unsigned x;
1224        unsigned round;
1225
1226        if (BB_BIG_ENDIAN) {
1227                for (x = 0; x < 25; x++) {
1228                        state[x] = SWAP_LE64(state[x]);
1229                }
1230        }
1231
1232        for (round = 0; round < NROUNDS; ++round) {
1233                /* Theta */
1234                {
1235                        uint64_t BC[10];
1236                        for (x = 0; x < 5; ++x) {
1237                                BC[x + 5] = BC[x] = state[x]
1238                                        ^ state[x + 5] ^ state[x + 10]
1239                                        ^ state[x + 15] ^ state[x + 20];
1240                        }
1241                        /* Using 2x5 vector above eliminates the need to use
1242                         * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1243                         * and the code is a bit _smaller_.
1244                         */
1245                        for (x = 0; x < 5; ++x) {
1246                                uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1247                                state[x] ^= temp;
1248                                state[x + 5] ^= temp;
1249                                state[x + 10] ^= temp;
1250                                state[x + 15] ^= temp;
1251                                state[x + 20] ^= temp;
1252                        }
1253                }
1254
1255                /* Rho Pi */
1256                if (SHA3_SMALL) {
1257                        uint64_t t1 = state[1];
1258                        for (x = 0; x < 24; ++x) {
1259                                uint64_t t0 = state[PI_LANE[x]];
1260                                state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1261                                t1 = t0;
1262                        }
1263                } else {
1264                        /* Especially large benefit for 32-bit arch (75% faster):
1265                         * 64-bit rotations by non-constant usually are SLOW on those.
1266                         * We resort to unrolling here.
1267                         * This optimizes out PI_LANE[] and ROT_CONST[],
1268                         * but generates 300-500 more bytes of code.
1269                         */
1270                        uint64_t t0;
1271                        uint64_t t1 = state[1];
1272#define RhoPi_twice(x) \
1273        t0 = state[PI_LANE[x  ]]; \
1274        state[PI_LANE[x  ]] = rotl64(t1, ROT_CONST[x  ]); \
1275        t1 = state[PI_LANE[x+1]]; \
1276        state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1277                        RhoPi_twice(0); RhoPi_twice(2);
1278                        RhoPi_twice(4); RhoPi_twice(6);
1279                        RhoPi_twice(8); RhoPi_twice(10);
1280                        RhoPi_twice(12); RhoPi_twice(14);
1281                        RhoPi_twice(16); RhoPi_twice(18);
1282                        RhoPi_twice(20); RhoPi_twice(22);
1283#undef RhoPi_twice
1284                }
1285                /* Chi */
1286# if LONG_MAX > 0x7fffffff
1287                for (x = 0; x <= 20; x += 5) {
1288                        uint64_t BC0, BC1, BC2, BC3, BC4;
1289                        BC0 = state[x + 0];
1290                        BC1 = state[x + 1];
1291                        BC2 = state[x + 2];
1292                        state[x + 0] = BC0 ^ ((~BC1) & BC2);
1293                        BC3 = state[x + 3];
1294                        state[x + 1] = BC1 ^ ((~BC2) & BC3);
1295                        BC4 = state[x + 4];
1296                        state[x + 2] = BC2 ^ ((~BC3) & BC4);
1297                        state[x + 3] = BC3 ^ ((~BC4) & BC0);
1298                        state[x + 4] = BC4 ^ ((~BC0) & BC1);
1299                }
1300# else
1301                /* Reduced register pressure version
1302                 * for register-starved 32-bit arches
1303                 * (i386: -95 bytes, and it is _faster_)
1304                 */
1305                for (x = 0; x <= 40;) {
1306                        uint32_t BC0, BC1, BC2, BC3, BC4;
1307                        uint32_t *const s32 = (uint32_t*)state;
1308#  if SHA3_SMALL
1309 do_half:
1310#  endif
1311                        BC0 = s32[x + 0*2];
1312                        BC1 = s32[x + 1*2];
1313                        BC2 = s32[x + 2*2];
1314                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1315                        BC3 = s32[x + 3*2];
1316                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1317                        BC4 = s32[x + 4*2];
1318                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1319                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1320                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1321                        x++;
1322#  if SHA3_SMALL
1323                        if (x & 1)
1324                                goto do_half;
1325                        x += 8;
1326#  else
1327                        BC0 = s32[x + 0*2];
1328                        BC1 = s32[x + 1*2];
1329                        BC2 = s32[x + 2*2];
1330                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1331                        BC3 = s32[x + 3*2];
1332                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1333                        BC4 = s32[x + 4*2];
1334                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1335                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1336                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1337                        x += 9;
1338#  endif
1339                }
1340# endif /* long is 32-bit */
1341                /* Iota */
1342                state[0] ^= IOTA_CONST[round]
1343                        | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1344                        | (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1345        }
1346
1347        if (BB_BIG_ENDIAN) {
1348                for (x = 0; x < 25; x++) {
1349                        state[x] = SWAP_LE64(state[x]);
1350                }
1351        }
1352#endif
1353}
1354
1355void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1356{
1357        memset(ctx, 0, sizeof(*ctx));
1358        /* SHA3-512, user can override */
1359        ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
1360}
1361
1362void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1363{
1364#if SHA3_SMALL
1365        const uint8_t *data = buffer;
1366        unsigned bufpos = ctx->bytes_queued;
1367
1368        while (1) {
1369                unsigned remaining = ctx->input_block_bytes - bufpos;
1370                if (remaining > len)
1371                        remaining = len;
1372                len -= remaining;
1373                /* XOR data into buffer */
1374                while (remaining != 0) {
1375                        uint8_t *buf = (uint8_t*)ctx->state;
1376                        buf[bufpos] ^= *data++;
1377                        bufpos++;
1378                        remaining--;
1379                }
1380
1381                /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1382                bufpos -= ctx->input_block_bytes;
1383                if (bufpos != 0)
1384                        break;
1385
1386                /* Buffer is filled up, process it */
1387                sha3_process_block72(ctx->state);
1388                /*bufpos = 0; - already is */
1389        }
1390        ctx->bytes_queued = bufpos + ctx->input_block_bytes;
1391#else
1392        /* +50 bytes code size, but a bit faster because of long-sized XORs */
1393        const uint8_t *data = buffer;
1394        unsigned bufpos = ctx->bytes_queued;
1395        unsigned iblk_bytes = ctx->input_block_bytes;
1396
1397        /* If already data in queue, continue queuing first */
1398        if (bufpos != 0) {
1399                while (len != 0) {
1400                        uint8_t *buf = (uint8_t*)ctx->state;
1401                        buf[bufpos] ^= *data++;
1402                        len--;
1403                        bufpos++;
1404                        if (bufpos == iblk_bytes) {
1405                                bufpos = 0;
1406                                goto do_block;
1407                        }
1408                }
1409        }
1410
1411        /* Absorb complete blocks */
1412        while (len >= iblk_bytes) {
1413                /* XOR data onto beginning of state[].
1414                 * We try to be efficient - operate one word at a time, not byte.
1415                 * Careful wrt unaligned access: can't just use "*(long*)data"!
1416                 */
1417                unsigned count = iblk_bytes / sizeof(long);
1418                long *buf = (long*)ctx->state;
1419                do {
1420                        long v;
1421                        move_from_unaligned_long(v, (long*)data);
1422                        *buf++ ^= v;
1423                        data += sizeof(long);
1424                } while (--count);
1425                len -= iblk_bytes;
1426 do_block:
1427                sha3_process_block72(ctx->state);
1428        }
1429
1430        /* Queue remaining data bytes */
1431        while (len != 0) {
1432                uint8_t *buf = (uint8_t*)ctx->state;
1433                buf[bufpos] ^= *data++;
1434                bufpos++;
1435                len--;
1436        }
1437
1438        ctx->bytes_queued = bufpos;
1439#endif
1440}
1441
1442unsigned FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1443{
1444        /* Padding */
1445        uint8_t *buf = (uint8_t*)ctx->state;
1446        /*
1447         * Keccak block padding is: add 1 bit after last bit of input,
1448         * then add zero bits until the end of block, and add the last 1 bit
1449         * (the last bit in the block) - the "10*1" pattern.
1450         * SHA3 standard appends additional two bits, 01,  before that padding:
1451         *
1452         * SHA3-224(M) = KECCAK[448](M||01, 224)
1453         * SHA3-256(M) = KECCAK[512](M||01, 256)
1454         * SHA3-384(M) = KECCAK[768](M||01, 384)
1455         * SHA3-512(M) = KECCAK[1024](M||01, 512)
1456         * (M is the input, || is bit concatenation)
1457         *
1458         * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1459         */
1460        buf[ctx->bytes_queued]          ^= 6; /* bit pattern 00000110 */
1461        buf[ctx->input_block_bytes - 1] ^= 0x80;
1462
1463        sha3_process_block72(ctx->state);
1464
1465        /* Output */
1466        memcpy(resbuf, ctx->state, 64);
1467        return 64;
1468}
1469