busybox/networking/tls_pstm_montgomery_reduce.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2017 Denys Vlasenko
   3 *
   4 * Licensed under GPLv2, see file LICENSE in this source tree.
   5 */
   6#include "tls.h"
   7
   8/* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
   9 * Changes are flagged with //bbox
  10 */
  11
  12/**
  13 *      @file    pstm_montgomery_reduce.c
  14 *      @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
  15 *
  16 *      Multiprecision Montgomery Reduction.
  17 */
  18/*
  19 *      Copyright (c) 2013-2015 INSIDE Secure Corporation
  20 *      Copyright (c) PeerSec Networks, 2002-2011
  21 *      All Rights Reserved
  22 *
  23 *      The latest version of this code is available at http://www.matrixssl.org
  24 *
  25 *      This software is open source; you can redistribute it and/or modify
  26 *      it under the terms of the GNU General Public License as published by
  27 *      the Free Software Foundation; either version 2 of the License, or
  28 *      (at your option) any later version.
  29 *
  30 *      This General Public License does NOT permit incorporating this software
  31 *      into proprietary programs.  If you are unable to comply with the GPL, a
  32 *      commercial license for this software may be purchased from INSIDE at
  33 *      http://www.insidesecure.com/eng/Company/Locations
  34 *
  35 *      This program is distributed in WITHOUT ANY WARRANTY; without even the
  36 *      implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  37 *      See the GNU General Public License for more details.
  38 *
  39 *      You should have received a copy of the GNU General Public License
  40 *      along with this program; if not, write to the Free Software
  41 *      Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  42 *      http://www.gnu.org/copyleft/gpl.html
  43 */
  44/******************************************************************************/
  45
  46//bbox
  47//#include "../cryptoApi.h"
  48#ifndef DISABLE_PSTM
  49
  50/******************************************************************************/
  51
  52#if defined(PSTM_X86)
  53/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
  54#if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
  55#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
  56#endif
  57//#pragma message ("Using 32 bit x86 Assembly Optimizations")
  58
  59#define MONT_START
  60#define MONT_FINI
  61#define LOOP_END
  62#define LOOP_START \
  63   mu = c[x] * mp
  64
  65#if 0
  66#define INNERMUL                                          \
  67asm(                                                      \
  68   "movl %5,%%eax \n\t"                                   \
  69   "mull %4       \n\t"                                   \
  70   "addl %1,%%eax \n\t"                                   \
  71   "adcl $0,%%edx \n\t"                                   \
  72   "addl %%eax,%0 \n\t"                                   \
  73   "adcl $0,%%edx \n\t"                                   \
  74   "movl %%edx,%1 \n\t"                                   \
  75:"=g"(_c[LO]), "=r"(cy)                                   \
  76:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++)              \
  77: "%eax", "%edx", "cc")
  78/*
  79 * The above generated "error: 'asm' operand has impossible constraints" on Android.
  80 * Do they reserve in their ABI a register for something, and there aren't enough left?
  81 */
  82#else
  83/* Let's avoid two explicit "movl" by telling compiler to put input value of *tmpm++
  84 * into EAX, and to expect cy result in EDX:
  85 */
  86#define INNERMUL                                          \
  87asm(                                                      \
  88   "mull %4       \n\t"                                   \
  89   "addl %3,%%eax \n\t"                                   \
  90   "adcl $0,%%edx \n\t"                                   \
  91   "addl %%eax,%0 \n\t"                                   \
  92   "adcl $0,%%edx \n\t"                                   \
  93:"=g"(_c[LO]), "=&d"(cy)                                  \
  94:"0"(_c[LO]), "g"(cy), "g"(mu), "a"(*tmpm++)              \
  95:"cc")
  96/* This doesn't tell compiler that we clobber EAX, but it probably won't need
  97 * the value of *tmpm anyway, thus won't try to reuse EAX contents.
  98 * TODO: fix it with dummy "=a"(clobbered_eax) output?
  99 */
 100#endif
 101
 102#define PROPCARRY                           \
 103asm(                                        \
 104   "addl   %1,%0    \n\t"                   \
 105   "sbb    %1,%1    \n\t"                   \
 106   "neg    %1       \n\t"                   \
 107:"=g"(_c[LO]), "=r"(cy)                     \
 108:"0"(_c[LO]), "1"(cy)                       \
 109:"cc")
 110
 111/******************************************************************************/
 112#elif defined(PSTM_X86_64)
 113/* x86-64 optimized */
 114#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
 115#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
 116#endif
 117//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
 118
 119#define MONT_START
 120#define MONT_FINI
 121#define LOOP_END
 122#define LOOP_START \
 123mu = c[x] * mp
 124
 125#define INNERMUL                                           \
 126asm(                                                       \
 127        "movq %5,%%rax \n\t"                                   \
 128        "mulq %4       \n\t"                                   \
 129        "addq %1,%%rax \n\t"                                   \
 130        "adcq $0,%%rdx \n\t"                                   \
 131        "addq %%rax,%0 \n\t"                                   \
 132        "adcq $0,%%rdx \n\t"                                   \
 133        "movq %%rdx,%1 \n\t"                                   \
 134        :"=g"(_c[LO]), "=r"(cy)                                \
 135        :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)           \
 136        : "%rax", "%rdx", "cc")
 137
 138#define INNERMUL8                               \
 139asm(                                                    \
 140        "movq 0(%5),%%rax    \n\t"  \
 141        "movq 0(%2),%%r10    \n\t"  \
 142        "movq 0x8(%5),%%r11  \n\t"  \
 143        "mulq %4             \n\t"  \
 144        "addq %%r10,%%rax    \n\t"  \
 145        "adcq $0,%%rdx       \n\t"  \
 146        "movq 0x8(%2),%%r10  \n\t"  \
 147        "addq %3,%%rax       \n\t"  \
 148        "adcq $0,%%rdx       \n\t"  \
 149        "movq %%rax,0(%0)    \n\t"  \
 150        "movq %%rdx,%1       \n\t"  \
 151        \
 152        "movq %%r11,%%rax    \n\t"  \
 153        "movq 0x10(%5),%%r11 \n\t"  \
 154        "mulq %4             \n\t"  \
 155        "addq %%r10,%%rax    \n\t"  \
 156        "adcq $0,%%rdx       \n\t"  \
 157        "movq 0x10(%2),%%r10 \n\t"  \
 158        "addq %3,%%rax       \n\t"  \
 159        "adcq $0,%%rdx       \n\t"  \
 160        "movq %%rax,0x8(%0)  \n\t"  \
 161        "movq %%rdx,%1       \n\t"  \
 162        \
 163        "movq %%r11,%%rax    \n\t"  \
 164        "movq 0x18(%5),%%r11 \n\t"  \
 165        "mulq %4             \n\t"  \
 166        "addq %%r10,%%rax    \n\t"  \
 167        "adcq $0,%%rdx       \n\t"  \
 168        "movq 0x18(%2),%%r10 \n\t"  \
 169        "addq %3,%%rax       \n\t"  \
 170        "adcq $0,%%rdx       \n\t"  \
 171        "movq %%rax,0x10(%0) \n\t"  \
 172        "movq %%rdx,%1       \n\t"  \
 173        \
 174        "movq %%r11,%%rax    \n\t"  \
 175        "movq 0x20(%5),%%r11 \n\t"  \
 176        "mulq %4             \n\t"  \
 177        "addq %%r10,%%rax    \n\t"  \
 178        "adcq $0,%%rdx       \n\t"  \
 179        "movq 0x20(%2),%%r10 \n\t"  \
 180        "addq %3,%%rax       \n\t"  \
 181        "adcq $0,%%rdx       \n\t"  \
 182        "movq %%rax,0x18(%0) \n\t"  \
 183        "movq %%rdx,%1       \n\t"  \
 184        \
 185        "movq %%r11,%%rax    \n\t"  \
 186        "movq 0x28(%5),%%r11 \n\t"  \
 187        "mulq %4             \n\t"  \
 188        "addq %%r10,%%rax    \n\t"  \
 189        "adcq $0,%%rdx       \n\t"  \
 190        "movq 0x28(%2),%%r10 \n\t"  \
 191        "addq %3,%%rax       \n\t"  \
 192        "adcq $0,%%rdx       \n\t"  \
 193        "movq %%rax,0x20(%0) \n\t"  \
 194        "movq %%rdx,%1       \n\t"  \
 195        \
 196        "movq %%r11,%%rax    \n\t"  \
 197        "movq 0x30(%5),%%r11 \n\t"  \
 198        "mulq %4             \n\t"  \
 199        "addq %%r10,%%rax    \n\t"  \
 200        "adcq $0,%%rdx       \n\t"  \
 201        "movq 0x30(%2),%%r10 \n\t"  \
 202        "addq %3,%%rax       \n\t"  \
 203        "adcq $0,%%rdx       \n\t"  \
 204        "movq %%rax,0x28(%0) \n\t"  \
 205        "movq %%rdx,%1       \n\t"  \
 206        \
 207        "movq %%r11,%%rax    \n\t"  \
 208        "movq 0x38(%5),%%r11 \n\t"  \
 209        "mulq %4             \n\t"  \
 210        "addq %%r10,%%rax    \n\t"  \
 211        "adcq $0,%%rdx       \n\t"  \
 212        "movq 0x38(%2),%%r10 \n\t"  \
 213        "addq %3,%%rax       \n\t"  \
 214        "adcq $0,%%rdx       \n\t"  \
 215        "movq %%rax,0x30(%0) \n\t"  \
 216        "movq %%rdx,%1       \n\t"  \
 217        \
 218        "movq %%r11,%%rax    \n\t"  \
 219        "mulq %4             \n\t"  \
 220        "addq %%r10,%%rax    \n\t"  \
 221        "adcq $0,%%rdx       \n\t"  \
 222        "addq %3,%%rax       \n\t"  \
 223        "adcq $0,%%rdx       \n\t"  \
 224        "movq %%rax,0x38(%0) \n\t"  \
 225        "movq %%rdx,%1       \n\t"  \
 226        \
 227        :"=r"(_c), "=r"(cy)                    \
 228        : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
 229        : "%rax", "%rdx", "%r10", "%r11", "cc")
 230
 231#define PROPCARRY                          \
 232asm(                                       \
 233        "addq   %1,%0    \n\t"                 \
 234        "setb   %%al     \n\t"                 \
 235        "movzbq %%al,%1 \n\t"                  \
 236        :"=g"(_c[LO]), "=r"(cy)                \
 237        :"0"(_c[LO]), "1"(cy)                  \
 238        : "%rax", "cc")
 239
 240/******************************************************************************/
 241#elif defined(PSTM_ARM)
 242
 243#define MONT_START
 244#define MONT_FINI
 245#define LOOP_END
 246#define LOOP_START \
 247mu = c[x] * mp
 248
 249#ifdef __thumb2__
 250//#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations")
 251#define INNERMUL                    \
 252asm(                                \
 253        " LDR    r0,%1            \n\t" \
 254        " ADDS   r0,r0,%0         \n\t" \
 255        " ITE CS                  \n\t" \
 256        " MOVCS  %0,#1            \n\t" \
 257        " MOVCC  %0,#0            \n\t" \
 258        " UMLAL  r0,%0,%3,%4      \n\t" \
 259        " STR    r0,%1            \n\t" \
 260        :"=r"(cy),"=m"(_c[0])\
 261        :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
 262        :"r0","cc");
 263#define PROPCARRY                  \
 264asm(                               \
 265        " LDR   r0,%1            \n\t" \
 266        " ADDS  r0,r0,%0         \n\t" \
 267        " STR   r0,%1            \n\t" \
 268        " ITE CS                 \n\t" \
 269        " MOVCS %0,#1            \n\t" \
 270        " MOVCC %0,#0            \n\t" \
 271        :"=r"(cy),"=m"(_c[0])\
 272        :"0"(cy),"m"(_c[0])\
 273        :"r0","cc");
 274#else /* Non-Thumb2 code */
 275//#pragma message ("Using 32 bit ARM Assembly Optimizations")
 276#define INNERMUL                    \
 277asm(                                \
 278        " LDR    r0,%1            \n\t" \
 279        " ADDS   r0,r0,%0         \n\t" \
 280        " MOVCS  %0,#1            \n\t" \
 281        " MOVCC  %0,#0            \n\t" \
 282        " UMLAL  r0,%0,%3,%4      \n\t" \
 283        " STR    r0,%1            \n\t" \
 284        :"=r"(cy),"=m"(_c[0])\
 285        :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
 286        :"r0","cc");
 287#define PROPCARRY                  \
 288asm(                               \
 289        " LDR   r0,%1            \n\t" \
 290        " ADDS  r0,r0,%0         \n\t" \
 291        " STR   r0,%1            \n\t" \
 292        " MOVCS %0,#1            \n\t" \
 293        " MOVCC %0,#0            \n\t" \
 294        :"=r"(cy),"=m"(_c[0])\
 295        :"0"(cy),"m"(_c[0])\
 296        :"r0","cc");
 297#endif /* __thumb2__ */
 298
 299
 300/******************************************************************************/
 301#elif defined(PSTM_MIPS)
 302/* MIPS32 */
 303//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
 304#define MONT_START
 305#define MONT_FINI
 306#define LOOP_END
 307#define LOOP_START \
 308mu = c[x] * mp
 309
 310#define INNERMUL                      \
 311asm(                                                              \
 312        " multu    %3,%4          \n\t"   \
 313        " mflo     $12            \n\t"   \
 314        " mfhi     $13            \n\t"   \
 315        " addu     $12,$12,%0     \n\t"   \
 316        " sltu     $10,$12,%0     \n\t"   \
 317        " addu     $13,$13,$10    \n\t"   \
 318        " lw       $10,%1         \n\t"   \
 319        " addu     $12,$12,$10    \n\t"   \
 320        " sltu     $10,$12,$10    \n\t"   \
 321        " addu     %0,$13,$10     \n\t"   \
 322        " sw       $12,%1         \n\t"   \
 323        :"=r"(cy),"=m"(_c[0])\
 324        :"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
 325        :"$10","$12","$13")\
 326; ++tmpm;
 327
 328#define PROPCARRY                     \
 329asm(                                  \
 330        " lw       $10,%1        \n\t"    \
 331        " addu     $10,$10,%0    \n\t"    \
 332        " sw       $10,%1        \n\t"    \
 333        " sltu     %0,$10,%0     \n\t"    \
 334        :"=r"(cy),"=m"(_c[0])\
 335        :"r"(cy),"r"(_c[0])\
 336        :"$10");
 337
 338
 339/******************************************************************************/
 340#else
 341
 342/* ISO C code */
 343#define MONT_START
 344#define MONT_FINI
 345#define LOOP_END
 346#define LOOP_START \
 347   mu = c[x] * mp
 348
 349#define INNERMUL                                                                                \
 350        do { pstm_word t;                                                                       \
 351                t = ((pstm_word)_c[0] + (pstm_word)cy) +                \
 352                        (((pstm_word)mu) * ((pstm_word)*tmpm++));       \
 353                _c[0] = (pstm_digit)t;                                                  \
 354                cy = (pstm_digit)(t >> DIGIT_BIT);                              \
 355        } while (0)
 356
 357#define PROPCARRY \
 358   do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)
 359
 360#endif
 361
 362/******************************************************************************/
 363
 364#define LO 0
 365
 366/* computes x/R == x (mod N) via Montgomery Reduction */
 367int32 FAST_FUNC pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
 368                pstm_digit mp, pstm_digit *paD, uint32 paDlen)
 369{
 370        pstm_digit      *c, *_c, *tmpm, mu;
 371        int32           oldused, x, y;
 372        int             pa; //bbox: was int16
 373
 374        pa = m->used;
 375        if (pa > a->alloc) {
 376                /* Sanity test for bad numbers.  This will confirm no buffer overruns */
 377                return PS_LIMIT_FAIL;
 378        }
 379
 380        if (paD && paDlen >= (uint32)2*pa+1) {
 381                c = paD;
 382                memset(c, 0x0, paDlen);
 383        } else {
 384                c = xzalloc(2*pa+1);//bbox
 385        }
 386        /* copy the input */
 387        oldused = a->used;
 388        for (x = 0; x < oldused; x++) {
 389                c[x] = a->dp[x];
 390        }
 391
 392        MONT_START;
 393
 394        for (x = 0; x < pa; x++) {
 395                pstm_digit cy = 0;
 396                /* get Mu for this round */
 397                LOOP_START;
 398                _c   = c + x;
 399                tmpm = m->dp;
 400                y = 0;
 401#ifdef PSTM_X86_64
 402                for (; y < (pa & ~7); y += 8) {
 403                        INNERMUL8;
 404                        _c   += 8;
 405                        tmpm += 8;
 406                }
 407#endif /* PSTM_X86_64 */
 408                for (; y < pa; y++) {
 409                        INNERMUL;
 410                        ++_c;
 411                }
 412                LOOP_END;
 413                while (cy) {
 414                        PROPCARRY;
 415                        ++_c;
 416                }
 417        }
 418
 419        /* now copy out */
 420        _c   = c + pa;
 421        tmpm = a->dp;
 422        for (x = 0; x < pa+1; x++) {
 423                *tmpm++ = *_c++;
 424        }
 425
 426        for (; x < oldused; x++)   {
 427                *tmpm++ = 0;
 428        }
 429
 430        MONT_FINI;
 431
 432        a->used = pa+1;
 433        pstm_clamp(a);
 434
 435        /* reuse x as return code */
 436        x = PSTM_OKAY;
 437
 438        /* if A >= m then A = A - m */
 439        if (pstm_cmp_mag (a, m) != PSTM_LT) {
 440                if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
 441                        x = PS_MEM_FAIL;
 442                }
 443        }
 444        if (paDlen < (uint32)2*pa+1) {
 445                psFree(c, pool);
 446        }
 447        return x;
 448}
 449
 450#endif /* !DISABLE_PSTM */
 451/******************************************************************************/
 452