busybox/networking/tls_pstm_sqr_comba.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2017 Denys Vlasenko
   3 *
   4 * Licensed under GPLv2, see file LICENSE in this source tree.
   5 */
   6#include "tls.h"
   7
   8/* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
   9 * Changes are flagged with //bbox
  10 */
  11
  12/**
  13 *      @file    pstm_sqr_comba.c
  14 *      @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
  15 *
  16 *      Multiprecision Squaring with Comba technique.
  17 */
  18/*
  19 *      Copyright (c) 2013-2015 INSIDE Secure Corporation
  20 *      Copyright (c) PeerSec Networks, 2002-2011
  21 *      All Rights Reserved
  22 *
  23 *      The latest version of this code is available at http://www.matrixssl.org
  24 *
  25 *      This software is open source; you can redistribute it and/or modify
  26 *      it under the terms of the GNU General Public License as published by
  27 *      the Free Software Foundation; either version 2 of the License, or
  28 *      (at your option) any later version.
  29 *
  30 *      This General Public License does NOT permit incorporating this software
  31 *      into proprietary programs.  If you are unable to comply with the GPL, a
  32 *      commercial license for this software may be purchased from INSIDE at
  33 *      http://www.insidesecure.com/eng/Company/Locations
  34 *
  35 *      This program is distributed in WITHOUT ANY WARRANTY; without even the
  36 *      implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  37 *      See the GNU General Public License for more details.
  38 *
  39 *      You should have received a copy of the GNU General Public License
  40 *      along with this program; if not, write to the Free Software
  41 *      Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  42 *      http://www.gnu.org/copyleft/gpl.html
  43 */
  44/******************************************************************************/
  45
  46//bbox
  47//#include "../cryptoApi.h"
  48#ifndef DISABLE_PSTM
  49
  50/******************************************************************************/
  51#if defined(PSTM_X86)
  52/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
  53#if !defined(__GNUC__) || !defined(__i386__)
  54#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
  55#endif
  56//#pragma message ("Using 32 bit x86 Assembly Optimizations")
  57
  58#define COMBA_START
  59
  60#define CLEAR_CARRY \
  61   c0 = c1 = c2 = 0;
  62
  63#define COMBA_STORE(x) \
  64   x = c0;
  65
  66#define COMBA_STORE2(x) \
  67   x = c1;
  68
  69#define CARRY_FORWARD \
  70   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
  71
  72#define COMBA_FINI
  73
  74#define SQRADD(i, j)                                      \
  75asm(                                            \
  76         "movl  %6,%%eax     \n\t"                            \
  77         "mull  %%eax        \n\t"                            \
  78         "addl  %%eax,%0     \n\t"                            \
  79         "adcl  %%edx,%1     \n\t"                            \
  80         "adcl  $0,%2        \n\t"                            \
  81         :"=rm"(c0), "=rm"(c1), "=rm"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
  82        //bbox: ^^^ replaced "=r" with "=rm": %ebx is not available on shared build
  83
  84#define SQRADD2(i, j)                                     \
  85asm(                                            \
  86         "movl  %6,%%eax     \n\t"                            \
  87         "mull  %7           \n\t"                            \
  88         "addl  %%eax,%0     \n\t"                            \
  89         "adcl  %%edx,%1     \n\t"                            \
  90         "adcl  $0,%2        \n\t"                            \
  91         "addl  %%eax,%0     \n\t"                            \
  92         "adcl  %%edx,%1     \n\t"                            \
  93         "adcl  $0,%2        \n\t"                            \
  94         :"=rm"(c0), "=rm"(c1), "=rm"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
  95        //bbox: ^^^ replaced "=r" with "=rm": %ebx is not available on shared build
  96
  97#define SQRADDSC(i, j)                                    \
  98asm(                                                     \
  99         "movl  %6,%%eax     \n\t"                            \
 100         "mull  %7           \n\t"                            \
 101         "movl  %%eax,%0     \n\t"                            \
 102         "movl  %%edx,%1     \n\t"                            \
 103         "xorl  %2,%2        \n\t"                            \
 104         :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
 105
 106#define SQRADDAC(i, j)                                    \
 107asm(                                                     \
 108         "movl  %6,%%eax     \n\t"                            \
 109         "mull  %7           \n\t"                            \
 110         "addl  %%eax,%0     \n\t"                            \
 111         "adcl  %%edx,%1     \n\t"                            \
 112         "adcl  $0,%2        \n\t"                            \
 113         :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
 114
 115#define SQRADDDB                                          \
 116asm(                                                     \
 117         "addl %6,%0         \n\t"                            \
 118         "adcl %7,%1         \n\t"                            \
 119         "adcl %8,%2         \n\t"                            \
 120         "addl %6,%0         \n\t"                            \
 121         "adcl %7,%1         \n\t"                            \
 122         "adcl %8,%2         \n\t"                            \
 123         :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
 124
 125/******************************************************************************/
 126#elif defined(PSTM_X86_64)
 127/* x86-64 optimized */
 128#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
 129#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
 130#endif
 131//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
 132
 133#define COMBA_START
 134
 135#define CLEAR_CARRY \
 136c0 = c1 = c2 = 0;
 137
 138#define COMBA_STORE(x) \
 139x = c0;
 140
 141#define COMBA_STORE2(x) \
 142x = c1;
 143
 144#define CARRY_FORWARD \
 145do { c0 = c1; c1 = c2; c2 = 0; } while (0);
 146
 147#define COMBA_FINI
 148
 149#define SQRADD(i, j)                                     \
 150asm(                                                     \
 151        "movq  %6,%%rax     \n\t"                            \
 152        "mulq  %%rax        \n\t"                            \
 153        "addq  %%rax,%0     \n\t"                            \
 154        "adcq  %%rdx,%1     \n\t"                            \
 155        "adcq  $0,%2        \n\t"                            \
 156        :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
 157
 158#define SQRADD2(i, j)                                    \
 159asm(                                                     \
 160        "movq  %6,%%rax     \n\t"                            \
 161        "mulq  %7           \n\t"                            \
 162        "addq  %%rax,%0     \n\t"                            \
 163        "adcq  %%rdx,%1     \n\t"                            \
 164        "adcq  $0,%2        \n\t"                            \
 165        "addq  %%rax,%0     \n\t"                            \
 166        "adcq  %%rdx,%1     \n\t"                            \
 167        "adcq  $0,%2        \n\t"                            \
 168        :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
 169
 170#define SQRADDSC(i, j)                                   \
 171asm(                                                     \
 172        "movq  %6,%%rax     \n\t"                            \
 173        "mulq  %7           \n\t"                            \
 174        "movq  %%rax,%0     \n\t"                            \
 175        "movq  %%rdx,%1     \n\t"                            \
 176        "xorq  %2,%2        \n\t"                            \
 177        :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
 178
 179#define SQRADDAC(i, j)                                   \
 180asm(                                                     \
 181        "movq  %6,%%rax     \n\t"                            \
 182        "mulq  %7           \n\t"                            \
 183        "addq  %%rax,%0     \n\t"                            \
 184        "adcq  %%rdx,%1     \n\t"                            \
 185        "adcq  $0,%2        \n\t"                            \
 186        :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
 187
 188#define SQRADDDB                                         \
 189asm(                                                     \
 190        "addq %6,%0         \n\t"                            \
 191        "adcq %7,%1         \n\t"                            \
 192        "adcq %8,%2         \n\t"                            \
 193        "addq %6,%0         \n\t"                            \
 194        "adcq %7,%1         \n\t"                            \
 195        "adcq %8,%2         \n\t"                            \
 196        :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
 197
 198/******************************************************************************/
 199#elif defined(PSTM_ARM)
 200/* ARM code */
 201//#pragma message ("Using 32 bit ARM Assembly Optimizations")
 202
 203#define COMBA_START
 204
 205#define CLEAR_CARRY \
 206c0 = c1 = c2 = 0;
 207
 208#define COMBA_STORE(x) \
 209x = c0;
 210
 211#define COMBA_STORE2(x) \
 212x = c1;
 213
 214#define CARRY_FORWARD \
 215do { c0 = c1; c1 = c2; c2 = 0; } while (0);
 216
 217#define COMBA_FINI
 218
 219/* multiplies point i and j, updates carry "c1" and digit c2 */
 220#define SQRADD(i, j)                                             \
 221asm(                                                             \
 222"  UMULL  r0,r1,%6,%6              \n\t"                         \
 223"  ADDS   %0,%0,r0                 \n\t"                         \
 224"  ADCS   %1,%1,r1                 \n\t"                         \
 225"  ADC    %2,%2,#0                 \n\t"                         \
 226:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
 227
 228/* for squaring some of the terms are doubled... */
 229#define SQRADD2(i, j)                                            \
 230asm(                                                             \
 231"  UMULL  r0,r1,%6,%7              \n\t"                         \
 232"  ADDS   %0,%0,r0                 \n\t"                         \
 233"  ADCS   %1,%1,r1                 \n\t"                         \
 234"  ADC    %2,%2,#0                 \n\t"                         \
 235"  ADDS   %0,%0,r0                 \n\t"                         \
 236"  ADCS   %1,%1,r1                 \n\t"                         \
 237"  ADC    %2,%2,#0                 \n\t"                         \
 238:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
 239
 240#define SQRADDSC(i, j)                                           \
 241asm(                                                             \
 242"  UMULL  %0,%1,%6,%7              \n\t"                         \
 243"  SUB    %2,%2,%2                 \n\t"                         \
 244:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "cc");
 245
 246#define SQRADDAC(i, j)                                           \
 247asm(                                                             \
 248"  UMULL  r0,r1,%6,%7              \n\t"                         \
 249"  ADDS   %0,%0,r0                 \n\t"                         \
 250"  ADCS   %1,%1,r1                 \n\t"                         \
 251"  ADC    %2,%2,#0                 \n\t"                         \
 252:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
 253
 254#define SQRADDDB                                                 \
 255asm(                                                             \
 256"  ADDS  %0,%0,%3                     \n\t"                      \
 257"  ADCS  %1,%1,%4                     \n\t"                      \
 258"  ADC   %2,%2,%5                     \n\t"                      \
 259"  ADDS  %0,%0,%3                     \n\t"                      \
 260"  ADCS  %1,%1,%4                     \n\t"                      \
 261"  ADC   %2,%2,%5                     \n\t"                      \
 262:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
 263
 264/******************************************************************************/
 265#elif defined(PSTM_MIPS)
 266/* MIPS32 */
 267//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
 268
 269#define COMBA_START
 270
 271#define CLEAR_CARRY \
 272c0 = c1 = c2 = 0;
 273
 274#define COMBA_STORE(x) \
 275x = c0;
 276
 277#define COMBA_STORE2(x) \
 278x = c1;
 279
 280#define CARRY_FORWARD \
 281do { c0 = c1; c1 = c2; c2 = 0; } while (0);
 282
 283#define COMBA_FINI
 284
 285/* multiplies point i and j, updates carry "c1" and digit c2 */
 286#define SQRADD(i, j)               \
 287asm(                               \
 288        " multu  %6,%6          \n\t"  \
 289        " mflo   $12            \n\t"  \
 290        " mfhi   $13            \n\t"  \
 291        " addu    %0,%0,$12     \n\t"  \
 292        " sltu   $12,%0,$12     \n\t"  \
 293        " addu    %1,%1,$13     \n\t"  \
 294        " sltu   $13,%1,$13     \n\t"  \
 295        " addu    %1,%1,$12     \n\t"  \
 296        " sltu   $12,%1,$12     \n\t"  \
 297        " addu    %2,%2,$13     \n\t"  \
 298        " addu    %2,%2,$12     \n\t"  \
 299        :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
 300
 301/* for squaring some of the terms are doubled... */
 302#define SQRADD2(i, j)             \
 303asm(                              \
 304        " multu  %6,%7          \n\t" \
 305        " mflo   $12            \n\t" \
 306        " mfhi   $13            \n\t" \
 307        \
 308        " addu    %0,%0,$12     \n\t" \
 309        " sltu   $14,%0,$12     \n\t" \
 310        " addu    %1,%1,$13     \n\t" \
 311        " sltu   $15,%1,$13     \n\t" \
 312        " addu    %1,%1,$14     \n\t" \
 313        " sltu   $14,%1,$14     \n\t" \
 314        " addu    %2,%2,$15     \n\t" \
 315        " addu    %2,%2,$14     \n\t" \
 316        \
 317        " addu    %0,%0,$12     \n\t" \
 318        " sltu   $14,%0,$12     \n\t" \
 319        " addu    %1,%1,$13     \n\t" \
 320        " sltu   $15,%1,$13     \n\t" \
 321        " addu    %1,%1,$14     \n\t" \
 322        " sltu   $14,%1,$14     \n\t" \
 323        " addu    %2,%2,$15     \n\t" \
 324        " addu    %2,%2,$14     \n\t" \
 325        :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
 326
 327#define SQRADDSC(i, j)             \
 328asm(                               \
 329        " multu  %6,%7          \n\t"  \
 330        " mflo   %0             \n\t"  \
 331        " mfhi   %1             \n\t"  \
 332        " xor    %2,%2,%2       \n\t"  \
 333        :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
 334
 335#define SQRADDAC(i, j)            \
 336asm(                              \
 337        " multu  %6,%7          \n\t" \
 338        " mflo   $12            \n\t" \
 339        " mfhi   $13            \n\t" \
 340        " addu    %0,%0,$12     \n\t" \
 341        " sltu   $12,%0,$12     \n\t" \
 342        " addu    %1,%1,$13     \n\t" \
 343        " sltu   $13,%1,$13     \n\t" \
 344        " addu    %1,%1,$12     \n\t" \
 345        " sltu   $12,%1,$12     \n\t" \
 346        " addu    %2,%2,$13     \n\t" \
 347        " addu    %2,%2,$12     \n\t" \
 348        :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
 349
 350#define SQRADDDB                   \
 351asm(                               \
 352        " addu    %0,%0,%3       \n\t" \
 353        " sltu   $10,%0,%3       \n\t" \
 354        " addu    %1,%1,$10      \n\t" \
 355        " sltu   $10,%1,$10      \n\t" \
 356        " addu    %1,%1,%4       \n\t" \
 357        " sltu   $11,%1,%4       \n\t" \
 358        " addu    %2,%2,$10      \n\t" \
 359        " addu    %2,%2,$11      \n\t" \
 360        " addu    %2,%2,%5       \n\t" \
 361        \
 362        " addu    %0,%0,%3       \n\t" \
 363        " sltu   $10,%0,%3       \n\t" \
 364        " addu    %1,%1,$10      \n\t" \
 365        " sltu   $10,%1,$10      \n\t" \
 366        " addu    %1,%1,%4       \n\t" \
 367        " sltu   $11,%1,%4       \n\t" \
 368        " addu    %2,%2,$10      \n\t" \
 369        " addu    %2,%2,$11      \n\t" \
 370        " addu    %2,%2,%5       \n\t" \
 371        :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
 372
 373#else
 374/******************************************************************************/
 375#define PSTM_ISO
 376/* ISO C portable code */
 377
 378#define COMBA_START
 379
 380#define CLEAR_CARRY \
 381   c0 = c1 = c2 = 0;
 382
 383#define COMBA_STORE(x) \
 384   x = c0;
 385
 386#define COMBA_STORE2(x) \
 387   x = c1;
 388
 389#define CARRY_FORWARD \
 390   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
 391
 392#define COMBA_FINI
 393
 394/* multiplies point i and j, updates carry "c1" and digit c2 */
 395#define SQRADD(i, j)                                                                                                    \
 396   do { pstm_word t;                                                                                                    \
 397   t = c0 + ((pstm_word)i) * ((pstm_word)j);  c0 = (pstm_digit)t;               \
 398   t = c1 + (t >> DIGIT_BIT);                                                                                   \
 399   c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT);                              \
 400   } while (0);
 401
 402
 403/* for squaring some of the terms are doubled... */
 404#define SQRADD2(i, j)                                                                                   \
 405   do { pstm_word t;                                                                                    \
 406   t  = ((pstm_word)i) * ((pstm_word)j);                                                \
 407   tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt;                                 \
 408   tt = (pstm_word)c1 + (tt >> DIGIT_BIT);                                              \
 409   c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT);    \
 410   tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt;                                 \
 411   tt = (pstm_word)c1 + (tt >> DIGIT_BIT);                                              \
 412   c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT);    \
 413   } while (0);
 414
 415#define SQRADDSC(i, j)                                                                          \
 416   do { pstm_word t;                                                                            \
 417          t =  ((pstm_word)i) * ((pstm_word)j);                                 \
 418          sc0 = (pstm_digit)t; sc1 = (pstm_digit)(t >> DIGIT_BIT); sc2 = 0;     \
 419   } while (0);
 420
 421#define SQRADDAC(i, j)                                                                                                          \
 422   do { pstm_word t;                                                                                                            \
 423   t = ((pstm_word)sc0) + ((pstm_word)i) * ((pstm_word)j);                                      \
 424   sc0 = (pstm_digit)t;                                                                                                         \
 425   t = ((pstm_word)sc1) + (t >> DIGIT_BIT); sc1 = (pstm_digit)t;                        \
 426   sc2 += (pstm_digit)(t >> DIGIT_BIT);                                                                         \
 427   } while (0);
 428
 429#define SQRADDDB                                                                                                                        \
 430   do { pstm_word t;                                                                                                            \
 431   t = ((pstm_word)sc0) + ((pstm_word)sc0) + ((pstm_word)c0);                           \
 432   c0 = (pstm_digit)t;                                                                                                          \
 433   t = ((pstm_word)sc1) + ((pstm_word)sc1) + c1 + (t >> DIGIT_BIT);                     \
 434   c1 = (pstm_digit)t;                                                                                                          \
 435   c2 = c2 + sc2 + sc2 + (pstm_digit)(t >> DIGIT_BIT);                                          \
 436   } while (0);
 437
 438#endif /* ISO_C */
 439
 440/******************************************************************************/
 441/*
 442        Non-unrolled comba squarer
 443 */
 444//bbox: pool unused
 445#define pstm_sqr_comba_gen(pool, A, B, paD, paDlen) \
 446        pstm_sqr_comba_gen(      A, B, paD, paDlen)
 447static int32 pstm_sqr_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B,
 448                        pstm_digit *paD, uint32 paDlen)
 449{
 450        int             paDfail, pa; //bbox: was int16
 451        int32       ix, iz;
 452        pstm_digit  c0, c1, c2, *dst;
 453#ifdef PSTM_ISO
 454        pstm_word   tt;
 455#endif
 456
 457        paDfail = 0;
 458        /* get size of output and trim */
 459        pa = A->used + A->used;
 460
 461        /* number of output digits to produce */
 462        COMBA_START;
 463        CLEAR_CARRY;
 464/*
 465        If b is not large enough grow it and continue
 466*/
 467        if (B->alloc < pa) {
 468                if (pstm_grow(B, pa) != PSTM_OKAY) {
 469                        return PS_MEM_FAIL;
 470                }
 471        }
 472        if (paD != NULL) {
 473                if (paDlen < (sizeof(pstm_digit) * pa)) {
 474                        paDfail = 1; /* have a paD, but it's not big enough */
 475                        dst = xzalloc(sizeof(pstm_digit) * pa);//bbox
 476                } else {
 477                        dst = paD;
 478                        memset(dst, 0x0, paDlen);
 479                }
 480        } else {
 481                dst = xzalloc(sizeof(pstm_digit) * pa);//bbox
 482        }
 483
 484        for (ix = 0; ix < pa; ix++) {
 485                int32      tx, ty, iy;
 486                pstm_digit *tmpy, *tmpx;
 487
 488                /* get offsets into the two bignums */
 489                ty = min(A->used-1, ix);
 490                tx = ix - ty;
 491
 492                /* setup temp aliases */
 493                tmpx = A->dp + tx;
 494                tmpy = A->dp + ty;
 495
 496/*
 497                        This is the number of times the loop will iterate,
 498                                while (tx++ < a->used && ty-- >= 0) { ... }
 499*/
 500                iy = min(A->used-tx, ty+1);
 501
 502/*
 503                now for squaring tx can never equal ty. We halve the distance since
 504                they approach at a rate of 2x and we have to round because odd cases
 505                need to be executed
 506*/
 507                iy = min(iy, (ty-tx+1)>>1);
 508
 509                /* forward carries */
 510                CARRY_FORWARD;
 511
 512                /* execute loop */
 513                for (iz = 0; iz < iy; iz++) {
 514                        SQRADD2(*tmpx++, *tmpy--);
 515                }
 516
 517                /* even columns have the square term in them */
 518                if ((ix&1) == 0) {
 519                        SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
 520                }
 521
 522                /* store it */
 523                COMBA_STORE(dst[ix]);
 524        }
 525
 526        COMBA_FINI;
 527/*
 528        setup dest
 529 */
 530        iz  = B->used;
 531        B->used = pa;
 532        {
 533                pstm_digit *tmpc;
 534                tmpc = B->dp;
 535                for (ix = 0; ix < pa; ix++) {
 536                        *tmpc++ = dst[ix];
 537                }
 538                /*      clear unused digits (that existed in the old copy of c) */
 539                for (; ix < iz; ix++) {
 540                        *tmpc++ = 0;
 541                }
 542        }
 543        pstm_clamp(B);
 544
 545        if ((paD == NULL) || paDfail == 1) {
 546                psFree(dst, pool);
 547        }
 548        return PS_SUCCESS;
 549}
 550
 551/******************************************************************************/
 552/*
 553        Unrolled Comba loop for 1024 bit keys
 554 */
 555#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
 556static int32 pstm_sqr_comba16(pstm_int *A, pstm_int *B)
 557{
 558        pstm_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;
 559#ifdef PSTM_ISO
 560        pstm_word   tt;
 561#endif
 562
 563        if (B->alloc < 32) {
 564                if (pstm_grow(B, 32) != PSTM_OKAY) {
 565                        return PS_MEM_FAIL;
 566                }
 567        }
 568        a = A->dp;
 569        sc0 = sc1 = sc2 = 0;
 570
 571        COMBA_START;
 572
 573   /* clear carries */
 574   CLEAR_CARRY;
 575
 576   /* output 0 */
 577   SQRADD(a[0],a[0]);
 578   COMBA_STORE(b[0]);
 579
 580   /* output 1 */
 581   CARRY_FORWARD;
 582   SQRADD2(a[0], a[1]);
 583   COMBA_STORE(b[1]);
 584
 585   /* output 2 */
 586   CARRY_FORWARD;
 587   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
 588   COMBA_STORE(b[2]);
 589
 590   /* output 3 */
 591   CARRY_FORWARD;
 592   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
 593   COMBA_STORE(b[3]);
 594
 595   /* output 4 */
 596   CARRY_FORWARD;
 597   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
 598   COMBA_STORE(b[4]);
 599
 600   /* output 5 */
 601   CARRY_FORWARD;
 602   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
 603   COMBA_STORE(b[5]);
 604
 605   /* output 6 */
 606   CARRY_FORWARD;
 607   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
 608   COMBA_STORE(b[6]);
 609
 610   /* output 7 */
 611   CARRY_FORWARD;
 612   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
 613   COMBA_STORE(b[7]);
 614
 615   /* output 8 */
 616   CARRY_FORWARD;
 617   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
 618   COMBA_STORE(b[8]);
 619
 620   /* output 9 */
 621   CARRY_FORWARD;
 622   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
 623   COMBA_STORE(b[9]);
 624
 625   /* output 10 */
 626   CARRY_FORWARD;
 627   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
 628   COMBA_STORE(b[10]);
 629
 630   /* output 11 */
 631   CARRY_FORWARD;
 632   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
 633   COMBA_STORE(b[11]);
 634
 635   /* output 12 */
 636   CARRY_FORWARD;
 637   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
 638   COMBA_STORE(b[12]);
 639
 640   /* output 13 */
 641   CARRY_FORWARD;
 642   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
 643   COMBA_STORE(b[13]);
 644
 645   /* output 14 */
 646   CARRY_FORWARD;
 647   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
 648   COMBA_STORE(b[14]);
 649
 650   /* output 15 */
 651   CARRY_FORWARD;
 652   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
 653   COMBA_STORE(b[15]);
 654
 655   /* output 16 */
 656   CARRY_FORWARD;
 657   SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
 658   COMBA_STORE(b[16]);
 659
 660   /* output 17 */
 661   CARRY_FORWARD;
 662   SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
 663   COMBA_STORE(b[17]);
 664
 665   /* output 18 */
 666   CARRY_FORWARD;
 667   SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
 668   COMBA_STORE(b[18]);
 669
 670   /* output 19 */
 671   CARRY_FORWARD;
 672   SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
 673   COMBA_STORE(b[19]);
 674
 675   /* output 20 */
 676   CARRY_FORWARD;
 677   SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
 678   COMBA_STORE(b[20]);
 679
 680   /* output 21 */
 681   CARRY_FORWARD;
 682   SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
 683   COMBA_STORE(b[21]);
 684
 685   /* output 22 */
 686   CARRY_FORWARD;
 687   SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
 688   COMBA_STORE(b[22]);
 689
 690   /* output 23 */
 691   CARRY_FORWARD;
 692   SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
 693   COMBA_STORE(b[23]);
 694
 695   /* output 24 */
 696   CARRY_FORWARD;
 697   SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
 698   COMBA_STORE(b[24]);
 699
 700   /* output 25 */
 701   CARRY_FORWARD;
 702   SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
 703   COMBA_STORE(b[25]);
 704
 705   /* output 26 */
 706   CARRY_FORWARD;
 707   SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
 708   COMBA_STORE(b[26]);
 709
 710   /* output 27 */
 711   CARRY_FORWARD;
 712   SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
 713   COMBA_STORE(b[27]);
 714
 715   /* output 28 */
 716   CARRY_FORWARD;
 717   SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
 718   COMBA_STORE(b[28]);
 719
 720   /* output 29 */
 721   CARRY_FORWARD;
 722   SQRADD2(a[14], a[15]);
 723   COMBA_STORE(b[29]);
 724
 725   /* output 30 */
 726   CARRY_FORWARD;
 727   SQRADD(a[15], a[15]);
 728   COMBA_STORE(b[30]);
 729   COMBA_STORE2(b[31]);
 730   COMBA_FINI;
 731
 732   B->used = 32;
 733   B->sign = PSTM_ZPOS;
 734   memcpy(B->dp, b, 32 * sizeof(pstm_digit));
 735   pstm_clamp(B);
 736   return PSTM_OKAY;
 737}
 738#endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
 739
 740
 741#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
 742static int32 pstm_sqr_comba32(pstm_int *A, pstm_int *B)
 743{
 744   pstm_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
 745#ifdef PSTM_ISO
 746   pstm_word tt;
 747#endif
 748
 749        if (B->alloc < 64) {
 750                if (pstm_grow(B, 64) != PSTM_OKAY) {
 751                        return PS_MEM_FAIL;
 752                }
 753        }
 754        sc0 = sc1 = sc2 = 0;
 755   a = A->dp;
 756   COMBA_START;
 757
 758   /* clear carries */
 759   CLEAR_CARRY;
 760
 761   /* output 0 */
 762   SQRADD(a[0],a[0]);
 763   COMBA_STORE(b[0]);
 764
 765   /* output 1 */
 766   CARRY_FORWARD;
 767   SQRADD2(a[0], a[1]);
 768   COMBA_STORE(b[1]);
 769
 770   /* output 2 */
 771   CARRY_FORWARD;
 772   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
 773   COMBA_STORE(b[2]);
 774
 775   /* output 3 */
 776   CARRY_FORWARD;
 777   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
 778   COMBA_STORE(b[3]);
 779
 780   /* output 4 */
 781   CARRY_FORWARD;
 782   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
 783   COMBA_STORE(b[4]);
 784
 785   /* output 5 */
 786   CARRY_FORWARD;
 787   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
 788   COMBA_STORE(b[5]);
 789
 790   /* output 6 */
 791   CARRY_FORWARD;
 792   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
 793   COMBA_STORE(b[6]);
 794
 795   /* output 7 */
 796   CARRY_FORWARD;
 797   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
 798   COMBA_STORE(b[7]);
 799
 800   /* output 8 */
 801   CARRY_FORWARD;
 802   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
 803   COMBA_STORE(b[8]);
 804
 805   /* output 9 */
 806   CARRY_FORWARD;
 807   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
 808   COMBA_STORE(b[9]);
 809
 810   /* output 10 */
 811   CARRY_FORWARD;
 812   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
 813   COMBA_STORE(b[10]);
 814
 815   /* output 11 */
 816   CARRY_FORWARD;
 817   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
 818   COMBA_STORE(b[11]);
 819
 820   /* output 12 */
 821   CARRY_FORWARD;
 822   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
 823   COMBA_STORE(b[12]);
 824
 825   /* output 13 */
 826   CARRY_FORWARD;
 827   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
 828   COMBA_STORE(b[13]);
 829
 830   /* output 14 */
 831   CARRY_FORWARD;
 832   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
 833   COMBA_STORE(b[14]);
 834
 835   /* output 15 */
 836   CARRY_FORWARD;
 837   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
 838   COMBA_STORE(b[15]);
 839
 840   /* output 16 */
 841   CARRY_FORWARD;
 842   SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
 843   COMBA_STORE(b[16]);
 844
 845   /* output 17 */
 846   CARRY_FORWARD;
 847   SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
 848   COMBA_STORE(b[17]);
 849
 850   /* output 18 */
 851   CARRY_FORWARD;
 852   SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
 853   COMBA_STORE(b[18]);
 854
 855   /* output 19 */
 856   CARRY_FORWARD;
 857   SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
 858   COMBA_STORE(b[19]);
 859
 860   /* output 20 */
 861   CARRY_FORWARD;
 862   SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
 863   COMBA_STORE(b[20]);
 864
 865   /* output 21 */
 866   CARRY_FORWARD;
 867   SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
 868   COMBA_STORE(b[21]);
 869
 870   /* output 22 */
 871   CARRY_FORWARD;
 872   SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
 873   COMBA_STORE(b[22]);
 874
 875   /* output 23 */
 876   CARRY_FORWARD;
 877   SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
 878   COMBA_STORE(b[23]);
 879
 880   /* output 24 */
 881   CARRY_FORWARD;
 882   SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
 883   COMBA_STORE(b[24]);
 884
 885   /* output 25 */
 886   CARRY_FORWARD;
 887   SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
 888   COMBA_STORE(b[25]);
 889
 890   /* output 26 */
 891   CARRY_FORWARD;
 892   SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]);
 893   COMBA_STORE(b[26]);
 894
 895   /* output 27 */
 896   CARRY_FORWARD;
 897   SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB;
 898   COMBA_STORE(b[27]);
 899
 900   /* output 28 */
 901   CARRY_FORWARD;
 902   SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]);
 903   COMBA_STORE(b[28]);
 904
 905   /* output 29 */
 906   CARRY_FORWARD;
 907   SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB;
 908   COMBA_STORE(b[29]);
 909
 910   /* output 30 */
 911   CARRY_FORWARD;
 912   SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]);
 913   COMBA_STORE(b[30]);
 914
 915   /* output 31 */
 916   CARRY_FORWARD;
 917   SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB;
 918   COMBA_STORE(b[31]);
 919
 920   /* output 32 */
 921   CARRY_FORWARD;
 922   SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]);
 923   COMBA_STORE(b[32]);
 924
 925   /* output 33 */
 926   CARRY_FORWARD;
 927   SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB;
 928   COMBA_STORE(b[33]);
 929
 930   /* output 34 */
 931   CARRY_FORWARD;
 932   SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]);
 933   COMBA_STORE(b[34]);
 934
 935   /* output 35 */
 936   CARRY_FORWARD;
 937   SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB;
 938   COMBA_STORE(b[35]);
 939
 940   /* output 36 */
 941   CARRY_FORWARD;
 942   SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]);
 943   COMBA_STORE(b[36]);
 944
 945   /* output 37 */
 946   CARRY_FORWARD;
 947   SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB;
 948   COMBA_STORE(b[37]);
 949
 950   /* output 38 */
 951   CARRY_FORWARD;
 952   SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]);
 953   COMBA_STORE(b[38]);
 954
 955   /* output 39 */
 956   CARRY_FORWARD;
 957   SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB;
 958   COMBA_STORE(b[39]);
 959
 960   /* output 40 */
 961   CARRY_FORWARD;
 962   SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]);
 963   COMBA_STORE(b[40]);
 964
 965   /* output 41 */
 966   CARRY_FORWARD;
 967   SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB;
 968   COMBA_STORE(b[41]);
 969
 970   /* output 42 */
 971   CARRY_FORWARD;
 972   SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]);
 973   COMBA_STORE(b[42]);
 974
 975   /* output 43 */
 976   CARRY_FORWARD;
 977   SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB;
 978   COMBA_STORE(b[43]);
 979
 980   /* output 44 */
 981   CARRY_FORWARD;
 982   SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]);
 983   COMBA_STORE(b[44]);
 984
 985   /* output 45 */
 986   CARRY_FORWARD;
 987   SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB;
 988   COMBA_STORE(b[45]);
 989
 990   /* output 46 */
 991   CARRY_FORWARD;
 992   SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]);
 993   COMBA_STORE(b[46]);
 994
 995   /* output 47 */
 996   CARRY_FORWARD;
 997   SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB;
 998   COMBA_STORE(b[47]);
 999
1000   /* output 48 */
1001   CARRY_FORWARD;
1002   SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]);
1003   COMBA_STORE(b[48]);
1004
1005   /* output 49 */
1006   CARRY_FORWARD;
1007   SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB;
1008   COMBA_STORE(b[49]);
1009
1010   /* output 50 */
1011   CARRY_FORWARD;
1012   SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]);
1013   COMBA_STORE(b[50]);
1014
1015   /* output 51 */
1016   CARRY_FORWARD;
1017   SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB;
1018   COMBA_STORE(b[51]);
1019
1020   /* output 52 */
1021   CARRY_FORWARD;
1022   SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]);
1023   COMBA_STORE(b[52]);
1024
1025   /* output 53 */
1026   CARRY_FORWARD;
1027   SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB;
1028   COMBA_STORE(b[53]);
1029
1030   /* output 54 */
1031   CARRY_FORWARD;
1032   SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]);
1033   COMBA_STORE(b[54]);
1034
1035   /* output 55 */
1036   CARRY_FORWARD;
1037   SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB;
1038   COMBA_STORE(b[55]);
1039
1040   /* output 56 */
1041   CARRY_FORWARD;
1042   SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]);
1043   COMBA_STORE(b[56]);
1044
1045   /* output 57 */
1046   CARRY_FORWARD;
1047   SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB;
1048   COMBA_STORE(b[57]);
1049
1050   /* output 58 */
1051   CARRY_FORWARD;
1052   SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]);
1053   COMBA_STORE(b[58]);
1054
1055   /* output 59 */
1056   CARRY_FORWARD;
1057   SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]);
1058   COMBA_STORE(b[59]);
1059
1060   /* output 60 */
1061   CARRY_FORWARD;
1062   SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]);
1063   COMBA_STORE(b[60]);
1064
1065   /* output 61 */
1066   CARRY_FORWARD;
1067   SQRADD2(a[30], a[31]);
1068   COMBA_STORE(b[61]);
1069
1070   /* output 62 */
1071   CARRY_FORWARD;
1072   SQRADD(a[31], a[31]);
1073   COMBA_STORE(b[62]);
1074   COMBA_STORE2(b[63]);
1075   COMBA_FINI;
1076
1077   B->used = 64;
1078   B->sign = PSTM_ZPOS;
1079   memcpy(B->dp, b, 64 * sizeof(pstm_digit));
1080   pstm_clamp(B);
1081   return PSTM_OKAY;
1082}
1083#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1084
1085/******************************************************************************/
1086/*
1087 */
1088int32 FAST_FUNC pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_digit *paD,
1089                uint32 paDlen)
1090{
1091#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
1092        if (A->used == 16) {
1093                return pstm_sqr_comba16(A, B);
1094        } else {
1095#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
1096                if (A->used == 32) {
1097                        return pstm_sqr_comba32(A, B);
1098                }
1099#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1100                return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
1101        }
1102#else
1103#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
1104        if (A->used == 32) {
1105                return pstm_sqr_comba32(A, B);
1106        }
1107#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1108        return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
1109#endif
1110}
1111
1112#endif /* DISABLE_PSTM */
1113/******************************************************************************/
1114