LXR linux/arch/arm/crypto/aes-neonbs-core.S

   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Bit sliced AES using NEON instructions
   4 *
   5 * Copyright (C) 2017 Linaro Ltd.
   6 * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
   7 */
   8
   9/*
  10 * The algorithm implemented here is described in detail by the paper
  11 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
  12 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
  13 *
  14 * This implementation is based primarily on the OpenSSL implementation
  15 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
  16 */
  17
  18#include <linux/linkage.h>
  19#include <asm/assembler.h>
  20
  21        .text
  22        .fpu            neon
  23
  24        rounds          .req    ip
  25        bskey           .req    r4
  26
  27        q0l             .req    d0
  28        q0h             .req    d1
  29        q1l             .req    d2
  30        q1h             .req    d3
  31        q2l             .req    d4
  32        q2h             .req    d5
  33        q3l             .req    d6
  34        q3h             .req    d7
  35        q4l             .req    d8
  36        q4h             .req    d9
  37        q5l             .req    d10
  38        q5h             .req    d11
  39        q6l             .req    d12
  40        q6h             .req    d13
  41        q7l             .req    d14
  42        q7h             .req    d15
  43        q8l             .req    d16
  44        q8h             .req    d17
  45        q9l             .req    d18
  46        q9h             .req    d19
  47        q10l            .req    d20
  48        q10h            .req    d21
  49        q11l            .req    d22
  50        q11h            .req    d23
  51        q12l            .req    d24
  52        q12h            .req    d25
  53        q13l            .req    d26
  54        q13h            .req    d27
  55        q14l            .req    d28
  56        q14h            .req    d29
  57        q15l            .req    d30
  58        q15h            .req    d31
  59
  60        .macro          __tbl, out, tbl, in, tmp
  61        .ifc            \out, \tbl
  62        .ifb            \tmp
  63        .error          __tbl needs temp register if out == tbl
  64        .endif
  65        vmov            \tmp, \out
  66        .endif
  67        vtbl.8          \out\()l, {\tbl}, \in\()l
  68        .ifc            \out, \tbl
  69        vtbl.8          \out\()h, {\tmp}, \in\()h
  70        .else
  71        vtbl.8          \out\()h, {\tbl}, \in\()h
  72        .endif
  73        .endm
  74
  75        .macro          __ldr, out, sym
  76        vldr            \out\()l, \sym
  77        vldr            \out\()h, \sym + 8
  78        .endm
  79
  80        .macro          in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
  81        veor            \b2, \b2, \b1
  82        veor            \b5, \b5, \b6
  83        veor            \b3, \b3, \b0
  84        veor            \b6, \b6, \b2
  85        veor            \b5, \b5, \b0
  86        veor            \b6, \b6, \b3
  87        veor            \b3, \b3, \b7
  88        veor            \b7, \b7, \b5
  89        veor            \b3, \b3, \b4
  90        veor            \b4, \b4, \b5
  91        veor            \b2, \b2, \b7
  92        veor            \b3, \b3, \b1
  93        veor            \b1, \b1, \b5
  94        .endm
  95
  96        .macro          out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
  97        veor            \b0, \b0, \b6
  98        veor            \b1, \b1, \b4
  99        veor            \b4, \b4, \b6
 100        veor            \b2, \b2, \b0
 101        veor            \b6, \b6, \b1
 102        veor            \b1, \b1, \b5
 103        veor            \b5, \b5, \b3
 104        veor            \b3, \b3, \b7
 105        veor            \b7, \b7, \b5
 106        veor            \b2, \b2, \b5
 107        veor            \b4, \b4, \b7
 108        .endm
 109
 110        .macro          inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
 111        veor            \b1, \b1, \b7
 112        veor            \b4, \b4, \b7
 113        veor            \b7, \b7, \b5
 114        veor            \b1, \b1, \b3
 115        veor            \b2, \b2, \b5
 116        veor            \b3, \b3, \b7
 117        veor            \b6, \b6, \b1
 118        veor            \b2, \b2, \b0
 119        veor            \b5, \b5, \b3
 120        veor            \b4, \b4, \b6
 121        veor            \b0, \b0, \b6
 122        veor            \b1, \b1, \b4
 123        .endm
 124
 125        .macro          inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
 126        veor            \b1, \b1, \b5
 127        veor            \b2, \b2, \b7
 128        veor            \b3, \b3, \b1
 129        veor            \b4, \b4, \b5
 130        veor            \b7, \b7, \b5
 131        veor            \b3, \b3, \b4
 132        veor            \b5, \b5, \b0
 133        veor            \b3, \b3, \b7
 134        veor            \b6, \b6, \b2
 135        veor            \b2, \b2, \b1
 136        veor            \b6, \b6, \b3
 137        veor            \b3, \b3, \b0
 138        veor            \b5, \b5, \b6
 139        .endm
 140
 141        .macro          mul_gf4, x0, x1, y0, y1, t0, t1
 142        veor            \t0, \y0, \y1
 143        vand            \t0, \t0, \x0
 144        veor            \x0, \x0, \x1
 145        vand            \t1, \x1, \y0
 146        vand            \x0, \x0, \y1
 147        veor            \x1, \t1, \t0
 148        veor            \x0, \x0, \t1
 149        .endm
 150
 151        .macro          mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
 152        veor            \t0, \y0, \y1
 153        veor            \t1, \y2, \y3
 154        vand            \t0, \t0, \x0
 155        vand            \t1, \t1, \x2
 156        veor            \x0, \x0, \x1
 157        veor            \x2, \x2, \x3
 158        vand            \x1, \x1, \y0
 159        vand            \x3, \x3, \y2
 160        vand            \x0, \x0, \y1
 161        vand            \x2, \x2, \y3
 162        veor            \x1, \x1, \x0
 163        veor            \x2, \x2, \x3
 164        veor            \x0, \x0, \t0
 165        veor            \x3, \x3, \t1
 166        .endm
 167
 168        .macro          mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
 169                                    y0, y1, y2, y3, t0, t1, t2, t3
 170        veor            \t0, \x0, \x2
 171        veor            \t1, \x1, \x3
 172        mul_gf4         \x0, \x1, \y0, \y1, \t2, \t3
 173        veor            \y0, \y0, \y2
 174        veor            \y1, \y1, \y3
 175        mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
 176        veor            \x0, \x0, \t0
 177        veor            \x2, \x2, \t0
 178        veor            \x1, \x1, \t1
 179        veor            \x3, \x3, \t1
 180        veor            \t0, \x4, \x6
 181        veor            \t1, \x5, \x7
 182        mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
 183        veor            \y0, \y0, \y2
 184        veor            \y1, \y1, \y3
 185        mul_gf4         \x4, \x5, \y0, \y1, \t2, \t3
 186        veor            \x4, \x4, \t0
 187        veor            \x6, \x6, \t0
 188        veor            \x5, \x5, \t1
 189        veor            \x7, \x7, \t1
 190        .endm
 191
 192        .macro          inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
 193                                   t0, t1, t2, t3, s0, s1, s2, s3
 194        veor            \t3, \x4, \x6
 195        veor            \t0, \x5, \x7
 196        veor            \t1, \x1, \x3
 197        veor            \s1, \x7, \x6
 198        veor            \s0, \x0, \x2
 199        veor            \s3, \t3, \t0
 200        vorr            \t2, \t0, \t1
 201        vand            \s2, \t3, \s0
 202        vorr            \t3, \t3, \s0
 203        veor            \s0, \s0, \t1
 204        vand            \t0, \t0, \t1
 205        veor            \t1, \x3, \x2
 206        vand            \s3, \s3, \s0
 207        vand            \s1, \s1, \t1
 208        veor            \t1, \x4, \x5
 209        veor            \s0, \x1, \x0
 210        veor            \t3, \t3, \s1
 211        veor            \t2, \t2, \s1
 212        vand            \s1, \t1, \s0
 213        vorr            \t1, \t1, \s0
 214        veor            \t3, \t3, \s3
 215        veor            \t0, \t0, \s1
 216        veor            \t2, \t2, \s2
 217        veor            \t1, \t1, \s3
 218        veor            \t0, \t0, \s2
 219        vand            \s0, \x7, \x3
 220        veor            \t1, \t1, \s2
 221        vand            \s1, \x6, \x2
 222        vand            \s2, \x5, \x1
 223        vorr            \s3, \x4, \x0
 224        veor            \t3, \t3, \s0
 225        veor            \t1, \t1, \s2
 226        veor            \s0, \t0, \s3
 227        veor            \t2, \t2, \s1
 228        vand            \s2, \t3, \t1
 229        veor            \s1, \t2, \s2
 230        veor            \s3, \s0, \s2
 231        vbsl            \s1, \t1, \s0
 232        vmvn            \t0, \s0
 233        vbsl            \s0, \s1, \s3
 234        vbsl            \t0, \s1, \s3
 235        vbsl            \s3, \t3, \t2
 236        veor            \t3, \t3, \t2
 237        vand            \s2, \s0, \s3
 238        veor            \t1, \t1, \t0
 239        veor            \s2, \s2, \t3
 240        mul_gf16_2      \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
 241                        \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
 242        .endm
 243
 244        .macro          sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
 245                              t0, t1, t2, t3, s0, s1, s2, s3
 246        in_bs_ch        \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
 247        inv_gf256       \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
 248                        \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 249        out_bs_ch       \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
 250        .endm
 251
 252        .macro          inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
 253                                  t0, t1, t2, t3, s0, s1, s2, s3
 254        inv_in_bs_ch    \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
 255        inv_gf256       \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
 256                        \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 257        inv_out_bs_ch   \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
 258        .endm
 259
 260        .macro          shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
 261                                    t0, t1, t2, t3, mask
 262        vld1.8          {\t0-\t1}, [bskey, :256]!
 263        veor            \t0, \t0, \x0
 264        vld1.8          {\t2-\t3}, [bskey, :256]!
 265        veor            \t1, \t1, \x1
 266        __tbl           \x0, \t0, \mask
 267        veor            \t2, \t2, \x2
 268        __tbl           \x1, \t1, \mask
 269        vld1.8          {\t0-\t1}, [bskey, :256]!
 270        veor            \t3, \t3, \x3
 271        __tbl           \x2, \t2, \mask
 272        __tbl           \x3, \t3, \mask
 273        vld1.8          {\t2-\t3}, [bskey, :256]!
 274        veor            \t0, \t0, \x4
 275        veor            \t1, \t1, \x5
 276        __tbl           \x4, \t0, \mask
 277        veor            \t2, \t2, \x6
 278        __tbl           \x5, \t1, \mask
 279        veor            \t3, \t3, \x7
 280        __tbl           \x6, \t2, \mask
 281        __tbl           \x7, \t3, \mask
 282        .endm
 283
 284        .macro          inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
 285                                        t0, t1, t2, t3, mask
 286        __tbl           \x0, \x0, \mask, \t0
 287        __tbl           \x1, \x1, \mask, \t1
 288        __tbl           \x2, \x2, \mask, \t2
 289        __tbl           \x3, \x3, \mask, \t3
 290        __tbl           \x4, \x4, \mask, \t0
 291        __tbl           \x5, \x5, \mask, \t1
 292        __tbl           \x6, \x6, \mask, \t2
 293        __tbl           \x7, \x7, \mask, \t3
 294        .endm
 295
 296        .macro          mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
 297                                  t0, t1, t2, t3, t4, t5, t6, t7, inv
 298        vext.8          \t0, \x0, \x0, #12
 299        vext.8          \t1, \x1, \x1, #12
 300        veor            \x0, \x0, \t0
 301        vext.8          \t2, \x2, \x2, #12
 302        veor            \x1, \x1, \t1
 303        vext.8          \t3, \x3, \x3, #12
 304        veor            \x2, \x2, \t2
 305        vext.8          \t4, \x4, \x4, #12
 306        veor            \x3, \x3, \t3
 307        vext.8          \t5, \x5, \x5, #12
 308        veor            \x4, \x4, \t4
 309        vext.8          \t6, \x6, \x6, #12
 310        veor            \x5, \x5, \t5
 311        vext.8          \t7, \x7, \x7, #12
 312        veor            \x6, \x6, \t6
 313        veor            \t1, \t1, \x0
 314        veor.8          \x7, \x7, \t7
 315        vext.8          \x0, \x0, \x0, #8
 316        veor            \t2, \t2, \x1
 317        veor            \t0, \t0, \x7
 318        veor            \t1, \t1, \x7
 319        vext.8          \x1, \x1, \x1, #8
 320        veor            \t5, \t5, \x4
 321        veor            \x0, \x0, \t0
 322        veor            \t6, \t6, \x5
 323        veor            \x1, \x1, \t1
 324        vext.8          \t0, \x4, \x4, #8
 325        veor            \t4, \t4, \x3
 326        vext.8          \t1, \x5, \x5, #8
 327        veor            \t7, \t7, \x6
 328        vext.8          \x4, \x3, \x3, #8
 329        veor            \t3, \t3, \x2
 330        vext.8          \x5, \x7, \x7, #8
 331        veor            \t4, \t4, \x7
 332        vext.8          \x3, \x6, \x6, #8
 333        veor            \t3, \t3, \x7
 334        vext.8          \x6, \x2, \x2, #8
 335        veor            \x7, \t1, \t5
 336        .ifb            \inv
 337        veor            \x2, \t0, \t4
 338        veor            \x4, \x4, \t3
 339        veor            \x5, \x5, \t7
 340        veor            \x3, \x3, \t6
 341        veor            \x6, \x6, \t2
 342        .else
 343        veor            \t3, \t3, \x4
 344        veor            \x5, \x5, \t7
 345        veor            \x2, \x3, \t6
 346        veor            \x3, \t0, \t4
 347        veor            \x4, \x6, \t2
 348        vmov            \x6, \t3
 349        .endif
 350        .endm
 351
 352        .macro          inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
 353                                      t0, t1, t2, t3, t4, t5, t6, t7
 354        vld1.8          {\t0-\t1}, [bskey, :256]!
 355        veor            \x0, \x0, \t0
 356        vld1.8          {\t2-\t3}, [bskey, :256]!
 357        veor            \x1, \x1, \t1
 358        vld1.8          {\t4-\t5}, [bskey, :256]!
 359        veor            \x2, \x2, \t2
 360        vld1.8          {\t6-\t7}, [bskey, :256]
 361        sub             bskey, bskey, #224
 362        veor            \x3, \x3, \t3
 363        veor            \x4, \x4, \t4
 364        veor            \x5, \x5, \t5
 365        veor            \x6, \x6, \t6
 366        veor            \x7, \x7, \t7
 367        vext.8          \t0, \x0, \x0, #8
 368        vext.8          \t6, \x6, \x6, #8
 369        vext.8          \t7, \x7, \x7, #8
 370        veor            \t0, \t0, \x0
 371        vext.8          \t1, \x1, \x1, #8
 372        veor            \t6, \t6, \x6
 373        vext.8          \t2, \x2, \x2, #8
 374        veor            \t7, \t7, \x7
 375        vext.8          \t3, \x3, \x3, #8
 376        veor            \t1, \t1, \x1
 377        vext.8          \t4, \x4, \x4, #8
 378        veor            \t2, \t2, \x2
 379        vext.8          \t5, \x5, \x5, #8
 380        veor            \t3, \t3, \x3
 381        veor            \t4, \t4, \x4
 382        veor            \t5, \t5, \x5
 383        veor            \x0, \x0, \t6
 384        veor            \x1, \x1, \t6
 385        veor            \x2, \x2, \t0
 386        veor            \x4, \x4, \t2
 387        veor            \x3, \x3, \t1
 388        veor            \x1, \x1, \t7
 389        veor            \x2, \x2, \t7
 390        veor            \x4, \x4, \t6
 391        veor            \x5, \x5, \t3
 392        veor            \x3, \x3, \t6
 393        veor            \x6, \x6, \t4
 394        veor            \x4, \x4, \t7
 395        veor            \x5, \x5, \t7
 396        veor            \x7, \x7, \t5
 397        mix_cols        \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
 398                        \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
 399        .endm
 400
 401        .macro          swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
 402        vshr.u64        \t0, \b0, #\n
 403        vshr.u64        \t1, \b1, #\n
 404        veor            \t0, \t0, \a0
 405        veor            \t1, \t1, \a1
 406        vand            \t0, \t0, \mask
 407        vand            \t1, \t1, \mask
 408        veor            \a0, \a0, \t0
 409        vshl.s64        \t0, \t0, #\n
 410        veor            \a1, \a1, \t1
 411        vshl.s64        \t1, \t1, #\n
 412        veor            \b0, \b0, \t0
 413        veor            \b1, \b1, \t1
 414        .endm
 415
 416        .macro          bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
 417        vmov.i8         \t0, #0x55
 418        vmov.i8         \t1, #0x33
 419        swapmove_2x     \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
 420        swapmove_2x     \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
 421        vmov.i8         \t0, #0x0f
 422        swapmove_2x     \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
 423        swapmove_2x     \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
 424        swapmove_2x     \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
 425        swapmove_2x     \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
 426        .endm
 427
 428        .align          4
 429M0:     .quad           0x02060a0e03070b0f, 0x0004080c0105090d
 430
 431        /*
 432         * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
 433         */
 434ENTRY(aesbs_convert_key)
 435        vld1.32         {q7}, [r1]!             // load round 0 key
 436        vld1.32         {q15}, [r1]!            // load round 1 key
 437
 438        vmov.i8         q8,  #0x01              // bit masks
 439        vmov.i8         q9,  #0x02
 440        vmov.i8         q10, #0x04
 441        vmov.i8         q11, #0x08
 442        vmov.i8         q12, #0x10
 443        vmov.i8         q13, #0x20
 444        __ldr           q14, M0
 445
 446        sub             r2, r2, #1
 447        vst1.8          {q7}, [r0, :128]!       // save round 0 key
 448
 449.Lkey_loop:
 450        __tbl           q7, q15, q14
 451        vmov.i8         q6, #0x40
 452        vmov.i8         q15, #0x80
 453
 454        vtst.8          q0, q7, q8
 455        vtst.8          q1, q7, q9
 456        vtst.8          q2, q7, q10
 457        vtst.8          q3, q7, q11
 458        vtst.8          q4, q7, q12
 459        vtst.8          q5, q7, q13
 460        vtst.8          q6, q7, q6
 461        vtst.8          q7, q7, q15
 462        vld1.32         {q15}, [r1]!            // load next round key
 463        vmvn            q0, q0
 464        vmvn            q1, q1
 465        vmvn            q5, q5
 466        vmvn            q6, q6
 467
 468        subs            r2, r2, #1
 469        vst1.8          {q0-q1}, [r0, :256]!
 470        vst1.8          {q2-q3}, [r0, :256]!
 471        vst1.8          {q4-q5}, [r0, :256]!
 472        vst1.8          {q6-q7}, [r0, :256]!
 473        bne             .Lkey_loop
 474
 475        vmov.i8         q7, #0x63               // compose .L63
 476        veor            q15, q15, q7
 477        vst1.8          {q15}, [r0, :128]
 478        bx              lr
 479ENDPROC(aesbs_convert_key)
 480
 481        .align          4
 482M0SR:   .quad           0x0a0e02060f03070b, 0x0004080c05090d01
 483
 484aesbs_encrypt8:
 485        vld1.8          {q9}, [bskey, :128]!    // round 0 key
 486        __ldr           q8, M0SR
 487
 488        veor            q10, q0, q9             // xor with round0 key
 489        veor            q11, q1, q9
 490        __tbl           q0, q10, q8
 491        veor            q12, q2, q9
 492        __tbl           q1, q11, q8
 493        veor            q13, q3, q9
 494        __tbl           q2, q12, q8
 495        veor            q14, q4, q9
 496        __tbl           q3, q13, q8
 497        veor            q15, q5, q9
 498        __tbl           q4, q14, q8
 499        veor            q10, q6, q9
 500        __tbl           q5, q15, q8
 501        veor            q11, q7, q9
 502        __tbl           q6, q10, q8
 503        __tbl           q7, q11, q8
 504
 505        bitslice        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
 506
 507        sub             rounds, rounds, #1
 508        b               .Lenc_sbox
 509
 510        .align          5
 511SR:     .quad           0x0504070600030201, 0x0f0e0d0c0a09080b
 512SRM0:   .quad           0x0304090e00050a0f, 0x01060b0c0207080d
 513
 514.Lenc_last:
 515        __ldr           q12, SRM0
 516.Lenc_loop:
 517        shift_rows      q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
 518.Lenc_sbox:
 519        sbox            q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
 520                                                                q13, q14, q15
 521        subs            rounds, rounds, #1
 522        bcc             .Lenc_done
 523
 524        mix_cols        q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
 525                                                                q13, q14, q15
 526
 527        beq             .Lenc_last
 528        __ldr           q12, SR
 529        b               .Lenc_loop
 530
 531.Lenc_done:
 532        vld1.8          {q12}, [bskey, :128]    // last round key
 533
 534        bitslice        q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
 535
 536        veor            q0, q0, q12
 537        veor            q1, q1, q12
 538        veor            q4, q4, q12
 539        veor            q6, q6, q12
 540        veor            q3, q3, q12
 541        veor            q7, q7, q12
 542        veor            q2, q2, q12
 543        veor            q5, q5, q12
 544        bx              lr
 545ENDPROC(aesbs_encrypt8)
 546
 547        .align          4
 548M0ISR:  .quad           0x0a0e0206070b0f03, 0x0004080c0d010509
 549
 550aesbs_decrypt8:
 551        add             bskey, bskey, rounds, lsl #7
 552        sub             bskey, bskey, #112
 553        vld1.8          {q9}, [bskey, :128]     // round 0 key
 554        sub             bskey, bskey, #128
 555        __ldr           q8, M0ISR
 556
 557        veor            q10, q0, q9             // xor with round0 key
 558        veor            q11, q1, q9
 559        __tbl           q0, q10, q8
 560        veor            q12, q2, q9
 561        __tbl           q1, q11, q8
 562        veor            q13, q3, q9
 563        __tbl           q2, q12, q8
 564        veor            q14, q4, q9
 565        __tbl           q3, q13, q8
 566        veor            q15, q5, q9
 567        __tbl           q4, q14, q8
 568        veor            q10, q6, q9
 569        __tbl           q5, q15, q8
 570        veor            q11, q7, q9
 571        __tbl           q6, q10, q8
 572        __tbl           q7, q11, q8
 573
 574        bitslice        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
 575
 576        sub             rounds, rounds, #1
 577        b               .Ldec_sbox
 578
 579        .align          5
 580ISR:    .quad           0x0504070602010003, 0x0f0e0d0c080b0a09
 581ISRM0:  .quad           0x01040b0e0205080f, 0x0306090c00070a0d
 582
 583.Ldec_last:
 584        __ldr           q12, ISRM0
 585.Ldec_loop:
 586        inv_shift_rows  q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
 587.Ldec_sbox:
 588        inv_sbox        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
 589                                                                q13, q14, q15
 590        subs            rounds, rounds, #1
 591        bcc             .Ldec_done
 592
 593        inv_mix_cols    q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
 594                                                                q13, q14, q15
 595
 596        beq             .Ldec_last
 597        __ldr           q12, ISR
 598        b               .Ldec_loop
 599
 600.Ldec_done:
 601        add             bskey, bskey, #112
 602        vld1.8          {q12}, [bskey, :128]    // last round key
 603
 604        bitslice        q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
 605
 606        veor            q0, q0, q12
 607        veor            q1, q1, q12
 608        veor            q6, q6, q12
 609        veor            q4, q4, q12
 610        veor            q2, q2, q12
 611        veor            q7, q7, q12
 612        veor            q3, q3, q12
 613        veor            q5, q5, q12
 614        bx              lr
 615ENDPROC(aesbs_decrypt8)
 616
 617        /*
 618         * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 619         *                   int blocks)
 620         * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 621         *                   int blocks)
 622         */
 623        .macro          __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 624        push            {r4-r6, lr}
 625        ldr             r5, [sp, #16]           // number of blocks
 626
 62799:     adr             ip, 0f
 628        and             lr, r5, #7
 629        cmp             r5, #8
 630        sub             ip, ip, lr, lsl #2
 631        movlt           pc, ip                  // computed goto if blocks < 8
 632
 633        vld1.8          {q0}, [r1]!
 634        vld1.8          {q1}, [r1]!
 635        vld1.8          {q2}, [r1]!
 636        vld1.8          {q3}, [r1]!
 637        vld1.8          {q4}, [r1]!
 638        vld1.8          {q5}, [r1]!
 639        vld1.8          {q6}, [r1]!
 640        vld1.8          {q7}, [r1]!
 641
 6420:      mov             bskey, r2
 643        mov             rounds, r3
 644        bl              \do8
 645
 646        adr             ip, 1f
 647        and             lr, r5, #7
 648        cmp             r5, #8
 649        sub             ip, ip, lr, lsl #2
 650        movlt           pc, ip                  // computed goto if blocks < 8
 651
 652        vst1.8          {\o0}, [r0]!
 653        vst1.8          {\o1}, [r0]!
 654        vst1.8          {\o2}, [r0]!
 655        vst1.8          {\o3}, [r0]!
 656        vst1.8          {\o4}, [r0]!
 657        vst1.8          {\o5}, [r0]!
 658        vst1.8          {\o6}, [r0]!
 659        vst1.8          {\o7}, [r0]!
 660
 6611:      subs            r5, r5, #8
 662        bgt             99b
 663
 664        pop             {r4-r6, pc}
 665        .endm
 666
 667        .align          4
 668ENTRY(aesbs_ecb_encrypt)
 669        __ecb_crypt     aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
 670ENDPROC(aesbs_ecb_encrypt)
 671
 672        .align          4
 673ENTRY(aesbs_ecb_decrypt)
 674        __ecb_crypt     aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
 675ENDPROC(aesbs_ecb_decrypt)
 676
 677        /*
 678         * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
 679         *                   int rounds, int blocks, u8 iv[])
 680         */
 681        .align          4
 682ENTRY(aesbs_cbc_decrypt)
 683        mov             ip, sp
 684        push            {r4-r6, lr}
 685        ldm             ip, {r5-r6}             // load args 4-5
 686
 68799:     adr             ip, 0f
 688        and             lr, r5, #7
 689        cmp             r5, #8
 690        sub             ip, ip, lr, lsl #2
 691        mov             lr, r1
 692        movlt           pc, ip                  // computed goto if blocks < 8
 693
 694        vld1.8          {q0}, [lr]!
 695        vld1.8          {q1}, [lr]!
 696        vld1.8          {q2}, [lr]!
 697        vld1.8          {q3}, [lr]!
 698        vld1.8          {q4}, [lr]!
 699        vld1.8          {q5}, [lr]!
 700        vld1.8          {q6}, [lr]!
 701        vld1.8          {q7}, [lr]
 702
 7030:      mov             bskey, r2
 704        mov             rounds, r3
 705        bl              aesbs_decrypt8
 706
 707        vld1.8          {q8}, [r6]
 708        vmov            q9, q8
 709        vmov            q10, q8
 710        vmov            q11, q8
 711        vmov            q12, q8
 712        vmov            q13, q8
 713        vmov            q14, q8
 714        vmov            q15, q8
 715
 716        adr             ip, 1f
 717        and             lr, r5, #7
 718        cmp             r5, #8
 719        sub             ip, ip, lr, lsl #2
 720        movlt           pc, ip                  // computed goto if blocks < 8
 721
 722        vld1.8          {q9}, [r1]!
 723        vld1.8          {q10}, [r1]!
 724        vld1.8          {q11}, [r1]!
 725        vld1.8          {q12}, [r1]!
 726        vld1.8          {q13}, [r1]!
 727        vld1.8          {q14}, [r1]!
 728        vld1.8          {q15}, [r1]!
 729        W(nop)
 730
 7311:      adr             ip, 2f
 732        sub             ip, ip, lr, lsl #3
 733        movlt           pc, ip                  // computed goto if blocks < 8
 734
 735        veor            q0, q0, q8
 736        vst1.8          {q0}, [r0]!
 737        veor            q1, q1, q9
 738        vst1.8          {q1}, [r0]!
 739        veor            q6, q6, q10
 740        vst1.8          {q6}, [r0]!
 741        veor            q4, q4, q11
 742        vst1.8          {q4}, [r0]!
 743        veor            q2, q2, q12
 744        vst1.8          {q2}, [r0]!
 745        veor            q7, q7, q13
 746        vst1.8          {q7}, [r0]!
 747        veor            q3, q3, q14
 748        vst1.8          {q3}, [r0]!
 749        veor            q5, q5, q15
 750        vld1.8          {q8}, [r1]!             // load next round's iv
 7512:      vst1.8          {q5}, [r0]!
 752
 753        subs            r5, r5, #8
 754        vst1.8          {q8}, [r6]              // store next round's iv
 755        bgt             99b
 756
 757        pop             {r4-r6, pc}
 758ENDPROC(aesbs_cbc_decrypt)
 759
 760        .macro          next_ctr, q
 761        vmov.32         \q\()h[1], r10
 762        adds            r10, r10, #1
 763        vmov.32         \q\()h[0], r9
 764        adcs            r9, r9, #0
 765        vmov.32         \q\()l[1], r8
 766        adcs            r8, r8, #0
 767        vmov.32         \q\()l[0], r7
 768        adc             r7, r7, #0
 769        vrev32.8        \q, \q
 770        .endm
 771
 772        /*
 773         * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
 774         *                   int rounds, int blocks, u8 ctr[], u8 final[])
 775         */
 776ENTRY(aesbs_ctr_encrypt)
 777        mov             ip, sp
 778        push            {r4-r10, lr}
 779
 780        ldm             ip, {r5-r7}             // load args 4-6
 781        teq             r7, #0
 782        addne           r5, r5, #1              // one extra block if final != 0
 783
 784        vld1.8          {q0}, [r6]              // load counter
 785        vrev32.8        q1, q0
 786        vmov            r9, r10, d3
 787        vmov            r7, r8, d2
 788
 789        adds            r10, r10, #1
 790        adcs            r9, r9, #0
 791        adcs            r8, r8, #0
 792        adc             r7, r7, #0
 793
 79499:     vmov            q1, q0
 795        vmov            q2, q0
 796        vmov            q3, q0
 797        vmov            q4, q0
 798        vmov            q5, q0
 799        vmov            q6, q0
 800        vmov            q7, q0
 801
 802        adr             ip, 0f
 803        sub             lr, r5, #1
 804        and             lr, lr, #7
 805        cmp             r5, #8
 806        sub             ip, ip, lr, lsl #5
 807        sub             ip, ip, lr, lsl #2
 808        movlt           pc, ip                  // computed goto if blocks < 8
 809
 810        next_ctr        q1
 811        next_ctr        q2
 812        next_ctr        q3
 813        next_ctr        q4
 814        next_ctr        q5
 815        next_ctr        q6
 816        next_ctr        q7
 817
 8180:      mov             bskey, r2
 819        mov             rounds, r3
 820        bl              aesbs_encrypt8
 821
 822        adr             ip, 1f
 823        and             lr, r5, #7
 824        cmp             r5, #8
 825        movgt           r4, #0
 826        ldrle           r4, [sp, #40]           // load final in the last round
 827        sub             ip, ip, lr, lsl #2
 828        movlt           pc, ip                  // computed goto if blocks < 8
 829
 830        vld1.8          {q8}, [r1]!
 831        vld1.8          {q9}, [r1]!
 832        vld1.8          {q10}, [r1]!
 833        vld1.8          {q11}, [r1]!
 834        vld1.8          {q12}, [r1]!
 835        vld1.8          {q13}, [r1]!
 836        vld1.8          {q14}, [r1]!
 837        teq             r4, #0                  // skip last block if 'final'
 8381:      bne             2f
 839        vld1.8          {q15}, [r1]!
 840
 8412:      adr             ip, 3f
 842        cmp             r5, #8
 843        sub             ip, ip, lr, lsl #3
 844        movlt           pc, ip                  // computed goto if blocks < 8
 845
 846        veor            q0, q0, q8
 847        vst1.8          {q0}, [r0]!
 848        veor            q1, q1, q9
 849        vst1.8          {q1}, [r0]!
 850        veor            q4, q4, q10
 851        vst1.8          {q4}, [r0]!
 852        veor            q6, q6, q11
 853        vst1.8          {q6}, [r0]!
 854        veor            q3, q3, q12
 855        vst1.8          {q3}, [r0]!
 856        veor            q7, q7, q13
 857        vst1.8          {q7}, [r0]!
 858        veor            q2, q2, q14
 859        vst1.8          {q2}, [r0]!
 860        teq             r4, #0                  // skip last block if 'final'
 861        W(bne)          5f
 8623:      veor            q5, q5, q15
 863        vst1.8          {q5}, [r0]!
 864
 8654:      next_ctr        q0
 866
 867        subs            r5, r5, #8
 868        bgt             99b
 869
 870        vst1.8          {q0}, [r6]
 871        pop             {r4-r10, pc}
 872
 8735:      vst1.8          {q5}, [r4]
 874        b               4b
 875ENDPROC(aesbs_ctr_encrypt)
 876
 877        .macro          next_tweak, out, in, const, tmp
 878        vshr.s64        \tmp, \in, #63
 879        vand            \tmp, \tmp, \const
 880        vadd.u64        \out, \in, \in
 881        vext.8          \tmp, \tmp, \tmp, #8
 882        veor            \out, \out, \tmp
 883        .endm
 884
 885        /*
 886         * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 887         *                   int blocks, u8 iv[], int reorder_last_tweak)
 888         * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 889         *                   int blocks, u8 iv[], int reorder_last_tweak)
 890         */
 891__xts_prepare8:
 892        vld1.8          {q14}, [r7]             // load iv
 893        vmov.i32        d30, #0x87              // compose tweak mask vector
 894        vmovl.u32       q15, d30
 895        vshr.u64        d30, d31, #7
 896        vmov            q12, q14
 897
 898        adr             ip, 0f
 899        and             r4, r6, #7
 900        cmp             r6, #8
 901        sub             ip, ip, r4, lsl #5
 902        mov             r4, sp
 903        movlt           pc, ip                  // computed goto if blocks < 8
 904
 905        vld1.8          {q0}, [r1]!
 906        next_tweak      q12, q14, q15, q13
 907        veor            q0, q0, q14
 908        vst1.8          {q14}, [r4, :128]!
 909
 910        vld1.8          {q1}, [r1]!
 911        next_tweak      q14, q12, q15, q13
 912        veor            q1, q1, q12
 913        vst1.8          {q12}, [r4, :128]!
 914
 915        vld1.8          {q2}, [r1]!
 916        next_tweak      q12, q14, q15, q13
 917        veor            q2, q2, q14
 918        vst1.8          {q14}, [r4, :128]!
 919
 920        vld1.8          {q3}, [r1]!
 921        next_tweak      q14, q12, q15, q13
 922        veor            q3, q3, q12
 923        vst1.8          {q12}, [r4, :128]!
 924
 925        vld1.8          {q4}, [r1]!
 926        next_tweak      q12, q14, q15, q13
 927        veor            q4, q4, q14
 928        vst1.8          {q14}, [r4, :128]!
 929
 930        vld1.8          {q5}, [r1]!
 931        next_tweak      q14, q12, q15, q13
 932        veor            q5, q5, q12
 933        vst1.8          {q12}, [r4, :128]!
 934
 935        vld1.8          {q6}, [r1]!
 936        next_tweak      q12, q14, q15, q13
 937        veor            q6, q6, q14
 938        vst1.8          {q14}, [r4, :128]!
 939
 940        vld1.8          {q7}, [r1]!
 941        next_tweak      q14, q12, q15, q13
 942THUMB(  itt             le              )
 943        W(cmple)        r8, #0
 944        ble             1f
 9450:      veor            q7, q7, q12
 946        vst1.8          {q12}, [r4, :128]
 947
 948        vst1.8          {q14}, [r7]             // store next iv
 949        bx              lr
 950
 9511:      vswp            q12, q14
 952        b               0b
 953ENDPROC(__xts_prepare8)
 954
 955        .macro          __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 956        push            {r4-r8, lr}
 957        mov             r5, sp                  // preserve sp
 958        ldrd            r6, r7, [sp, #24]       // get blocks and iv args
 959        rsb             r8, ip, #1
 960        sub             ip, sp, #128            // make room for 8x tweak
 961        bic             ip, ip, #0xf            // align sp to 16 bytes
 962        mov             sp, ip
 963
 96499:     bl              __xts_prepare8
 965
 966        mov             bskey, r2
 967        mov             rounds, r3
 968        bl              \do8
 969
 970        adr             ip, 0f
 971        and             lr, r6, #7
 972        cmp             r6, #8
 973        sub             ip, ip, lr, lsl #2
 974        mov             r4, sp
 975        movlt           pc, ip                  // computed goto if blocks < 8
 976
 977        vld1.8          {q8}, [r4, :128]!
 978        vld1.8          {q9}, [r4, :128]!
 979        vld1.8          {q10}, [r4, :128]!
 980        vld1.8          {q11}, [r4, :128]!
 981        vld1.8          {q12}, [r4, :128]!
 982        vld1.8          {q13}, [r4, :128]!
 983        vld1.8          {q14}, [r4, :128]!
 984        vld1.8          {q15}, [r4, :128]
 985
 9860:      adr             ip, 1f
 987        sub             ip, ip, lr, lsl #3
 988        movlt           pc, ip                  // computed goto if blocks < 8
 989
 990        veor            \o0, \o0, q8
 991        vst1.8          {\o0}, [r0]!
 992        veor            \o1, \o1, q9
 993        vst1.8          {\o1}, [r0]!
 994        veor            \o2, \o2, q10
 995        vst1.8          {\o2}, [r0]!
 996        veor            \o3, \o3, q11
 997        vst1.8          {\o3}, [r0]!
 998        veor            \o4, \o4, q12
 999        vst1.8          {\o4}, [r0]!
1000        veor            \o5, \o5, q13

1001        vst1.8          {\o5}, [r0]!
1002        veor            \o6, \o6, q14
1003        vst1.8          {\o6}, [r0]!
1004        veor            \o7, \o7, q15
1005        vst1.8          {\o7}, [r0]!
1006
10071:      subs            r6, r6, #8
1008        bgt             99b
1009
1010        mov             sp, r5
1011        pop             {r4-r8, pc}
1012        .endm
1013
1014ENTRY(aesbs_xts_encrypt)
1015        mov             ip, #0                  // never reorder final tweak
1016        __xts_crypt     aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1017ENDPROC(aesbs_xts_encrypt)
1018
1019ENTRY(aesbs_xts_decrypt)
1020        ldr             ip, [sp, #8]            // reorder final tweak?
1021        __xts_crypt     aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1022ENDPROC(aesbs_xts_decrypt)
1023