linux/arch/powerpc/crypto/aes-spe-core.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Fast AES implementation for SPE instruction set (PPC)
   4 *
   5 * This code makes use of the SPE SIMD instruction set as defined in
   6 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
   7 * Implementation is based on optimization guide notes from
   8 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
   9 *
  10 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
  11 */
  12
  13#include <asm/ppc_asm.h>
  14#include "aes-spe-regs.h"
  15
  16#define EAD(in, bpos) \
  17        rlwimi          rT0,in,28-((bpos+3)%4)*8,20,27;
  18
  19#define DAD(in, bpos) \
  20        rlwimi          rT1,in,24-((bpos+3)%4)*8,24,31;
  21
  22#define LWH(out, off) \
  23        evlwwsplat      out,off(rT0);   /* load word high               */
  24
  25#define LWL(out, off) \
  26        lwz             out,off(rT0);   /* load word low                */
  27
  28#define LBZ(out, tab, off) \
  29        lbz             out,off(tab);   /* load byte                    */
  30
  31#define LAH(out, in, bpos, off) \
  32        EAD(in, bpos)                   /* calc addr + load word high   */ \
  33        LWH(out, off)
  34
  35#define LAL(out, in, bpos, off) \
  36        EAD(in, bpos)                   /* calc addr + load word low    */ \
  37        LWL(out, off)
  38
  39#define LAE(out, in, bpos) \
  40        EAD(in, bpos)                   /* calc addr + load enc byte    */ \
  41        LBZ(out, rT0, 8)
  42
  43#define LBE(out) \
  44        LBZ(out, rT0, 8)                /* load enc byte                */
  45
  46#define LAD(out, in, bpos) \
  47        DAD(in, bpos)                   /* calc addr + load dec byte    */ \
  48        LBZ(out, rT1, 0)
  49
  50#define LBD(out) \
  51        LBZ(out, rT1, 0)
  52
  53/*
  54 * ppc_encrypt_block: The central encryption function for a single 16 bytes
  55 * block. It does no stack handling or register saving to support fast calls
  56 * via bl/blr. It expects that caller has pre-xored input data with first
  57 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
  58 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
  59 * and rW0-rW3 and caller must execute a final xor on the output registers.
  60 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
  61 *
  62 */
  63_GLOBAL(ppc_encrypt_block)
  64        LAH(rW4, rD1, 2, 4)
  65        LAH(rW6, rD0, 3, 0)
  66        LAH(rW3, rD0, 1, 8)
  67ppc_encrypt_block_loop:
  68        LAH(rW0, rD3, 0, 12)
  69        LAL(rW0, rD0, 0, 12)
  70        LAH(rW1, rD1, 0, 12)
  71        LAH(rW2, rD2, 1, 8)
  72        LAL(rW2, rD3, 1, 8)
  73        LAL(rW3, rD1, 1, 8)
  74        LAL(rW4, rD2, 2, 4)
  75        LAL(rW6, rD1, 3, 0)
  76        LAH(rW5, rD3, 2, 4)
  77        LAL(rW5, rD0, 2, 4)
  78        LAH(rW7, rD2, 3, 0)
  79        evldw           rD1,16(rKP)
  80        EAD(rD3, 3)
  81        evxor           rW2,rW2,rW4
  82        LWL(rW7, 0)
  83        evxor           rW2,rW2,rW6
  84        EAD(rD2, 0)
  85        evxor           rD1,rD1,rW2
  86        LWL(rW1, 12)
  87        evxor           rD1,rD1,rW0
  88        evldw           rD3,24(rKP)
  89        evmergehi       rD0,rD0,rD1
  90        EAD(rD1, 2)
  91        evxor           rW3,rW3,rW5
  92        LWH(rW4, 4)
  93        evxor           rW3,rW3,rW7
  94        EAD(rD0, 3)
  95        evxor           rD3,rD3,rW3
  96        LWH(rW6, 0)
  97        evxor           rD3,rD3,rW1
  98        EAD(rD0, 1)
  99        evmergehi       rD2,rD2,rD3
 100        LWH(rW3, 8)
 101        LAH(rW0, rD3, 0, 12)
 102        LAL(rW0, rD0, 0, 12)
 103        LAH(rW1, rD1, 0, 12)
 104        LAH(rW2, rD2, 1, 8)
 105        LAL(rW2, rD3, 1, 8)
 106        LAL(rW3, rD1, 1, 8)
 107        LAL(rW4, rD2, 2, 4)
 108        LAL(rW6, rD1, 3, 0)
 109        LAH(rW5, rD3, 2, 4)
 110        LAL(rW5, rD0, 2, 4)
 111        LAH(rW7, rD2, 3, 0)
 112        evldw           rD1,32(rKP)
 113        EAD(rD3, 3)
 114        evxor           rW2,rW2,rW4
 115        LWL(rW7, 0)
 116        evxor           rW2,rW2,rW6
 117        EAD(rD2, 0)
 118        evxor           rD1,rD1,rW2
 119        LWL(rW1, 12)
 120        evxor           rD1,rD1,rW0
 121        evldw           rD3,40(rKP)
 122        evmergehi       rD0,rD0,rD1
 123        EAD(rD1, 2)
 124        evxor           rW3,rW3,rW5
 125        LWH(rW4, 4)
 126        evxor           rW3,rW3,rW7
 127        EAD(rD0, 3)
 128        evxor           rD3,rD3,rW3
 129        LWH(rW6, 0)
 130        evxor           rD3,rD3,rW1
 131        EAD(rD0, 1)
 132        evmergehi       rD2,rD2,rD3
 133        LWH(rW3, 8)
 134        addi            rKP,rKP,32
 135        bdnz            ppc_encrypt_block_loop
 136        LAH(rW0, rD3, 0, 12)
 137        LAL(rW0, rD0, 0, 12)
 138        LAH(rW1, rD1, 0, 12)
 139        LAH(rW2, rD2, 1, 8)
 140        LAL(rW2, rD3, 1, 8)
 141        LAL(rW3, rD1, 1, 8)
 142        LAL(rW4, rD2, 2, 4)
 143        LAH(rW5, rD3, 2, 4)
 144        LAL(rW6, rD1, 3, 0)
 145        LAL(rW5, rD0, 2, 4)
 146        LAH(rW7, rD2, 3, 0)
 147        evldw           rD1,16(rKP)
 148        EAD(rD3, 3)
 149        evxor           rW2,rW2,rW4
 150        LWL(rW7, 0)
 151        evxor           rW2,rW2,rW6
 152        EAD(rD2, 0)
 153        evxor           rD1,rD1,rW2
 154        LWL(rW1, 12)
 155        evxor           rD1,rD1,rW0
 156        evldw           rD3,24(rKP)
 157        evmergehi       rD0,rD0,rD1
 158        EAD(rD1, 0)
 159        evxor           rW3,rW3,rW5
 160        LBE(rW2)
 161        evxor           rW3,rW3,rW7
 162        EAD(rD0, 1)
 163        evxor           rD3,rD3,rW3
 164        LBE(rW6)
 165        evxor           rD3,rD3,rW1
 166        EAD(rD0, 0)
 167        evmergehi       rD2,rD2,rD3
 168        LBE(rW1)
 169        LAE(rW0, rD3, 0)
 170        LAE(rW1, rD0, 0)
 171        LAE(rW4, rD2, 1)
 172        LAE(rW5, rD3, 1)
 173        LAE(rW3, rD2, 0)
 174        LAE(rW7, rD1, 1)
 175        rlwimi          rW0,rW4,8,16,23
 176        rlwimi          rW1,rW5,8,16,23
 177        LAE(rW4, rD1, 2)
 178        LAE(rW5, rD2, 2)
 179        rlwimi          rW2,rW6,8,16,23
 180        rlwimi          rW3,rW7,8,16,23
 181        LAE(rW6, rD3, 2)
 182        LAE(rW7, rD0, 2)
 183        rlwimi          rW0,rW4,16,8,15
 184        rlwimi          rW1,rW5,16,8,15
 185        LAE(rW4, rD0, 3)
 186        LAE(rW5, rD1, 3)
 187        rlwimi          rW2,rW6,16,8,15
 188        lwz             rD0,32(rKP)
 189        rlwimi          rW3,rW7,16,8,15
 190        lwz             rD1,36(rKP)
 191        LAE(rW6, rD2, 3)
 192        LAE(rW7, rD3, 3)
 193        rlwimi          rW0,rW4,24,0,7
 194        lwz             rD2,40(rKP)
 195        rlwimi          rW1,rW5,24,0,7
 196        lwz             rD3,44(rKP)
 197        rlwimi          rW2,rW6,24,0,7
 198        rlwimi          rW3,rW7,24,0,7
 199        blr
 200
 201/*
 202 * ppc_decrypt_block: The central decryption function for a single 16 bytes
 203 * block. It does no stack handling or register saving to support fast calls
 204 * via bl/blr. It expects that caller has pre-xored input data with first
 205 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
 206 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
 207 * and rW0-rW3 and caller must execute a final xor on the output registers.
 208 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
 209 *
 210 */
 211_GLOBAL(ppc_decrypt_block)
 212        LAH(rW0, rD1, 0, 12)
 213        LAH(rW6, rD0, 3, 0)
 214        LAH(rW3, rD0, 1, 8)
 215ppc_decrypt_block_loop:
 216        LAH(rW1, rD3, 0, 12)
 217        LAL(rW0, rD2, 0, 12)
 218        LAH(rW2, rD2, 1, 8)
 219        LAL(rW2, rD3, 1, 8)
 220        LAH(rW4, rD3, 2, 4)
 221        LAL(rW4, rD0, 2, 4)
 222        LAL(rW6, rD1, 3, 0)
 223        LAH(rW5, rD1, 2, 4)
 224        LAH(rW7, rD2, 3, 0)
 225        LAL(rW7, rD3, 3, 0)
 226        LAL(rW3, rD1, 1, 8)
 227        evldw           rD1,16(rKP)
 228        EAD(rD0, 0)
 229        evxor           rW4,rW4,rW6
 230        LWL(rW1, 12)
 231        evxor           rW0,rW0,rW4
 232        EAD(rD2, 2)
 233        evxor           rW0,rW0,rW2
 234        LWL(rW5, 4)
 235        evxor           rD1,rD1,rW0
 236        evldw           rD3,24(rKP)
 237        evmergehi       rD0,rD0,rD1
 238        EAD(rD1, 0)
 239        evxor           rW3,rW3,rW7
 240        LWH(rW0, 12)
 241        evxor           rW3,rW3,rW1
 242        EAD(rD0, 3)
 243        evxor           rD3,rD3,rW3
 244        LWH(rW6, 0)
 245        evxor           rD3,rD3,rW5
 246        EAD(rD0, 1)
 247        evmergehi       rD2,rD2,rD3
 248        LWH(rW3, 8)
 249        LAH(rW1, rD3, 0, 12)
 250        LAL(rW0, rD2, 0, 12)
 251        LAH(rW2, rD2, 1, 8)
 252        LAL(rW2, rD3, 1, 8)
 253        LAH(rW4, rD3, 2, 4)
 254        LAL(rW4, rD0, 2, 4)
 255        LAL(rW6, rD1, 3, 0)
 256        LAH(rW5, rD1, 2, 4)
 257        LAH(rW7, rD2, 3, 0)
 258        LAL(rW7, rD3, 3, 0)
 259        LAL(rW3, rD1, 1, 8)
 260        evldw            rD1,32(rKP)
 261        EAD(rD0, 0)
 262        evxor           rW4,rW4,rW6
 263        LWL(rW1, 12)
 264        evxor           rW0,rW0,rW4
 265        EAD(rD2, 2)
 266        evxor           rW0,rW0,rW2
 267        LWL(rW5, 4)
 268        evxor           rD1,rD1,rW0
 269        evldw           rD3,40(rKP)
 270        evmergehi       rD0,rD0,rD1
 271        EAD(rD1, 0)
 272        evxor           rW3,rW3,rW7
 273        LWH(rW0, 12)
 274        evxor           rW3,rW3,rW1
 275        EAD(rD0, 3)
 276        evxor           rD3,rD3,rW3
 277        LWH(rW6, 0)
 278        evxor           rD3,rD3,rW5
 279        EAD(rD0, 1)
 280        evmergehi       rD2,rD2,rD3
 281        LWH(rW3, 8)
 282        addi            rKP,rKP,32
 283        bdnz            ppc_decrypt_block_loop
 284        LAH(rW1, rD3, 0, 12)
 285        LAL(rW0, rD2, 0, 12)
 286        LAH(rW2, rD2, 1, 8)
 287        LAL(rW2, rD3, 1, 8)
 288        LAH(rW4, rD3, 2, 4)
 289        LAL(rW4, rD0, 2, 4)
 290        LAL(rW6, rD1, 3, 0)
 291        LAH(rW5, rD1, 2, 4)
 292        LAH(rW7, rD2, 3, 0)
 293        LAL(rW7, rD3, 3, 0)
 294        LAL(rW3, rD1, 1, 8)
 295        evldw            rD1,16(rKP)
 296        EAD(rD0, 0)
 297        evxor           rW4,rW4,rW6
 298        LWL(rW1, 12)
 299        evxor           rW0,rW0,rW4
 300        EAD(rD2, 2)
 301        evxor           rW0,rW0,rW2
 302        LWL(rW5, 4)
 303        evxor           rD1,rD1,rW0
 304        evldw           rD3,24(rKP)
 305        evmergehi       rD0,rD0,rD1
 306        DAD(rD1, 0)
 307        evxor           rW3,rW3,rW7
 308        LBD(rW0)
 309        evxor           rW3,rW3,rW1
 310        DAD(rD0, 1)
 311        evxor           rD3,rD3,rW3
 312        LBD(rW6)
 313        evxor           rD3,rD3,rW5
 314        DAD(rD0, 0)
 315        evmergehi       rD2,rD2,rD3
 316        LBD(rW3)
 317        LAD(rW2, rD3, 0)
 318        LAD(rW1, rD2, 0)
 319        LAD(rW4, rD2, 1)
 320        LAD(rW5, rD3, 1)
 321        LAD(rW7, rD1, 1)
 322        rlwimi          rW0,rW4,8,16,23
 323        rlwimi          rW1,rW5,8,16,23
 324        LAD(rW4, rD3, 2)
 325        LAD(rW5, rD0, 2)
 326        rlwimi          rW2,rW6,8,16,23
 327        rlwimi          rW3,rW7,8,16,23
 328        LAD(rW6, rD1, 2)
 329        LAD(rW7, rD2, 2)
 330        rlwimi          rW0,rW4,16,8,15
 331        rlwimi          rW1,rW5,16,8,15
 332        LAD(rW4, rD0, 3)
 333        LAD(rW5, rD1, 3)
 334        rlwimi          rW2,rW6,16,8,15
 335        lwz             rD0,32(rKP)
 336        rlwimi          rW3,rW7,16,8,15
 337        lwz             rD1,36(rKP)
 338        LAD(rW6, rD2, 3)
 339        LAD(rW7, rD3, 3)
 340        rlwimi          rW0,rW4,24,0,7
 341        lwz             rD2,40(rKP)
 342        rlwimi          rW1,rW5,24,0,7
 343        lwz             rD3,44(rKP)
 344        rlwimi          rW2,rW6,24,0,7
 345        rlwimi          rW3,rW7,24,0,7
 346        blr
 347