linux/arch/hexagon/lib/memset.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Copyright (c) 2011, The Linux Foundation. All rights reserved.
   4 */
   5
   6
   7/* HEXAGON assembly optimized memset */
   8/* Replaces the standard library function memset */
   9
  10
  11        .macro HEXAGON_OPT_FUNC_BEGIN name
  12        .text
  13        .p2align 4
  14        .globl \name
  15        .type  \name, @function
  16\name:
  17        .endm
  18
  19        .macro HEXAGON_OPT_FUNC_FINISH name
  20        .size  \name, . - \name
  21        .endm
  22
  23/* FUNCTION: memset (v2 version) */
  24#if __HEXAGON_ARCH__ < 3
  25HEXAGON_OPT_FUNC_BEGIN memset
  26        {
  27                r6 = #8
  28                r7 = extractu(r0, #3 , #0)
  29                p0 = cmp.eq(r2, #0)
  30                p1 = cmp.gtu(r2, #7)
  31        }
  32        {
  33                r4 = vsplatb(r1)
  34                r8 = r0           /* leave r0 intact for return val  */
  35                r9 = sub(r6, r7)  /* bytes until double alignment  */
  36                if p0 jumpr r31   /* count == 0, so return  */
  37        }
  38        {
  39                r3 = #0
  40                r7 = #0
  41                p0 = tstbit(r9, #0)
  42                if p1 jump 2f /* skip byte loop */
  43        }
  44
  45/* less than 8 bytes to set, so just set a byte at a time and return  */
  46
  47                loop0(1f, r2) /* byte loop */
  48        .falign
  491: /* byte loop */
  50        {
  51                memb(r8++#1) = r4
  52        }:endloop0
  53                jumpr r31
  54        .falign
  552: /* skip byte loop */
  56        {
  57                r6 = #1
  58                p0 = tstbit(r9, #1)
  59                p1 = cmp.eq(r2, #1)
  60                if !p0 jump 3f /* skip initial byte store */
  61        }
  62        {
  63                memb(r8++#1) = r4
  64                r3:2 = sub(r3:2, r7:6)
  65                if p1 jumpr r31
  66        }
  67        .falign
  683: /* skip initial byte store */
  69        {
  70                r6 = #2
  71                p0 = tstbit(r9, #2)
  72                p1 = cmp.eq(r2, #2)
  73                if !p0 jump 4f /* skip initial half store */
  74        }
  75        {
  76                memh(r8++#2) = r4
  77                r3:2 = sub(r3:2, r7:6)
  78                if p1 jumpr r31
  79        }
  80        .falign
  814: /* skip initial half store */
  82        {
  83                r6 = #4
  84                p0 = cmp.gtu(r2, #7)
  85                p1 = cmp.eq(r2, #4)
  86                if !p0 jump 5f /* skip initial word store */
  87        }
  88        {
  89                memw(r8++#4) = r4
  90                r3:2 = sub(r3:2, r7:6)
  91                p0 = cmp.gtu(r2, #11)
  92                if p1 jumpr r31
  93        }
  94        .falign
  955: /* skip initial word store */
  96        {
  97                r10 = lsr(r2, #3)
  98                p1 = cmp.eq(r3, #1)
  99                if !p0 jump 7f /* skip double loop */
 100        }
 101        {
 102                r5 = r4
 103                r6 = #8
 104                loop0(6f, r10) /* double loop */
 105        }
 106
 107/* set bytes a double word at a time  */
 108
 109        .falign
 1106: /* double loop */
 111        {
 112                memd(r8++#8) = r5:4
 113                r3:2 = sub(r3:2, r7:6)
 114                p1 = cmp.eq(r2, #8)
 115        }:endloop0
 116        .falign
 1177: /* skip double loop */
 118        {
 119                p0 = tstbit(r2, #2)
 120                if p1 jumpr r31
 121        }
 122        {
 123                r6 = #4
 124                p0 = tstbit(r2, #1)
 125                p1 = cmp.eq(r2, #4)
 126                if !p0 jump 8f /* skip final word store */
 127        }
 128        {
 129                memw(r8++#4) = r4
 130                r3:2 = sub(r3:2, r7:6)
 131                if p1 jumpr r31
 132        }
 133        .falign
 1348: /* skip final word store */
 135        {
 136                p1 = cmp.eq(r2, #2)
 137                if !p0 jump 9f /* skip final half store */
 138        }
 139        {
 140                memh(r8++#2) = r4
 141                if p1 jumpr r31
 142        }
 143        .falign
 1449: /* skip final half store */
 145        {
 146                memb(r8++#1) = r4
 147                jumpr r31
 148        }
 149HEXAGON_OPT_FUNC_FINISH memset
 150#endif
 151
 152
 153/*  FUNCTION: memset (v3 and higher version)  */
 154#if __HEXAGON_ARCH__ >= 3
 155HEXAGON_OPT_FUNC_BEGIN memset
 156        {
 157                r7=vsplatb(r1)
 158                r6 = r0
 159                if (r2==#0) jump:nt .L1
 160        }
 161        {
 162                r5:4=combine(r7,r7)
 163                p0 = cmp.gtu(r2,#8)
 164                if (p0.new) jump:nt .L3
 165        }
 166        {
 167                r3 = r0
 168                loop0(.L47,r2)
 169        }
 170        .falign
 171.L47:
 172        {
 173                memb(r3++#1) = r1
 174        }:endloop0 /* start=.L47 */
 175                jumpr r31
 176.L3:
 177        {
 178                p0 = tstbit(r0,#0)
 179                if (!p0.new) jump:nt .L8
 180                p1 = cmp.eq(r2, #1)
 181        }
 182        {
 183                r6 = add(r0, #1)
 184                r2 = add(r2,#-1)
 185                memb(r0) = r1
 186                if (p1) jump .L1
 187        }
 188.L8:
 189        {
 190                p0 = tstbit(r6,#1)
 191                if (!p0.new) jump:nt .L10
 192        }
 193        {
 194                r2 = add(r2,#-2)
 195                memh(r6++#2) = r7
 196                p0 = cmp.eq(r2, #2)
 197                if (p0.new) jump:nt .L1
 198        }
 199.L10:
 200        {
 201                p0 = tstbit(r6,#2)
 202                if (!p0.new) jump:nt .L12
 203        }
 204        {
 205                r2 = add(r2,#-4)
 206                memw(r6++#4) = r7
 207                p0 = cmp.eq(r2, #4)
 208                if (p0.new) jump:nt .L1
 209        }
 210.L12:
 211        {
 212                p0 = cmp.gtu(r2,#127)
 213                if (!p0.new) jump:nt .L14
 214        }
 215                r3 = and(r6,#31)
 216                if (r3==#0) jump:nt .L17
 217        {
 218                memd(r6++#8) = r5:4
 219                r2 = add(r2,#-8)
 220        }
 221                r3 = and(r6,#31)
 222                if (r3==#0) jump:nt .L17
 223        {
 224                memd(r6++#8) = r5:4
 225                r2 = add(r2,#-8)
 226        }
 227                r3 = and(r6,#31)
 228                if (r3==#0) jump:nt .L17
 229        {
 230                memd(r6++#8) = r5:4
 231                r2 = add(r2,#-8)
 232        }
 233.L17:
 234        {
 235                r3 = lsr(r2,#5)
 236                if (r1!=#0) jump:nt .L18
 237        }
 238        {
 239                r8 = r3
 240                r3 = r6
 241                loop0(.L46,r3)
 242        }
 243        .falign
 244.L46:
 245        {
 246                dczeroa(r6)
 247                r6 = add(r6,#32)
 248                r2 = add(r2,#-32)
 249        }:endloop0 /* start=.L46 */
 250.L14:
 251        {
 252                p0 = cmp.gtu(r2,#7)
 253                if (!p0.new) jump:nt .L28
 254                r8 = lsr(r2,#3)
 255        }
 256                loop0(.L44,r8)
 257        .falign
 258.L44:
 259        {
 260                memd(r6++#8) = r5:4
 261                r2 = add(r2,#-8)
 262        }:endloop0 /* start=.L44 */
 263.L28:
 264        {
 265                p0 = tstbit(r2,#2)
 266                if (!p0.new) jump:nt .L33
 267        }
 268        {
 269                r2 = add(r2,#-4)
 270                memw(r6++#4) = r7
 271        }
 272.L33:
 273        {
 274                p0 = tstbit(r2,#1)
 275                if (!p0.new) jump:nt .L35
 276        }
 277        {
 278                r2 = add(r2,#-2)
 279                memh(r6++#2) = r7
 280        }
 281.L35:
 282                p0 = cmp.eq(r2,#1)
 283                if (p0) memb(r6) = r1
 284.L1:
 285                jumpr r31
 286.L18:
 287                loop0(.L45,r3)
 288        .falign
 289.L45:
 290                dczeroa(r6)
 291        {
 292                memd(r6++#8) = r5:4
 293                r2 = add(r2,#-32)
 294        }
 295                memd(r6++#8) = r5:4
 296                memd(r6++#8) = r5:4
 297        {
 298                memd(r6++#8) = r5:4
 299        }:endloop0 /* start=.L45  */
 300                jump .L14
 301HEXAGON_OPT_FUNC_FINISH memset
 302#endif
 303