linux/arch/hexagon/lib/memset.S
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2011, The Linux Foundation. All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License version 2 and
   6 * only version 2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 * GNU General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16 * 02110-1301, USA.
  17 */
  18
  19
  20/* HEXAGON assembly optimized memset */
  21/* Replaces the standard library function memset */
  22
  23
  24        .macro HEXAGON_OPT_FUNC_BEGIN name
  25        .text
  26        .p2align 4
  27        .globl \name
  28        .type  \name, @function
  29\name:
  30        .endm
  31
  32        .macro HEXAGON_OPT_FUNC_FINISH name
  33        .size  \name, . - \name
  34        .endm
  35
  36/* FUNCTION: memset (v2 version) */
  37#if __HEXAGON_ARCH__ < 3
  38HEXAGON_OPT_FUNC_BEGIN memset
  39        {
  40                r6 = #8
  41                r7 = extractu(r0, #3 , #0)
  42                p0 = cmp.eq(r2, #0)
  43                p1 = cmp.gtu(r2, #7)
  44        }
  45        {
  46                r4 = vsplatb(r1)
  47                r8 = r0           /* leave r0 intact for return val  */
  48                r9 = sub(r6, r7)  /* bytes until double alignment  */
  49                if p0 jumpr r31   /* count == 0, so return  */
  50        }
  51        {
  52                r3 = #0
  53                r7 = #0
  54                p0 = tstbit(r9, #0)
  55                if p1 jump 2f /* skip byte loop */
  56        }
  57
  58/* less than 8 bytes to set, so just set a byte at a time and return  */
  59
  60                loop0(1f, r2) /* byte loop */
  61        .falign
  621: /* byte loop */
  63        {
  64                memb(r8++#1) = r4
  65        }:endloop0
  66                jumpr r31
  67        .falign
  682: /* skip byte loop */
  69        {
  70                r6 = #1
  71                p0 = tstbit(r9, #1)
  72                p1 = cmp.eq(r2, #1)
  73                if !p0 jump 3f /* skip initial byte store */
  74        }
  75        {
  76                memb(r8++#1) = r4
  77                r3:2 = sub(r3:2, r7:6)
  78                if p1 jumpr r31
  79        }
  80        .falign
  813: /* skip initial byte store */
  82        {
  83                r6 = #2
  84                p0 = tstbit(r9, #2)
  85                p1 = cmp.eq(r2, #2)
  86                if !p0 jump 4f /* skip initial half store */
  87        }
  88        {
  89                memh(r8++#2) = r4
  90                r3:2 = sub(r3:2, r7:6)
  91                if p1 jumpr r31
  92        }
  93        .falign
  944: /* skip initial half store */
  95        {
  96                r6 = #4
  97                p0 = cmp.gtu(r2, #7)
  98                p1 = cmp.eq(r2, #4)
  99                if !p0 jump 5f /* skip initial word store */
 100        }
 101        {
 102                memw(r8++#4) = r4
 103                r3:2 = sub(r3:2, r7:6)
 104                p0 = cmp.gtu(r2, #11)
 105                if p1 jumpr r31
 106        }
 107        .falign
 1085: /* skip initial word store */
 109        {
 110                r10 = lsr(r2, #3)
 111                p1 = cmp.eq(r3, #1)
 112                if !p0 jump 7f /* skip double loop */
 113        }
 114        {
 115                r5 = r4
 116                r6 = #8
 117                loop0(6f, r10) /* double loop */
 118        }
 119
 120/* set bytes a double word at a time  */
 121
 122        .falign
 1236: /* double loop */
 124        {
 125                memd(r8++#8) = r5:4
 126                r3:2 = sub(r3:2, r7:6)
 127                p1 = cmp.eq(r2, #8)
 128        }:endloop0
 129        .falign
 1307: /* skip double loop */
 131        {
 132                p0 = tstbit(r2, #2)
 133                if p1 jumpr r31
 134        }
 135        {
 136                r6 = #4
 137                p0 = tstbit(r2, #1)
 138                p1 = cmp.eq(r2, #4)
 139                if !p0 jump 8f /* skip final word store */
 140        }
 141        {
 142                memw(r8++#4) = r4
 143                r3:2 = sub(r3:2, r7:6)
 144                if p1 jumpr r31
 145        }
 146        .falign
 1478: /* skip final word store */
 148        {
 149                p1 = cmp.eq(r2, #2)
 150                if !p0 jump 9f /* skip final half store */
 151        }
 152        {
 153                memh(r8++#2) = r4
 154                if p1 jumpr r31
 155        }
 156        .falign
 1579: /* skip final half store */
 158        {
 159                memb(r8++#1) = r4
 160                jumpr r31
 161        }
 162HEXAGON_OPT_FUNC_FINISH memset
 163#endif
 164
 165
 166/*  FUNCTION: memset (v3 and higher version)  */
 167#if __HEXAGON_ARCH__ >= 3
 168HEXAGON_OPT_FUNC_BEGIN memset
 169        {
 170                r7=vsplatb(r1)
 171                r6 = r0
 172                if (r2==#0) jump:nt .L1
 173        }
 174        {
 175                r5:4=combine(r7,r7)
 176                p0 = cmp.gtu(r2,#8)
 177                if (p0.new) jump:nt .L3
 178        }
 179        {
 180                r3 = r0
 181                loop0(.L47,r2)
 182        }
 183        .falign
 184.L47:
 185        {
 186                memb(r3++#1) = r1
 187        }:endloop0 /* start=.L47 */
 188                jumpr r31
 189.L3:
 190        {
 191                p0 = tstbit(r0,#0)
 192                if (!p0.new) jump:nt .L8
 193                p1 = cmp.eq(r2, #1)
 194        }
 195        {
 196                r6 = add(r0, #1)
 197                r2 = add(r2,#-1)
 198                memb(r0) = r1
 199                if (p1) jump .L1
 200        }
 201.L8:
 202        {
 203                p0 = tstbit(r6,#1)
 204                if (!p0.new) jump:nt .L10
 205        }
 206        {
 207                r2 = add(r2,#-2)
 208                memh(r6++#2) = r7
 209                p0 = cmp.eq(r2, #2)
 210                if (p0.new) jump:nt .L1
 211        }
 212.L10:
 213        {
 214                p0 = tstbit(r6,#2)
 215                if (!p0.new) jump:nt .L12
 216        }
 217        {
 218                r2 = add(r2,#-4)
 219                memw(r6++#4) = r7
 220                p0 = cmp.eq(r2, #4)
 221                if (p0.new) jump:nt .L1
 222        }
 223.L12:
 224        {
 225                p0 = cmp.gtu(r2,#127)
 226                if (!p0.new) jump:nt .L14
 227        }
 228                r3 = and(r6,#31)
 229                if (r3==#0) jump:nt .L17
 230        {
 231                memd(r6++#8) = r5:4
 232                r2 = add(r2,#-8)
 233        }
 234                r3 = and(r6,#31)
 235                if (r3==#0) jump:nt .L17
 236        {
 237                memd(r6++#8) = r5:4
 238                r2 = add(r2,#-8)
 239        }
 240                r3 = and(r6,#31)
 241                if (r3==#0) jump:nt .L17
 242        {
 243                memd(r6++#8) = r5:4
 244                r2 = add(r2,#-8)
 245        }
 246.L17:
 247        {
 248                r3 = lsr(r2,#5)
 249                if (r1!=#0) jump:nt .L18
 250        }
 251        {
 252                r8 = r3
 253                r3 = r6
 254                loop0(.L46,r3)
 255        }
 256        .falign
 257.L46:
 258        {
 259                dczeroa(r6)
 260                r6 = add(r6,#32)
 261                r2 = add(r2,#-32)
 262        }:endloop0 /* start=.L46 */
 263.L14:
 264        {
 265                p0 = cmp.gtu(r2,#7)
 266                if (!p0.new) jump:nt .L28
 267                r8 = lsr(r2,#3)
 268        }
 269                loop0(.L44,r8)
 270        .falign
 271.L44:
 272        {
 273                memd(r6++#8) = r5:4
 274                r2 = add(r2,#-8)
 275        }:endloop0 /* start=.L44 */
 276.L28:
 277        {
 278                p0 = tstbit(r2,#2)
 279                if (!p0.new) jump:nt .L33
 280        }
 281        {
 282                r2 = add(r2,#-4)
 283                memw(r6++#4) = r7
 284        }
 285.L33:
 286        {
 287                p0 = tstbit(r2,#1)
 288                if (!p0.new) jump:nt .L35
 289        }
 290        {
 291                r2 = add(r2,#-2)
 292                memh(r6++#2) = r7
 293        }
 294.L35:
 295                p0 = cmp.eq(r2,#1)
 296                if (p0) memb(r6) = r1
 297.L1:
 298                jumpr r31
 299.L18:
 300                loop0(.L45,r3)
 301        .falign
 302.L45:
 303                dczeroa(r6)
 304        {
 305                memd(r6++#8) = r5:4
 306                r2 = add(r2,#-32)
 307        }
 308                memd(r6++#8) = r5:4
 309                memd(r6++#8) = r5:4
 310        {
 311                memd(r6++#8) = r5:4
 312        }:endloop0 /* start=.L45  */
 313                jump .L14
 314HEXAGON_OPT_FUNC_FINISH memset
 315#endif
 316