linux/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
<<
>>
Prefs
   1/*
   2 * Copyright 2018 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 */
  22
  23var SQ_WAVE_STATUS_INST_ATC_SHIFT               = 23
  24var SQ_WAVE_STATUS_INST_ATC_MASK                = 0x00800000
  25var SQ_WAVE_STATUS_SPI_PRIO_MASK                = 0x00000006
  26var SQ_WAVE_STATUS_HALT_MASK                    = 0x2000
  27
  28var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT            = 12
  29var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE             = 9
  30var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT           = 8
  31var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE            = 6
  32var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT           = 24
  33var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE            = 4
  34var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT    = 24
  35var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE     = 4
  36var SQ_WAVE_IB_STS2_WAVE64_SHIFT                = 11
  37var SQ_WAVE_IB_STS2_WAVE64_SIZE                 = 1
  38
  39var SQ_WAVE_TRAPSTS_SAVECTX_MASK                = 0x400
  40var SQ_WAVE_TRAPSTS_EXCE_MASK                   = 0x1FF
  41var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT               = 10
  42var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK               = 0x100
  43var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT              = 8
  44var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK            = 0x3FF
  45var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT           = 0x0
  46var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE            = 10
  47var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK           = 0xFFFFF800
  48var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT          = 11
  49var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE           = 21
  50var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK           = 0x800
  51
  52var SQ_WAVE_IB_STS_RCNT_SHIFT                   = 16
  53var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT           = 15
  54var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT            = 25
  55var SQ_WAVE_IB_STS_REPLAY_W64H_SIZE             = 1
  56var SQ_WAVE_IB_STS_REPLAY_W64H_MASK             = 0x02000000
  57var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE            = 1
  58var SQ_WAVE_IB_STS_RCNT_SIZE                    = 6
  59var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK       = 0x003F8000
  60var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG   = 0x00007FFF
  61
  62var SQ_BUF_RSRC_WORD1_ATC_SHIFT                 = 24
  63var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT               = 27
  64
  65// bits [31:24] unused by SPI debug data
  66var TTMP11_SAVE_REPLAY_W64H_SHIFT               = 31
  67var TTMP11_SAVE_REPLAY_W64H_MASK                = 0x80000000
  68var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT         = 24
  69var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK          = 0x7F000000
  70
  71// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
  72// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
  73var S_SAVE_BUF_RSRC_WORD1_STRIDE                = 0x00040000
  74var S_SAVE_BUF_RSRC_WORD3_MISC                  = 0x10807FAC
  75
  76var S_SAVE_SPI_INIT_ATC_MASK                    = 0x08000000
  77var S_SAVE_SPI_INIT_ATC_SHIFT                   = 27
  78var S_SAVE_SPI_INIT_MTYPE_MASK                  = 0x70000000
  79var S_SAVE_SPI_INIT_MTYPE_SHIFT                 = 28
  80var S_SAVE_SPI_INIT_FIRST_WAVE_MASK             = 0x04000000
  81var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT            = 26
  82
  83var S_SAVE_PC_HI_RCNT_SHIFT                     = 26
  84var S_SAVE_PC_HI_RCNT_MASK                      = 0xFC000000
  85var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT             = 25
  86var S_SAVE_PC_HI_FIRST_REPLAY_MASK              = 0x02000000
  87var S_SAVE_PC_HI_REPLAY_W64H_SHIFT              = 24
  88var S_SAVE_PC_HI_REPLAY_W64H_MASK               = 0x01000000
  89
  90var s_sgpr_save_num                             = 108
  91
  92var s_save_spi_init_lo                          = exec_lo
  93var s_save_spi_init_hi                          = exec_hi
  94var s_save_pc_lo                                = ttmp0
  95var s_save_pc_hi                                = ttmp1
  96var s_save_exec_lo                              = ttmp2
  97var s_save_exec_hi                              = ttmp3
  98var s_save_status                               = ttmp12
  99var s_save_trapsts                              = ttmp5
 100var s_save_xnack_mask                           = ttmp6
 101var s_wave_size                                 = ttmp7
 102var s_save_buf_rsrc0                            = ttmp8
 103var s_save_buf_rsrc1                            = ttmp9
 104var s_save_buf_rsrc2                            = ttmp10
 105var s_save_buf_rsrc3                            = ttmp11
 106var s_save_mem_offset                           = ttmp14
 107var s_save_alloc_size                           = s_save_trapsts
 108var s_save_tmp                                  = s_save_buf_rsrc2
 109var s_save_m0                                   = ttmp15
 110
 111var S_RESTORE_BUF_RSRC_WORD1_STRIDE             = S_SAVE_BUF_RSRC_WORD1_STRIDE
 112var S_RESTORE_BUF_RSRC_WORD3_MISC               = S_SAVE_BUF_RSRC_WORD3_MISC
 113
 114var S_RESTORE_SPI_INIT_ATC_MASK                 = 0x08000000
 115var S_RESTORE_SPI_INIT_ATC_SHIFT                = 27
 116var S_RESTORE_SPI_INIT_MTYPE_MASK               = 0x70000000
 117var S_RESTORE_SPI_INIT_MTYPE_SHIFT              = 28
 118var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK          = 0x04000000
 119var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT         = 26
 120var S_WAVE_SIZE                                 = 25
 121
 122var S_RESTORE_PC_HI_RCNT_SHIFT                  = S_SAVE_PC_HI_RCNT_SHIFT
 123var S_RESTORE_PC_HI_RCNT_MASK                   = S_SAVE_PC_HI_RCNT_MASK
 124var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT          = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
 125var S_RESTORE_PC_HI_FIRST_REPLAY_MASK           = S_SAVE_PC_HI_FIRST_REPLAY_MASK
 126
 127var s_restore_spi_init_lo                       = exec_lo
 128var s_restore_spi_init_hi                       = exec_hi
 129var s_restore_mem_offset                        = ttmp12
 130var s_restore_alloc_size                        = ttmp3
 131var s_restore_tmp                               = ttmp6
 132var s_restore_mem_offset_save                   = s_restore_tmp
 133var s_restore_m0                                = s_restore_alloc_size
 134var s_restore_mode                              = ttmp7
 135var s_restore_flat_scratch                      = ttmp2
 136var s_restore_pc_lo                             = ttmp0
 137var s_restore_pc_hi                             = ttmp1
 138var s_restore_exec_lo                           = ttmp14
 139var s_restore_exec_hi                           = ttmp15
 140var s_restore_status                            = ttmp4
 141var s_restore_trapsts                           = ttmp5
 142var s_restore_xnack_mask                        = ttmp13
 143var s_restore_buf_rsrc0                         = ttmp8
 144var s_restore_buf_rsrc1                         = ttmp9
 145var s_restore_buf_rsrc2                         = ttmp10
 146var s_restore_buf_rsrc3                         = ttmp11
 147var s_restore_size                              = ttmp7
 148
 149shader main
 150        asic(DEFAULT)
 151        type(CS)
 152        wave_size(32)
 153
 154        s_branch        L_SKIP_RESTORE                                          //NOT restore. might be a regular trap or save
 155
 156L_JUMP_TO_RESTORE:
 157        s_branch        L_RESTORE
 158
 159L_SKIP_RESTORE:
 160        s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)                     //save STATUS since we will change SCC
 161        s_andn2_b32     s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK
 162        s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
 163        s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK     //check whether this is for save
 164        s_cbranch_scc1  L_SAVE
 165
 166        // If STATUS.MEM_VIOL is asserted then halt the wave to prevent
 167        // the exception raising again and blocking context save.
 168        s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
 169        s_cbranch_scc0  L_FETCH_2ND_TRAP
 170        s_or_b32        s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
 171
 172L_FETCH_2ND_TRAP:
 173        // Preserve and clear scalar XNACK state before issuing scalar loads.
 174        // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
 175        // unused space ttmp11[31:24].
 176        s_andn2_b32     ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
 177        s_getreg_b32    ttmp2, hwreg(HW_REG_IB_STS)
 178        s_and_b32       ttmp3, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
 179        s_lshl_b32      ttmp3, ttmp3, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
 180        s_or_b32        ttmp11, ttmp11, ttmp3
 181        s_and_b32       ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
 182        s_lshl_b32      ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
 183        s_or_b32        ttmp11, ttmp11, ttmp3
 184        s_andn2_b32     ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
 185        s_setreg_b32    hwreg(HW_REG_IB_STS), ttmp2
 186
 187        // Read second-level TBA/TMA from first-level TMA and jump if available.
 188        // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
 189        // ttmp12 holds SQ_WAVE_STATUS
 190        s_getreg_b32    ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
 191        s_getreg_b32    ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
 192        s_lshl_b64      [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
 193        s_load_dwordx2  [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1             // second-level TBA
 194        s_waitcnt       lgkmcnt(0)
 195        s_load_dwordx2  [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1           // second-level TMA
 196        s_waitcnt       lgkmcnt(0)
 197        s_and_b64       [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
 198        s_cbranch_scc0  L_NO_NEXT_TRAP                                          // second-level trap handler not been set
 199        s_setpc_b64     [ttmp2, ttmp3]                                          // jump to second-level trap handler
 200
 201L_NO_NEXT_TRAP:
 202        s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
 203        s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK
 204        s_cbranch_scc1  L_EXCP_CASE                                             // Exception, jump back to the shader program directly.
 205        s_add_u32       ttmp0, ttmp0, 4                                         // S_TRAP case, add 4 to ttmp0
 206        s_addc_u32      ttmp1, ttmp1, 0
 207L_EXCP_CASE:
 208        s_and_b32       ttmp1, ttmp1, 0xFFFF
 209
 210        // Restore SQ_WAVE_IB_STS.
 211        s_lshr_b32      ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
 212        s_and_b32       ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
 213        s_lshr_b32      ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
 214        s_and_b32       ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
 215        s_or_b32        ttmp2, ttmp2, ttmp3
 216        s_setreg_b32    hwreg(HW_REG_IB_STS), ttmp2
 217
 218        // Restore SQ_WAVE_STATUS.
 219        s_and_b64       exec, exec, exec                                        // Restore STATUS.EXECZ, not writable by s_setreg_b32
 220        s_and_b64       vcc, vcc, vcc                                           // Restore STATUS.VCCZ, not writable by s_setreg_b32
 221        s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status
 222
 223        s_rfe_b64       [ttmp0, ttmp1]
 224
 225L_SAVE:
 226        //check whether there is mem_viol
 227        s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
 228        s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
 229        s_cbranch_scc0  L_NO_PC_REWIND
 230
 231        //if so, need rewind PC assuming GDS operation gets NACKed
 232        s_mov_b32       s_save_tmp, 0
 233        s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp    //clear mem_viol bit
 234        s_and_b32       s_save_pc_hi, s_save_pc_hi, 0x0000ffff                  //pc[47:32]
 235        s_sub_u32       s_save_pc_lo, s_save_pc_lo, 8                           //pc[31:0]-8
 236        s_subb_u32      s_save_pc_hi, s_save_pc_hi, 0x0
 237
 238L_NO_PC_REWIND:
 239        s_mov_b32       s_save_tmp, 0
 240        s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp     //clear saveCtx bit
 241
 242        s_getreg_b32    s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
 243        s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)
 244        s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
 245        s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
 246        s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)
 247        s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
 248        s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
 249        s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT, SQ_WAVE_IB_STS_REPLAY_W64H_SIZE)
 250        s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
 251        s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
 252        s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)                        //clear RCNT and FIRST_REPLAY and REPLAY_W64H in IB_STS
 253        s_and_b32       s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
 254
 255        s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
 256
 257        /* inform SPI the readiness and wait for SPI's go signal */
 258        s_mov_b32       s_save_exec_lo, exec_lo                                 //save EXEC and use EXEC for the go signal from SPI
 259        s_mov_b32       s_save_exec_hi, exec_hi
 260        s_mov_b64       exec, 0x0                                               //clear EXEC to get ready to receive
 261
 262        s_sendmsg       sendmsg(MSG_SAVEWAVE)                                   //send SPI a message and wait for SPI's write to EXEC
 263
 264L_SLEEP:
 265        // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
 266        // SQ hang, since the 7,8th wave could not get arbit to exec inst, while
 267        // other waves are stuck into the sleep-loop and waiting for wrexec!=0
 268        s_sleep         0x2
 269        s_cbranch_execz L_SLEEP
 270
 271        /* setup Resource Contants */
 272        s_mov_b32       s_save_buf_rsrc0, s_save_spi_init_lo                    //base_addr_lo
 273        s_and_b32       s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF        //base_addr_hi
 274        s_or_b32        s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
 275        s_mov_b32       s_save_buf_rsrc2, 0                                     //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
 276        s_mov_b32       s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
 277        s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
 278        s_lshr_b32      s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
 279        s_or_b32        s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp          //or ATC
 280        s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
 281        s_lshr_b32      s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
 282        s_or_b32        s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp          //or MTYPE
 283
 284        s_mov_b32       s_save_m0, m0
 285
 286        /* global mem offset */
 287        s_mov_b32       s_save_mem_offset, 0x0
 288        s_getreg_b32    s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
 289        s_lshl_b32      s_wave_size, s_wave_size, S_WAVE_SIZE
 290        s_or_b32        s_wave_size, s_save_spi_init_hi, s_wave_size            //share s_wave_size with exec_hi, it's at bit25
 291
 292        /* save HW registers */
 293
 294L_SAVE_HWREG:
 295        // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
 296        get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
 297        get_svgpr_size_bytes(s_save_tmp)
 298        s_add_u32       s_save_mem_offset, s_save_mem_offset, s_save_tmp
 299        s_add_u32       s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
 300
 301        s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
 302
 303        write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
 304        write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
 305        write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
 306        write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
 307        write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
 308        write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
 309
 310        s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
 311        write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)
 312        write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
 313
 314        s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)
 315        write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
 316
 317        s_getreg_b32    s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO)
 318        write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
 319
 320        s_getreg_b32    s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI)
 321        write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
 322
 323        /* the first wave in the threadgroup */
 324        s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
 325        s_mov_b32       s_save_exec_hi, 0x0
 326        s_or_b32        s_save_exec_hi, s_save_tmp, s_save_exec_hi              // save first wave bit to s_save_exec_hi.bits[26]
 327
 328        /* save SGPRs */
 329        // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
 330
 331        // SGPR SR memory offset : size(VGPR)+size(SVGPR)
 332        get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
 333        get_svgpr_size_bytes(s_save_tmp)
 334        s_add_u32       s_save_mem_offset, s_save_mem_offset, s_save_tmp
 335        s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
 336
 337        // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
 338        s_mov_b32       s_save_xnack_mask, s_save_buf_rsrc0
 339        s_add_u32       s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
 340        s_addc_u32      s_save_buf_rsrc1, s_save_buf_rsrc1, 0
 341
 342        s_mov_b32       m0, 0x0                                                 //SGPR initial index value =0
 343        s_nop           0x0                                                     //Manually inserted wait states
 344L_SAVE_SGPR_LOOP:
 345        // SGPR is allocated in 16 SGPR granularity
 346        s_movrels_b64   s0, s0                                                  //s0 = s[0+m0], s1 = s[1+m0]
 347        s_movrels_b64   s2, s2                                                  //s2 = s[2+m0], s3 = s[3+m0]
 348        s_movrels_b64   s4, s4                                                  //s4 = s[4+m0], s5 = s[5+m0]
 349        s_movrels_b64   s6, s6                                                  //s6 = s[6+m0], s7 = s[7+m0]
 350        s_movrels_b64   s8, s8                                                  //s8 = s[8+m0], s9 = s[9+m0]
 351        s_movrels_b64   s10, s10                                                //s10 = s[10+m0], s11 = s[11+m0]
 352        s_movrels_b64   s12, s12                                                //s12 = s[12+m0], s13 = s[13+m0]
 353        s_movrels_b64   s14, s14                                                //s14 = s[14+m0], s15 = s[15+m0]
 354
 355        write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
 356        s_add_u32       m0, m0, 16                                              //next sgpr index
 357        s_cmp_lt_u32    m0, 96                                                  //scc = (m0 < first 96 SGPR) ? 1 : 0
 358        s_cbranch_scc1  L_SAVE_SGPR_LOOP                                        //first 96 SGPR save is complete?
 359
 360        //save the rest 12 SGPR
 361        s_movrels_b64   s0, s0                                                  //s0 = s[0+m0], s1 = s[1+m0]
 362        s_movrels_b64   s2, s2                                                  //s2 = s[2+m0], s3 = s[3+m0]
 363        s_movrels_b64   s4, s4                                                  //s4 = s[4+m0], s5 = s[5+m0]
 364        s_movrels_b64   s6, s6                                                  //s6 = s[6+m0], s7 = s[7+m0]
 365        s_movrels_b64   s8, s8                                                  //s8 = s[8+m0], s9 = s[9+m0]
 366        s_movrels_b64   s10, s10                                                //s10 = s[10+m0], s11 = s[11+m0]
 367        write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
 368
 369        // restore s_save_buf_rsrc0,1
 370        s_mov_b32       s_save_buf_rsrc0, s_save_xnack_mask
 371
 372        /* save first 4 VGPR, then LDS save could use   */
 373        // each wave will alloc 4 vgprs at least...
 374
 375        s_mov_b32       s_save_mem_offset, 0
 376        s_mov_b32       exec_lo, 0xFFFFFFFF                                     //need every thread from now on
 377        s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
 378        s_and_b32       m0, m0, 1
 379        s_cmp_eq_u32    m0, 1
 380        s_cbranch_scc1  L_ENABLE_SAVE_4VGPR_EXEC_HI
 381        s_mov_b32       exec_hi, 0x00000000
 382        s_branch        L_SAVE_4VGPR_WAVE32
 383L_ENABLE_SAVE_4VGPR_EXEC_HI:
 384        s_mov_b32       exec_hi, 0xFFFFFFFF
 385        s_branch        L_SAVE_4VGPR_WAVE64
 386L_SAVE_4VGPR_WAVE32:
 387        s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
 388
 389        // VGPR Allocated in 4-GPR granularity
 390
 391        buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 392        buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
 393        buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
 394        buffer_store_dword      v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
 395        s_branch        L_SAVE_LDS
 396
 397L_SAVE_4VGPR_WAVE64:
 398        s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
 399
 400        // VGPR Allocated in 4-GPR granularity
 401
 402        buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 403        buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
 404        buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
 405        buffer_store_dword      v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
 406
 407        /* save LDS */
 408
 409L_SAVE_LDS:
 410        // Change EXEC to all threads...
 411        s_mov_b32       exec_lo, 0xFFFFFFFF                                     //need every thread from now on
 412        s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
 413        s_and_b32       m0, m0, 1
 414        s_cmp_eq_u32    m0, 1
 415        s_cbranch_scc1  L_ENABLE_SAVE_LDS_EXEC_HI
 416        s_mov_b32       exec_hi, 0x00000000
 417        s_branch        L_SAVE_LDS_NORMAL
 418L_ENABLE_SAVE_LDS_EXEC_HI:
 419        s_mov_b32       exec_hi, 0xFFFFFFFF
 420L_SAVE_LDS_NORMAL:
 421        s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
 422        s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF        //lds_size is zero?
 423        s_cbranch_scc0  L_SAVE_LDS_DONE                                         //no lds used? jump to L_SAVE_DONE
 424
 425        s_barrier                                                               //LDS is used? wait for other waves in the same TG
 426        s_and_b32       s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
 427        s_cbranch_scc0  L_SAVE_LDS_DONE
 428
 429        // first wave do LDS save;
 430
 431        s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 6                 //LDS size in dwords = lds_size * 64dw
 432        s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                 //LDS size in bytes
 433        s_mov_b32       s_save_buf_rsrc2, s_save_alloc_size                     //NUM_RECORDS in bytes
 434
 435        // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
 436        //
 437        get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
 438        get_svgpr_size_bytes(s_save_tmp)
 439        s_add_u32       s_save_mem_offset, s_save_mem_offset, s_save_tmp
 440        s_add_u32       s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
 441        s_add_u32       s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
 442
 443        s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
 444
 445        //load 0~63*4(byte address) to vgpr v0
 446        v_mbcnt_lo_u32_b32      v0, -1, 0
 447        v_mbcnt_hi_u32_b32      v0, -1, v0
 448        v_mul_u32_u24   v0, 4, v0
 449
 450        s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
 451        s_and_b32       m0, m0, 1
 452        s_cmp_eq_u32    m0, 1
 453        s_mov_b32       m0, 0x0
 454        s_cbranch_scc1  L_SAVE_LDS_W64
 455
 456L_SAVE_LDS_W32:
 457        s_mov_b32       s3, 128
 458        s_nop           0
 459        s_nop           0
 460        s_nop           0
 461L_SAVE_LDS_LOOP_W32:
 462        ds_read_b32     v1, v0
 463        s_waitcnt       0
 464        buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 465
 466        s_add_u32       m0, m0, s3                                              //every buffer_store_lds does 256 bytes
 467        s_add_u32       s_save_mem_offset, s_save_mem_offset, s3
 468        v_add_nc_u32    v0, v0, 128                                             //mem offset increased by 128 bytes
 469        s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc=(m0 < s_save_alloc_size) ? 1 : 0
 470        s_cbranch_scc1  L_SAVE_LDS_LOOP_W32                                     //LDS save is complete?
 471
 472        s_branch        L_SAVE_LDS_DONE
 473
 474L_SAVE_LDS_W64:
 475        s_mov_b32       s3, 256
 476        s_nop           0
 477        s_nop           0
 478        s_nop           0
 479L_SAVE_LDS_LOOP_W64:
 480        ds_read_b32     v1, v0
 481        s_waitcnt       0
 482        buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 483
 484        s_add_u32       m0, m0, s3                                              //every buffer_store_lds does 256 bytes
 485        s_add_u32       s_save_mem_offset, s_save_mem_offset, s3
 486        v_add_nc_u32    v0, v0, 256                                             //mem offset increased by 256 bytes
 487        s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc=(m0 < s_save_alloc_size) ? 1 : 0
 488        s_cbranch_scc1  L_SAVE_LDS_LOOP_W64                                     //LDS save is complete?
 489
 490L_SAVE_LDS_DONE:
 491        /* save VGPRs  - set the Rest VGPRs */
 492L_SAVE_VGPR:
 493        // VGPR SR memory offset: 0
 494        s_mov_b32       exec_lo, 0xFFFFFFFF                                     //need every thread from now on
 495        s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
 496        s_and_b32       m0, m0, 1
 497        s_cmp_eq_u32    m0, 1
 498        s_cbranch_scc1  L_ENABLE_SAVE_VGPR_EXEC_HI
 499        s_mov_b32       s_save_mem_offset, (0+128*4)                            // for the rest VGPRs
 500        s_mov_b32       exec_hi, 0x00000000
 501        s_branch        L_SAVE_VGPR_NORMAL
 502L_ENABLE_SAVE_VGPR_EXEC_HI:
 503        s_mov_b32       s_save_mem_offset, (0+256*4)                            // for the rest VGPRs
 504        s_mov_b32       exec_hi, 0xFFFFFFFF
 505L_SAVE_VGPR_NORMAL:
 506        s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
 507        s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
 508        s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                 //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
 509        //determine it is wave32 or wave64
 510        s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
 511        s_and_b32       m0, m0, 1
 512        s_cmp_eq_u32    m0, 1
 513        s_cbranch_scc1  L_SAVE_VGPR_WAVE64
 514
 515        s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
 516
 517        // VGPR Allocated in 4-GPR granularity
 518
 519        // VGPR store using dw burst
 520        s_mov_b32       m0, 0x4                                                 //VGPR initial index value =4
 521        s_cmp_lt_u32    m0, s_save_alloc_size
 522        s_cbranch_scc0  L_SAVE_VGPR_END
 523
 524L_SAVE_VGPR_W32_LOOP:
 525        v_movrels_b32   v0, v0                                                  //v0 = v[0+m0]
 526        v_movrels_b32   v1, v1                                                  //v1 = v[1+m0]
 527        v_movrels_b32   v2, v2                                                  //v2 = v[2+m0]
 528        v_movrels_b32   v3, v3                                                  //v3 = v[3+m0]
 529
 530        buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 531        buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
 532        buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
 533        buffer_store_dword      v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
 534
 535        s_add_u32       m0, m0, 4                                               //next vgpr index
 536        s_add_u32       s_save_mem_offset, s_save_mem_offset, 128*4             //every buffer_store_dword does 128 bytes
 537        s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc = (m0 < s_save_alloc_size) ? 1 : 0
 538        s_cbranch_scc1  L_SAVE_VGPR_W32_LOOP                                    //VGPR save is complete?
 539
 540        s_branch        L_SAVE_VGPR_END
 541
 542L_SAVE_VGPR_WAVE64:
 543        s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
 544
 545        // VGPR store using dw burst
 546        s_mov_b32       m0, 0x4                                                 //VGPR initial index value =4
 547        s_cmp_lt_u32    m0, s_save_alloc_size
 548        s_cbranch_scc0  L_SAVE_VGPR_END
 549
 550L_SAVE_VGPR_W64_LOOP:
 551        v_movrels_b32   v0, v0                                                  //v0 = v[0+m0]
 552        v_movrels_b32   v1, v1                                                  //v1 = v[1+m0]
 553        v_movrels_b32   v2, v2                                                  //v2 = v[2+m0]
 554        v_movrels_b32   v3, v3                                                  //v3 = v[3+m0]
 555
 556        buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 557        buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
 558        buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
 559        buffer_store_dword      v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
 560
 561        s_add_u32       m0, m0, 4                                               //next vgpr index
 562        s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4             //every buffer_store_dword does 256 bytes
 563        s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc = (m0 < s_save_alloc_size) ? 1 : 0
 564        s_cbranch_scc1  L_SAVE_VGPR_W64_LOOP                                    //VGPR save is complete?
 565
 566        //Below part will be the save shared vgpr part (new for gfx10)
 567        s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
 568        s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF        //shared_vgpr_size is zero?
 569        s_cbranch_scc0  L_SAVE_VGPR_END                                         //no shared_vgpr used? jump to L_SAVE_LDS
 570        s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 3                 //Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
 571        //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
 572        //save shared_vgpr will start from the index of m0
 573        s_add_u32       s_save_alloc_size, s_save_alloc_size, m0
 574        s_mov_b32       exec_lo, 0xFFFFFFFF
 575        s_mov_b32       exec_hi, 0x00000000
 576L_SAVE_SHARED_VGPR_WAVE64_LOOP:
 577        v_movrels_b32   v0, v0                                                  //v0 = v[0+m0]
 578        buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 579        s_add_u32       m0, m0, 1                                               //next vgpr index
 580        s_add_u32       s_save_mem_offset, s_save_mem_offset, 128
 581        s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc = (m0 < s_save_alloc_size) ? 1 : 0
 582        s_cbranch_scc1  L_SAVE_SHARED_VGPR_WAVE64_LOOP                          //SHARED_VGPR save is complete?
 583
 584L_SAVE_VGPR_END:
 585        s_branch        L_END_PGM
 586
 587L_RESTORE:
 588        /* Setup Resource Contants */
 589        s_mov_b32       s_restore_buf_rsrc0, s_restore_spi_init_lo              //base_addr_lo
 590        s_and_b32       s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF  //base_addr_hi
 591        s_or_b32        s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
 592        s_mov_b32       s_restore_buf_rsrc2, 0                                  //NUM_RECORDS initial value = 0 (in bytes)
 593        s_mov_b32       s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
 594        s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
 595        s_lshr_b32      s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
 596        s_or_b32        s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
 597        s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
 598        s_lshr_b32      s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
 599        s_or_b32        s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
 600        //determine it is wave32 or wave64
 601        s_getreg_b32    s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
 602        s_lshl_b32      s_restore_size, s_restore_size, S_WAVE_SIZE
 603        s_or_b32        s_restore_size, s_restore_spi_init_hi, s_restore_size
 604
 605        s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
 606        s_cbranch_scc0  L_RESTORE_VGPR
 607
 608        /* restore LDS */
 609L_RESTORE_LDS:
 610        s_mov_b32       exec_lo, 0xFFFFFFFF                                     //need every thread from now on
 611        s_lshr_b32      m0, s_restore_size, S_WAVE_SIZE
 612        s_and_b32       m0, m0, 1
 613        s_cmp_eq_u32    m0, 1
 614        s_cbranch_scc1  L_ENABLE_RESTORE_LDS_EXEC_HI
 615        s_mov_b32       exec_hi, 0x00000000
 616        s_branch        L_RESTORE_LDS_NORMAL
 617L_ENABLE_RESTORE_LDS_EXEC_HI:
 618        s_mov_b32       exec_hi, 0xFFFFFFFF
 619L_RESTORE_LDS_NORMAL:
 620        s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
 621        s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF  //lds_size is zero?
 622        s_cbranch_scc0  L_RESTORE_VGPR                                          //no lds used? jump to L_RESTORE_VGPR
 623        s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 6           //LDS size in dwords = lds_size * 64dw
 624        s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2           //LDS size in bytes
 625        s_mov_b32       s_restore_buf_rsrc2, s_restore_alloc_size               //NUM_RECORDS in bytes
 626
 627        // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
 628        //
 629        get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
 630        get_svgpr_size_bytes(s_restore_tmp)
 631        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
 632        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
 633        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
 634
 635        s_mov_b32       s_restore_buf_rsrc2, 0x1000000                          //NUM_RECORDS in bytes
 636
 637        s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
 638        s_and_b32       m0, m0, 1
 639        s_cmp_eq_u32    m0, 1
 640        s_mov_b32       m0, 0x0
 641        s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64
 642
 643L_RESTORE_LDS_LOOP_W32:
 644        buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
 645        s_add_u32       m0, m0, 128                                             // 128 DW
 646        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 128         //mem offset increased by 128DW
 647        s_cmp_lt_u32    m0, s_restore_alloc_size                                //scc=(m0 < s_restore_alloc_size) ? 1 : 0
 648        s_cbranch_scc1  L_RESTORE_LDS_LOOP_W32                                  //LDS restore is complete?
 649        s_branch        L_RESTORE_VGPR
 650
 651L_RESTORE_LDS_LOOP_W64:
 652        buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
 653        s_add_u32       m0, m0, 256                                             // 256 DW
 654        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256         //mem offset increased by 256DW
 655        s_cmp_lt_u32    m0, s_restore_alloc_size                                //scc=(m0 < s_restore_alloc_size) ? 1 : 0
 656        s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64                                  //LDS restore is complete?
 657
 658        /* restore VGPRs */
 659L_RESTORE_VGPR:
 660        // VGPR SR memory offset : 0
 661        s_mov_b32       s_restore_mem_offset, 0x0
 662        s_mov_b32       exec_lo, 0xFFFFFFFF                                     //need every thread from now on
 663        s_lshr_b32      m0, s_restore_size, S_WAVE_SIZE
 664        s_and_b32       m0, m0, 1
 665        s_cmp_eq_u32    m0, 1
 666        s_cbranch_scc1  L_ENABLE_RESTORE_VGPR_EXEC_HI
 667        s_mov_b32       exec_hi, 0x00000000
 668        s_branch        L_RESTORE_VGPR_NORMAL
 669L_ENABLE_RESTORE_VGPR_EXEC_HI:
 670        s_mov_b32       exec_hi, 0xFFFFFFFF
 671L_RESTORE_VGPR_NORMAL:
 672        s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
 673        s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
 674        s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2           //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
 675        //determine it is wave32 or wave64
 676        s_lshr_b32      m0, s_restore_size, S_WAVE_SIZE
 677        s_and_b32       m0, m0, 1
 678        s_cmp_eq_u32    m0, 1
 679        s_cbranch_scc1  L_RESTORE_VGPR_WAVE64
 680
 681        s_mov_b32       s_restore_buf_rsrc2, 0x1000000                          //NUM_RECORDS in bytes
 682
 683        // VGPR load using dw burst
 684        s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset         // restore start with v1, v0 will be the last
 685        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 128*4
 686        s_mov_b32       m0, 4                                                   //VGPR initial index value = 4
 687
 688L_RESTORE_VGPR_WAVE32_LOOP:
 689        buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
 690        buffer_load_dword       v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128
 691        buffer_load_dword       v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2
 692        buffer_load_dword       v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3
 693        s_waitcnt       vmcnt(0)
 694        v_movreld_b32   v0, v0                                                  //v[0+m0] = v0
 695        v_movreld_b32   v1, v1
 696        v_movreld_b32   v2, v2
 697        v_movreld_b32   v3, v3
 698        s_add_u32       m0, m0, 4                                               //next vgpr index
 699        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 128*4       //every buffer_load_dword does 128 bytes
 700        s_cmp_lt_u32    m0, s_restore_alloc_size                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
 701        s_cbranch_scc1  L_RESTORE_VGPR_WAVE32_LOOP                              //VGPR restore (except v0) is complete?
 702
 703        /* VGPR restore on v0 */
 704        buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
 705        buffer_load_dword       v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
 706        buffer_load_dword       v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
 707        buffer_load_dword       v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
 708
 709        s_branch        L_RESTORE_SGPR
 710
 711L_RESTORE_VGPR_WAVE64:
 712        s_mov_b32       s_restore_buf_rsrc2, 0x1000000                          //NUM_RECORDS in bytes
 713
 714        // VGPR load using dw burst
 715        s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset         // restore start with v4, v0 will be the last
 716        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
 717        s_mov_b32       m0, 4                                                   //VGPR initial index value = 4
 718
 719L_RESTORE_VGPR_WAVE64_LOOP:
 720        buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
 721        buffer_load_dword       v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
 722        buffer_load_dword       v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
 723        buffer_load_dword       v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
 724        s_waitcnt       vmcnt(0)
 725        v_movreld_b32   v0, v0                                                  //v[0+m0] = v0
 726        v_movreld_b32   v1, v1
 727        v_movreld_b32   v2, v2
 728        v_movreld_b32   v3, v3
 729        s_add_u32       m0, m0, 4                                               //next vgpr index
 730        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4       //every buffer_load_dword does 256 bytes
 731        s_cmp_lt_u32    m0, s_restore_alloc_size                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
 732        s_cbranch_scc1  L_RESTORE_VGPR_WAVE64_LOOP                              //VGPR restore (except v0) is complete?
 733
 734        //Below part will be the restore shared vgpr part (new for gfx10)
 735        s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)  //shared_vgpr_size
 736        s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF  //shared_vgpr_size is zero?
 737        s_cbranch_scc0  L_RESTORE_V0                                            //no shared_vgpr used?
 738        s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 3           //Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
 739        //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
 740        //restore shared_vgpr will start from the index of m0
 741        s_add_u32       s_restore_alloc_size, s_restore_alloc_size, m0
 742        s_mov_b32       exec_lo, 0xFFFFFFFF
 743        s_mov_b32       exec_hi, 0x00000000
 744L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
 745        buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
 746        s_waitcnt       vmcnt(0)
 747        v_movreld_b32   v0, v0                                                  //v[0+m0] = v0
 748        s_add_u32       m0, m0, 1                                               //next vgpr index
 749        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 128
 750        s_cmp_lt_u32    m0, s_restore_alloc_size                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
 751        s_cbranch_scc1  L_RESTORE_SHARED_VGPR_WAVE64_LOOP                       //VGPR restore (except v0) is complete?
 752
 753        s_mov_b32       exec_hi, 0xFFFFFFFF                                     //restore back exec_hi before restoring V0!!
 754
 755        /* VGPR restore on v0 */
 756L_RESTORE_V0:
 757        buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
 758        buffer_load_dword       v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
 759        buffer_load_dword       v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
 760        buffer_load_dword       v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
 761        s_waitcnt       vmcnt(0)
 762
 763        /* restore SGPRs */
 764        //will be 2+8+16*6
 765        // SGPR SR memory offset : size(VGPR)+size(SVGPR)
 766L_RESTORE_SGPR:
 767        get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
 768        get_svgpr_size_bytes(s_restore_tmp)
 769        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
 770        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
 771        s_sub_u32       s_restore_mem_offset, s_restore_mem_offset, 20*4        //s108~s127 is not saved
 772
 773        s_mov_b32       s_restore_buf_rsrc2, 0x1000000                          //NUM_RECORDS in bytes
 774
 775        s_mov_b32       m0, s_sgpr_save_num
 776
 777        read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
 778        s_waitcnt       lgkmcnt(0)
 779
 780        s_sub_u32       m0, m0, 4                                               // Restore from S[0] to S[104]
 781        s_nop           0                                                       // hazard SALU M0=> S_MOVREL
 782
 783        s_movreld_b64   s0, s0                                                  //s[0+m0] = s0
 784        s_movreld_b64   s2, s2
 785
 786        read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
 787        s_waitcnt       lgkmcnt(0)
 788
 789        s_sub_u32       m0, m0, 8                                               // Restore from S[0] to S[96]
 790        s_nop           0                                                       // hazard SALU M0=> S_MOVREL
 791
 792        s_movreld_b64   s0, s0                                                  //s[0+m0] = s0
 793        s_movreld_b64   s2, s2
 794        s_movreld_b64   s4, s4
 795        s_movreld_b64   s6, s6
 796
 797 L_RESTORE_SGPR_LOOP:
 798        read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
 799        s_waitcnt       lgkmcnt(0)
 800
 801        s_sub_u32       m0, m0, 16                                              // Restore from S[n] to S[0]
 802        s_nop           0                                                       // hazard SALU M0=> S_MOVREL
 803
 804        s_movreld_b64   s0, s0                                                  //s[0+m0] = s0
 805        s_movreld_b64   s2, s2
 806        s_movreld_b64   s4, s4
 807        s_movreld_b64   s6, s6
 808        s_movreld_b64   s8, s8
 809        s_movreld_b64   s10, s10
 810        s_movreld_b64   s12, s12
 811        s_movreld_b64   s14, s14
 812
 813        s_cmp_eq_u32    m0, 0                                                   //scc = (m0 < s_sgpr_save_num) ? 1 : 0
 814        s_cbranch_scc0  L_RESTORE_SGPR_LOOP
 815
 816        /* restore HW registers */
 817L_RESTORE_HWREG:
 818        // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
 819        get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
 820        get_svgpr_size_bytes(s_restore_tmp)
 821        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
 822        s_add_u32       s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
 823
 824        s_mov_b32       s_restore_buf_rsrc2, 0x1000000                          //NUM_RECORDS in bytes
 825
 826        read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
 827        read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
 828        read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
 829        read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
 830        read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
 831        read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
 832        read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
 833        read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
 834        read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
 835        read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
 836        s_waitcnt       lgkmcnt(0)
 837
 838        s_setreg_b32    hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch
 839
 840        read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
 841        s_waitcnt       lgkmcnt(0)                                              //from now on, it is safe to restore STATUS and IB_STS
 842
 843        s_setreg_b32    hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch
 844
 845        s_mov_b32       s_restore_tmp, s_restore_pc_hi
 846        s_and_b32       s_restore_pc_hi, s_restore_tmp, 0x0000ffff              //pc[47:32] //Do it here in order not to affect STATUS
 847
 848        s_mov_b32       m0, s_restore_m0
 849        s_mov_b32       exec_lo, s_restore_exec_lo
 850        s_mov_b32       exec_hi, s_restore_exec_hi
 851
 852        s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
 853        s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
 854        s_setreg_b32    hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
 855        s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
 856        s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
 857        s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
 858        s_setreg_b32    hwreg(HW_REG_MODE), s_restore_mode
 859        s_and_b32       s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK
 860        s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
 861        s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
 862        s_mov_b32       s_restore_mode, 0x0
 863        s_or_b32        s_restore_mode, s_restore_mode, s_restore_m0
 864        s_and_b32       s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK
 865        s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
 866        s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
 867        s_or_b32        s_restore_mode, s_restore_mode, s_restore_m0
 868        s_and_b32       s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_REPLAY_W64H_MASK
 869        s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
 870        s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT
 871        s_or_b32        s_restore_mode, s_restore_mode, s_restore_m0
 872
 873        s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
 874        s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
 875        s_setreg_b32    hwreg(HW_REG_IB_STS), s_restore_mode
 876
 877        s_and_b64       exec, exec, exec                                        // Restore STATUS.EXECZ, not writable by s_setreg_b32
 878        s_and_b64       vcc, vcc, vcc                                           // Restore STATUS.VCCZ, not writable by s_setreg_b32
 879        s_setreg_b32    hwreg(HW_REG_STATUS), s_restore_status                  // SCC is included, which is changed by previous salu
 880
 881        s_barrier                                                               //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
 882
 883        s_rfe_b64       s_restore_pc_lo                                         //Return to the main shader program and resume execution
 884
 885L_END_PGM:
 886        s_endpgm
 887end
 888
 889function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
 890        s_mov_b32       exec_lo, m0
 891        s_mov_b32       m0, s_mem_offset
 892        s_buffer_store_dword    s, s_rsrc, m0 glc:1
 893        s_add_u32       s_mem_offset, s_mem_offset, 4
 894        s_mov_b32       m0, exec_lo
 895end
 896
 897
 898function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
 899        s_buffer_store_dwordx4  s[0], s_rsrc, 0 glc:1
 900        s_buffer_store_dwordx4  s[4], s_rsrc, 16 glc:1
 901        s_buffer_store_dwordx4  s[8], s_rsrc, 32 glc:1
 902        s_buffer_store_dwordx4  s[12], s_rsrc, 48 glc:1
 903        s_add_u32       s_rsrc[0], s_rsrc[0], 4*16
 904        s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0
 905end
 906
 907function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset)
 908        s_buffer_store_dwordx4  s[0], s_rsrc, 0 glc:1
 909        s_buffer_store_dwordx4  s[4], s_rsrc, 16 glc:1
 910        s_buffer_store_dwordx4  s[8], s_rsrc, 32 glc:1
 911        s_add_u32       s_rsrc[0], s_rsrc[0], 4*12
 912        s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0
 913end
 914
 915
 916function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
 917        s_buffer_load_dword     s, s_rsrc, s_mem_offset glc:1
 918        s_add_u32       s_mem_offset, s_mem_offset, 4
 919end
 920
 921function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
 922        s_sub_u32       s_mem_offset, s_mem_offset, 4*16
 923        s_buffer_load_dwordx16  s, s_rsrc, s_mem_offset glc:1
 924end
 925
 926function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
 927        s_sub_u32       s_mem_offset, s_mem_offset, 4*8
 928        s_buffer_load_dwordx8   s, s_rsrc, s_mem_offset glc:1
 929end
 930
 931function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
 932        s_sub_u32       s_mem_offset, s_mem_offset, 4*4
 933        s_buffer_load_dwordx4   s, s_rsrc, s_mem_offset glc:1
 934end
 935
 936
 937function get_lds_size_bytes(s_lds_size_byte)
 938        s_getreg_b32    s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
 939        s_lshl_b32      s_lds_size_byte, s_lds_size_byte, 8                     //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
 940end
 941
 942function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
 943        s_getreg_b32    s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
 944        s_add_u32       s_vgpr_size_byte, s_vgpr_size_byte, 1
 945        s_lshr_b32      m0, s_size, S_WAVE_SIZE
 946        s_and_b32       m0, m0, 1
 947        s_cmp_eq_u32    m0, 1
 948        s_cbranch_scc1  L_ENABLE_SHIFT_W64
 949        s_lshl_b32      s_vgpr_size_byte, s_vgpr_size_byte, (2+7)               //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4   (non-zero value)
 950        s_branch        L_SHIFT_DONE
 951L_ENABLE_SHIFT_W64:
 952        s_lshl_b32      s_vgpr_size_byte, s_vgpr_size_byte, (2+8)               //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)
 953L_SHIFT_DONE:
 954end
 955
 956function get_svgpr_size_bytes(s_svgpr_size_byte)
 957        s_getreg_b32    s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
 958        s_lshl_b32      s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
 959end
 960
 961function get_sgpr_size_bytes
 962        return 512
 963end
 964
 965function get_hwreg_size_bytes
 966        return 128
 967end
 968