linux/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
<<
>>
Prefs
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2020 Intel Corporation
   4 */
   5
   6#include "gen6_engine_cs.h"
   7#include "intel_engine.h"
   8#include "intel_gpu_commands.h"
   9#include "intel_gt.h"
  10#include "intel_gt_irq.h"
  11#include "intel_gt_pm_irq.h"
  12#include "intel_ring.h"
  13
  14#define HWS_SCRATCH_ADDR        (I915_GEM_HWS_SCRATCH * sizeof(u32))
  15
  16/*
  17 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
  18 * implementing two workarounds on gen6.  From section 1.4.7.1
  19 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
  20 *
  21 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
  22 * produced by non-pipelined state commands), software needs to first
  23 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
  24 * 0.
  25 *
  26 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
  27 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
  28 *
  29 * And the workaround for these two requires this workaround first:
  30 *
  31 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
  32 * BEFORE the pipe-control with a post-sync op and no write-cache
  33 * flushes.
  34 *
  35 * And this last workaround is tricky because of the requirements on
  36 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
  37 * volume 2 part 1:
  38 *
  39 *     "1 of the following must also be set:
  40 *      - Render Target Cache Flush Enable ([12] of DW1)
  41 *      - Depth Cache Flush Enable ([0] of DW1)
  42 *      - Stall at Pixel Scoreboard ([1] of DW1)
  43 *      - Depth Stall ([13] of DW1)
  44 *      - Post-Sync Operation ([13] of DW1)
  45 *      - Notify Enable ([8] of DW1)"
  46 *
  47 * The cache flushes require the workaround flush that triggered this
  48 * one, so we can't use it.  Depth stall would trigger the same.
  49 * Post-sync nonzero is what triggered this second workaround, so we
  50 * can't use that one either.  Notify enable is IRQs, which aren't
  51 * really our business.  That leaves only stall at scoreboard.
  52 */
  53static int
  54gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
  55{
  56        u32 scratch_addr =
  57                intel_gt_scratch_offset(rq->engine->gt,
  58                                        INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
  59        u32 *cs;
  60
  61        cs = intel_ring_begin(rq, 6);
  62        if (IS_ERR(cs))
  63                return PTR_ERR(cs);
  64
  65        *cs++ = GFX_OP_PIPE_CONTROL(5);
  66        *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
  67        *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
  68        *cs++ = 0; /* low dword */
  69        *cs++ = 0; /* high dword */
  70        *cs++ = MI_NOOP;
  71        intel_ring_advance(rq, cs);
  72
  73        cs = intel_ring_begin(rq, 6);
  74        if (IS_ERR(cs))
  75                return PTR_ERR(cs);
  76
  77        *cs++ = GFX_OP_PIPE_CONTROL(5);
  78        *cs++ = PIPE_CONTROL_QW_WRITE;
  79        *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
  80        *cs++ = 0;
  81        *cs++ = 0;
  82        *cs++ = MI_NOOP;
  83        intel_ring_advance(rq, cs);
  84
  85        return 0;
  86}
  87
  88int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
  89{
  90        u32 scratch_addr =
  91                intel_gt_scratch_offset(rq->engine->gt,
  92                                        INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
  93        u32 *cs, flags = 0;
  94        int ret;
  95
  96        /* Force SNB workarounds for PIPE_CONTROL flushes */
  97        ret = gen6_emit_post_sync_nonzero_flush(rq);
  98        if (ret)
  99                return ret;
 100
 101        /*
 102         * Just flush everything.  Experiments have shown that reducing the
 103         * number of bits based on the write domains has little performance
 104         * impact. And when rearranging requests, the order of flushes is
 105         * unknown.
 106         */
 107        if (mode & EMIT_FLUSH) {
 108                flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 109                flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 110                /*
 111                 * Ensure that any following seqno writes only happen
 112                 * when the render cache is indeed flushed.
 113                 */
 114                flags |= PIPE_CONTROL_CS_STALL;
 115        }
 116        if (mode & EMIT_INVALIDATE) {
 117                flags |= PIPE_CONTROL_TLB_INVALIDATE;
 118                flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 119                flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 120                flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 121                flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 122                flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 123                /*
 124                 * TLB invalidate requires a post-sync write.
 125                 */
 126                flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 127        }
 128
 129        cs = intel_ring_begin(rq, 4);
 130        if (IS_ERR(cs))
 131                return PTR_ERR(cs);
 132
 133        *cs++ = GFX_OP_PIPE_CONTROL(4);
 134        *cs++ = flags;
 135        *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 136        *cs++ = 0;
 137        intel_ring_advance(rq, cs);
 138
 139        return 0;
 140}
 141
 142u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 143{
 144        /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
 145        *cs++ = GFX_OP_PIPE_CONTROL(4);
 146        *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 147        *cs++ = 0;
 148        *cs++ = 0;
 149
 150        *cs++ = GFX_OP_PIPE_CONTROL(4);
 151        *cs++ = PIPE_CONTROL_QW_WRITE;
 152        *cs++ = intel_gt_scratch_offset(rq->engine->gt,
 153                                        INTEL_GT_SCRATCH_FIELD_DEFAULT) |
 154                PIPE_CONTROL_GLOBAL_GTT;
 155        *cs++ = 0;
 156
 157        /* Finally we can flush and with it emit the breadcrumb */
 158        *cs++ = GFX_OP_PIPE_CONTROL(4);
 159        *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 160                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 161                 PIPE_CONTROL_DC_FLUSH_ENABLE |
 162                 PIPE_CONTROL_QW_WRITE |
 163                 PIPE_CONTROL_CS_STALL);
 164        *cs++ = i915_request_active_seqno(rq) |
 165                PIPE_CONTROL_GLOBAL_GTT;
 166        *cs++ = rq->fence.seqno;
 167
 168        *cs++ = MI_USER_INTERRUPT;
 169        *cs++ = MI_NOOP;
 170
 171        rq->tail = intel_ring_offset(rq, cs);
 172        assert_ring_tail_valid(rq->ring, rq->tail);
 173
 174        return cs;
 175}
 176
 177static int mi_flush_dw(struct i915_request *rq, u32 flags)
 178{
 179        u32 cmd, *cs;
 180
 181        cs = intel_ring_begin(rq, 4);
 182        if (IS_ERR(cs))
 183                return PTR_ERR(cs);
 184
 185        cmd = MI_FLUSH_DW;
 186
 187        /*
 188         * We always require a command barrier so that subsequent
 189         * commands, such as breadcrumb interrupts, are strictly ordered
 190         * wrt the contents of the write cache being flushed to memory
 191         * (and thus being coherent from the CPU).
 192         */
 193        cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 194
 195        /*
 196         * Bspec vol 1c.3 - blitter engine command streamer:
 197         * "If ENABLED, all TLBs will be invalidated once the flush
 198         * operation is complete. This bit is only valid when the
 199         * Post-Sync Operation field is a value of 1h or 3h."
 200         */
 201        cmd |= flags;
 202
 203        *cs++ = cmd;
 204        *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
 205        *cs++ = 0;
 206        *cs++ = MI_NOOP;
 207
 208        intel_ring_advance(rq, cs);
 209
 210        return 0;
 211}
 212
 213static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
 214{
 215        return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
 216}
 217
 218int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
 219{
 220        return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
 221}
 222
 223int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
 224{
 225        return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
 226}
 227
 228int gen6_emit_bb_start(struct i915_request *rq,
 229                       u64 offset, u32 len,
 230                       unsigned int dispatch_flags)
 231{
 232        u32 security;
 233        u32 *cs;
 234
 235        security = MI_BATCH_NON_SECURE_I965;
 236        if (dispatch_flags & I915_DISPATCH_SECURE)
 237                security = 0;
 238
 239        cs = intel_ring_begin(rq, 2);
 240        if (IS_ERR(cs))
 241                return PTR_ERR(cs);
 242
 243        cs = __gen6_emit_bb_start(cs, offset, security);
 244        intel_ring_advance(rq, cs);
 245
 246        return 0;
 247}
 248
 249int
 250hsw_emit_bb_start(struct i915_request *rq,
 251                  u64 offset, u32 len,
 252                  unsigned int dispatch_flags)
 253{
 254        u32 security;
 255        u32 *cs;
 256
 257        security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
 258        if (dispatch_flags & I915_DISPATCH_SECURE)
 259                security = 0;
 260
 261        cs = intel_ring_begin(rq, 2);
 262        if (IS_ERR(cs))
 263                return PTR_ERR(cs);
 264
 265        cs = __gen6_emit_bb_start(cs, offset, security);
 266        intel_ring_advance(rq, cs);
 267
 268        return 0;
 269}
 270
 271static int gen7_stall_cs(struct i915_request *rq)
 272{
 273        u32 *cs;
 274
 275        cs = intel_ring_begin(rq, 4);
 276        if (IS_ERR(cs))
 277                return PTR_ERR(cs);
 278
 279        *cs++ = GFX_OP_PIPE_CONTROL(4);
 280        *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 281        *cs++ = 0;
 282        *cs++ = 0;
 283        intel_ring_advance(rq, cs);
 284
 285        return 0;
 286}
 287
 288int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
 289{
 290        u32 scratch_addr =
 291                intel_gt_scratch_offset(rq->engine->gt,
 292                                        INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
 293        u32 *cs, flags = 0;
 294
 295        /*
 296         * Ensure that any following seqno writes only happen when the render
 297         * cache is indeed flushed.
 298         *
 299         * Workaround: 4th PIPE_CONTROL command (except the ones with only
 300         * read-cache invalidate bits set) must have the CS_STALL bit set. We
 301         * don't try to be clever and just set it unconditionally.
 302         */
 303        flags |= PIPE_CONTROL_CS_STALL;
 304
 305        /*
 306         * CS_STALL suggests at least a post-sync write.
 307         */
 308        flags |= PIPE_CONTROL_QW_WRITE;
 309        flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
 310
 311        /*
 312         * Just flush everything.  Experiments have shown that reducing the
 313         * number of bits based on the write domains has little performance
 314         * impact.
 315         */
 316        if (mode & EMIT_FLUSH) {
 317                flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 318                flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 319                flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 320                flags |= PIPE_CONTROL_FLUSH_ENABLE;
 321        }
 322        if (mode & EMIT_INVALIDATE) {
 323                flags |= PIPE_CONTROL_TLB_INVALIDATE;
 324                flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 325                flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 326                flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 327                flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 328                flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 329                flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
 330
 331                /*
 332                 * Workaround: we must issue a pipe_control with CS-stall bit
 333                 * set before a pipe_control command that has the state cache
 334                 * invalidate bit set.
 335                 */
 336                gen7_stall_cs(rq);
 337        }
 338
 339        cs = intel_ring_begin(rq, 4);
 340        if (IS_ERR(cs))
 341                return PTR_ERR(cs);
 342
 343        *cs++ = GFX_OP_PIPE_CONTROL(4);
 344        *cs++ = flags;
 345        *cs++ = scratch_addr;
 346        *cs++ = 0;
 347        intel_ring_advance(rq, cs);
 348
 349        return 0;
 350}
 351
 352u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 353{
 354        *cs++ = GFX_OP_PIPE_CONTROL(4);
 355        *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 356                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 357                 PIPE_CONTROL_DC_FLUSH_ENABLE |
 358                 PIPE_CONTROL_FLUSH_ENABLE |
 359                 PIPE_CONTROL_QW_WRITE |
 360                 PIPE_CONTROL_GLOBAL_GTT_IVB |
 361                 PIPE_CONTROL_CS_STALL);
 362        *cs++ = i915_request_active_seqno(rq);
 363        *cs++ = rq->fence.seqno;
 364
 365        *cs++ = MI_USER_INTERRUPT;
 366        *cs++ = MI_NOOP;
 367
 368        rq->tail = intel_ring_offset(rq, cs);
 369        assert_ring_tail_valid(rq->ring, rq->tail);
 370
 371        return cs;
 372}
 373
 374u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 375{
 376        GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
 377        GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
 378
 379        *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 380        *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 381        *cs++ = rq->fence.seqno;
 382
 383        *cs++ = MI_USER_INTERRUPT;
 384
 385        rq->tail = intel_ring_offset(rq, cs);
 386        assert_ring_tail_valid(rq->ring, rq->tail);
 387
 388        return cs;
 389}
 390
 391#define GEN7_XCS_WA 32
 392u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 393{
 394        int i;
 395
 396        GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
 397        GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
 398
 399        *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
 400                MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 401        *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 402        *cs++ = rq->fence.seqno;
 403
 404        for (i = 0; i < GEN7_XCS_WA; i++) {
 405                *cs++ = MI_STORE_DWORD_INDEX;
 406                *cs++ = I915_GEM_HWS_SEQNO_ADDR;
 407                *cs++ = rq->fence.seqno;
 408        }
 409
 410        *cs++ = MI_FLUSH_DW;
 411        *cs++ = 0;
 412        *cs++ = 0;
 413
 414        *cs++ = MI_USER_INTERRUPT;
 415        *cs++ = MI_NOOP;
 416
 417        rq->tail = intel_ring_offset(rq, cs);
 418        assert_ring_tail_valid(rq->ring, rq->tail);
 419
 420        return cs;
 421}
 422#undef GEN7_XCS_WA
 423
 424void gen6_irq_enable(struct intel_engine_cs *engine)
 425{
 426        ENGINE_WRITE(engine, RING_IMR,
 427                     ~(engine->irq_enable_mask | engine->irq_keep_mask));
 428
 429        /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
 430        ENGINE_POSTING_READ(engine, RING_IMR);
 431
 432        gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
 433}
 434
 435void gen6_irq_disable(struct intel_engine_cs *engine)
 436{
 437        ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
 438        gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
 439}
 440
 441void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
 442{
 443        ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
 444
 445        /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
 446        ENGINE_POSTING_READ(engine, RING_IMR);
 447
 448        gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
 449}
 450
 451void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
 452{
 453        ENGINE_WRITE(engine, RING_IMR, ~0);
 454        gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
 455}
 456