linux/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2008-2010 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *    Eric Anholt <eric@anholt.net>
  25 *    Zou Nan hai <nanhai.zou@intel.com>
  26 *    Xiang Hai hao<haihao.xiang@intel.com>
  27 *
  28 */
  29
  30#include <linux/log2.h>
  31
  32#include <drm/i915_drm.h>
  33
  34#include "gem/i915_gem_context.h"
  35
  36#include "i915_drv.h"
  37#include "i915_gem_render_state.h"
  38#include "i915_trace.h"
  39#include "intel_context.h"
  40#include "intel_reset.h"
  41#include "intel_workarounds.h"
  42
  43/* Rough estimate of the typical request size, performing a flush,
  44 * set-context and then emitting the batch.
  45 */
  46#define LEGACY_REQUEST_SIZE 200
  47
  48unsigned int intel_ring_update_space(struct intel_ring *ring)
  49{
  50        unsigned int space;
  51
  52        space = __intel_ring_space(ring->head, ring->emit, ring->size);
  53
  54        ring->space = space;
  55        return space;
  56}
  57
  58static int
  59gen2_render_ring_flush(struct i915_request *rq, u32 mode)
  60{
  61        unsigned int num_store_dw;
  62        u32 cmd, *cs;
  63
  64        cmd = MI_FLUSH;
  65        num_store_dw = 0;
  66        if (mode & EMIT_INVALIDATE)
  67                cmd |= MI_READ_FLUSH;
  68        if (mode & EMIT_FLUSH)
  69                num_store_dw = 4;
  70
  71        cs = intel_ring_begin(rq, 2 + 3 * num_store_dw);
  72        if (IS_ERR(cs))
  73                return PTR_ERR(cs);
  74
  75        *cs++ = cmd;
  76        while (num_store_dw--) {
  77                *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
  78                *cs++ = i915_scratch_offset(rq->i915);
  79                *cs++ = 0;
  80        }
  81        *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
  82
  83        intel_ring_advance(rq, cs);
  84
  85        return 0;
  86}
  87
  88static int
  89gen4_render_ring_flush(struct i915_request *rq, u32 mode)
  90{
  91        u32 cmd, *cs;
  92        int i;
  93
  94        /*
  95         * read/write caches:
  96         *
  97         * I915_GEM_DOMAIN_RENDER is always invalidated, but is
  98         * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
  99         * also flushed at 2d versus 3d pipeline switches.
 100         *
 101         * read-only caches:
 102         *
 103         * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
 104         * MI_READ_FLUSH is set, and is always flushed on 965.
 105         *
 106         * I915_GEM_DOMAIN_COMMAND may not exist?
 107         *
 108         * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
 109         * invalidated when MI_EXE_FLUSH is set.
 110         *
 111         * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
 112         * invalidated with every MI_FLUSH.
 113         *
 114         * TLBs:
 115         *
 116         * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
 117         * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
 118         * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
 119         * are flushed at any MI_FLUSH.
 120         */
 121
 122        cmd = MI_FLUSH;
 123        if (mode & EMIT_INVALIDATE) {
 124                cmd |= MI_EXE_FLUSH;
 125                if (IS_G4X(rq->i915) || IS_GEN(rq->i915, 5))
 126                        cmd |= MI_INVALIDATE_ISP;
 127        }
 128
 129        i = 2;
 130        if (mode & EMIT_INVALIDATE)
 131                i += 20;
 132
 133        cs = intel_ring_begin(rq, i);
 134        if (IS_ERR(cs))
 135                return PTR_ERR(cs);
 136
 137        *cs++ = cmd;
 138
 139        /*
 140         * A random delay to let the CS invalidate take effect? Without this
 141         * delay, the GPU relocation path fails as the CS does not see
 142         * the updated contents. Just as important, if we apply the flushes
 143         * to the EMIT_FLUSH branch (i.e. immediately after the relocation
 144         * write and before the invalidate on the next batch), the relocations
 145         * still fail. This implies that is a delay following invalidation
 146         * that is required to reset the caches as opposed to a delay to
 147         * ensure the memory is written.
 148         */
 149        if (mode & EMIT_INVALIDATE) {
 150                *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
 151                *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
 152                *cs++ = 0;
 153                *cs++ = 0;
 154
 155                for (i = 0; i < 12; i++)
 156                        *cs++ = MI_FLUSH;
 157
 158                *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
 159                *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
 160                *cs++ = 0;
 161                *cs++ = 0;
 162        }
 163
 164        *cs++ = cmd;
 165
 166        intel_ring_advance(rq, cs);
 167
 168        return 0;
 169}
 170
 171/*
 172 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 173 * implementing two workarounds on gen6.  From section 1.4.7.1
 174 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
 175 *
 176 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
 177 * produced by non-pipelined state commands), software needs to first
 178 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
 179 * 0.
 180 *
 181 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
 182 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
 183 *
 184 * And the workaround for these two requires this workaround first:
 185 *
 186 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
 187 * BEFORE the pipe-control with a post-sync op and no write-cache
 188 * flushes.
 189 *
 190 * And this last workaround is tricky because of the requirements on
 191 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
 192 * volume 2 part 1:
 193 *
 194 *     "1 of the following must also be set:
 195 *      - Render Target Cache Flush Enable ([12] of DW1)
 196 *      - Depth Cache Flush Enable ([0] of DW1)
 197 *      - Stall at Pixel Scoreboard ([1] of DW1)
 198 *      - Depth Stall ([13] of DW1)
 199 *      - Post-Sync Operation ([13] of DW1)
 200 *      - Notify Enable ([8] of DW1)"
 201 *
 202 * The cache flushes require the workaround flush that triggered this
 203 * one, so we can't use it.  Depth stall would trigger the same.
 204 * Post-sync nonzero is what triggered this second workaround, so we
 205 * can't use that one either.  Notify enable is IRQs, which aren't
 206 * really our business.  That leaves only stall at scoreboard.
 207 */
 208static int
 209gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
 210{
 211        u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
 212        u32 *cs;
 213
 214        cs = intel_ring_begin(rq, 6);
 215        if (IS_ERR(cs))
 216                return PTR_ERR(cs);
 217
 218        *cs++ = GFX_OP_PIPE_CONTROL(5);
 219        *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 220        *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 221        *cs++ = 0; /* low dword */
 222        *cs++ = 0; /* high dword */
 223        *cs++ = MI_NOOP;
 224        intel_ring_advance(rq, cs);
 225
 226        cs = intel_ring_begin(rq, 6);
 227        if (IS_ERR(cs))
 228                return PTR_ERR(cs);
 229
 230        *cs++ = GFX_OP_PIPE_CONTROL(5);
 231        *cs++ = PIPE_CONTROL_QW_WRITE;
 232        *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 233        *cs++ = 0;
 234        *cs++ = 0;
 235        *cs++ = MI_NOOP;
 236        intel_ring_advance(rq, cs);
 237
 238        return 0;
 239}
 240
 241static int
 242gen6_render_ring_flush(struct i915_request *rq, u32 mode)
 243{
 244        u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
 245        u32 *cs, flags = 0;
 246        int ret;
 247
 248        /* Force SNB workarounds for PIPE_CONTROL flushes */
 249        ret = gen6_emit_post_sync_nonzero_flush(rq);
 250        if (ret)
 251                return ret;
 252
 253        /* Just flush everything.  Experiments have shown that reducing the
 254         * number of bits based on the write domains has little performance
 255         * impact.
 256         */
 257        if (mode & EMIT_FLUSH) {
 258                flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 259                flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 260                /*
 261                 * Ensure that any following seqno writes only happen
 262                 * when the render cache is indeed flushed.
 263                 */
 264                flags |= PIPE_CONTROL_CS_STALL;
 265        }
 266        if (mode & EMIT_INVALIDATE) {
 267                flags |= PIPE_CONTROL_TLB_INVALIDATE;
 268                flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 269                flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 270                flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 271                flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 272                flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 273                /*
 274                 * TLB invalidate requires a post-sync write.
 275                 */
 276                flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 277        }
 278
 279        cs = intel_ring_begin(rq, 4);
 280        if (IS_ERR(cs))
 281                return PTR_ERR(cs);
 282
 283        *cs++ = GFX_OP_PIPE_CONTROL(4);
 284        *cs++ = flags;
 285        *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 286        *cs++ = 0;
 287        intel_ring_advance(rq, cs);
 288
 289        return 0;
 290}
 291
 292static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 293{
 294        /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
 295        *cs++ = GFX_OP_PIPE_CONTROL(4);
 296        *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 297        *cs++ = 0;
 298        *cs++ = 0;
 299
 300        *cs++ = GFX_OP_PIPE_CONTROL(4);
 301        *cs++ = PIPE_CONTROL_QW_WRITE;
 302        *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
 303        *cs++ = 0;
 304
 305        /* Finally we can flush and with it emit the breadcrumb */
 306        *cs++ = GFX_OP_PIPE_CONTROL(4);
 307        *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 308                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 309                 PIPE_CONTROL_DC_FLUSH_ENABLE |
 310                 PIPE_CONTROL_QW_WRITE |
 311                 PIPE_CONTROL_CS_STALL);
 312        *cs++ = rq->timeline->hwsp_offset | PIPE_CONTROL_GLOBAL_GTT;
 313        *cs++ = rq->fence.seqno;
 314
 315        *cs++ = MI_USER_INTERRUPT;
 316        *cs++ = MI_NOOP;
 317
 318        rq->tail = intel_ring_offset(rq, cs);
 319        assert_ring_tail_valid(rq->ring, rq->tail);
 320
 321        return cs;
 322}
 323
 324static int
 325gen7_render_ring_cs_stall_wa(struct i915_request *rq)
 326{
 327        u32 *cs;
 328
 329        cs = intel_ring_begin(rq, 4);
 330        if (IS_ERR(cs))
 331                return PTR_ERR(cs);
 332
 333        *cs++ = GFX_OP_PIPE_CONTROL(4);
 334        *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 335        *cs++ = 0;
 336        *cs++ = 0;
 337        intel_ring_advance(rq, cs);
 338
 339        return 0;
 340}
 341
 342static int
 343gen7_render_ring_flush(struct i915_request *rq, u32 mode)
 344{
 345        u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
 346        u32 *cs, flags = 0;
 347
 348        /*
 349         * Ensure that any following seqno writes only happen when the render
 350         * cache is indeed flushed.
 351         *
 352         * Workaround: 4th PIPE_CONTROL command (except the ones with only
 353         * read-cache invalidate bits set) must have the CS_STALL bit set. We
 354         * don't try to be clever and just set it unconditionally.
 355         */
 356        flags |= PIPE_CONTROL_CS_STALL;
 357
 358        /* Just flush everything.  Experiments have shown that reducing the
 359         * number of bits based on the write domains has little performance
 360         * impact.
 361         */
 362        if (mode & EMIT_FLUSH) {
 363                flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 364                flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 365                flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 366                flags |= PIPE_CONTROL_FLUSH_ENABLE;
 367        }
 368        if (mode & EMIT_INVALIDATE) {
 369                flags |= PIPE_CONTROL_TLB_INVALIDATE;
 370                flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 371                flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 372                flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 373                flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 374                flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 375                flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
 376                /*
 377                 * TLB invalidate requires a post-sync write.
 378                 */
 379                flags |= PIPE_CONTROL_QW_WRITE;
 380                flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
 381
 382                flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
 383
 384                /* Workaround: we must issue a pipe_control with CS-stall bit
 385                 * set before a pipe_control command that has the state cache
 386                 * invalidate bit set. */
 387                gen7_render_ring_cs_stall_wa(rq);
 388        }
 389
 390        cs = intel_ring_begin(rq, 4);
 391        if (IS_ERR(cs))
 392                return PTR_ERR(cs);
 393
 394        *cs++ = GFX_OP_PIPE_CONTROL(4);
 395        *cs++ = flags;
 396        *cs++ = scratch_addr;
 397        *cs++ = 0;
 398        intel_ring_advance(rq, cs);
 399
 400        return 0;
 401}
 402
 403static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 404{
 405        *cs++ = GFX_OP_PIPE_CONTROL(4);
 406        *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 407                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 408                 PIPE_CONTROL_DC_FLUSH_ENABLE |
 409                 PIPE_CONTROL_FLUSH_ENABLE |
 410                 PIPE_CONTROL_QW_WRITE |
 411                 PIPE_CONTROL_GLOBAL_GTT_IVB |
 412                 PIPE_CONTROL_CS_STALL);
 413        *cs++ = rq->timeline->hwsp_offset;
 414        *cs++ = rq->fence.seqno;
 415
 416        *cs++ = MI_USER_INTERRUPT;
 417        *cs++ = MI_NOOP;
 418
 419        rq->tail = intel_ring_offset(rq, cs);
 420        assert_ring_tail_valid(rq->ring, rq->tail);
 421
 422        return cs;
 423}
 424
 425static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 426{
 427        GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 428        GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 429
 430        *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 431        *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 432        *cs++ = rq->fence.seqno;
 433
 434        *cs++ = MI_USER_INTERRUPT;
 435
 436        rq->tail = intel_ring_offset(rq, cs);
 437        assert_ring_tail_valid(rq->ring, rq->tail);
 438
 439        return cs;
 440}
 441
 442#define GEN7_XCS_WA 32
 443static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 444{
 445        int i;
 446
 447        GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 448        GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 449
 450        *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 451        *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 452        *cs++ = rq->fence.seqno;
 453
 454        for (i = 0; i < GEN7_XCS_WA; i++) {
 455                *cs++ = MI_STORE_DWORD_INDEX;
 456                *cs++ = I915_GEM_HWS_SEQNO_ADDR;
 457                *cs++ = rq->fence.seqno;
 458        }
 459
 460        *cs++ = MI_FLUSH_DW;
 461        *cs++ = 0;
 462        *cs++ = 0;
 463
 464        *cs++ = MI_USER_INTERRUPT;
 465        *cs++ = MI_NOOP;
 466
 467        rq->tail = intel_ring_offset(rq, cs);
 468        assert_ring_tail_valid(rq->ring, rq->tail);
 469
 470        return cs;
 471}
 472#undef GEN7_XCS_WA
 473
 474static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
 475{
 476        /*
 477         * Keep the render interrupt unmasked as this papers over
 478         * lost interrupts following a reset.
 479         */
 480        if (engine->class == RENDER_CLASS) {
 481                if (INTEL_GEN(engine->i915) >= 6)
 482                        mask &= ~BIT(0);
 483                else
 484                        mask &= ~I915_USER_INTERRUPT;
 485        }
 486
 487        intel_engine_set_hwsp_writemask(engine, mask);
 488}
 489
 490static void set_hws_pga(struct intel_engine_cs *engine, phys_addr_t phys)
 491{
 492        struct drm_i915_private *dev_priv = engine->i915;
 493        u32 addr;
 494
 495        addr = lower_32_bits(phys);
 496        if (INTEL_GEN(dev_priv) >= 4)
 497                addr |= (phys >> 28) & 0xf0;
 498
 499        I915_WRITE(HWS_PGA, addr);
 500}
 501
 502static struct page *status_page(struct intel_engine_cs *engine)
 503{
 504        struct drm_i915_gem_object *obj = engine->status_page.vma->obj;
 505
 506        GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
 507        return sg_page(obj->mm.pages->sgl);
 508}
 509
 510static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
 511{
 512        set_hws_pga(engine, PFN_PHYS(page_to_pfn(status_page(engine))));
 513        set_hwstam(engine, ~0u);
 514}
 515
 516static void set_hwsp(struct intel_engine_cs *engine, u32 offset)
 517{
 518        struct drm_i915_private *dev_priv = engine->i915;
 519        i915_reg_t hwsp;
 520
 521        /*
 522         * The ring status page addresses are no longer next to the rest of
 523         * the ring registers as of gen7.
 524         */
 525        if (IS_GEN(dev_priv, 7)) {
 526                switch (engine->id) {
 527                /*
 528                 * No more rings exist on Gen7. Default case is only to shut up
 529                 * gcc switch check warning.
 530                 */
 531                default:
 532                        GEM_BUG_ON(engine->id);
 533                        /* fallthrough */
 534                case RCS0:
 535                        hwsp = RENDER_HWS_PGA_GEN7;
 536                        break;
 537                case BCS0:
 538                        hwsp = BLT_HWS_PGA_GEN7;
 539                        break;
 540                case VCS0:
 541                        hwsp = BSD_HWS_PGA_GEN7;
 542                        break;
 543                case VECS0:
 544                        hwsp = VEBOX_HWS_PGA_GEN7;
 545                        break;
 546                }
 547        } else if (IS_GEN(dev_priv, 6)) {
 548                hwsp = RING_HWS_PGA_GEN6(engine->mmio_base);
 549        } else {
 550                hwsp = RING_HWS_PGA(engine->mmio_base);
 551        }
 552
 553        I915_WRITE(hwsp, offset);
 554        POSTING_READ(hwsp);
 555}
 556
 557static void flush_cs_tlb(struct intel_engine_cs *engine)
 558{
 559        struct drm_i915_private *dev_priv = engine->i915;
 560
 561        if (!IS_GEN_RANGE(dev_priv, 6, 7))
 562                return;
 563
 564        /* ring should be idle before issuing a sync flush*/
 565        WARN_ON((ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
 566
 567        ENGINE_WRITE(engine, RING_INSTPM,
 568                     _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
 569                                        INSTPM_SYNC_FLUSH));
 570        if (intel_wait_for_register(engine->uncore,
 571                                    RING_INSTPM(engine->mmio_base),
 572                                    INSTPM_SYNC_FLUSH, 0,
 573                                    1000))
 574                DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
 575                          engine->name);
 576}
 577
 578static void ring_setup_status_page(struct intel_engine_cs *engine)
 579{
 580        set_hwsp(engine, i915_ggtt_offset(engine->status_page.vma));
 581        set_hwstam(engine, ~0u);
 582
 583        flush_cs_tlb(engine);
 584}
 585
 586static bool stop_ring(struct intel_engine_cs *engine)
 587{
 588        struct drm_i915_private *dev_priv = engine->i915;
 589
 590        if (INTEL_GEN(dev_priv) > 2) {
 591                ENGINE_WRITE(engine,
 592                             RING_MI_MODE, _MASKED_BIT_ENABLE(STOP_RING));
 593                if (intel_wait_for_register(engine->uncore,
 594                                            RING_MI_MODE(engine->mmio_base),
 595                                            MODE_IDLE,
 596                                            MODE_IDLE,
 597                                            1000)) {
 598                        DRM_ERROR("%s : timed out trying to stop ring\n",
 599                                  engine->name);
 600
 601                        /*
 602                         * Sometimes we observe that the idle flag is not
 603                         * set even though the ring is empty. So double
 604                         * check before giving up.
 605                         */
 606                        if (ENGINE_READ(engine, RING_HEAD) !=
 607                            ENGINE_READ(engine, RING_TAIL))
 608                                return false;
 609                }
 610        }
 611
 612        ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL));
 613
 614        ENGINE_WRITE(engine, RING_HEAD, 0);
 615        ENGINE_WRITE(engine, RING_TAIL, 0);
 616
 617        /* The ring must be empty before it is disabled */
 618        ENGINE_WRITE(engine, RING_CTL, 0);
 619
 620        return (ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) == 0;
 621}
 622
 623static int xcs_resume(struct intel_engine_cs *engine)
 624{
 625        struct drm_i915_private *dev_priv = engine->i915;
 626        struct intel_ring *ring = engine->buffer;
 627        int ret = 0;
 628
 629        GEM_TRACE("%s: ring:{HEAD:%04x, TAIL:%04x}\n",
 630                  engine->name, ring->head, ring->tail);
 631
 632        intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
 633
 634        if (!stop_ring(engine)) {
 635                /* G45 ring initialization often fails to reset head to zero */
 636                DRM_DEBUG_DRIVER("%s head not reset to zero "
 637                                "ctl %08x head %08x tail %08x start %08x\n",
 638                                engine->name,
 639                                ENGINE_READ(engine, RING_CTL),
 640                                ENGINE_READ(engine, RING_HEAD),
 641                                ENGINE_READ(engine, RING_TAIL),
 642                                ENGINE_READ(engine, RING_START));
 643
 644                if (!stop_ring(engine)) {
 645                        DRM_ERROR("failed to set %s head to zero "
 646                                  "ctl %08x head %08x tail %08x start %08x\n",
 647                                  engine->name,
 648                                  ENGINE_READ(engine, RING_CTL),
 649                                  ENGINE_READ(engine, RING_HEAD),
 650                                  ENGINE_READ(engine, RING_TAIL),
 651                                  ENGINE_READ(engine, RING_START));
 652                        ret = -EIO;
 653                        goto out;
 654                }
 655        }
 656
 657        if (HWS_NEEDS_PHYSICAL(dev_priv))
 658                ring_setup_phys_status_page(engine);
 659        else
 660                ring_setup_status_page(engine);
 661
 662        intel_engine_reset_breadcrumbs(engine);
 663
 664        /* Enforce ordering by reading HEAD register back */
 665        ENGINE_READ(engine, RING_HEAD);
 666
 667        /* Initialize the ring. This must happen _after_ we've cleared the ring
 668         * registers with the above sequence (the readback of the HEAD registers
 669         * also enforces ordering), otherwise the hw might lose the new ring
 670         * register values. */
 671        ENGINE_WRITE(engine, RING_START, i915_ggtt_offset(ring->vma));
 672
 673        /* WaClearRingBufHeadRegAtInit:ctg,elk */
 674        if (ENGINE_READ(engine, RING_HEAD))
 675                DRM_DEBUG_DRIVER("%s initialization failed [head=%08x], fudging\n",
 676                                 engine->name, ENGINE_READ(engine, RING_HEAD));
 677
 678        /* Check that the ring offsets point within the ring! */
 679        GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
 680        GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
 681        intel_ring_update_space(ring);
 682
 683        /* First wake the ring up to an empty/idle ring */
 684        ENGINE_WRITE(engine, RING_HEAD, ring->head);
 685        ENGINE_WRITE(engine, RING_TAIL, ring->head);
 686        ENGINE_POSTING_READ(engine, RING_TAIL);
 687
 688        ENGINE_WRITE(engine, RING_CTL, RING_CTL_SIZE(ring->size) | RING_VALID);
 689
 690        /* If the head is still not zero, the ring is dead */
 691        if (intel_wait_for_register(engine->uncore,
 692                                    RING_CTL(engine->mmio_base),
 693                                    RING_VALID, RING_VALID,
 694                                    50)) {
 695                DRM_ERROR("%s initialization failed "
 696                          "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
 697                          engine->name,
 698                          ENGINE_READ(engine, RING_CTL),
 699                          ENGINE_READ(engine, RING_CTL) & RING_VALID,
 700                          ENGINE_READ(engine, RING_HEAD), ring->head,
 701                          ENGINE_READ(engine, RING_TAIL), ring->tail,
 702                          ENGINE_READ(engine, RING_START),
 703                          i915_ggtt_offset(ring->vma));
 704                ret = -EIO;
 705                goto out;
 706        }
 707
 708        if (INTEL_GEN(dev_priv) > 2)
 709                ENGINE_WRITE(engine,
 710                             RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
 711
 712        /* Now awake, let it get started */
 713        if (ring->tail != ring->head) {
 714                ENGINE_WRITE(engine, RING_TAIL, ring->tail);
 715                ENGINE_POSTING_READ(engine, RING_TAIL);
 716        }
 717
 718        /* Papering over lost _interrupts_ immediately following the restart */
 719        intel_engine_queue_breadcrumbs(engine);
 720out:
 721        intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
 722
 723        return ret;
 724}
 725
 726static void reset_prepare(struct intel_engine_cs *engine)
 727{
 728        intel_engine_stop_cs(engine);
 729}
 730
 731static void reset_ring(struct intel_engine_cs *engine, bool stalled)
 732{
 733        struct i915_request *pos, *rq;
 734        unsigned long flags;
 735        u32 head;
 736
 737        rq = NULL;
 738        spin_lock_irqsave(&engine->active.lock, flags);
 739        list_for_each_entry(pos, &engine->active.requests, sched.link) {
 740                if (!i915_request_completed(pos)) {
 741                        rq = pos;
 742                        break;
 743                }
 744        }
 745
 746        /*
 747         * The guilty request will get skipped on a hung engine.
 748         *
 749         * Users of client default contexts do not rely on logical
 750         * state preserved between batches so it is safe to execute
 751         * queued requests following the hang. Non default contexts
 752         * rely on preserved state, so skipping a batch loses the
 753         * evolution of the state and it needs to be considered corrupted.
 754         * Executing more queued batches on top of corrupted state is
 755         * risky. But we take the risk by trying to advance through
 756         * the queued requests in order to make the client behaviour
 757         * more predictable around resets, by not throwing away random
 758         * amount of batches it has prepared for execution. Sophisticated
 759         * clients can use gem_reset_stats_ioctl and dma fence status
 760         * (exported via sync_file info ioctl on explicit fences) to observe
 761         * when it loses the context state and should rebuild accordingly.
 762         *
 763         * The context ban, and ultimately the client ban, mechanism are safety
 764         * valves if client submission ends up resulting in nothing more than
 765         * subsequent hangs.
 766         */
 767
 768        if (rq) {
 769                /*
 770                 * Try to restore the logical GPU state to match the
 771                 * continuation of the request queue. If we skip the
 772                 * context/PD restore, then the next request may try to execute
 773                 * assuming that its context is valid and loaded on the GPU and
 774                 * so may try to access invalid memory, prompting repeated GPU
 775                 * hangs.
 776                 *
 777                 * If the request was guilty, we still restore the logical
 778                 * state in case the next request requires it (e.g. the
 779                 * aliasing ppgtt), but skip over the hung batch.
 780                 *
 781                 * If the request was innocent, we try to replay the request
 782                 * with the restored context.
 783                 */
 784                i915_reset_request(rq, stalled);
 785
 786                GEM_BUG_ON(rq->ring != engine->buffer);
 787                head = rq->head;
 788        } else {
 789                head = engine->buffer->tail;
 790        }
 791        engine->buffer->head = intel_ring_wrap(engine->buffer, head);
 792
 793        spin_unlock_irqrestore(&engine->active.lock, flags);
 794}
 795
 796static void reset_finish(struct intel_engine_cs *engine)
 797{
 798}
 799
 800static int intel_rcs_ctx_init(struct i915_request *rq)
 801{
 802        int ret;
 803
 804        ret = intel_engine_emit_ctx_wa(rq);
 805        if (ret != 0)
 806                return ret;
 807
 808        ret = i915_gem_render_state_emit(rq);
 809        if (ret)
 810                return ret;
 811
 812        return 0;
 813}
 814
 815static int rcs_resume(struct intel_engine_cs *engine)
 816{
 817        struct drm_i915_private *dev_priv = engine->i915;
 818
 819        /*
 820         * Disable CONSTANT_BUFFER before it is loaded from the context
 821         * image. For as it is loaded, it is executed and the stored
 822         * address may no longer be valid, leading to a GPU hang.
 823         *
 824         * This imposes the requirement that userspace reload their
 825         * CONSTANT_BUFFER on every batch, fortunately a requirement
 826         * they are already accustomed to from before contexts were
 827         * enabled.
 828         */
 829        if (IS_GEN(dev_priv, 4))
 830                I915_WRITE(ECOSKPD,
 831                           _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE));
 832
 833        /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
 834        if (IS_GEN_RANGE(dev_priv, 4, 6))
 835                I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
 836
 837        /* We need to disable the AsyncFlip performance optimisations in order
 838         * to use MI_WAIT_FOR_EVENT within the CS. It should already be
 839         * programmed to '1' on all products.
 840         *
 841         * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
 842         */
 843        if (IS_GEN_RANGE(dev_priv, 6, 7))
 844                I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
 845
 846        /* Required for the hardware to program scanline values for waiting */
 847        /* WaEnableFlushTlbInvalidationMode:snb */
 848        if (IS_GEN(dev_priv, 6))
 849                I915_WRITE(GFX_MODE,
 850                           _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
 851
 852        /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
 853        if (IS_GEN(dev_priv, 7))
 854                I915_WRITE(GFX_MODE_GEN7,
 855                           _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
 856                           _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
 857
 858        if (IS_GEN(dev_priv, 6)) {
 859                /* From the Sandybridge PRM, volume 1 part 3, page 24:
 860                 * "If this bit is set, STCunit will have LRA as replacement
 861                 *  policy. [...] This bit must be reset.  LRA replacement
 862                 *  policy is not supported."
 863                 */
 864                I915_WRITE(CACHE_MODE_0,
 865                           _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
 866        }
 867
 868        if (IS_GEN_RANGE(dev_priv, 6, 7))
 869                I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
 870
 871        return xcs_resume(engine);
 872}
 873
 874static void cancel_requests(struct intel_engine_cs *engine)
 875{
 876        struct i915_request *request;
 877        unsigned long flags;
 878
 879        spin_lock_irqsave(&engine->active.lock, flags);
 880
 881        /* Mark all submitted requests as skipped. */
 882        list_for_each_entry(request, &engine->active.requests, sched.link) {
 883                if (!i915_request_signaled(request))
 884                        dma_fence_set_error(&request->fence, -EIO);
 885
 886                i915_request_mark_complete(request);
 887        }
 888
 889        /* Remaining _unready_ requests will be nop'ed when submitted */
 890
 891        spin_unlock_irqrestore(&engine->active.lock, flags);
 892}
 893
 894static void i9xx_submit_request(struct i915_request *request)
 895{
 896        i915_request_submit(request);
 897
 898        ENGINE_WRITE(request->engine, RING_TAIL,
 899                     intel_ring_set_tail(request->ring, request->tail));
 900}
 901
 902static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 903{
 904        GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 905        GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 906
 907        *cs++ = MI_FLUSH;
 908
 909        *cs++ = MI_STORE_DWORD_INDEX;
 910        *cs++ = I915_GEM_HWS_SEQNO_ADDR;
 911        *cs++ = rq->fence.seqno;
 912
 913        *cs++ = MI_USER_INTERRUPT;
 914        *cs++ = MI_NOOP;
 915
 916        rq->tail = intel_ring_offset(rq, cs);
 917        assert_ring_tail_valid(rq->ring, rq->tail);
 918
 919        return cs;
 920}
 921
 922#define GEN5_WA_STORES 8 /* must be at least 1! */
 923static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 924{
 925        int i;
 926
 927        GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 928        GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 929
 930        *cs++ = MI_FLUSH;
 931
 932        BUILD_BUG_ON(GEN5_WA_STORES < 1);
 933        for (i = 0; i < GEN5_WA_STORES; i++) {
 934                *cs++ = MI_STORE_DWORD_INDEX;
 935                *cs++ = I915_GEM_HWS_SEQNO_ADDR;
 936                *cs++ = rq->fence.seqno;
 937        }
 938
 939        *cs++ = MI_USER_INTERRUPT;
 940
 941        rq->tail = intel_ring_offset(rq, cs);
 942        assert_ring_tail_valid(rq->ring, rq->tail);
 943
 944        return cs;
 945}
 946#undef GEN5_WA_STORES
 947
 948static void
 949gen5_irq_enable(struct intel_engine_cs *engine)
 950{
 951        gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
 952}
 953
 954static void
 955gen5_irq_disable(struct intel_engine_cs *engine)
 956{
 957        gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
 958}
 959
 960static void
 961i9xx_irq_enable(struct intel_engine_cs *engine)
 962{
 963        engine->i915->irq_mask &= ~engine->irq_enable_mask;
 964        intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
 965        intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
 966}
 967
 968static void
 969i9xx_irq_disable(struct intel_engine_cs *engine)
 970{
 971        engine->i915->irq_mask |= engine->irq_enable_mask;
 972        intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
 973}
 974
 975static void
 976i8xx_irq_enable(struct intel_engine_cs *engine)
 977{
 978        struct drm_i915_private *i915 = engine->i915;
 979
 980        i915->irq_mask &= ~engine->irq_enable_mask;
 981        intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
 982        ENGINE_POSTING_READ16(engine, RING_IMR);
 983}
 984
 985static void
 986i8xx_irq_disable(struct intel_engine_cs *engine)
 987{
 988        struct drm_i915_private *i915 = engine->i915;
 989
 990        i915->irq_mask |= engine->irq_enable_mask;
 991        intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
 992}
 993
 994static int
 995bsd_ring_flush(struct i915_request *rq, u32 mode)
 996{
 997        u32 *cs;
 998
 999        cs = intel_ring_begin(rq, 2);
1000        if (IS_ERR(cs))
1001                return PTR_ERR(cs);
1002
1003        *cs++ = MI_FLUSH;
1004        *cs++ = MI_NOOP;
1005        intel_ring_advance(rq, cs);
1006        return 0;
1007}
1008
1009static void
1010gen6_irq_enable(struct intel_engine_cs *engine)
1011{
1012        ENGINE_WRITE(engine, RING_IMR,
1013                     ~(engine->irq_enable_mask | engine->irq_keep_mask));
1014
1015        /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1016        ENGINE_POSTING_READ(engine, RING_IMR);
1017
1018        gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
1019}
1020
1021static void
1022gen6_irq_disable(struct intel_engine_cs *engine)
1023{
1024        ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
1025        gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
1026}
1027
1028static void
1029hsw_vebox_irq_enable(struct intel_engine_cs *engine)
1030{
1031        ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
1032
1033        /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1034        ENGINE_POSTING_READ(engine, RING_IMR);
1035
1036        gen6_unmask_pm_irq(engine->i915, engine->irq_enable_mask);
1037}
1038
1039static void
1040hsw_vebox_irq_disable(struct intel_engine_cs *engine)
1041{
1042        ENGINE_WRITE(engine, RING_IMR, ~0);
1043        gen6_mask_pm_irq(engine->i915, engine->irq_enable_mask);
1044}
1045
1046static int
1047i965_emit_bb_start(struct i915_request *rq,
1048                   u64 offset, u32 length,
1049                   unsigned int dispatch_flags)
1050{
1051        u32 *cs;
1052
1053        cs = intel_ring_begin(rq, 2);
1054        if (IS_ERR(cs))
1055                return PTR_ERR(cs);
1056
1057        *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | (dispatch_flags &
1058                I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE_I965);
1059        *cs++ = offset;
1060        intel_ring_advance(rq, cs);
1061
1062        return 0;
1063}
1064
1065/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
1066#define I830_BATCH_LIMIT SZ_256K
1067#define I830_TLB_ENTRIES (2)
1068#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1069static int
1070i830_emit_bb_start(struct i915_request *rq,
1071                   u64 offset, u32 len,
1072                   unsigned int dispatch_flags)
1073{
1074        u32 *cs, cs_offset = i915_scratch_offset(rq->i915);
1075
1076        GEM_BUG_ON(rq->i915->gt.scratch->size < I830_WA_SIZE);
1077
1078        cs = intel_ring_begin(rq, 6);
1079        if (IS_ERR(cs))
1080                return PTR_ERR(cs);
1081
1082        /* Evict the invalid PTE TLBs */
1083        *cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
1084        *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
1085        *cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
1086        *cs++ = cs_offset;
1087        *cs++ = 0xdeadbeef;
1088        *cs++ = MI_NOOP;
1089        intel_ring_advance(rq, cs);
1090
1091        if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1092                if (len > I830_BATCH_LIMIT)
1093                        return -ENOSPC;
1094
1095                cs = intel_ring_begin(rq, 6 + 2);
1096                if (IS_ERR(cs))
1097                        return PTR_ERR(cs);
1098
1099                /* Blit the batch (which has now all relocs applied) to the
1100                 * stable batch scratch bo area (so that the CS never
1101                 * stumbles over its tlb invalidation bug) ...
1102                 */
1103                *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA;
1104                *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
1105                *cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
1106                *cs++ = cs_offset;
1107                *cs++ = 4096;
1108                *cs++ = offset;
1109
1110                *cs++ = MI_FLUSH;
1111                *cs++ = MI_NOOP;
1112                intel_ring_advance(rq, cs);
1113
1114                /* ... and execute it. */
1115                offset = cs_offset;
1116        }
1117
1118        cs = intel_ring_begin(rq, 2);
1119        if (IS_ERR(cs))
1120                return PTR_ERR(cs);
1121
1122        *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1123        *cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1124                MI_BATCH_NON_SECURE);
1125        intel_ring_advance(rq, cs);
1126
1127        return 0;
1128}
1129
1130static int
1131i915_emit_bb_start(struct i915_request *rq,
1132                   u64 offset, u32 len,
1133                   unsigned int dispatch_flags)
1134{
1135        u32 *cs;
1136
1137        cs = intel_ring_begin(rq, 2);
1138        if (IS_ERR(cs))
1139                return PTR_ERR(cs);
1140
1141        *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1142        *cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1143                MI_BATCH_NON_SECURE);
1144        intel_ring_advance(rq, cs);
1145
1146        return 0;
1147}
1148
1149int intel_ring_pin(struct intel_ring *ring)
1150{
1151        struct i915_vma *vma = ring->vma;
1152        unsigned int flags;
1153        void *addr;
1154        int ret;
1155
1156        if (atomic_fetch_inc(&ring->pin_count))
1157                return 0;
1158
1159        ret = i915_timeline_pin(ring->timeline);
1160        if (ret)
1161                goto err_unpin;
1162
1163        flags = PIN_GLOBAL;
1164
1165        /* Ring wraparound at offset 0 sometimes hangs. No idea why. */
1166        flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
1167
1168        if (vma->obj->stolen)
1169                flags |= PIN_MAPPABLE;
1170        else
1171                flags |= PIN_HIGH;
1172
1173        ret = i915_vma_pin(vma, 0, 0, flags);
1174        if (unlikely(ret))
1175                goto err_timeline;
1176
1177        if (i915_vma_is_map_and_fenceable(vma))
1178                addr = (void __force *)i915_vma_pin_iomap(vma);
1179        else
1180                addr = i915_gem_object_pin_map(vma->obj,
1181                                               i915_coherent_map_type(vma->vm->i915));
1182        if (IS_ERR(addr)) {
1183                ret = PTR_ERR(addr);
1184                goto err_ring;
1185        }
1186
1187        vma->obj->pin_global++;
1188
1189        GEM_BUG_ON(ring->vaddr);
1190        ring->vaddr = addr;
1191
1192        return 0;
1193
1194err_ring:
1195        i915_vma_unpin(vma);
1196err_timeline:
1197        i915_timeline_unpin(ring->timeline);
1198err_unpin:
1199        atomic_dec(&ring->pin_count);
1200        return ret;
1201}
1202
1203void intel_ring_reset(struct intel_ring *ring, u32 tail)
1204{
1205        GEM_BUG_ON(!intel_ring_offset_valid(ring, tail));
1206
1207        ring->tail = tail;
1208        ring->head = tail;
1209        ring->emit = tail;
1210        intel_ring_update_space(ring);
1211}
1212
1213void intel_ring_unpin(struct intel_ring *ring)
1214{
1215        if (!atomic_dec_and_test(&ring->pin_count))
1216                return;
1217
1218        /* Discard any unused bytes beyond that submitted to hw. */
1219        intel_ring_reset(ring, ring->tail);
1220
1221        GEM_BUG_ON(!ring->vma);
1222        if (i915_vma_is_map_and_fenceable(ring->vma))
1223                i915_vma_unpin_iomap(ring->vma);
1224        else
1225                i915_gem_object_unpin_map(ring->vma->obj);
1226
1227        GEM_BUG_ON(!ring->vaddr);
1228        ring->vaddr = NULL;
1229
1230        ring->vma->obj->pin_global--;
1231        i915_vma_unpin(ring->vma);
1232
1233        i915_timeline_unpin(ring->timeline);
1234}
1235
1236static struct i915_vma *
1237intel_ring_create_vma(struct drm_i915_private *dev_priv, int size)
1238{
1239        struct i915_address_space *vm = &dev_priv->ggtt.vm;
1240        struct drm_i915_gem_object *obj;
1241        struct i915_vma *vma;
1242
1243        obj = i915_gem_object_create_stolen(dev_priv, size);
1244        if (!obj)
1245                obj = i915_gem_object_create_internal(dev_priv, size);
1246        if (IS_ERR(obj))
1247                return ERR_CAST(obj);
1248
1249        /*
1250         * Mark ring buffers as read-only from GPU side (so no stray overwrites)
1251         * if supported by the platform's GGTT.
1252         */
1253        if (vm->has_read_only)
1254                i915_gem_object_set_readonly(obj);
1255
1256        vma = i915_vma_instance(obj, vm, NULL);
1257        if (IS_ERR(vma))
1258                goto err;
1259
1260        return vma;
1261
1262err:
1263        i915_gem_object_put(obj);
1264        return vma;
1265}
1266
1267struct intel_ring *
1268intel_engine_create_ring(struct intel_engine_cs *engine,
1269                         struct i915_timeline *timeline,
1270                         int size)
1271{
1272        struct intel_ring *ring;
1273        struct i915_vma *vma;
1274
1275        GEM_BUG_ON(!is_power_of_2(size));
1276        GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES);
1277
1278        ring = kzalloc(sizeof(*ring), GFP_KERNEL);
1279        if (!ring)
1280                return ERR_PTR(-ENOMEM);
1281
1282        kref_init(&ring->ref);
1283        INIT_LIST_HEAD(&ring->request_list);
1284        ring->timeline = i915_timeline_get(timeline);
1285
1286        ring->size = size;
1287        /* Workaround an erratum on the i830 which causes a hang if
1288         * the TAIL pointer points to within the last 2 cachelines
1289         * of the buffer.
1290         */
1291        ring->effective_size = size;
1292        if (IS_I830(engine->i915) || IS_I845G(engine->i915))
1293                ring->effective_size -= 2 * CACHELINE_BYTES;
1294
1295        intel_ring_update_space(ring);
1296
1297        vma = intel_ring_create_vma(engine->i915, size);
1298        if (IS_ERR(vma)) {
1299                kfree(ring);
1300                return ERR_CAST(vma);
1301        }
1302        ring->vma = vma;
1303
1304        return ring;
1305}
1306
1307void intel_ring_free(struct kref *ref)
1308{
1309        struct intel_ring *ring = container_of(ref, typeof(*ring), ref);
1310
1311        i915_vma_close(ring->vma);
1312        i915_vma_put(ring->vma);
1313
1314        i915_timeline_put(ring->timeline);
1315        kfree(ring);
1316}
1317
1318static void __ring_context_fini(struct intel_context *ce)
1319{
1320        GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
1321        i915_gem_object_put(ce->state->obj);
1322}
1323
1324static void ring_context_destroy(struct kref *ref)
1325{
1326        struct intel_context *ce = container_of(ref, typeof(*ce), ref);
1327
1328        GEM_BUG_ON(intel_context_is_pinned(ce));
1329
1330        if (ce->state)
1331                __ring_context_fini(ce);
1332
1333        intel_context_free(ce);
1334}
1335
1336static int __context_pin_ppgtt(struct i915_gem_context *ctx)
1337{
1338        struct i915_address_space *vm;
1339        int err = 0;
1340
1341        vm = ctx->vm ?: &ctx->i915->mm.aliasing_ppgtt->vm;
1342        if (vm)
1343                err = gen6_ppgtt_pin(i915_vm_to_ppgtt((vm)));
1344
1345        return err;
1346}
1347
1348static void __context_unpin_ppgtt(struct i915_gem_context *ctx)
1349{
1350        struct i915_address_space *vm;
1351
1352        vm = ctx->vm ?: &ctx->i915->mm.aliasing_ppgtt->vm;
1353        if (vm)
1354                gen6_ppgtt_unpin(i915_vm_to_ppgtt(vm));
1355}
1356
1357static void ring_context_unpin(struct intel_context *ce)
1358{
1359        __context_unpin_ppgtt(ce->gem_context);
1360}
1361
1362static struct i915_vma *
1363alloc_context_vma(struct intel_engine_cs *engine)
1364{
1365        struct drm_i915_private *i915 = engine->i915;
1366        struct drm_i915_gem_object *obj;
1367        struct i915_vma *vma;
1368        int err;
1369
1370        obj = i915_gem_object_create_shmem(i915, engine->context_size);
1371        if (IS_ERR(obj))
1372                return ERR_CAST(obj);
1373
1374        /*
1375         * Try to make the context utilize L3 as well as LLC.
1376         *
1377         * On VLV we don't have L3 controls in the PTEs so we
1378         * shouldn't touch the cache level, especially as that
1379         * would make the object snooped which might have a
1380         * negative performance impact.
1381         *
1382         * Snooping is required on non-llc platforms in execlist
1383         * mode, but since all GGTT accesses use PAT entry 0 we
1384         * get snooping anyway regardless of cache_level.
1385         *
1386         * This is only applicable for Ivy Bridge devices since
1387         * later platforms don't have L3 control bits in the PTE.
1388         */
1389        if (IS_IVYBRIDGE(i915))
1390                i915_gem_object_set_cache_coherency(obj, I915_CACHE_L3_LLC);
1391
1392        if (engine->default_state) {
1393                void *defaults, *vaddr;
1394
1395                vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
1396                if (IS_ERR(vaddr)) {
1397                        err = PTR_ERR(vaddr);
1398                        goto err_obj;
1399                }
1400
1401                defaults = i915_gem_object_pin_map(engine->default_state,
1402                                                   I915_MAP_WB);
1403                if (IS_ERR(defaults)) {
1404                        err = PTR_ERR(defaults);
1405                        goto err_map;
1406                }
1407
1408                memcpy(vaddr, defaults, engine->context_size);
1409                i915_gem_object_unpin_map(engine->default_state);
1410
1411                i915_gem_object_flush_map(obj);
1412                i915_gem_object_unpin_map(obj);
1413        }
1414
1415        vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
1416        if (IS_ERR(vma)) {
1417                err = PTR_ERR(vma);
1418                goto err_obj;
1419        }
1420
1421        return vma;
1422
1423err_map:
1424        i915_gem_object_unpin_map(obj);
1425err_obj:
1426        i915_gem_object_put(obj);
1427        return ERR_PTR(err);
1428}
1429
1430static int ring_context_pin(struct intel_context *ce)
1431{
1432        struct intel_engine_cs *engine = ce->engine;
1433        int err;
1434
1435        /* One ringbuffer to rule them all */
1436        GEM_BUG_ON(!engine->buffer);
1437        ce->ring = engine->buffer;
1438
1439        if (!ce->state && engine->context_size) {
1440                struct i915_vma *vma;
1441
1442                vma = alloc_context_vma(engine);
1443                if (IS_ERR(vma))
1444                        return PTR_ERR(vma);
1445
1446                ce->state = vma;
1447        }
1448
1449        err = intel_context_active_acquire(ce, PIN_HIGH);
1450        if (err)
1451                return err;
1452
1453        err = __context_pin_ppgtt(ce->gem_context);
1454        if (err)
1455                goto err_active;
1456
1457        return 0;
1458
1459err_active:
1460        intel_context_active_release(ce);
1461        return err;
1462}
1463
1464static void ring_context_reset(struct intel_context *ce)
1465{
1466        intel_ring_reset(ce->ring, 0);
1467}
1468
1469static const struct intel_context_ops ring_context_ops = {
1470        .pin = ring_context_pin,
1471        .unpin = ring_context_unpin,
1472
1473        .enter = intel_context_enter_engine,
1474        .exit = intel_context_exit_engine,
1475
1476        .reset = ring_context_reset,
1477        .destroy = ring_context_destroy,
1478};
1479
1480static int load_pd_dir(struct i915_request *rq, const struct i915_ppgtt *ppgtt)
1481{
1482        const struct intel_engine_cs * const engine = rq->engine;
1483        u32 *cs;
1484
1485        cs = intel_ring_begin(rq, 6);
1486        if (IS_ERR(cs))
1487                return PTR_ERR(cs);
1488
1489        *cs++ = MI_LOAD_REGISTER_IMM(1);
1490        *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
1491        *cs++ = PP_DIR_DCLV_2G;
1492
1493        *cs++ = MI_LOAD_REGISTER_IMM(1);
1494        *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1495        *cs++ = ppgtt->pd->base.ggtt_offset << 10;
1496
1497        intel_ring_advance(rq, cs);
1498
1499        return 0;
1500}
1501
1502static int flush_pd_dir(struct i915_request *rq)
1503{
1504        const struct intel_engine_cs * const engine = rq->engine;
1505        u32 *cs;
1506
1507        cs = intel_ring_begin(rq, 4);
1508        if (IS_ERR(cs))
1509                return PTR_ERR(cs);
1510
1511        /* Stall until the page table load is complete */
1512        *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1513        *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1514        *cs++ = i915_scratch_offset(rq->i915);
1515        *cs++ = MI_NOOP;
1516
1517        intel_ring_advance(rq, cs);
1518        return 0;
1519}
1520
1521static inline int mi_set_context(struct i915_request *rq, u32 flags)
1522{
1523        struct drm_i915_private *i915 = rq->i915;
1524        struct intel_engine_cs *engine = rq->engine;
1525        enum intel_engine_id id;
1526        const int num_engines =
1527                IS_HSW_GT1(i915) ? RUNTIME_INFO(i915)->num_engines - 1 : 0;
1528        bool force_restore = false;
1529        int len;
1530        u32 *cs;
1531
1532        flags |= MI_MM_SPACE_GTT;
1533        if (IS_HASWELL(i915))
1534                /* These flags are for resource streamer on HSW+ */
1535                flags |= HSW_MI_RS_SAVE_STATE_EN | HSW_MI_RS_RESTORE_STATE_EN;
1536        else
1537                /* We need to save the extended state for powersaving modes */
1538                flags |= MI_SAVE_EXT_STATE_EN | MI_RESTORE_EXT_STATE_EN;
1539
1540        len = 4;
1541        if (IS_GEN(i915, 7))
1542                len += 2 + (num_engines ? 4 * num_engines + 6 : 0);
1543        else if (IS_GEN(i915, 5))
1544                len += 2;
1545        if (flags & MI_FORCE_RESTORE) {
1546                GEM_BUG_ON(flags & MI_RESTORE_INHIBIT);
1547                flags &= ~MI_FORCE_RESTORE;
1548                force_restore = true;
1549                len += 2;
1550        }
1551
1552        cs = intel_ring_begin(rq, len);
1553        if (IS_ERR(cs))
1554                return PTR_ERR(cs);
1555
1556        /* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
1557        if (IS_GEN(i915, 7)) {
1558                *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1559                if (num_engines) {
1560                        struct intel_engine_cs *signaller;
1561
1562                        *cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1563                        for_each_engine(signaller, i915, id) {
1564                                if (signaller == engine)
1565                                        continue;
1566
1567                                *cs++ = i915_mmio_reg_offset(
1568                                           RING_PSMI_CTL(signaller->mmio_base));
1569                                *cs++ = _MASKED_BIT_ENABLE(
1570                                                GEN6_PSMI_SLEEP_MSG_DISABLE);
1571                        }
1572                }
1573        } else if (IS_GEN(i915, 5)) {
1574                /*
1575                 * This w/a is only listed for pre-production ilk a/b steppings,
1576                 * but is also mentioned for programming the powerctx. To be
1577                 * safe, just apply the workaround; we do not use SyncFlush so
1578                 * this should never take effect and so be a no-op!
1579                 */
1580                *cs++ = MI_SUSPEND_FLUSH | MI_SUSPEND_FLUSH_EN;
1581        }
1582
1583        if (force_restore) {
1584                /*
1585                 * The HW doesn't handle being told to restore the current
1586                 * context very well. Quite often it likes goes to go off and
1587                 * sulk, especially when it is meant to be reloading PP_DIR.
1588                 * A very simple fix to force the reload is to simply switch
1589                 * away from the current context and back again.
1590                 *
1591                 * Note that the kernel_context will contain random state
1592                 * following the INHIBIT_RESTORE. We accept this since we
1593                 * never use the kernel_context state; it is merely a
1594                 * placeholder we use to flush other contexts.
1595                 */
1596                *cs++ = MI_SET_CONTEXT;
1597                *cs++ = i915_ggtt_offset(engine->kernel_context->state) |
1598                        MI_MM_SPACE_GTT |
1599                        MI_RESTORE_INHIBIT;
1600        }
1601
1602        *cs++ = MI_NOOP;
1603        *cs++ = MI_SET_CONTEXT;
1604        *cs++ = i915_ggtt_offset(rq->hw_context->state) | flags;
1605        /*
1606         * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
1607         * WaMiSetContext_Hang:snb,ivb,vlv
1608         */
1609        *cs++ = MI_NOOP;
1610
1611        if (IS_GEN(i915, 7)) {
1612                if (num_engines) {
1613                        struct intel_engine_cs *signaller;
1614                        i915_reg_t last_reg = {}; /* keep gcc quiet */
1615
1616                        *cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1617                        for_each_engine(signaller, i915, id) {
1618                                if (signaller == engine)
1619                                        continue;
1620
1621                                last_reg = RING_PSMI_CTL(signaller->mmio_base);
1622                                *cs++ = i915_mmio_reg_offset(last_reg);
1623                                *cs++ = _MASKED_BIT_DISABLE(
1624                                                GEN6_PSMI_SLEEP_MSG_DISABLE);
1625                        }
1626
1627                        /* Insert a delay before the next switch! */
1628                        *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1629                        *cs++ = i915_mmio_reg_offset(last_reg);
1630                        *cs++ = i915_scratch_offset(rq->i915);
1631                        *cs++ = MI_NOOP;
1632                }
1633                *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1634        } else if (IS_GEN(i915, 5)) {
1635                *cs++ = MI_SUSPEND_FLUSH;
1636        }
1637
1638        intel_ring_advance(rq, cs);
1639
1640        return 0;
1641}
1642
1643static int remap_l3(struct i915_request *rq, int slice)
1644{
1645        u32 *cs, *remap_info = rq->i915->l3_parity.remap_info[slice];
1646        int i;
1647
1648        if (!remap_info)
1649                return 0;
1650
1651        cs = intel_ring_begin(rq, GEN7_L3LOG_SIZE/4 * 2 + 2);
1652        if (IS_ERR(cs))
1653                return PTR_ERR(cs);
1654
1655        /*
1656         * Note: We do not worry about the concurrent register cacheline hang
1657         * here because no other code should access these registers other than
1658         * at initialization time.
1659         */
1660        *cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
1661        for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
1662                *cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
1663                *cs++ = remap_info[i];
1664        }
1665        *cs++ = MI_NOOP;
1666        intel_ring_advance(rq, cs);
1667
1668        return 0;
1669}
1670
1671static int switch_context(struct i915_request *rq)
1672{
1673        struct intel_engine_cs *engine = rq->engine;
1674        struct i915_gem_context *ctx = rq->gem_context;
1675        struct i915_address_space *vm =
1676                ctx->vm ?: &rq->i915->mm.aliasing_ppgtt->vm;
1677        unsigned int unwind_mm = 0;
1678        u32 hw_flags = 0;
1679        int ret, i;
1680
1681        GEM_BUG_ON(HAS_EXECLISTS(rq->i915));
1682
1683        if (vm) {
1684                struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
1685                int loops;
1686
1687                /*
1688                 * Baytail takes a little more convincing that it really needs
1689                 * to reload the PD between contexts. It is not just a little
1690                 * longer, as adding more stalls after the load_pd_dir (i.e.
1691                 * adding a long loop around flush_pd_dir) is not as effective
1692                 * as reloading the PD umpteen times. 32 is derived from
1693                 * experimentation (gem_exec_parallel/fds) and has no good
1694                 * explanation.
1695                 */
1696                loops = 1;
1697                if (engine->id == BCS0 && IS_VALLEYVIEW(engine->i915))
1698                        loops = 32;
1699
1700                do {
1701                        ret = load_pd_dir(rq, ppgtt);
1702                        if (ret)
1703                                goto err;
1704                } while (--loops);
1705
1706                if (ppgtt->pd_dirty_engines & engine->mask) {
1707                        unwind_mm = engine->mask;
1708                        ppgtt->pd_dirty_engines &= ~unwind_mm;
1709                        hw_flags = MI_FORCE_RESTORE;
1710                }
1711        }
1712
1713        if (rq->hw_context->state) {
1714                GEM_BUG_ON(engine->id != RCS0);
1715
1716                /*
1717                 * The kernel context(s) is treated as pure scratch and is not
1718                 * expected to retain any state (as we sacrifice it during
1719                 * suspend and on resume it may be corrupted). This is ok,
1720                 * as nothing actually executes using the kernel context; it
1721                 * is purely used for flushing user contexts.
1722                 */
1723                if (i915_gem_context_is_kernel(ctx))
1724                        hw_flags = MI_RESTORE_INHIBIT;
1725
1726                ret = mi_set_context(rq, hw_flags);
1727                if (ret)
1728                        goto err_mm;
1729        }
1730
1731        if (vm) {
1732                ret = engine->emit_flush(rq, EMIT_INVALIDATE);
1733                if (ret)
1734                        goto err_mm;
1735
1736                ret = flush_pd_dir(rq);
1737                if (ret)
1738                        goto err_mm;
1739
1740                /*
1741                 * Not only do we need a full barrier (post-sync write) after
1742                 * invalidating the TLBs, but we need to wait a little bit
1743                 * longer. Whether this is merely delaying us, or the
1744                 * subsequent flush is a key part of serialising with the
1745                 * post-sync op, this extra pass appears vital before a
1746                 * mm switch!
1747                 */
1748                ret = engine->emit_flush(rq, EMIT_INVALIDATE);
1749                if (ret)
1750                        goto err_mm;
1751
1752                ret = engine->emit_flush(rq, EMIT_FLUSH);
1753                if (ret)
1754                        goto err_mm;
1755        }
1756
1757        if (ctx->remap_slice) {
1758                for (i = 0; i < MAX_L3_SLICES; i++) {
1759                        if (!(ctx->remap_slice & BIT(i)))
1760                                continue;
1761
1762                        ret = remap_l3(rq, i);
1763                        if (ret)
1764                                goto err_mm;
1765                }
1766
1767                ctx->remap_slice = 0;
1768        }
1769
1770        return 0;
1771
1772err_mm:
1773        if (unwind_mm)
1774                i915_vm_to_ppgtt(vm)->pd_dirty_engines |= unwind_mm;
1775err:
1776        return ret;
1777}
1778
1779static int ring_request_alloc(struct i915_request *request)
1780{
1781        int ret;
1782
1783        GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
1784        GEM_BUG_ON(request->timeline->has_initial_breadcrumb);
1785
1786        /*
1787         * Flush enough space to reduce the likelihood of waiting after
1788         * we start building the request - in which case we will just
1789         * have to repeat work.
1790         */
1791        request->reserved_space += LEGACY_REQUEST_SIZE;
1792
1793        /* Unconditionally invalidate GPU caches and TLBs. */
1794        ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1795        if (ret)
1796                return ret;
1797
1798        ret = switch_context(request);
1799        if (ret)
1800                return ret;
1801
1802        request->reserved_space -= LEGACY_REQUEST_SIZE;
1803        return 0;
1804}
1805
1806static noinline int wait_for_space(struct intel_ring *ring, unsigned int bytes)
1807{
1808        struct i915_request *target;
1809        long timeout;
1810
1811        if (intel_ring_update_space(ring) >= bytes)
1812                return 0;
1813
1814        GEM_BUG_ON(list_empty(&ring->request_list));
1815        list_for_each_entry(target, &ring->request_list, ring_link) {
1816                /* Would completion of this request free enough space? */
1817                if (bytes <= __intel_ring_space(target->postfix,
1818                                                ring->emit, ring->size))
1819                        break;
1820        }
1821
1822        if (WARN_ON(&target->ring_link == &ring->request_list))
1823                return -ENOSPC;
1824
1825        timeout = i915_request_wait(target,
1826                                    I915_WAIT_INTERRUPTIBLE,
1827                                    MAX_SCHEDULE_TIMEOUT);
1828        if (timeout < 0)
1829                return timeout;
1830
1831        i915_request_retire_upto(target);
1832
1833        intel_ring_update_space(ring);
1834        GEM_BUG_ON(ring->space < bytes);
1835        return 0;
1836}
1837
1838u32 *intel_ring_begin(struct i915_request *rq, unsigned int num_dwords)
1839{
1840        struct intel_ring *ring = rq->ring;
1841        const unsigned int remain_usable = ring->effective_size - ring->emit;
1842        const unsigned int bytes = num_dwords * sizeof(u32);
1843        unsigned int need_wrap = 0;
1844        unsigned int total_bytes;
1845        u32 *cs;
1846
1847        /* Packets must be qword aligned. */
1848        GEM_BUG_ON(num_dwords & 1);
1849
1850        total_bytes = bytes + rq->reserved_space;
1851        GEM_BUG_ON(total_bytes > ring->effective_size);
1852
1853        if (unlikely(total_bytes > remain_usable)) {
1854                const int remain_actual = ring->size - ring->emit;
1855
1856                if (bytes > remain_usable) {
1857                        /*
1858                         * Not enough space for the basic request. So need to
1859                         * flush out the remainder and then wait for
1860                         * base + reserved.
1861                         */
1862                        total_bytes += remain_actual;
1863                        need_wrap = remain_actual | 1;
1864                } else  {
1865                        /*
1866                         * The base request will fit but the reserved space
1867                         * falls off the end. So we don't need an immediate
1868                         * wrap and only need to effectively wait for the
1869                         * reserved size from the start of ringbuffer.
1870                         */
1871                        total_bytes = rq->reserved_space + remain_actual;
1872                }
1873        }
1874
1875        if (unlikely(total_bytes > ring->space)) {
1876                int ret;
1877
1878                /*
1879                 * Space is reserved in the ringbuffer for finalising the
1880                 * request, as that cannot be allowed to fail. During request
1881                 * finalisation, reserved_space is set to 0 to stop the
1882                 * overallocation and the assumption is that then we never need
1883                 * to wait (which has the risk of failing with EINTR).
1884                 *
1885                 * See also i915_request_alloc() and i915_request_add().
1886                 */
1887                GEM_BUG_ON(!rq->reserved_space);
1888
1889                ret = wait_for_space(ring, total_bytes);
1890                if (unlikely(ret))
1891                        return ERR_PTR(ret);
1892        }
1893
1894        if (unlikely(need_wrap)) {
1895                need_wrap &= ~1;
1896                GEM_BUG_ON(need_wrap > ring->space);
1897                GEM_BUG_ON(ring->emit + need_wrap > ring->size);
1898                GEM_BUG_ON(!IS_ALIGNED(need_wrap, sizeof(u64)));
1899
1900                /* Fill the tail with MI_NOOP */
1901                memset64(ring->vaddr + ring->emit, 0, need_wrap / sizeof(u64));
1902                ring->space -= need_wrap;
1903                ring->emit = 0;
1904        }
1905
1906        GEM_BUG_ON(ring->emit > ring->size - bytes);
1907        GEM_BUG_ON(ring->space < bytes);
1908        cs = ring->vaddr + ring->emit;
1909        GEM_DEBUG_EXEC(memset32(cs, POISON_INUSE, bytes / sizeof(*cs)));
1910        ring->emit += bytes;
1911        ring->space -= bytes;
1912
1913        return cs;
1914}
1915
1916/* Align the ring tail to a cacheline boundary */
1917int intel_ring_cacheline_align(struct i915_request *rq)
1918{
1919        int num_dwords;
1920        void *cs;
1921
1922        num_dwords = (rq->ring->emit & (CACHELINE_BYTES - 1)) / sizeof(u32);
1923        if (num_dwords == 0)
1924                return 0;
1925
1926        num_dwords = CACHELINE_DWORDS - num_dwords;
1927        GEM_BUG_ON(num_dwords & 1);
1928
1929        cs = intel_ring_begin(rq, num_dwords);
1930        if (IS_ERR(cs))
1931                return PTR_ERR(cs);
1932
1933        memset64(cs, (u64)MI_NOOP << 32 | MI_NOOP, num_dwords / 2);
1934        intel_ring_advance(rq, cs);
1935
1936        GEM_BUG_ON(rq->ring->emit & (CACHELINE_BYTES - 1));
1937        return 0;
1938}
1939
1940static void gen6_bsd_submit_request(struct i915_request *request)
1941{
1942        struct intel_uncore *uncore = request->engine->uncore;
1943
1944        intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
1945
1946       /* Every tail move must follow the sequence below */
1947
1948        /* Disable notification that the ring is IDLE. The GT
1949         * will then assume that it is busy and bring it out of rc6.
1950         */
1951        intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
1952                              _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
1953
1954        /* Clear the context id. Here be magic! */
1955        intel_uncore_write64_fw(uncore, GEN6_BSD_RNCID, 0x0);
1956
1957        /* Wait for the ring not to be idle, i.e. for it to wake up. */
1958        if (__intel_wait_for_register_fw(uncore,
1959                                         GEN6_BSD_SLEEP_PSMI_CONTROL,
1960                                         GEN6_BSD_SLEEP_INDICATOR,
1961                                         0,
1962                                         1000, 0, NULL))
1963                DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
1964
1965        /* Now that the ring is fully powered up, update the tail */
1966        i9xx_submit_request(request);
1967
1968        /* Let the ring send IDLE messages to the GT again,
1969         * and so let it sleep to conserve power when idle.
1970         */
1971        intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
1972                              _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
1973
1974        intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL);
1975}
1976
1977static int mi_flush_dw(struct i915_request *rq, u32 flags)
1978{
1979        u32 cmd, *cs;
1980
1981        cs = intel_ring_begin(rq, 4);
1982        if (IS_ERR(cs))
1983                return PTR_ERR(cs);
1984
1985        cmd = MI_FLUSH_DW;
1986
1987        /*
1988         * We always require a command barrier so that subsequent
1989         * commands, such as breadcrumb interrupts, are strictly ordered
1990         * wrt the contents of the write cache being flushed to memory
1991         * (and thus being coherent from the CPU).
1992         */
1993        cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
1994
1995        /*
1996         * Bspec vol 1c.3 - blitter engine command streamer:
1997         * "If ENABLED, all TLBs will be invalidated once the flush
1998         * operation is complete. This bit is only valid when the
1999         * Post-Sync Operation field is a value of 1h or 3h."
2000         */
2001        cmd |= flags;
2002
2003        *cs++ = cmd;
2004        *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2005        *cs++ = 0;
2006        *cs++ = MI_NOOP;
2007
2008        intel_ring_advance(rq, cs);
2009
2010        return 0;
2011}
2012
2013static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
2014{
2015        return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
2016}
2017
2018static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
2019{
2020        return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
2021}
2022
2023static int
2024hsw_emit_bb_start(struct i915_request *rq,
2025                  u64 offset, u32 len,
2026                  unsigned int dispatch_flags)
2027{
2028        u32 *cs;
2029
2030        cs = intel_ring_begin(rq, 2);
2031        if (IS_ERR(cs))
2032                return PTR_ERR(cs);
2033
2034        *cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
2035                0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW);
2036        /* bit0-7 is the length on GEN6+ */
2037        *cs++ = offset;
2038        intel_ring_advance(rq, cs);
2039
2040        return 0;
2041}
2042
2043static int
2044gen6_emit_bb_start(struct i915_request *rq,
2045                   u64 offset, u32 len,
2046                   unsigned int dispatch_flags)
2047{
2048        u32 *cs;
2049
2050        cs = intel_ring_begin(rq, 2);
2051        if (IS_ERR(cs))
2052                return PTR_ERR(cs);
2053
2054        *cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
2055                0 : MI_BATCH_NON_SECURE_I965);
2056        /* bit0-7 is the length on GEN6+ */
2057        *cs++ = offset;
2058        intel_ring_advance(rq, cs);
2059
2060        return 0;
2061}
2062
2063/* Blitter support (SandyBridge+) */
2064
2065static int gen6_ring_flush(struct i915_request *rq, u32 mode)
2066{
2067        return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
2068}
2069
2070static void i9xx_set_default_submission(struct intel_engine_cs *engine)
2071{
2072        engine->submit_request = i9xx_submit_request;
2073        engine->cancel_requests = cancel_requests;
2074
2075        engine->park = NULL;
2076        engine->unpark = NULL;
2077}
2078
2079static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine)
2080{
2081        i9xx_set_default_submission(engine);
2082        engine->submit_request = gen6_bsd_submit_request;
2083}
2084
2085static void ring_destroy(struct intel_engine_cs *engine)
2086{
2087        struct drm_i915_private *dev_priv = engine->i915;
2088
2089        WARN_ON(INTEL_GEN(dev_priv) > 2 &&
2090                (ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
2091
2092        intel_engine_cleanup_common(engine);
2093
2094        intel_ring_unpin(engine->buffer);
2095        intel_ring_put(engine->buffer);
2096
2097        kfree(engine);
2098}
2099
2100static void setup_irq(struct intel_engine_cs *engine)
2101{
2102        struct drm_i915_private *i915 = engine->i915;
2103
2104        if (INTEL_GEN(i915) >= 6) {
2105                engine->irq_enable = gen6_irq_enable;
2106                engine->irq_disable = gen6_irq_disable;
2107        } else if (INTEL_GEN(i915) >= 5) {
2108                engine->irq_enable = gen5_irq_enable;
2109                engine->irq_disable = gen5_irq_disable;
2110        } else if (INTEL_GEN(i915) >= 3) {
2111                engine->irq_enable = i9xx_irq_enable;
2112                engine->irq_disable = i9xx_irq_disable;
2113        } else {
2114                engine->irq_enable = i8xx_irq_enable;
2115                engine->irq_disable = i8xx_irq_disable;
2116        }
2117}
2118
2119static void setup_common(struct intel_engine_cs *engine)
2120{
2121        struct drm_i915_private *i915 = engine->i915;
2122
2123        /* gen8+ are only supported with execlists */
2124        GEM_BUG_ON(INTEL_GEN(i915) >= 8);
2125
2126        setup_irq(engine);
2127
2128        engine->destroy = ring_destroy;
2129
2130        engine->resume = xcs_resume;
2131        engine->reset.prepare = reset_prepare;
2132        engine->reset.reset = reset_ring;
2133        engine->reset.finish = reset_finish;
2134
2135        engine->cops = &ring_context_ops;
2136        engine->request_alloc = ring_request_alloc;
2137
2138        /*
2139         * Using a global execution timeline; the previous final breadcrumb is
2140         * equivalent to our next initial bread so we can elide
2141         * engine->emit_init_breadcrumb().
2142         */
2143        engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
2144        if (IS_GEN(i915, 5))
2145                engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
2146
2147        engine->set_default_submission = i9xx_set_default_submission;
2148
2149        if (INTEL_GEN(i915) >= 6)
2150                engine->emit_bb_start = gen6_emit_bb_start;
2151        else if (INTEL_GEN(i915) >= 4)
2152                engine->emit_bb_start = i965_emit_bb_start;
2153        else if (IS_I830(i915) || IS_I845G(i915))
2154                engine->emit_bb_start = i830_emit_bb_start;
2155        else
2156                engine->emit_bb_start = i915_emit_bb_start;
2157}
2158
2159static void setup_rcs(struct intel_engine_cs *engine)
2160{
2161        struct drm_i915_private *i915 = engine->i915;
2162
2163        if (HAS_L3_DPF(i915))
2164                engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2165
2166        engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
2167
2168        if (INTEL_GEN(i915) >= 7) {
2169                engine->init_context = intel_rcs_ctx_init;
2170                engine->emit_flush = gen7_render_ring_flush;
2171                engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
2172        } else if (IS_GEN(i915, 6)) {
2173                engine->init_context = intel_rcs_ctx_init;
2174                engine->emit_flush = gen6_render_ring_flush;
2175                engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
2176        } else if (IS_GEN(i915, 5)) {
2177                engine->emit_flush = gen4_render_ring_flush;
2178        } else {
2179                if (INTEL_GEN(i915) < 4)
2180                        engine->emit_flush = gen2_render_ring_flush;
2181                else
2182                        engine->emit_flush = gen4_render_ring_flush;
2183                engine->irq_enable_mask = I915_USER_INTERRUPT;
2184        }
2185
2186        if (IS_HASWELL(i915))
2187                engine->emit_bb_start = hsw_emit_bb_start;
2188
2189        engine->resume = rcs_resume;
2190}
2191
2192static void setup_vcs(struct intel_engine_cs *engine)
2193{
2194        struct drm_i915_private *i915 = engine->i915;
2195
2196        if (INTEL_GEN(i915) >= 6) {
2197                /* gen6 bsd needs a special wa for tail updates */
2198                if (IS_GEN(i915, 6))
2199                        engine->set_default_submission = gen6_bsd_set_default_submission;
2200                engine->emit_flush = gen6_bsd_ring_flush;
2201                engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
2202
2203                if (IS_GEN(i915, 6))
2204                        engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
2205                else
2206                        engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2207        } else {
2208                engine->emit_flush = bsd_ring_flush;
2209                if (IS_GEN(i915, 5))
2210                        engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
2211                else
2212                        engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
2213        }
2214}
2215
2216static void setup_bcs(struct intel_engine_cs *engine)
2217{
2218        struct drm_i915_private *i915 = engine->i915;
2219
2220        engine->emit_flush = gen6_ring_flush;
2221        engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
2222
2223        if (IS_GEN(i915, 6))
2224                engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
2225        else
2226                engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2227}
2228
2229static void setup_vecs(struct intel_engine_cs *engine)
2230{
2231        struct drm_i915_private *i915 = engine->i915;
2232
2233        GEM_BUG_ON(INTEL_GEN(i915) < 7);
2234
2235        engine->emit_flush = gen6_ring_flush;
2236        engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
2237        engine->irq_enable = hsw_vebox_irq_enable;
2238        engine->irq_disable = hsw_vebox_irq_disable;
2239
2240        engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2241}
2242
2243int intel_ring_submission_setup(struct intel_engine_cs *engine)
2244{
2245        setup_common(engine);
2246
2247        switch (engine->class) {
2248        case RENDER_CLASS:
2249                setup_rcs(engine);
2250                break;
2251        case VIDEO_DECODE_CLASS:
2252                setup_vcs(engine);
2253                break;
2254        case COPY_ENGINE_CLASS:
2255                setup_bcs(engine);
2256                break;
2257        case VIDEO_ENHANCEMENT_CLASS:
2258                setup_vecs(engine);
2259                break;
2260        default:
2261                MISSING_CASE(engine->class);
2262                return -ENODEV;
2263        }
2264
2265        return 0;
2266}
2267
2268int intel_ring_submission_init(struct intel_engine_cs *engine)
2269{
2270        struct i915_timeline *timeline;
2271        struct intel_ring *ring;
2272        int err;
2273
2274        timeline = i915_timeline_create(engine->i915, engine->status_page.vma);
2275        if (IS_ERR(timeline)) {
2276                err = PTR_ERR(timeline);
2277                goto err;
2278        }
2279        GEM_BUG_ON(timeline->has_initial_breadcrumb);
2280
2281        ring = intel_engine_create_ring(engine, timeline, 32 * PAGE_SIZE);
2282        i915_timeline_put(timeline);
2283        if (IS_ERR(ring)) {
2284                err = PTR_ERR(ring);
2285                goto err;
2286        }
2287
2288        err = intel_ring_pin(ring);
2289        if (err)
2290                goto err_ring;
2291
2292        GEM_BUG_ON(engine->buffer);
2293        engine->buffer = ring;
2294
2295        err = intel_engine_init_common(engine);
2296        if (err)
2297                goto err_unpin;
2298
2299        GEM_BUG_ON(ring->timeline->hwsp_ggtt != engine->status_page.vma);
2300
2301        return 0;
2302
2303err_unpin:
2304        intel_ring_unpin(ring);
2305err_ring:
2306        intel_ring_put(ring);
2307err:
2308        intel_engine_cleanup_common(engine);
2309        return err;
2310}
2311