linux/drivers/gpu/drm/i915/gt/intel_reset.c
<<
>>
Prefs
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2008-2018 Intel Corporation
   4 */
   5
   6#include <linux/sched/mm.h>
   7#include <linux/stop_machine.h>
   8#include <linux/string_helpers.h>
   9
  10#include "display/intel_display.h"
  11#include "display/intel_overlay.h"
  12
  13#include "gem/i915_gem_context.h"
  14
  15#include "gt/intel_gt_regs.h"
  16
  17#include "i915_drv.h"
  18#include "i915_file_private.h"
  19#include "i915_gpu_error.h"
  20#include "i915_irq.h"
  21#include "intel_breadcrumbs.h"
  22#include "intel_engine_pm.h"
  23#include "intel_engine_regs.h"
  24#include "intel_gt.h"
  25#include "intel_gt_pm.h"
  26#include "intel_gt_requests.h"
  27#include "intel_mchbar_regs.h"
  28#include "intel_pci_config.h"
  29#include "intel_reset.h"
  30
  31#include "uc/intel_guc.h"
  32
  33#define RESET_MAX_RETRIES 3
  34
  35/* XXX How to handle concurrent GGTT updates using tiling registers? */
  36#define RESET_UNDER_STOP_MACHINE 0
  37
  38static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
  39{
  40        intel_uncore_rmw_fw(uncore, reg, 0, set);
  41}
  42
  43static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
  44{
  45        intel_uncore_rmw_fw(uncore, reg, clr, 0);
  46}
  47
  48static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
  49{
  50        struct drm_i915_file_private *file_priv = ctx->file_priv;
  51        unsigned long prev_hang;
  52        unsigned int score;
  53
  54        if (IS_ERR_OR_NULL(file_priv))
  55                return;
  56
  57        score = 0;
  58        if (banned)
  59                score = I915_CLIENT_SCORE_CONTEXT_BAN;
  60
  61        prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
  62        if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
  63                score += I915_CLIENT_SCORE_HANG_FAST;
  64
  65        if (score) {
  66                atomic_add(score, &file_priv->ban_score);
  67
  68                drm_dbg(&ctx->i915->drm,
  69                        "client %s: gained %u ban score, now %u\n",
  70                        ctx->name, score,
  71                        atomic_read(&file_priv->ban_score));
  72        }
  73}
  74
  75static bool mark_guilty(struct i915_request *rq)
  76{
  77        struct i915_gem_context *ctx;
  78        unsigned long prev_hang;
  79        bool banned;
  80        int i;
  81
  82        if (intel_context_is_closed(rq->context))
  83                return true;
  84
  85        rcu_read_lock();
  86        ctx = rcu_dereference(rq->context->gem_context);
  87        if (ctx && !kref_get_unless_zero(&ctx->ref))
  88                ctx = NULL;
  89        rcu_read_unlock();
  90        if (!ctx)
  91                return intel_context_is_banned(rq->context);
  92
  93        atomic_inc(&ctx->guilty_count);
  94
  95        /* Cool contexts are too cool to be banned! (Used for reset testing.) */
  96        if (!i915_gem_context_is_bannable(ctx)) {
  97                banned = false;
  98                goto out;
  99        }
 100
 101        drm_notice(&ctx->i915->drm,
 102                   "%s context reset due to GPU hang\n",
 103                   ctx->name);
 104
 105        /* Record the timestamp for the last N hangs */
 106        prev_hang = ctx->hang_timestamp[0];
 107        for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
 108                ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
 109        ctx->hang_timestamp[i] = jiffies;
 110
 111        /* If we have hung N+1 times in rapid succession, we ban the context! */
 112        banned = !i915_gem_context_is_recoverable(ctx);
 113        if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
 114                banned = true;
 115        if (banned)
 116                drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
 117                        ctx->name, atomic_read(&ctx->guilty_count));
 118
 119        client_mark_guilty(ctx, banned);
 120
 121out:
 122        i915_gem_context_put(ctx);
 123        return banned;
 124}
 125
 126static void mark_innocent(struct i915_request *rq)
 127{
 128        struct i915_gem_context *ctx;
 129
 130        rcu_read_lock();
 131        ctx = rcu_dereference(rq->context->gem_context);
 132        if (ctx)
 133                atomic_inc(&ctx->active_count);
 134        rcu_read_unlock();
 135}
 136
 137void __i915_request_reset(struct i915_request *rq, bool guilty)
 138{
 139        bool banned = false;
 140
 141        RQ_TRACE(rq, "guilty? %s\n", str_yes_no(guilty));
 142        GEM_BUG_ON(__i915_request_is_complete(rq));
 143
 144        rcu_read_lock(); /* protect the GEM context */
 145        if (guilty) {
 146                i915_request_set_error_once(rq, -EIO);
 147                __i915_request_skip(rq);
 148                banned = mark_guilty(rq);
 149        } else {
 150                i915_request_set_error_once(rq, -EAGAIN);
 151                mark_innocent(rq);
 152        }
 153        rcu_read_unlock();
 154
 155        if (banned)
 156                intel_context_ban(rq->context, rq);
 157}
 158
 159static bool i915_in_reset(struct pci_dev *pdev)
 160{
 161        u8 gdrst;
 162
 163        pci_read_config_byte(pdev, I915_GDRST, &gdrst);
 164        return gdrst & GRDOM_RESET_STATUS;
 165}
 166
 167static int i915_do_reset(struct intel_gt *gt,
 168                         intel_engine_mask_t engine_mask,
 169                         unsigned int retry)
 170{
 171        struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
 172        int err;
 173
 174        /* Assert reset for at least 20 usec, and wait for acknowledgement. */
 175        pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
 176        udelay(50);
 177        err = wait_for_atomic(i915_in_reset(pdev), 50);
 178
 179        /* Clear the reset request. */
 180        pci_write_config_byte(pdev, I915_GDRST, 0);
 181        udelay(50);
 182        if (!err)
 183                err = wait_for_atomic(!i915_in_reset(pdev), 50);
 184
 185        return err;
 186}
 187
 188static bool g4x_reset_complete(struct pci_dev *pdev)
 189{
 190        u8 gdrst;
 191
 192        pci_read_config_byte(pdev, I915_GDRST, &gdrst);
 193        return (gdrst & GRDOM_RESET_ENABLE) == 0;
 194}
 195
 196static int g33_do_reset(struct intel_gt *gt,
 197                        intel_engine_mask_t engine_mask,
 198                        unsigned int retry)
 199{
 200        struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
 201
 202        pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
 203        return wait_for_atomic(g4x_reset_complete(pdev), 50);
 204}
 205
 206static int g4x_do_reset(struct intel_gt *gt,
 207                        intel_engine_mask_t engine_mask,
 208                        unsigned int retry)
 209{
 210        struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
 211        struct intel_uncore *uncore = gt->uncore;
 212        int ret;
 213
 214        /* WaVcpClkGateDisableForMediaReset:ctg,elk */
 215        rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
 216        intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
 217
 218        pci_write_config_byte(pdev, I915_GDRST,
 219                              GRDOM_MEDIA | GRDOM_RESET_ENABLE);
 220        ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
 221        if (ret) {
 222                GT_TRACE(gt, "Wait for media reset failed\n");
 223                goto out;
 224        }
 225
 226        pci_write_config_byte(pdev, I915_GDRST,
 227                              GRDOM_RENDER | GRDOM_RESET_ENABLE);
 228        ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
 229        if (ret) {
 230                GT_TRACE(gt, "Wait for render reset failed\n");
 231                goto out;
 232        }
 233
 234out:
 235        pci_write_config_byte(pdev, I915_GDRST, 0);
 236
 237        rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
 238        intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
 239
 240        return ret;
 241}
 242
 243static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
 244                        unsigned int retry)
 245{
 246        struct intel_uncore *uncore = gt->uncore;
 247        int ret;
 248
 249        intel_uncore_write_fw(uncore, ILK_GDSR,
 250                              ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
 251        ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
 252                                           ILK_GRDOM_RESET_ENABLE, 0,
 253                                           5000, 0,
 254                                           NULL);
 255        if (ret) {
 256                GT_TRACE(gt, "Wait for render reset failed\n");
 257                goto out;
 258        }
 259
 260        intel_uncore_write_fw(uncore, ILK_GDSR,
 261                              ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
 262        ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
 263                                           ILK_GRDOM_RESET_ENABLE, 0,
 264                                           5000, 0,
 265                                           NULL);
 266        if (ret) {
 267                GT_TRACE(gt, "Wait for media reset failed\n");
 268                goto out;
 269        }
 270
 271out:
 272        intel_uncore_write_fw(uncore, ILK_GDSR, 0);
 273        intel_uncore_posting_read_fw(uncore, ILK_GDSR);
 274        return ret;
 275}
 276
 277/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
 278static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
 279{
 280        struct intel_uncore *uncore = gt->uncore;
 281        int err;
 282
 283        /*
 284         * GEN6_GDRST is not in the gt power well, no need to check
 285         * for fifo space for the write or forcewake the chip for
 286         * the read
 287         */
 288        intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
 289
 290        /* Wait for the device to ack the reset requests */
 291        err = __intel_wait_for_register_fw(uncore,
 292                                           GEN6_GDRST, hw_domain_mask, 0,
 293                                           500, 0,
 294                                           NULL);
 295        if (err)
 296                GT_TRACE(gt,
 297                         "Wait for 0x%08x engines reset failed\n",
 298                         hw_domain_mask);
 299
 300        return err;
 301}
 302
 303static int __gen6_reset_engines(struct intel_gt *gt,
 304                                intel_engine_mask_t engine_mask,
 305                                unsigned int retry)
 306{
 307        struct intel_engine_cs *engine;
 308        u32 hw_mask;
 309
 310        if (engine_mask == ALL_ENGINES) {
 311                hw_mask = GEN6_GRDOM_FULL;
 312        } else {
 313                intel_engine_mask_t tmp;
 314
 315                hw_mask = 0;
 316                for_each_engine_masked(engine, gt, engine_mask, tmp) {
 317                        hw_mask |= engine->reset_domain;
 318                }
 319        }
 320
 321        return gen6_hw_domain_reset(gt, hw_mask);
 322}
 323
 324static int gen6_reset_engines(struct intel_gt *gt,
 325                              intel_engine_mask_t engine_mask,
 326                              unsigned int retry)
 327{
 328        unsigned long flags;
 329        int ret;
 330
 331        spin_lock_irqsave(&gt->uncore->lock, flags);
 332        ret = __gen6_reset_engines(gt, engine_mask, retry);
 333        spin_unlock_irqrestore(&gt->uncore->lock, flags);
 334
 335        return ret;
 336}
 337
 338static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine)
 339{
 340        int vecs_id;
 341
 342        GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS);
 343
 344        vecs_id = _VECS((engine->instance) / 2);
 345
 346        return engine->gt->engine[vecs_id];
 347}
 348
 349struct sfc_lock_data {
 350        i915_reg_t lock_reg;
 351        i915_reg_t ack_reg;
 352        i915_reg_t usage_reg;
 353        u32 lock_bit;
 354        u32 ack_bit;
 355        u32 usage_bit;
 356        u32 reset_bit;
 357};
 358
 359static void get_sfc_forced_lock_data(struct intel_engine_cs *engine,
 360                                     struct sfc_lock_data *sfc_lock)
 361{
 362        switch (engine->class) {
 363        default:
 364                MISSING_CASE(engine->class);
 365                fallthrough;
 366        case VIDEO_DECODE_CLASS:
 367                sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine->mmio_base);
 368                sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
 369
 370                sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base);
 371                sfc_lock->ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
 372
 373                sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base);
 374                sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT;
 375                sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
 376
 377                break;
 378        case VIDEO_ENHANCEMENT_CLASS:
 379                sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine->mmio_base);
 380                sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
 381
 382                sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine->mmio_base);
 383                sfc_lock->ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
 384
 385                sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine->mmio_base);
 386                sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT;
 387                sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
 388
 389                break;
 390        }
 391}
 392
 393static int gen11_lock_sfc(struct intel_engine_cs *engine,
 394                          u32 *reset_mask,
 395                          u32 *unlock_mask)
 396{
 397        struct intel_uncore *uncore = engine->uncore;
 398        u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
 399        struct sfc_lock_data sfc_lock;
 400        bool lock_obtained, lock_to_other = false;
 401        int ret;
 402
 403        switch (engine->class) {
 404        case VIDEO_DECODE_CLASS:
 405                if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
 406                        return 0;
 407
 408                fallthrough;
 409        case VIDEO_ENHANCEMENT_CLASS:
 410                get_sfc_forced_lock_data(engine, &sfc_lock);
 411
 412                break;
 413        default:
 414                return 0;
 415        }
 416
 417        if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) {
 418                struct intel_engine_cs *paired_vecs;
 419
 420                if (engine->class != VIDEO_DECODE_CLASS ||
 421                    GRAPHICS_VER(engine->i915) != 12)
 422                        return 0;
 423
 424                /*
 425                 * Wa_14010733141
 426                 *
 427                 * If the VCS-MFX isn't using the SFC, we also need to check
 428                 * whether VCS-HCP is using it.  If so, we need to issue a *VE*
 429                 * forced lock on the VE engine that shares the same SFC.
 430                 */
 431                if (!(intel_uncore_read_fw(uncore,
 432                                           GEN12_HCP_SFC_LOCK_STATUS(engine->mmio_base)) &
 433                      GEN12_HCP_SFC_USAGE_BIT))
 434                        return 0;
 435
 436                paired_vecs = find_sfc_paired_vecs_engine(engine);
 437                get_sfc_forced_lock_data(paired_vecs, &sfc_lock);
 438                lock_to_other = true;
 439                *unlock_mask |= paired_vecs->mask;
 440        } else {
 441                *unlock_mask |= engine->mask;
 442        }
 443
 444        /*
 445         * If the engine is using an SFC, tell the engine that a software reset
 446         * is going to happen. The engine will then try to force lock the SFC.
 447         * If SFC ends up being locked to the engine we want to reset, we have
 448         * to reset it as well (we will unlock it once the reset sequence is
 449         * completed).
 450         */
 451        rmw_set_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit);
 452
 453        ret = __intel_wait_for_register_fw(uncore,
 454                                           sfc_lock.ack_reg,
 455                                           sfc_lock.ack_bit,
 456                                           sfc_lock.ack_bit,
 457                                           1000, 0, NULL);
 458
 459        /*
 460         * Was the SFC released while we were trying to lock it?
 461         *
 462         * We should reset both the engine and the SFC if:
 463         *  - We were locking the SFC to this engine and the lock succeeded
 464         *       OR
 465         *  - We were locking the SFC to a different engine (Wa_14010733141)
 466         *    but the SFC was released before the lock was obtained.
 467         *
 468         * Otherwise we need only reset the engine by itself and we can
 469         * leave the SFC alone.
 470         */
 471        lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) &
 472                        sfc_lock.usage_bit) != 0;
 473        if (lock_obtained == lock_to_other)
 474                return 0;
 475
 476        if (ret) {
 477                ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n");
 478                return ret;
 479        }
 480
 481        *reset_mask |= sfc_lock.reset_bit;
 482        return 0;
 483}
 484
 485static void gen11_unlock_sfc(struct intel_engine_cs *engine)
 486{
 487        struct intel_uncore *uncore = engine->uncore;
 488        u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
 489        struct sfc_lock_data sfc_lock = {};
 490
 491        if (engine->class != VIDEO_DECODE_CLASS &&
 492            engine->class != VIDEO_ENHANCEMENT_CLASS)
 493                return;
 494
 495        if (engine->class == VIDEO_DECODE_CLASS &&
 496            (BIT(engine->instance) & vdbox_sfc_access) == 0)
 497                return;
 498
 499        get_sfc_forced_lock_data(engine, &sfc_lock);
 500
 501        rmw_clear_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit);
 502}
 503
 504static int __gen11_reset_engines(struct intel_gt *gt,
 505                                 intel_engine_mask_t engine_mask,
 506                                 unsigned int retry)
 507{
 508        struct intel_engine_cs *engine;
 509        intel_engine_mask_t tmp;
 510        u32 reset_mask, unlock_mask = 0;
 511        int ret;
 512
 513        if (engine_mask == ALL_ENGINES) {
 514                reset_mask = GEN11_GRDOM_FULL;
 515        } else {
 516                reset_mask = 0;
 517                for_each_engine_masked(engine, gt, engine_mask, tmp) {
 518                        reset_mask |= engine->reset_domain;
 519                        ret = gen11_lock_sfc(engine, &reset_mask, &unlock_mask);
 520                        if (ret)
 521                                goto sfc_unlock;
 522                }
 523        }
 524
 525        ret = gen6_hw_domain_reset(gt, reset_mask);
 526
 527sfc_unlock:
 528        /*
 529         * We unlock the SFC based on the lock status and not the result of
 530         * gen11_lock_sfc to make sure that we clean properly if something
 531         * wrong happened during the lock (e.g. lock acquired after timeout
 532         * expiration).
 533         *
 534         * Due to Wa_14010733141, we may have locked an SFC to an engine that
 535         * wasn't being reset.  So instead of calling gen11_unlock_sfc()
 536         * on engine_mask, we instead call it on the mask of engines that our
 537         * gen11_lock_sfc() calls told us actually had locks attempted.
 538         */
 539        for_each_engine_masked(engine, gt, unlock_mask, tmp)
 540                gen11_unlock_sfc(engine);
 541
 542        return ret;
 543}
 544
 545static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
 546{
 547        struct intel_uncore *uncore = engine->uncore;
 548        const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
 549        u32 request, mask, ack;
 550        int ret;
 551
 552        if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1)))
 553                return -ETIMEDOUT;
 554
 555        ack = intel_uncore_read_fw(uncore, reg);
 556        if (ack & RESET_CTL_CAT_ERROR) {
 557                /*
 558                 * For catastrophic errors, ready-for-reset sequence
 559                 * needs to be bypassed: HAS#396813
 560                 */
 561                request = RESET_CTL_CAT_ERROR;
 562                mask = RESET_CTL_CAT_ERROR;
 563
 564                /* Catastrophic errors need to be cleared by HW */
 565                ack = 0;
 566        } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
 567                request = RESET_CTL_REQUEST_RESET;
 568                mask = RESET_CTL_READY_TO_RESET;
 569                ack = RESET_CTL_READY_TO_RESET;
 570        } else {
 571                return 0;
 572        }
 573
 574        intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
 575        ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
 576                                           700, 0, NULL);
 577        if (ret)
 578                drm_err(&engine->i915->drm,
 579                        "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
 580                        engine->name, request,
 581                        intel_uncore_read_fw(uncore, reg));
 582
 583        return ret;
 584}
 585
 586static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
 587{
 588        intel_uncore_write_fw(engine->uncore,
 589                              RING_RESET_CTL(engine->mmio_base),
 590                              _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
 591}
 592
 593static int gen8_reset_engines(struct intel_gt *gt,
 594                              intel_engine_mask_t engine_mask,
 595                              unsigned int retry)
 596{
 597        struct intel_engine_cs *engine;
 598        const bool reset_non_ready = retry >= 1;
 599        intel_engine_mask_t tmp;
 600        unsigned long flags;
 601        int ret;
 602
 603        spin_lock_irqsave(&gt->uncore->lock, flags);
 604
 605        for_each_engine_masked(engine, gt, engine_mask, tmp) {
 606                ret = gen8_engine_reset_prepare(engine);
 607                if (ret && !reset_non_ready)
 608                        goto skip_reset;
 609
 610                /*
 611                 * If this is not the first failed attempt to prepare,
 612                 * we decide to proceed anyway.
 613                 *
 614                 * By doing so we risk context corruption and with
 615                 * some gens (kbl), possible system hang if reset
 616                 * happens during active bb execution.
 617                 *
 618                 * We rather take context corruption instead of
 619                 * failed reset with a wedged driver/gpu. And
 620                 * active bb execution case should be covered by
 621                 * stop_engines() we have before the reset.
 622                 */
 623        }
 624
 625        /*
 626         * Wa_22011100796:dg2, whenever Full soft reset is required,
 627         * reset all individual engines firstly, and then do a full soft reset.
 628         *
 629         * This is best effort, so ignore any error from the initial reset.
 630         */
 631        if (IS_DG2(gt->i915) && engine_mask == ALL_ENGINES)
 632                __gen11_reset_engines(gt, gt->info.engine_mask, 0);
 633
 634        if (GRAPHICS_VER(gt->i915) >= 11)
 635                ret = __gen11_reset_engines(gt, engine_mask, retry);
 636        else
 637                ret = __gen6_reset_engines(gt, engine_mask, retry);
 638
 639skip_reset:
 640        for_each_engine_masked(engine, gt, engine_mask, tmp)
 641                gen8_engine_reset_cancel(engine);
 642
 643        spin_unlock_irqrestore(&gt->uncore->lock, flags);
 644
 645        return ret;
 646}
 647
 648static int mock_reset(struct intel_gt *gt,
 649                      intel_engine_mask_t mask,
 650                      unsigned int retry)
 651{
 652        return 0;
 653}
 654
 655typedef int (*reset_func)(struct intel_gt *,
 656                          intel_engine_mask_t engine_mask,
 657                          unsigned int retry);
 658
 659static reset_func intel_get_gpu_reset(const struct intel_gt *gt)
 660{
 661        struct drm_i915_private *i915 = gt->i915;
 662
 663        if (is_mock_gt(gt))
 664                return mock_reset;
 665        else if (GRAPHICS_VER(i915) >= 8)
 666                return gen8_reset_engines;
 667        else if (GRAPHICS_VER(i915) >= 6)
 668                return gen6_reset_engines;
 669        else if (GRAPHICS_VER(i915) >= 5)
 670                return ilk_do_reset;
 671        else if (IS_G4X(i915))
 672                return g4x_do_reset;
 673        else if (IS_G33(i915) || IS_PINEVIEW(i915))
 674                return g33_do_reset;
 675        else if (GRAPHICS_VER(i915) >= 3)
 676                return i915_do_reset;
 677        else
 678                return NULL;
 679}
 680
 681int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
 682{
 683        const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
 684        reset_func reset;
 685        int ret = -ETIMEDOUT;
 686        int retry;
 687
 688        reset = intel_get_gpu_reset(gt);
 689        if (!reset)
 690                return -ENODEV;
 691
 692        /*
 693         * If the power well sleeps during the reset, the reset
 694         * request may be dropped and never completes (causing -EIO).
 695         */
 696        intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
 697        for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
 698                GT_TRACE(gt, "engine_mask=%x\n", engine_mask);
 699                preempt_disable();
 700                ret = reset(gt, engine_mask, retry);
 701                preempt_enable();
 702        }
 703        intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
 704
 705        return ret;
 706}
 707
 708bool intel_has_gpu_reset(const struct intel_gt *gt)
 709{
 710        if (!gt->i915->params.reset)
 711                return NULL;
 712
 713        return intel_get_gpu_reset(gt);
 714}
 715
 716bool intel_has_reset_engine(const struct intel_gt *gt)
 717{
 718        if (gt->i915->params.reset < 2)
 719                return false;
 720
 721        return INTEL_INFO(gt->i915)->has_reset_engine;
 722}
 723
 724int intel_reset_guc(struct intel_gt *gt)
 725{
 726        u32 guc_domain =
 727                GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
 728        int ret;
 729
 730        GEM_BUG_ON(!HAS_GT_UC(gt->i915));
 731
 732        intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
 733        ret = gen6_hw_domain_reset(gt, guc_domain);
 734        intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
 735
 736        return ret;
 737}
 738
 739/*
 740 * Ensure irq handler finishes, and not run again.
 741 * Also return the active request so that we only search for it once.
 742 */
 743static void reset_prepare_engine(struct intel_engine_cs *engine)
 744{
 745        /*
 746         * During the reset sequence, we must prevent the engine from
 747         * entering RC6. As the context state is undefined until we restart
 748         * the engine, if it does enter RC6 during the reset, the state
 749         * written to the powercontext is undefined and so we may lose
 750         * GPU state upon resume, i.e. fail to restart after a reset.
 751         */
 752        intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
 753        if (engine->reset.prepare)
 754                engine->reset.prepare(engine);
 755}
 756
 757static void revoke_mmaps(struct intel_gt *gt)
 758{
 759        int i;
 760
 761        for (i = 0; i < gt->ggtt->num_fences; i++) {
 762                struct drm_vma_offset_node *node;
 763                struct i915_vma *vma;
 764                u64 vma_offset;
 765
 766                vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
 767                if (!vma)
 768                        continue;
 769
 770                if (!i915_vma_has_userfault(vma))
 771                        continue;
 772
 773                GEM_BUG_ON(vma->fence != &gt->ggtt->fence_regs[i]);
 774
 775                if (!vma->mmo)
 776                        continue;
 777
 778                node = &vma->mmo->vma_node;
 779                vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
 780
 781                unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
 782                                    drm_vma_node_offset_addr(node) + vma_offset,
 783                                    vma->size,
 784                                    1);
 785        }
 786}
 787
 788static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
 789{
 790        struct intel_engine_cs *engine;
 791        intel_engine_mask_t awake = 0;
 792        enum intel_engine_id id;
 793
 794        /* For GuC mode, ensure submission is disabled before stopping ring */
 795        intel_uc_reset_prepare(&gt->uc);
 796
 797        for_each_engine(engine, gt, id) {
 798                if (intel_engine_pm_get_if_awake(engine))
 799                        awake |= engine->mask;
 800                reset_prepare_engine(engine);
 801        }
 802
 803        return awake;
 804}
 805
 806static void gt_revoke(struct intel_gt *gt)
 807{
 808        revoke_mmaps(gt);
 809}
 810
 811static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
 812{
 813        struct intel_engine_cs *engine;
 814        enum intel_engine_id id;
 815        int err;
 816
 817        /*
 818         * Everything depends on having the GTT running, so we need to start
 819         * there.
 820         */
 821        err = i915_ggtt_enable_hw(gt->i915);
 822        if (err)
 823                return err;
 824
 825        local_bh_disable();
 826        for_each_engine(engine, gt, id)
 827                __intel_engine_reset(engine, stalled_mask & engine->mask);
 828        local_bh_enable();
 829
 830        intel_uc_reset(&gt->uc, ALL_ENGINES);
 831
 832        intel_ggtt_restore_fences(gt->ggtt);
 833
 834        return err;
 835}
 836
 837static void reset_finish_engine(struct intel_engine_cs *engine)
 838{
 839        if (engine->reset.finish)
 840                engine->reset.finish(engine);
 841        intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
 842
 843        intel_engine_signal_breadcrumbs(engine);
 844}
 845
 846static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
 847{
 848        struct intel_engine_cs *engine;
 849        enum intel_engine_id id;
 850
 851        for_each_engine(engine, gt, id) {
 852                reset_finish_engine(engine);
 853                if (awake & engine->mask)
 854                        intel_engine_pm_put(engine);
 855        }
 856
 857        intel_uc_reset_finish(&gt->uc);
 858}
 859
 860static void nop_submit_request(struct i915_request *request)
 861{
 862        RQ_TRACE(request, "-EIO\n");
 863
 864        request = i915_request_mark_eio(request);
 865        if (request) {
 866                i915_request_submit(request);
 867                intel_engine_signal_breadcrumbs(request->engine);
 868
 869                i915_request_put(request);
 870        }
 871}
 872
 873static void __intel_gt_set_wedged(struct intel_gt *gt)
 874{
 875        struct intel_engine_cs *engine;
 876        intel_engine_mask_t awake;
 877        enum intel_engine_id id;
 878
 879        if (test_bit(I915_WEDGED, &gt->reset.flags))
 880                return;
 881
 882        GT_TRACE(gt, "start\n");
 883
 884        /*
 885         * First, stop submission to hw, but do not yet complete requests by
 886         * rolling the global seqno forward (since this would complete requests
 887         * for which we haven't set the fence error to EIO yet).
 888         */
 889        awake = reset_prepare(gt);
 890
 891        /* Even if the GPU reset fails, it should still stop the engines */
 892        if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
 893                __intel_gt_reset(gt, ALL_ENGINES);
 894
 895        for_each_engine(engine, gt, id)
 896                engine->submit_request = nop_submit_request;
 897
 898        /*
 899         * Make sure no request can slip through without getting completed by
 900         * either this call here to intel_engine_write_global_seqno, or the one
 901         * in nop_submit_request.
 902         */
 903        synchronize_rcu_expedited();
 904        set_bit(I915_WEDGED, &gt->reset.flags);
 905
 906        /* Mark all executing requests as skipped */
 907        local_bh_disable();
 908        for_each_engine(engine, gt, id)
 909                if (engine->reset.cancel)
 910                        engine->reset.cancel(engine);
 911        intel_uc_cancel_requests(&gt->uc);
 912        local_bh_enable();
 913
 914        reset_finish(gt, awake);
 915
 916        GT_TRACE(gt, "end\n");
 917}
 918
 919void intel_gt_set_wedged(struct intel_gt *gt)
 920{
 921        intel_wakeref_t wakeref;
 922
 923        if (test_bit(I915_WEDGED, &gt->reset.flags))
 924                return;
 925
 926        wakeref = intel_runtime_pm_get(gt->uncore->rpm);
 927        mutex_lock(&gt->reset.mutex);
 928
 929        if (GEM_SHOW_DEBUG()) {
 930                struct drm_printer p = drm_debug_printer(__func__);
 931                struct intel_engine_cs *engine;
 932                enum intel_engine_id id;
 933
 934                drm_printf(&p, "called from %pS\n", (void *)_RET_IP_);
 935                for_each_engine(engine, gt, id) {
 936                        if (intel_engine_is_idle(engine))
 937                                continue;
 938
 939                        intel_engine_dump(engine, &p, "%s\n", engine->name);
 940                }
 941        }
 942
 943        __intel_gt_set_wedged(gt);
 944
 945        mutex_unlock(&gt->reset.mutex);
 946        intel_runtime_pm_put(gt->uncore->rpm, wakeref);
 947}
 948
 949static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 950{
 951        struct intel_gt_timelines *timelines = &gt->timelines;
 952        struct intel_timeline *tl;
 953        bool ok;
 954
 955        if (!test_bit(I915_WEDGED, &gt->reset.flags))
 956                return true;
 957
 958        /* Never fully initialised, recovery impossible */
 959        if (intel_gt_has_unrecoverable_error(gt))
 960                return false;
 961
 962        GT_TRACE(gt, "start\n");
 963
 964        /*
 965         * Before unwedging, make sure that all pending operations
 966         * are flushed and errored out - we may have requests waiting upon
 967         * third party fences. We marked all inflight requests as EIO, and
 968         * every execbuf since returned EIO, for consistency we want all
 969         * the currently pending requests to also be marked as EIO, which
 970         * is done inside our nop_submit_request - and so we must wait.
 971         *
 972         * No more can be submitted until we reset the wedged bit.
 973         */
 974        spin_lock(&timelines->lock);
 975        list_for_each_entry(tl, &timelines->active_list, link) {
 976                struct dma_fence *fence;
 977
 978                fence = i915_active_fence_get(&tl->last_request);
 979                if (!fence)
 980                        continue;
 981
 982                spin_unlock(&timelines->lock);
 983
 984                /*
 985                 * All internal dependencies (i915_requests) will have
 986                 * been flushed by the set-wedge, but we may be stuck waiting
 987                 * for external fences. These should all be capped to 10s
 988                 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
 989                 * in the worst case.
 990                 */
 991                dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
 992                dma_fence_put(fence);
 993
 994                /* Restart iteration after droping lock */
 995                spin_lock(&timelines->lock);
 996                tl = list_entry(&timelines->active_list, typeof(*tl), link);
 997        }
 998        spin_unlock(&timelines->lock);
 999
1000        /* We must reset pending GPU events before restoring our submission */
1001        ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
1002        if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1003                ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
1004        if (!ok) {
1005                /*
1006                 * Warn CI about the unrecoverable wedged condition.
1007                 * Time for a reboot.
1008                 */
1009                add_taint_for_CI(gt->i915, TAINT_WARN);
1010                return false;
1011        }
1012
1013        /*
1014         * Undo nop_submit_request. We prevent all new i915 requests from
1015         * being queued (by disallowing execbuf whilst wedged) so having
1016         * waited for all active requests above, we know the system is idle
1017         * and do not have to worry about a thread being inside
1018         * engine->submit_request() as we swap over. So unlike installing
1019         * the nop_submit_request on reset, we can do this from normal
1020         * context and do not require stop_machine().
1021         */
1022        intel_engines_reset_default_submission(gt);
1023
1024        GT_TRACE(gt, "end\n");
1025
1026        smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
1027        clear_bit(I915_WEDGED, &gt->reset.flags);
1028
1029        return true;
1030}
1031
1032bool intel_gt_unset_wedged(struct intel_gt *gt)
1033{
1034        bool result;
1035
1036        mutex_lock(&gt->reset.mutex);
1037        result = __intel_gt_unset_wedged(gt);
1038        mutex_unlock(&gt->reset.mutex);
1039
1040        return result;
1041}
1042
1043static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
1044{
1045        int err, i;
1046
1047        err = __intel_gt_reset(gt, ALL_ENGINES);
1048        for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
1049                msleep(10 * (i + 1));
1050                err = __intel_gt_reset(gt, ALL_ENGINES);
1051        }
1052        if (err)
1053                return err;
1054
1055        return gt_reset(gt, stalled_mask);
1056}
1057
1058static int resume(struct intel_gt *gt)
1059{
1060        struct intel_engine_cs *engine;
1061        enum intel_engine_id id;
1062        int ret;
1063
1064        for_each_engine(engine, gt, id) {
1065                ret = intel_engine_resume(engine);
1066                if (ret)
1067                        return ret;
1068        }
1069
1070        return 0;
1071}
1072
1073/**
1074 * intel_gt_reset - reset chip after a hang
1075 * @gt: #intel_gt to reset
1076 * @stalled_mask: mask of the stalled engines with the guilty requests
1077 * @reason: user error message for why we are resetting
1078 *
1079 * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
1080 * on failure.
1081 *
1082 * Procedure is fairly simple:
1083 *   - reset the chip using the reset reg
1084 *   - re-init context state
1085 *   - re-init hardware status page
1086 *   - re-init ring buffer
1087 *   - re-init interrupt state
1088 *   - re-init display
1089 */
1090void intel_gt_reset(struct intel_gt *gt,
1091                    intel_engine_mask_t stalled_mask,
1092                    const char *reason)
1093{
1094        intel_engine_mask_t awake;
1095        int ret;
1096
1097        GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
1098
1099        might_sleep();
1100        GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1101
1102        /*
1103         * FIXME: Revoking cpu mmap ptes cannot be done from a dma_fence
1104         * critical section like gpu reset.
1105         */
1106        gt_revoke(gt);
1107
1108        mutex_lock(&gt->reset.mutex);
1109
1110        /* Clear any previous failed attempts at recovery. Time to try again. */
1111        if (!__intel_gt_unset_wedged(gt))
1112                goto unlock;
1113
1114        if (reason)
1115                drm_notice(&gt->i915->drm,
1116                           "Resetting chip for %s\n", reason);
1117        atomic_inc(&gt->i915->gpu_error.reset_count);
1118
1119        awake = reset_prepare(gt);
1120
1121        if (!intel_has_gpu_reset(gt)) {
1122                if (gt->i915->params.reset)
1123                        drm_err(&gt->i915->drm, "GPU reset not supported\n");
1124                else
1125                        drm_dbg(&gt->i915->drm, "GPU reset disabled\n");
1126                goto error;
1127        }
1128
1129        if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1130                intel_runtime_pm_disable_interrupts(gt->i915);
1131
1132        if (do_reset(gt, stalled_mask)) {
1133                drm_err(&gt->i915->drm, "Failed to reset chip\n");
1134                goto taint;
1135        }
1136
1137        if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1138                intel_runtime_pm_enable_interrupts(gt->i915);
1139
1140        intel_overlay_reset(gt->i915);
1141
1142        /*
1143         * Next we need to restore the context, but we don't use those
1144         * yet either...
1145         *
1146         * Ring buffer needs to be re-initialized in the KMS case, or if X
1147         * was running at the time of the reset (i.e. we weren't VT
1148         * switched away).
1149         */
1150        ret = intel_gt_init_hw(gt);
1151        if (ret) {
1152                drm_err(&gt->i915->drm,
1153                        "Failed to initialise HW following reset (%d)\n",
1154                        ret);
1155                goto taint;
1156        }
1157
1158        ret = resume(gt);
1159        if (ret)
1160                goto taint;
1161
1162finish:
1163        reset_finish(gt, awake);
1164unlock:
1165        mutex_unlock(&gt->reset.mutex);
1166        return;
1167
1168taint:
1169        /*
1170         * History tells us that if we cannot reset the GPU now, we
1171         * never will. This then impacts everything that is run
1172         * subsequently. On failing the reset, we mark the driver
1173         * as wedged, preventing further execution on the GPU.
1174         * We also want to go one step further and add a taint to the
1175         * kernel so that any subsequent faults can be traced back to
1176         * this failure. This is important for CI, where if the
1177         * GPU/driver fails we would like to reboot and restart testing
1178         * rather than continue on into oblivion. For everyone else,
1179         * the system should still plod along, but they have been warned!
1180         */
1181        add_taint_for_CI(gt->i915, TAINT_WARN);
1182error:
1183        __intel_gt_set_wedged(gt);
1184        goto finish;
1185}
1186
1187static int intel_gt_reset_engine(struct intel_engine_cs *engine)
1188{
1189        return __intel_gt_reset(engine->gt, engine->mask);
1190}
1191
1192int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
1193{
1194        struct intel_gt *gt = engine->gt;
1195        int ret;
1196
1197        ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
1198        GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags));
1199
1200        if (intel_engine_uses_guc(engine))
1201                return -ENODEV;
1202
1203        if (!intel_engine_pm_get_if_awake(engine))
1204                return 0;
1205
1206        reset_prepare_engine(engine);
1207
1208        if (msg)
1209                drm_notice(&engine->i915->drm,
1210                           "Resetting %s for %s\n", engine->name, msg);
1211        atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
1212
1213        ret = intel_gt_reset_engine(engine);
1214        if (ret) {
1215                /* If we fail here, we expect to fallback to a global reset */
1216                ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret);
1217                goto out;
1218        }
1219
1220        /*
1221         * The request that caused the hang is stuck on elsp, we know the
1222         * active request and can drop it, adjust head to skip the offending
1223         * request to resume executing remaining requests in the queue.
1224         */
1225        __intel_engine_reset(engine, true);
1226
1227        /*
1228         * The engine and its registers (and workarounds in case of render)
1229         * have been reset to their default values. Follow the init_ring
1230         * process to program RING_MODE, HWSP and re-enable submission.
1231         */
1232        ret = intel_engine_resume(engine);
1233
1234out:
1235        intel_engine_cancel_stop_cs(engine);
1236        reset_finish_engine(engine);
1237        intel_engine_pm_put_async(engine);
1238        return ret;
1239}
1240
1241/**
1242 * intel_engine_reset - reset GPU engine to recover from a hang
1243 * @engine: engine to reset
1244 * @msg: reason for GPU reset; or NULL for no drm_notice()
1245 *
1246 * Reset a specific GPU engine. Useful if a hang is detected.
1247 * Returns zero on successful reset or otherwise an error code.
1248 *
1249 * Procedure is:
1250 *  - identifies the request that caused the hang and it is dropped
1251 *  - reset engine (which will force the engine to idle)
1252 *  - re-init/configure engine
1253 */
1254int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
1255{
1256        int err;
1257
1258        local_bh_disable();
1259        err = __intel_engine_reset_bh(engine, msg);
1260        local_bh_enable();
1261
1262        return err;
1263}
1264
1265static void intel_gt_reset_global(struct intel_gt *gt,
1266                                  u32 engine_mask,
1267                                  const char *reason)
1268{
1269        struct kobject *kobj = &gt->i915->drm.primary->kdev->kobj;
1270        char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1271        char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1272        char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1273        struct intel_wedge_me w;
1274
1275        kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1276
1277        GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask);
1278        kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1279
1280        /* Use a watchdog to ensure that our reset completes */
1281        intel_wedge_on_timeout(&w, gt, 5 * HZ) {
1282                intel_display_prepare_reset(gt->i915);
1283
1284                /* Flush everyone using a resource about to be clobbered */
1285                synchronize_srcu_expedited(&gt->reset.backoff_srcu);
1286
1287                intel_gt_reset(gt, engine_mask, reason);
1288
1289                intel_display_finish_reset(gt->i915);
1290        }
1291
1292        if (!test_bit(I915_WEDGED, &gt->reset.flags))
1293                kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1294}
1295
1296/**
1297 * intel_gt_handle_error - handle a gpu error
1298 * @gt: the intel_gt
1299 * @engine_mask: mask representing engines that are hung
1300 * @flags: control flags
1301 * @fmt: Error message format string
1302 *
1303 * Do some basic checking of register state at error time and
1304 * dump it to the syslog.  Also call i915_capture_error_state() to make
1305 * sure we get a record and make it available in debugfs.  Fire a uevent
1306 * so userspace knows something bad happened (should trigger collection
1307 * of a ring dump etc.).
1308 */
1309void intel_gt_handle_error(struct intel_gt *gt,
1310                           intel_engine_mask_t engine_mask,
1311                           unsigned long flags,
1312                           const char *fmt, ...)
1313{
1314        struct intel_engine_cs *engine;
1315        intel_wakeref_t wakeref;
1316        intel_engine_mask_t tmp;
1317        char error_msg[80];
1318        char *msg = NULL;
1319
1320        if (fmt) {
1321                va_list args;
1322
1323                va_start(args, fmt);
1324                vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1325                va_end(args);
1326
1327                msg = error_msg;
1328        }
1329
1330        /*
1331         * In most cases it's guaranteed that we get here with an RPM
1332         * reference held, for example because there is a pending GPU
1333         * request that won't finish until the reset is done. This
1334         * isn't the case at least when we get here by doing a
1335         * simulated reset via debugfs, so get an RPM reference.
1336         */
1337        wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1338
1339        engine_mask &= gt->info.engine_mask;
1340
1341        if (flags & I915_ERROR_CAPTURE) {
1342                i915_capture_error_state(gt, engine_mask, CORE_DUMP_FLAG_NONE);
1343                intel_gt_clear_error_registers(gt, engine_mask);
1344        }
1345
1346        /*
1347         * Try engine reset when available. We fall back to full reset if
1348         * single reset fails.
1349         */
1350        if (!intel_uc_uses_guc_submission(&gt->uc) &&
1351            intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
1352                local_bh_disable();
1353                for_each_engine_masked(engine, gt, engine_mask, tmp) {
1354                        BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1355                        if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1356                                             &gt->reset.flags))
1357                                continue;
1358
1359                        if (__intel_engine_reset_bh(engine, msg) == 0)
1360                                engine_mask &= ~engine->mask;
1361
1362                        clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1363                                              &gt->reset.flags);
1364                }
1365                local_bh_enable();
1366        }
1367
1368        if (!engine_mask)
1369                goto out;
1370
1371        /* Full reset needs the mutex, stop any other user trying to do so. */
1372        if (test_and_set_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1373                wait_event(gt->reset.queue,
1374                           !test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1375                goto out; /* piggy-back on the other reset */
1376        }
1377
1378        /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1379        synchronize_rcu_expedited();
1380
1381        /*
1382         * Prevent any other reset-engine attempt. We don't do this for GuC
1383         * submission the GuC owns the per-engine reset, not the i915.
1384         */
1385        if (!intel_uc_uses_guc_submission(&gt->uc)) {
1386                for_each_engine(engine, gt, tmp) {
1387                        while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1388                                                &gt->reset.flags))
1389                                wait_on_bit(&gt->reset.flags,
1390                                            I915_RESET_ENGINE + engine->id,
1391                                            TASK_UNINTERRUPTIBLE);
1392                }
1393        }
1394
1395        intel_gt_reset_global(gt, engine_mask, msg);
1396
1397        if (!intel_uc_uses_guc_submission(&gt->uc)) {
1398                for_each_engine(engine, gt, tmp)
1399                        clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1400                                         &gt->reset.flags);
1401        }
1402        clear_bit_unlock(I915_RESET_BACKOFF, &gt->reset.flags);
1403        smp_mb__after_atomic();
1404        wake_up_all(&gt->reset.queue);
1405
1406out:
1407        intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1408}
1409
1410int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
1411{
1412        might_lock(&gt->reset.backoff_srcu);
1413        might_sleep();
1414
1415        rcu_read_lock();
1416        while (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1417                rcu_read_unlock();
1418
1419                if (wait_event_interruptible(gt->reset.queue,
1420                                             !test_bit(I915_RESET_BACKOFF,
1421                                                       &gt->reset.flags)))
1422                        return -EINTR;
1423
1424                rcu_read_lock();
1425        }
1426        *srcu = srcu_read_lock(&gt->reset.backoff_srcu);
1427        rcu_read_unlock();
1428
1429        return 0;
1430}
1431
1432void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1433__releases(&gt->reset.backoff_srcu)
1434{
1435        srcu_read_unlock(&gt->reset.backoff_srcu, tag);
1436}
1437
1438int intel_gt_terminally_wedged(struct intel_gt *gt)
1439{
1440        might_sleep();
1441
1442        if (!intel_gt_is_wedged(gt))
1443                return 0;
1444
1445        if (intel_gt_has_unrecoverable_error(gt))
1446                return -EIO;
1447
1448        /* Reset still in progress? Maybe we will recover? */
1449        if (wait_event_interruptible(gt->reset.queue,
1450                                     !test_bit(I915_RESET_BACKOFF,
1451                                               &gt->reset.flags)))
1452                return -EINTR;
1453
1454        return intel_gt_is_wedged(gt) ? -EIO : 0;
1455}
1456
1457void intel_gt_set_wedged_on_init(struct intel_gt *gt)
1458{
1459        BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
1460                     I915_WEDGED_ON_INIT);
1461        intel_gt_set_wedged(gt);
1462        i915_disable_error_state(gt->i915, -ENODEV);
1463        set_bit(I915_WEDGED_ON_INIT, &gt->reset.flags);
1464
1465        /* Wedged on init is non-recoverable */
1466        add_taint_for_CI(gt->i915, TAINT_WARN);
1467}
1468
1469void intel_gt_set_wedged_on_fini(struct intel_gt *gt)
1470{
1471        intel_gt_set_wedged(gt);
1472        i915_disable_error_state(gt->i915, -ENODEV);
1473        set_bit(I915_WEDGED_ON_FINI, &gt->reset.flags);
1474        intel_gt_retire_requests(gt); /* cleanup any wedged requests */
1475}
1476
1477void intel_gt_init_reset(struct intel_gt *gt)
1478{
1479        init_waitqueue_head(&gt->reset.queue);
1480        mutex_init(&gt->reset.mutex);
1481        init_srcu_struct(&gt->reset.backoff_srcu);
1482
1483        /*
1484         * While undesirable to wait inside the shrinker, complain anyway.
1485         *
1486         * If we have to wait during shrinking, we guarantee forward progress
1487         * by forcing the reset. Therefore during the reset we must not
1488         * re-enter the shrinker. By declaring that we take the reset mutex
1489         * within the shrinker, we forbid ourselves from performing any
1490         * fs-reclaim or taking related locks during reset.
1491         */
1492        i915_gem_shrinker_taints_mutex(gt->i915, &gt->reset.mutex);
1493
1494        /* no GPU until we are ready! */
1495        __set_bit(I915_WEDGED, &gt->reset.flags);
1496}
1497
1498void intel_gt_fini_reset(struct intel_gt *gt)
1499{
1500        cleanup_srcu_struct(&gt->reset.backoff_srcu);
1501}
1502
1503static void intel_wedge_me(struct work_struct *work)
1504{
1505        struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1506
1507        drm_err(&w->gt->i915->drm,
1508                "%s timed out, cancelling all in-flight rendering.\n",
1509                w->name);
1510        intel_gt_set_wedged(w->gt);
1511}
1512
1513void __intel_init_wedge(struct intel_wedge_me *w,
1514                        struct intel_gt *gt,
1515                        long timeout,
1516                        const char *name)
1517{
1518        w->gt = gt;
1519        w->name = name;
1520
1521        INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1522        schedule_delayed_work(&w->work, timeout);
1523}
1524
1525void __intel_fini_wedge(struct intel_wedge_me *w)
1526{
1527        cancel_delayed_work_sync(&w->work);
1528        destroy_delayed_work_on_stack(&w->work);
1529        w->gt = NULL;
1530}
1531
1532#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1533#include "selftest_reset.c"
1534#include "selftest_hangcheck.c"
1535#endif
1536