linux/drivers/gpu/drm/i915/gt/intel_reset.c
<<
>>
Prefs
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2008-2018 Intel Corporation
   4 */
   5
   6#include <linux/sched/mm.h>
   7#include <linux/stop_machine.h>
   8
   9#include "display/intel_display.h"
  10#include "display/intel_overlay.h"
  11
  12#include "gem/i915_gem_context.h"
  13
  14#include "i915_drv.h"
  15#include "i915_gpu_error.h"
  16#include "i915_irq.h"
  17#include "intel_breadcrumbs.h"
  18#include "intel_engine_pm.h"
  19#include "intel_gt.h"
  20#include "intel_gt_pm.h"
  21#include "intel_gt_requests.h"
  22#include "intel_reset.h"
  23
  24#include "uc/intel_guc.h"
  25
  26#define RESET_MAX_RETRIES 3
  27
  28/* XXX How to handle concurrent GGTT updates using tiling registers? */
  29#define RESET_UNDER_STOP_MACHINE 0
  30
  31static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
  32{
  33        intel_uncore_rmw_fw(uncore, reg, 0, set);
  34}
  35
  36static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
  37{
  38        intel_uncore_rmw_fw(uncore, reg, clr, 0);
  39}
  40
  41static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
  42{
  43        struct drm_i915_file_private *file_priv = ctx->file_priv;
  44        unsigned long prev_hang;
  45        unsigned int score;
  46
  47        if (IS_ERR_OR_NULL(file_priv))
  48                return;
  49
  50        score = 0;
  51        if (banned)
  52                score = I915_CLIENT_SCORE_CONTEXT_BAN;
  53
  54        prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
  55        if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
  56                score += I915_CLIENT_SCORE_HANG_FAST;
  57
  58        if (score) {
  59                atomic_add(score, &file_priv->ban_score);
  60
  61                drm_dbg(&ctx->i915->drm,
  62                        "client %s: gained %u ban score, now %u\n",
  63                        ctx->name, score,
  64                        atomic_read(&file_priv->ban_score));
  65        }
  66}
  67
  68static bool mark_guilty(struct i915_request *rq)
  69{
  70        struct i915_gem_context *ctx;
  71        unsigned long prev_hang;
  72        bool banned;
  73        int i;
  74
  75        if (intel_context_is_closed(rq->context))
  76                return true;
  77
  78        rcu_read_lock();
  79        ctx = rcu_dereference(rq->context->gem_context);
  80        if (ctx && !kref_get_unless_zero(&ctx->ref))
  81                ctx = NULL;
  82        rcu_read_unlock();
  83        if (!ctx)
  84                return intel_context_is_banned(rq->context);
  85
  86        atomic_inc(&ctx->guilty_count);
  87
  88        /* Cool contexts are too cool to be banned! (Used for reset testing.) */
  89        if (!i915_gem_context_is_bannable(ctx)) {
  90                banned = false;
  91                goto out;
  92        }
  93
  94        drm_notice(&ctx->i915->drm,
  95                   "%s context reset due to GPU hang\n",
  96                   ctx->name);
  97
  98        /* Record the timestamp for the last N hangs */
  99        prev_hang = ctx->hang_timestamp[0];
 100        for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
 101                ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
 102        ctx->hang_timestamp[i] = jiffies;
 103
 104        /* If we have hung N+1 times in rapid succession, we ban the context! */
 105        banned = !i915_gem_context_is_recoverable(ctx);
 106        if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
 107                banned = true;
 108        if (banned)
 109                drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
 110                        ctx->name, atomic_read(&ctx->guilty_count));
 111
 112        client_mark_guilty(ctx, banned);
 113
 114out:
 115        i915_gem_context_put(ctx);
 116        return banned;
 117}
 118
 119static void mark_innocent(struct i915_request *rq)
 120{
 121        struct i915_gem_context *ctx;
 122
 123        rcu_read_lock();
 124        ctx = rcu_dereference(rq->context->gem_context);
 125        if (ctx)
 126                atomic_inc(&ctx->active_count);
 127        rcu_read_unlock();
 128}
 129
 130void __i915_request_reset(struct i915_request *rq, bool guilty)
 131{
 132        bool banned = false;
 133
 134        RQ_TRACE(rq, "guilty? %s\n", yesno(guilty));
 135        GEM_BUG_ON(__i915_request_is_complete(rq));
 136
 137        rcu_read_lock(); /* protect the GEM context */
 138        if (guilty) {
 139                i915_request_set_error_once(rq, -EIO);
 140                __i915_request_skip(rq);
 141                banned = mark_guilty(rq);
 142        } else {
 143                i915_request_set_error_once(rq, -EAGAIN);
 144                mark_innocent(rq);
 145        }
 146        rcu_read_unlock();
 147
 148        if (banned)
 149                intel_context_ban(rq->context, rq);
 150}
 151
 152static bool i915_in_reset(struct pci_dev *pdev)
 153{
 154        u8 gdrst;
 155
 156        pci_read_config_byte(pdev, I915_GDRST, &gdrst);
 157        return gdrst & GRDOM_RESET_STATUS;
 158}
 159
 160static int i915_do_reset(struct intel_gt *gt,
 161                         intel_engine_mask_t engine_mask,
 162                         unsigned int retry)
 163{
 164        struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
 165        int err;
 166
 167        /* Assert reset for at least 20 usec, and wait for acknowledgement. */
 168        pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
 169        udelay(50);
 170        err = wait_for_atomic(i915_in_reset(pdev), 50);
 171
 172        /* Clear the reset request. */
 173        pci_write_config_byte(pdev, I915_GDRST, 0);
 174        udelay(50);
 175        if (!err)
 176                err = wait_for_atomic(!i915_in_reset(pdev), 50);
 177
 178        return err;
 179}
 180
 181static bool g4x_reset_complete(struct pci_dev *pdev)
 182{
 183        u8 gdrst;
 184
 185        pci_read_config_byte(pdev, I915_GDRST, &gdrst);
 186        return (gdrst & GRDOM_RESET_ENABLE) == 0;
 187}
 188
 189static int g33_do_reset(struct intel_gt *gt,
 190                        intel_engine_mask_t engine_mask,
 191                        unsigned int retry)
 192{
 193        struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
 194
 195        pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
 196        return wait_for_atomic(g4x_reset_complete(pdev), 50);
 197}
 198
 199static int g4x_do_reset(struct intel_gt *gt,
 200                        intel_engine_mask_t engine_mask,
 201                        unsigned int retry)
 202{
 203        struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
 204        struct intel_uncore *uncore = gt->uncore;
 205        int ret;
 206
 207        /* WaVcpClkGateDisableForMediaReset:ctg,elk */
 208        rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
 209        intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
 210
 211        pci_write_config_byte(pdev, I915_GDRST,
 212                              GRDOM_MEDIA | GRDOM_RESET_ENABLE);
 213        ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
 214        if (ret) {
 215                GT_TRACE(gt, "Wait for media reset failed\n");
 216                goto out;
 217        }
 218
 219        pci_write_config_byte(pdev, I915_GDRST,
 220                              GRDOM_RENDER | GRDOM_RESET_ENABLE);
 221        ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
 222        if (ret) {
 223                GT_TRACE(gt, "Wait for render reset failed\n");
 224                goto out;
 225        }
 226
 227out:
 228        pci_write_config_byte(pdev, I915_GDRST, 0);
 229
 230        rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
 231        intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
 232
 233        return ret;
 234}
 235
 236static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
 237                        unsigned int retry)
 238{
 239        struct intel_uncore *uncore = gt->uncore;
 240        int ret;
 241
 242        intel_uncore_write_fw(uncore, ILK_GDSR,
 243                              ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
 244        ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
 245                                           ILK_GRDOM_RESET_ENABLE, 0,
 246                                           5000, 0,
 247                                           NULL);
 248        if (ret) {
 249                GT_TRACE(gt, "Wait for render reset failed\n");
 250                goto out;
 251        }
 252
 253        intel_uncore_write_fw(uncore, ILK_GDSR,
 254                              ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
 255        ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
 256                                           ILK_GRDOM_RESET_ENABLE, 0,
 257                                           5000, 0,
 258                                           NULL);
 259        if (ret) {
 260                GT_TRACE(gt, "Wait for media reset failed\n");
 261                goto out;
 262        }
 263
 264out:
 265        intel_uncore_write_fw(uncore, ILK_GDSR, 0);
 266        intel_uncore_posting_read_fw(uncore, ILK_GDSR);
 267        return ret;
 268}
 269
 270/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
 271static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
 272{
 273        struct intel_uncore *uncore = gt->uncore;
 274        int err;
 275
 276        /*
 277         * GEN6_GDRST is not in the gt power well, no need to check
 278         * for fifo space for the write or forcewake the chip for
 279         * the read
 280         */
 281        intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
 282
 283        /* Wait for the device to ack the reset requests */
 284        err = __intel_wait_for_register_fw(uncore,
 285                                           GEN6_GDRST, hw_domain_mask, 0,
 286                                           500, 0,
 287                                           NULL);
 288        if (err)
 289                GT_TRACE(gt,
 290                         "Wait for 0x%08x engines reset failed\n",
 291                         hw_domain_mask);
 292
 293        return err;
 294}
 295
 296static int gen6_reset_engines(struct intel_gt *gt,
 297                              intel_engine_mask_t engine_mask,
 298                              unsigned int retry)
 299{
 300        struct intel_engine_cs *engine;
 301        u32 hw_mask;
 302
 303        if (engine_mask == ALL_ENGINES) {
 304                hw_mask = GEN6_GRDOM_FULL;
 305        } else {
 306                intel_engine_mask_t tmp;
 307
 308                hw_mask = 0;
 309                for_each_engine_masked(engine, gt, engine_mask, tmp) {
 310                        hw_mask |= engine->reset_domain;
 311                }
 312        }
 313
 314        return gen6_hw_domain_reset(gt, hw_mask);
 315}
 316
 317static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine)
 318{
 319        int vecs_id;
 320
 321        GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS);
 322
 323        vecs_id = _VECS((engine->instance) / 2);
 324
 325        return engine->gt->engine[vecs_id];
 326}
 327
 328struct sfc_lock_data {
 329        i915_reg_t lock_reg;
 330        i915_reg_t ack_reg;
 331        i915_reg_t usage_reg;
 332        u32 lock_bit;
 333        u32 ack_bit;
 334        u32 usage_bit;
 335        u32 reset_bit;
 336};
 337
 338static void get_sfc_forced_lock_data(struct intel_engine_cs *engine,
 339                                     struct sfc_lock_data *sfc_lock)
 340{
 341        switch (engine->class) {
 342        default:
 343                MISSING_CASE(engine->class);
 344                fallthrough;
 345        case VIDEO_DECODE_CLASS:
 346                sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine);
 347                sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
 348
 349                sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine);
 350                sfc_lock->ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
 351
 352                sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine);
 353                sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT;
 354                sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
 355
 356                break;
 357        case VIDEO_ENHANCEMENT_CLASS:
 358                sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine);
 359                sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
 360
 361                sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine);
 362                sfc_lock->ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
 363
 364                sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine);
 365                sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT;
 366                sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
 367
 368                break;
 369        }
 370}
 371
 372static int gen11_lock_sfc(struct intel_engine_cs *engine,
 373                          u32 *reset_mask,
 374                          u32 *unlock_mask)
 375{
 376        struct intel_uncore *uncore = engine->uncore;
 377        u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
 378        struct sfc_lock_data sfc_lock;
 379        bool lock_obtained, lock_to_other = false;
 380        int ret;
 381
 382        switch (engine->class) {
 383        case VIDEO_DECODE_CLASS:
 384                if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
 385                        return 0;
 386
 387                fallthrough;
 388        case VIDEO_ENHANCEMENT_CLASS:
 389                get_sfc_forced_lock_data(engine, &sfc_lock);
 390
 391                break;
 392        default:
 393                return 0;
 394        }
 395
 396        if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) {
 397                struct intel_engine_cs *paired_vecs;
 398
 399                if (engine->class != VIDEO_DECODE_CLASS ||
 400                    GRAPHICS_VER(engine->i915) != 12)
 401                        return 0;
 402
 403                /*
 404                 * Wa_14010733141
 405                 *
 406                 * If the VCS-MFX isn't using the SFC, we also need to check
 407                 * whether VCS-HCP is using it.  If so, we need to issue a *VE*
 408                 * forced lock on the VE engine that shares the same SFC.
 409                 */
 410                if (!(intel_uncore_read_fw(uncore,
 411                                           GEN12_HCP_SFC_LOCK_STATUS(engine)) &
 412                      GEN12_HCP_SFC_USAGE_BIT))
 413                        return 0;
 414
 415                paired_vecs = find_sfc_paired_vecs_engine(engine);
 416                get_sfc_forced_lock_data(paired_vecs, &sfc_lock);
 417                lock_to_other = true;
 418                *unlock_mask |= paired_vecs->mask;
 419        } else {
 420                *unlock_mask |= engine->mask;
 421        }
 422
 423        /*
 424         * If the engine is using an SFC, tell the engine that a software reset
 425         * is going to happen. The engine will then try to force lock the SFC.
 426         * If SFC ends up being locked to the engine we want to reset, we have
 427         * to reset it as well (we will unlock it once the reset sequence is
 428         * completed).
 429         */
 430        rmw_set_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit);
 431
 432        ret = __intel_wait_for_register_fw(uncore,
 433                                           sfc_lock.ack_reg,
 434                                           sfc_lock.ack_bit,
 435                                           sfc_lock.ack_bit,
 436                                           1000, 0, NULL);
 437
 438        /*
 439         * Was the SFC released while we were trying to lock it?
 440         *
 441         * We should reset both the engine and the SFC if:
 442         *  - We were locking the SFC to this engine and the lock succeeded
 443         *       OR
 444         *  - We were locking the SFC to a different engine (Wa_14010733141)
 445         *    but the SFC was released before the lock was obtained.
 446         *
 447         * Otherwise we need only reset the engine by itself and we can
 448         * leave the SFC alone.
 449         */
 450        lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) &
 451                        sfc_lock.usage_bit) != 0;
 452        if (lock_obtained == lock_to_other)
 453                return 0;
 454
 455        if (ret) {
 456                ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n");
 457                return ret;
 458        }
 459
 460        *reset_mask |= sfc_lock.reset_bit;
 461        return 0;
 462}
 463
 464static void gen11_unlock_sfc(struct intel_engine_cs *engine)
 465{
 466        struct intel_uncore *uncore = engine->uncore;
 467        u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
 468        struct sfc_lock_data sfc_lock = {};
 469
 470        if (engine->class != VIDEO_DECODE_CLASS &&
 471            engine->class != VIDEO_ENHANCEMENT_CLASS)
 472                return;
 473
 474        if (engine->class == VIDEO_DECODE_CLASS &&
 475            (BIT(engine->instance) & vdbox_sfc_access) == 0)
 476                return;
 477
 478        get_sfc_forced_lock_data(engine, &sfc_lock);
 479
 480        rmw_clear_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit);
 481}
 482
 483static int gen11_reset_engines(struct intel_gt *gt,
 484                               intel_engine_mask_t engine_mask,
 485                               unsigned int retry)
 486{
 487        struct intel_engine_cs *engine;
 488        intel_engine_mask_t tmp;
 489        u32 reset_mask, unlock_mask = 0;
 490        int ret;
 491
 492        if (engine_mask == ALL_ENGINES) {
 493                reset_mask = GEN11_GRDOM_FULL;
 494        } else {
 495                reset_mask = 0;
 496                for_each_engine_masked(engine, gt, engine_mask, tmp) {
 497                        reset_mask |= engine->reset_domain;
 498                        ret = gen11_lock_sfc(engine, &reset_mask, &unlock_mask);
 499                        if (ret)
 500                                goto sfc_unlock;
 501                }
 502        }
 503
 504        ret = gen6_hw_domain_reset(gt, reset_mask);
 505
 506sfc_unlock:
 507        /*
 508         * We unlock the SFC based on the lock status and not the result of
 509         * gen11_lock_sfc to make sure that we clean properly if something
 510         * wrong happened during the lock (e.g. lock acquired after timeout
 511         * expiration).
 512         *
 513         * Due to Wa_14010733141, we may have locked an SFC to an engine that
 514         * wasn't being reset.  So instead of calling gen11_unlock_sfc()
 515         * on engine_mask, we instead call it on the mask of engines that our
 516         * gen11_lock_sfc() calls told us actually had locks attempted.
 517         */
 518        for_each_engine_masked(engine, gt, unlock_mask, tmp)
 519                gen11_unlock_sfc(engine);
 520
 521        return ret;
 522}
 523
 524static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
 525{
 526        struct intel_uncore *uncore = engine->uncore;
 527        const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
 528        u32 request, mask, ack;
 529        int ret;
 530
 531        if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1)))
 532                return -ETIMEDOUT;
 533
 534        ack = intel_uncore_read_fw(uncore, reg);
 535        if (ack & RESET_CTL_CAT_ERROR) {
 536                /*
 537                 * For catastrophic errors, ready-for-reset sequence
 538                 * needs to be bypassed: HAS#396813
 539                 */
 540                request = RESET_CTL_CAT_ERROR;
 541                mask = RESET_CTL_CAT_ERROR;
 542
 543                /* Catastrophic errors need to be cleared by HW */
 544                ack = 0;
 545        } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
 546                request = RESET_CTL_REQUEST_RESET;
 547                mask = RESET_CTL_READY_TO_RESET;
 548                ack = RESET_CTL_READY_TO_RESET;
 549        } else {
 550                return 0;
 551        }
 552
 553        intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
 554        ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
 555                                           700, 0, NULL);
 556        if (ret)
 557                drm_err(&engine->i915->drm,
 558                        "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
 559                        engine->name, request,
 560                        intel_uncore_read_fw(uncore, reg));
 561
 562        return ret;
 563}
 564
 565static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
 566{
 567        intel_uncore_write_fw(engine->uncore,
 568                              RING_RESET_CTL(engine->mmio_base),
 569                              _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
 570}
 571
 572static int gen8_reset_engines(struct intel_gt *gt,
 573                              intel_engine_mask_t engine_mask,
 574                              unsigned int retry)
 575{
 576        struct intel_engine_cs *engine;
 577        const bool reset_non_ready = retry >= 1;
 578        intel_engine_mask_t tmp;
 579        int ret;
 580
 581        for_each_engine_masked(engine, gt, engine_mask, tmp) {
 582                ret = gen8_engine_reset_prepare(engine);
 583                if (ret && !reset_non_ready)
 584                        goto skip_reset;
 585
 586                /*
 587                 * If this is not the first failed attempt to prepare,
 588                 * we decide to proceed anyway.
 589                 *
 590                 * By doing so we risk context corruption and with
 591                 * some gens (kbl), possible system hang if reset
 592                 * happens during active bb execution.
 593                 *
 594                 * We rather take context corruption instead of
 595                 * failed reset with a wedged driver/gpu. And
 596                 * active bb execution case should be covered by
 597                 * stop_engines() we have before the reset.
 598                 */
 599        }
 600
 601        if (GRAPHICS_VER(gt->i915) >= 11)
 602                ret = gen11_reset_engines(gt, engine_mask, retry);
 603        else
 604                ret = gen6_reset_engines(gt, engine_mask, retry);
 605
 606skip_reset:
 607        for_each_engine_masked(engine, gt, engine_mask, tmp)
 608                gen8_engine_reset_cancel(engine);
 609
 610        return ret;
 611}
 612
 613static int mock_reset(struct intel_gt *gt,
 614                      intel_engine_mask_t mask,
 615                      unsigned int retry)
 616{
 617        return 0;
 618}
 619
 620typedef int (*reset_func)(struct intel_gt *,
 621                          intel_engine_mask_t engine_mask,
 622                          unsigned int retry);
 623
 624static reset_func intel_get_gpu_reset(const struct intel_gt *gt)
 625{
 626        struct drm_i915_private *i915 = gt->i915;
 627
 628        if (is_mock_gt(gt))
 629                return mock_reset;
 630        else if (GRAPHICS_VER(i915) >= 8)
 631                return gen8_reset_engines;
 632        else if (GRAPHICS_VER(i915) >= 6)
 633                return gen6_reset_engines;
 634        else if (GRAPHICS_VER(i915) >= 5)
 635                return ilk_do_reset;
 636        else if (IS_G4X(i915))
 637                return g4x_do_reset;
 638        else if (IS_G33(i915) || IS_PINEVIEW(i915))
 639                return g33_do_reset;
 640        else if (GRAPHICS_VER(i915) >= 3)
 641                return i915_do_reset;
 642        else
 643                return NULL;
 644}
 645
 646int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
 647{
 648        const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
 649        reset_func reset;
 650        int ret = -ETIMEDOUT;
 651        int retry;
 652
 653        reset = intel_get_gpu_reset(gt);
 654        if (!reset)
 655                return -ENODEV;
 656
 657        /*
 658         * If the power well sleeps during the reset, the reset
 659         * request may be dropped and never completes (causing -EIO).
 660         */
 661        intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
 662        for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
 663                GT_TRACE(gt, "engine_mask=%x\n", engine_mask);
 664                preempt_disable();
 665                ret = reset(gt, engine_mask, retry);
 666                preempt_enable();
 667        }
 668        intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
 669
 670        return ret;
 671}
 672
 673bool intel_has_gpu_reset(const struct intel_gt *gt)
 674{
 675        if (!gt->i915->params.reset)
 676                return NULL;
 677
 678        return intel_get_gpu_reset(gt);
 679}
 680
 681bool intel_has_reset_engine(const struct intel_gt *gt)
 682{
 683        if (gt->i915->params.reset < 2)
 684                return false;
 685
 686        return INTEL_INFO(gt->i915)->has_reset_engine;
 687}
 688
 689int intel_reset_guc(struct intel_gt *gt)
 690{
 691        u32 guc_domain =
 692                GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
 693        int ret;
 694
 695        GEM_BUG_ON(!HAS_GT_UC(gt->i915));
 696
 697        intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
 698        ret = gen6_hw_domain_reset(gt, guc_domain);
 699        intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
 700
 701        return ret;
 702}
 703
 704/*
 705 * Ensure irq handler finishes, and not run again.
 706 * Also return the active request so that we only search for it once.
 707 */
 708static void reset_prepare_engine(struct intel_engine_cs *engine)
 709{
 710        /*
 711         * During the reset sequence, we must prevent the engine from
 712         * entering RC6. As the context state is undefined until we restart
 713         * the engine, if it does enter RC6 during the reset, the state
 714         * written to the powercontext is undefined and so we may lose
 715         * GPU state upon resume, i.e. fail to restart after a reset.
 716         */
 717        intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
 718        if (engine->reset.prepare)
 719                engine->reset.prepare(engine);
 720}
 721
 722static void revoke_mmaps(struct intel_gt *gt)
 723{
 724        int i;
 725
 726        for (i = 0; i < gt->ggtt->num_fences; i++) {
 727                struct drm_vma_offset_node *node;
 728                struct i915_vma *vma;
 729                u64 vma_offset;
 730
 731                vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
 732                if (!vma)
 733                        continue;
 734
 735                if (!i915_vma_has_userfault(vma))
 736                        continue;
 737
 738                GEM_BUG_ON(vma->fence != &gt->ggtt->fence_regs[i]);
 739
 740                if (!vma->mmo)
 741                        continue;
 742
 743                node = &vma->mmo->vma_node;
 744                vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
 745
 746                unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
 747                                    drm_vma_node_offset_addr(node) + vma_offset,
 748                                    vma->size,
 749                                    1);
 750        }
 751}
 752
 753static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
 754{
 755        struct intel_engine_cs *engine;
 756        intel_engine_mask_t awake = 0;
 757        enum intel_engine_id id;
 758
 759        for_each_engine(engine, gt, id) {
 760                if (intel_engine_pm_get_if_awake(engine))
 761                        awake |= engine->mask;
 762                reset_prepare_engine(engine);
 763        }
 764
 765        intel_uc_reset_prepare(&gt->uc);
 766
 767        return awake;
 768}
 769
 770static void gt_revoke(struct intel_gt *gt)
 771{
 772        revoke_mmaps(gt);
 773}
 774
 775static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
 776{
 777        struct intel_engine_cs *engine;
 778        enum intel_engine_id id;
 779        int err;
 780
 781        /*
 782         * Everything depends on having the GTT running, so we need to start
 783         * there.
 784         */
 785        err = i915_ggtt_enable_hw(gt->i915);
 786        if (err)
 787                return err;
 788
 789        local_bh_disable();
 790        for_each_engine(engine, gt, id)
 791                __intel_engine_reset(engine, stalled_mask & engine->mask);
 792        local_bh_enable();
 793
 794        intel_uc_reset(&gt->uc, true);
 795
 796        intel_ggtt_restore_fences(gt->ggtt);
 797
 798        return err;
 799}
 800
 801static void reset_finish_engine(struct intel_engine_cs *engine)
 802{
 803        if (engine->reset.finish)
 804                engine->reset.finish(engine);
 805        intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
 806
 807        intel_engine_signal_breadcrumbs(engine);
 808}
 809
 810static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
 811{
 812        struct intel_engine_cs *engine;
 813        enum intel_engine_id id;
 814
 815        for_each_engine(engine, gt, id) {
 816                reset_finish_engine(engine);
 817                if (awake & engine->mask)
 818                        intel_engine_pm_put(engine);
 819        }
 820
 821        intel_uc_reset_finish(&gt->uc);
 822}
 823
 824static void nop_submit_request(struct i915_request *request)
 825{
 826        RQ_TRACE(request, "-EIO\n");
 827
 828        request = i915_request_mark_eio(request);
 829        if (request) {
 830                i915_request_submit(request);
 831                intel_engine_signal_breadcrumbs(request->engine);
 832
 833                i915_request_put(request);
 834        }
 835}
 836
 837static void __intel_gt_set_wedged(struct intel_gt *gt)
 838{
 839        struct intel_engine_cs *engine;
 840        intel_engine_mask_t awake;
 841        enum intel_engine_id id;
 842
 843        if (test_bit(I915_WEDGED, &gt->reset.flags))
 844                return;
 845
 846        GT_TRACE(gt, "start\n");
 847
 848        /*
 849         * First, stop submission to hw, but do not yet complete requests by
 850         * rolling the global seqno forward (since this would complete requests
 851         * for which we haven't set the fence error to EIO yet).
 852         */
 853        awake = reset_prepare(gt);
 854
 855        /* Even if the GPU reset fails, it should still stop the engines */
 856        if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
 857                __intel_gt_reset(gt, ALL_ENGINES);
 858
 859        for_each_engine(engine, gt, id)
 860                engine->submit_request = nop_submit_request;
 861
 862        /*
 863         * Make sure no request can slip through without getting completed by
 864         * either this call here to intel_engine_write_global_seqno, or the one
 865         * in nop_submit_request.
 866         */
 867        synchronize_rcu_expedited();
 868        set_bit(I915_WEDGED, &gt->reset.flags);
 869
 870        /* Mark all executing requests as skipped */
 871        local_bh_disable();
 872        for_each_engine(engine, gt, id)
 873                if (engine->reset.cancel)
 874                        engine->reset.cancel(engine);
 875        intel_uc_cancel_requests(&gt->uc);
 876        local_bh_enable();
 877
 878        reset_finish(gt, awake);
 879
 880        GT_TRACE(gt, "end\n");
 881}
 882
 883void intel_gt_set_wedged(struct intel_gt *gt)
 884{
 885        intel_wakeref_t wakeref;
 886
 887        if (test_bit(I915_WEDGED, &gt->reset.flags))
 888                return;
 889
 890        wakeref = intel_runtime_pm_get(gt->uncore->rpm);
 891        mutex_lock(&gt->reset.mutex);
 892
 893        if (GEM_SHOW_DEBUG()) {
 894                struct drm_printer p = drm_debug_printer(__func__);
 895                struct intel_engine_cs *engine;
 896                enum intel_engine_id id;
 897
 898                drm_printf(&p, "called from %pS\n", (void *)_RET_IP_);
 899                for_each_engine(engine, gt, id) {
 900                        if (intel_engine_is_idle(engine))
 901                                continue;
 902
 903                        intel_engine_dump(engine, &p, "%s\n", engine->name);
 904                }
 905        }
 906
 907        __intel_gt_set_wedged(gt);
 908
 909        mutex_unlock(&gt->reset.mutex);
 910        intel_runtime_pm_put(gt->uncore->rpm, wakeref);
 911}
 912
 913static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 914{
 915        struct intel_gt_timelines *timelines = &gt->timelines;
 916        struct intel_timeline *tl;
 917        bool ok;
 918
 919        if (!test_bit(I915_WEDGED, &gt->reset.flags))
 920                return true;
 921
 922        /* Never fully initialised, recovery impossible */
 923        if (intel_gt_has_unrecoverable_error(gt))
 924                return false;
 925
 926        GT_TRACE(gt, "start\n");
 927
 928        /*
 929         * Before unwedging, make sure that all pending operations
 930         * are flushed and errored out - we may have requests waiting upon
 931         * third party fences. We marked all inflight requests as EIO, and
 932         * every execbuf since returned EIO, for consistency we want all
 933         * the currently pending requests to also be marked as EIO, which
 934         * is done inside our nop_submit_request - and so we must wait.
 935         *
 936         * No more can be submitted until we reset the wedged bit.
 937         */
 938        spin_lock(&timelines->lock);
 939        list_for_each_entry(tl, &timelines->active_list, link) {
 940                struct dma_fence *fence;
 941
 942                fence = i915_active_fence_get(&tl->last_request);
 943                if (!fence)
 944                        continue;
 945
 946                spin_unlock(&timelines->lock);
 947
 948                /*
 949                 * All internal dependencies (i915_requests) will have
 950                 * been flushed by the set-wedge, but we may be stuck waiting
 951                 * for external fences. These should all be capped to 10s
 952                 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
 953                 * in the worst case.
 954                 */
 955                dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
 956                dma_fence_put(fence);
 957
 958                /* Restart iteration after droping lock */
 959                spin_lock(&timelines->lock);
 960                tl = list_entry(&timelines->active_list, typeof(*tl), link);
 961        }
 962        spin_unlock(&timelines->lock);
 963
 964        /* We must reset pending GPU events before restoring our submission */
 965        ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
 966        if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
 967                ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
 968        if (!ok) {
 969                /*
 970                 * Warn CI about the unrecoverable wedged condition.
 971                 * Time for a reboot.
 972                 */
 973                add_taint_for_CI(gt->i915, TAINT_WARN);
 974                return false;
 975        }
 976
 977        /*
 978         * Undo nop_submit_request. We prevent all new i915 requests from
 979         * being queued (by disallowing execbuf whilst wedged) so having
 980         * waited for all active requests above, we know the system is idle
 981         * and do not have to worry about a thread being inside
 982         * engine->submit_request() as we swap over. So unlike installing
 983         * the nop_submit_request on reset, we can do this from normal
 984         * context and do not require stop_machine().
 985         */
 986        intel_engines_reset_default_submission(gt);
 987
 988        GT_TRACE(gt, "end\n");
 989
 990        smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 991        clear_bit(I915_WEDGED, &gt->reset.flags);
 992
 993        return true;
 994}
 995
 996bool intel_gt_unset_wedged(struct intel_gt *gt)
 997{
 998        bool result;
 999
1000        mutex_lock(&gt->reset.mutex);
1001        result = __intel_gt_unset_wedged(gt);
1002        mutex_unlock(&gt->reset.mutex);
1003
1004        return result;
1005}
1006
1007static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
1008{
1009        int err, i;
1010
1011        err = __intel_gt_reset(gt, ALL_ENGINES);
1012        for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
1013                msleep(10 * (i + 1));
1014                err = __intel_gt_reset(gt, ALL_ENGINES);
1015        }
1016        if (err)
1017                return err;
1018
1019        return gt_reset(gt, stalled_mask);
1020}
1021
1022static int resume(struct intel_gt *gt)
1023{
1024        struct intel_engine_cs *engine;
1025        enum intel_engine_id id;
1026        int ret;
1027
1028        for_each_engine(engine, gt, id) {
1029                ret = intel_engine_resume(engine);
1030                if (ret)
1031                        return ret;
1032        }
1033
1034        return 0;
1035}
1036
1037/**
1038 * intel_gt_reset - reset chip after a hang
1039 * @gt: #intel_gt to reset
1040 * @stalled_mask: mask of the stalled engines with the guilty requests
1041 * @reason: user error message for why we are resetting
1042 *
1043 * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
1044 * on failure.
1045 *
1046 * Procedure is fairly simple:
1047 *   - reset the chip using the reset reg
1048 *   - re-init context state
1049 *   - re-init hardware status page
1050 *   - re-init ring buffer
1051 *   - re-init interrupt state
1052 *   - re-init display
1053 */
1054void intel_gt_reset(struct intel_gt *gt,
1055                    intel_engine_mask_t stalled_mask,
1056                    const char *reason)
1057{
1058        intel_engine_mask_t awake;
1059        int ret;
1060
1061        GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
1062
1063        might_sleep();
1064        GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1065
1066        /*
1067         * FIXME: Revoking cpu mmap ptes cannot be done from a dma_fence
1068         * critical section like gpu reset.
1069         */
1070        gt_revoke(gt);
1071
1072        mutex_lock(&gt->reset.mutex);
1073
1074        /* Clear any previous failed attempts at recovery. Time to try again. */
1075        if (!__intel_gt_unset_wedged(gt))
1076                goto unlock;
1077
1078        if (reason)
1079                drm_notice(&gt->i915->drm,
1080                           "Resetting chip for %s\n", reason);
1081        atomic_inc(&gt->i915->gpu_error.reset_count);
1082
1083        awake = reset_prepare(gt);
1084
1085        if (!intel_has_gpu_reset(gt)) {
1086                if (gt->i915->params.reset)
1087                        drm_err(&gt->i915->drm, "GPU reset not supported\n");
1088                else
1089                        drm_dbg(&gt->i915->drm, "GPU reset disabled\n");
1090                goto error;
1091        }
1092
1093        if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1094                intel_runtime_pm_disable_interrupts(gt->i915);
1095
1096        if (do_reset(gt, stalled_mask)) {
1097                drm_err(&gt->i915->drm, "Failed to reset chip\n");
1098                goto taint;
1099        }
1100
1101        if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1102                intel_runtime_pm_enable_interrupts(gt->i915);
1103
1104        intel_overlay_reset(gt->i915);
1105
1106        /*
1107         * Next we need to restore the context, but we don't use those
1108         * yet either...
1109         *
1110         * Ring buffer needs to be re-initialized in the KMS case, or if X
1111         * was running at the time of the reset (i.e. we weren't VT
1112         * switched away).
1113         */
1114        ret = intel_gt_init_hw(gt);
1115        if (ret) {
1116                drm_err(&gt->i915->drm,
1117                        "Failed to initialise HW following reset (%d)\n",
1118                        ret);
1119                goto taint;
1120        }
1121
1122        ret = resume(gt);
1123        if (ret)
1124                goto taint;
1125
1126finish:
1127        reset_finish(gt, awake);
1128unlock:
1129        mutex_unlock(&gt->reset.mutex);
1130        return;
1131
1132taint:
1133        /*
1134         * History tells us that if we cannot reset the GPU now, we
1135         * never will. This then impacts everything that is run
1136         * subsequently. On failing the reset, we mark the driver
1137         * as wedged, preventing further execution on the GPU.
1138         * We also want to go one step further and add a taint to the
1139         * kernel so that any subsequent faults can be traced back to
1140         * this failure. This is important for CI, where if the
1141         * GPU/driver fails we would like to reboot and restart testing
1142         * rather than continue on into oblivion. For everyone else,
1143         * the system should still plod along, but they have been warned!
1144         */
1145        add_taint_for_CI(gt->i915, TAINT_WARN);
1146error:
1147        __intel_gt_set_wedged(gt);
1148        goto finish;
1149}
1150
1151static int intel_gt_reset_engine(struct intel_engine_cs *engine)
1152{
1153        return __intel_gt_reset(engine->gt, engine->mask);
1154}
1155
1156int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
1157{
1158        struct intel_gt *gt = engine->gt;
1159        int ret;
1160
1161        ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
1162        GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags));
1163
1164        if (intel_engine_uses_guc(engine))
1165                return -ENODEV;
1166
1167        if (!intel_engine_pm_get_if_awake(engine))
1168                return 0;
1169
1170        reset_prepare_engine(engine);
1171
1172        if (msg)
1173                drm_notice(&engine->i915->drm,
1174                           "Resetting %s for %s\n", engine->name, msg);
1175        atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
1176
1177        ret = intel_gt_reset_engine(engine);
1178        if (ret) {
1179                /* If we fail here, we expect to fallback to a global reset */
1180                ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret);
1181                goto out;
1182        }
1183
1184        /*
1185         * The request that caused the hang is stuck on elsp, we know the
1186         * active request and can drop it, adjust head to skip the offending
1187         * request to resume executing remaining requests in the queue.
1188         */
1189        __intel_engine_reset(engine, true);
1190
1191        /*
1192         * The engine and its registers (and workarounds in case of render)
1193         * have been reset to their default values. Follow the init_ring
1194         * process to program RING_MODE, HWSP and re-enable submission.
1195         */
1196        ret = intel_engine_resume(engine);
1197
1198out:
1199        intel_engine_cancel_stop_cs(engine);
1200        reset_finish_engine(engine);
1201        intel_engine_pm_put_async(engine);
1202        return ret;
1203}
1204
1205/**
1206 * intel_engine_reset - reset GPU engine to recover from a hang
1207 * @engine: engine to reset
1208 * @msg: reason for GPU reset; or NULL for no drm_notice()
1209 *
1210 * Reset a specific GPU engine. Useful if a hang is detected.
1211 * Returns zero on successful reset or otherwise an error code.
1212 *
1213 * Procedure is:
1214 *  - identifies the request that caused the hang and it is dropped
1215 *  - reset engine (which will force the engine to idle)
1216 *  - re-init/configure engine
1217 */
1218int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
1219{
1220        int err;
1221
1222        local_bh_disable();
1223        err = __intel_engine_reset_bh(engine, msg);
1224        local_bh_enable();
1225
1226        return err;
1227}
1228
1229static void intel_gt_reset_global(struct intel_gt *gt,
1230                                  u32 engine_mask,
1231                                  const char *reason)
1232{
1233        struct kobject *kobj = &gt->i915->drm.primary->kdev->kobj;
1234        char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1235        char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1236        char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1237        struct intel_wedge_me w;
1238
1239        kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1240
1241        GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask);
1242        kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1243
1244        /* Use a watchdog to ensure that our reset completes */
1245        intel_wedge_on_timeout(&w, gt, 5 * HZ) {
1246                intel_display_prepare_reset(gt->i915);
1247
1248                /* Flush everyone using a resource about to be clobbered */
1249                synchronize_srcu_expedited(&gt->reset.backoff_srcu);
1250
1251                intel_gt_reset(gt, engine_mask, reason);
1252
1253                intel_display_finish_reset(gt->i915);
1254        }
1255
1256        if (!test_bit(I915_WEDGED, &gt->reset.flags))
1257                kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1258}
1259
1260/**
1261 * intel_gt_handle_error - handle a gpu error
1262 * @gt: the intel_gt
1263 * @engine_mask: mask representing engines that are hung
1264 * @flags: control flags
1265 * @fmt: Error message format string
1266 *
1267 * Do some basic checking of register state at error time and
1268 * dump it to the syslog.  Also call i915_capture_error_state() to make
1269 * sure we get a record and make it available in debugfs.  Fire a uevent
1270 * so userspace knows something bad happened (should trigger collection
1271 * of a ring dump etc.).
1272 */
1273void intel_gt_handle_error(struct intel_gt *gt,
1274                           intel_engine_mask_t engine_mask,
1275                           unsigned long flags,
1276                           const char *fmt, ...)
1277{
1278        struct intel_engine_cs *engine;
1279        intel_wakeref_t wakeref;
1280        intel_engine_mask_t tmp;
1281        char error_msg[80];
1282        char *msg = NULL;
1283
1284        if (fmt) {
1285                va_list args;
1286
1287                va_start(args, fmt);
1288                vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1289                va_end(args);
1290
1291                msg = error_msg;
1292        }
1293
1294        /*
1295         * In most cases it's guaranteed that we get here with an RPM
1296         * reference held, for example because there is a pending GPU
1297         * request that won't finish until the reset is done. This
1298         * isn't the case at least when we get here by doing a
1299         * simulated reset via debugfs, so get an RPM reference.
1300         */
1301        wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1302
1303        engine_mask &= gt->info.engine_mask;
1304
1305        if (flags & I915_ERROR_CAPTURE) {
1306                i915_capture_error_state(gt, engine_mask);
1307                intel_gt_clear_error_registers(gt, engine_mask);
1308        }
1309
1310        /*
1311         * Try engine reset when available. We fall back to full reset if
1312         * single reset fails.
1313         */
1314        if (!intel_uc_uses_guc_submission(&gt->uc) &&
1315            intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
1316                local_bh_disable();
1317                for_each_engine_masked(engine, gt, engine_mask, tmp) {
1318                        BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1319                        if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1320                                             &gt->reset.flags))
1321                                continue;
1322
1323                        if (__intel_engine_reset_bh(engine, msg) == 0)
1324                                engine_mask &= ~engine->mask;
1325
1326                        clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1327                                              &gt->reset.flags);
1328                }
1329                local_bh_enable();
1330        }
1331
1332        if (!engine_mask)
1333                goto out;
1334
1335        /* Full reset needs the mutex, stop any other user trying to do so. */
1336        if (test_and_set_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1337                wait_event(gt->reset.queue,
1338                           !test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1339                goto out; /* piggy-back on the other reset */
1340        }
1341
1342        /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1343        synchronize_rcu_expedited();
1344
1345        /*
1346         * Prevent any other reset-engine attempt. We don't do this for GuC
1347         * submission the GuC owns the per-engine reset, not the i915.
1348         */
1349        if (!intel_uc_uses_guc_submission(&gt->uc)) {
1350                for_each_engine(engine, gt, tmp) {
1351                        while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1352                                                &gt->reset.flags))
1353                                wait_on_bit(&gt->reset.flags,
1354                                            I915_RESET_ENGINE + engine->id,
1355                                            TASK_UNINTERRUPTIBLE);
1356                }
1357        }
1358
1359        intel_gt_reset_global(gt, engine_mask, msg);
1360
1361        if (!intel_uc_uses_guc_submission(&gt->uc)) {
1362                for_each_engine(engine, gt, tmp)
1363                        clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1364                                         &gt->reset.flags);
1365        }
1366        clear_bit_unlock(I915_RESET_BACKOFF, &gt->reset.flags);
1367        smp_mb__after_atomic();
1368        wake_up_all(&gt->reset.queue);
1369
1370out:
1371        intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1372}
1373
1374int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
1375{
1376        might_lock(&gt->reset.backoff_srcu);
1377        might_sleep();
1378
1379        rcu_read_lock();
1380        while (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1381                rcu_read_unlock();
1382
1383                if (wait_event_interruptible(gt->reset.queue,
1384                                             !test_bit(I915_RESET_BACKOFF,
1385                                                       &gt->reset.flags)))
1386                        return -EINTR;
1387
1388                rcu_read_lock();
1389        }
1390        *srcu = srcu_read_lock(&gt->reset.backoff_srcu);
1391        rcu_read_unlock();
1392
1393        return 0;
1394}
1395
1396void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1397__releases(&gt->reset.backoff_srcu)
1398{
1399        srcu_read_unlock(&gt->reset.backoff_srcu, tag);
1400}
1401
1402int intel_gt_terminally_wedged(struct intel_gt *gt)
1403{
1404        might_sleep();
1405
1406        if (!intel_gt_is_wedged(gt))
1407                return 0;
1408
1409        if (intel_gt_has_unrecoverable_error(gt))
1410                return -EIO;
1411
1412        /* Reset still in progress? Maybe we will recover? */
1413        if (wait_event_interruptible(gt->reset.queue,
1414                                     !test_bit(I915_RESET_BACKOFF,
1415                                               &gt->reset.flags)))
1416                return -EINTR;
1417
1418        return intel_gt_is_wedged(gt) ? -EIO : 0;
1419}
1420
1421void intel_gt_set_wedged_on_init(struct intel_gt *gt)
1422{
1423        BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
1424                     I915_WEDGED_ON_INIT);
1425        intel_gt_set_wedged(gt);
1426        i915_disable_error_state(gt->i915, -ENODEV);
1427        set_bit(I915_WEDGED_ON_INIT, &gt->reset.flags);
1428
1429        /* Wedged on init is non-recoverable */
1430        add_taint_for_CI(gt->i915, TAINT_WARN);
1431}
1432
1433void intel_gt_set_wedged_on_fini(struct intel_gt *gt)
1434{
1435        intel_gt_set_wedged(gt);
1436        i915_disable_error_state(gt->i915, -ENODEV);
1437        set_bit(I915_WEDGED_ON_FINI, &gt->reset.flags);
1438        intel_gt_retire_requests(gt); /* cleanup any wedged requests */
1439}
1440
1441void intel_gt_init_reset(struct intel_gt *gt)
1442{
1443        init_waitqueue_head(&gt->reset.queue);
1444        mutex_init(&gt->reset.mutex);
1445        init_srcu_struct(&gt->reset.backoff_srcu);
1446
1447        /*
1448         * While undesirable to wait inside the shrinker, complain anyway.
1449         *
1450         * If we have to wait during shrinking, we guarantee forward progress
1451         * by forcing the reset. Therefore during the reset we must not
1452         * re-enter the shrinker. By declaring that we take the reset mutex
1453         * within the shrinker, we forbid ourselves from performing any
1454         * fs-reclaim or taking related locks during reset.
1455         */
1456        i915_gem_shrinker_taints_mutex(gt->i915, &gt->reset.mutex);
1457
1458        /* no GPU until we are ready! */
1459        __set_bit(I915_WEDGED, &gt->reset.flags);
1460}
1461
1462void intel_gt_fini_reset(struct intel_gt *gt)
1463{
1464        cleanup_srcu_struct(&gt->reset.backoff_srcu);
1465}
1466
1467static void intel_wedge_me(struct work_struct *work)
1468{
1469        struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1470
1471        drm_err(&w->gt->i915->drm,
1472                "%s timed out, cancelling all in-flight rendering.\n",
1473                w->name);
1474        intel_gt_set_wedged(w->gt);
1475}
1476
1477void __intel_init_wedge(struct intel_wedge_me *w,
1478                        struct intel_gt *gt,
1479                        long timeout,
1480                        const char *name)
1481{
1482        w->gt = gt;
1483        w->name = name;
1484
1485        INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1486        schedule_delayed_work(&w->work, timeout);
1487}
1488
1489void __intel_fini_wedge(struct intel_wedge_me *w)
1490{
1491        cancel_delayed_work_sync(&w->work);
1492        destroy_delayed_work_on_stack(&w->work);
1493        w->gt = NULL;
1494}
1495
1496#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1497#include "selftest_reset.c"
1498#include "selftest_hangcheck.c"
1499#endif
1500