linux/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
<<
>>
Prefs
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2016 Intel Corporation
   4 */
   5
   6#include <linux/kthread.h>
   7
   8#include "gem/i915_gem_context.h"
   9#include "gem/i915_gem_internal.h"
  10
  11#include "i915_gem_evict.h"
  12#include "intel_gt.h"
  13#include "intel_engine_heartbeat.h"
  14#include "intel_engine_pm.h"
  15#include "selftest_engine_heartbeat.h"
  16
  17#include "i915_selftest.h"
  18#include "selftests/i915_random.h"
  19#include "selftests/igt_flush_test.h"
  20#include "selftests/igt_reset.h"
  21#include "selftests/igt_atomic.h"
  22#include "selftests/igt_spinner.h"
  23#include "selftests/intel_scheduler_helpers.h"
  24
  25#include "selftests/mock_drm.h"
  26
  27#include "gem/selftests/mock_context.h"
  28#include "gem/selftests/igt_gem_utils.h"
  29
  30#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
  31
  32struct hang {
  33        struct intel_gt *gt;
  34        struct drm_i915_gem_object *hws;
  35        struct drm_i915_gem_object *obj;
  36        struct i915_gem_context *ctx;
  37        u32 *seqno;
  38        u32 *batch;
  39};
  40
  41static int hang_init(struct hang *h, struct intel_gt *gt)
  42{
  43        void *vaddr;
  44        int err;
  45
  46        memset(h, 0, sizeof(*h));
  47        h->gt = gt;
  48
  49        h->ctx = kernel_context(gt->i915, NULL);
  50        if (IS_ERR(h->ctx))
  51                return PTR_ERR(h->ctx);
  52
  53        GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
  54
  55        h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  56        if (IS_ERR(h->hws)) {
  57                err = PTR_ERR(h->hws);
  58                goto err_ctx;
  59        }
  60
  61        h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  62        if (IS_ERR(h->obj)) {
  63                err = PTR_ERR(h->obj);
  64                goto err_hws;
  65        }
  66
  67        i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
  68        vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
  69        if (IS_ERR(vaddr)) {
  70                err = PTR_ERR(vaddr);
  71                goto err_obj;
  72        }
  73        h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  74
  75        vaddr = i915_gem_object_pin_map_unlocked(h->obj,
  76                                                 i915_coherent_map_type(gt->i915, h->obj, false));
  77        if (IS_ERR(vaddr)) {
  78                err = PTR_ERR(vaddr);
  79                goto err_unpin_hws;
  80        }
  81        h->batch = vaddr;
  82
  83        return 0;
  84
  85err_unpin_hws:
  86        i915_gem_object_unpin_map(h->hws);
  87err_obj:
  88        i915_gem_object_put(h->obj);
  89err_hws:
  90        i915_gem_object_put(h->hws);
  91err_ctx:
  92        kernel_context_close(h->ctx);
  93        return err;
  94}
  95
  96static u64 hws_address(const struct i915_vma *hws,
  97                       const struct i915_request *rq)
  98{
  99        return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
 100}
 101
 102static int move_to_active(struct i915_vma *vma,
 103                          struct i915_request *rq,
 104                          unsigned int flags)
 105{
 106        int err;
 107
 108        i915_vma_lock(vma);
 109        err = i915_request_await_object(rq, vma->obj,
 110                                        flags & EXEC_OBJECT_WRITE);
 111        if (err == 0)
 112                err = i915_vma_move_to_active(vma, rq, flags);
 113        i915_vma_unlock(vma);
 114
 115        return err;
 116}
 117
 118static struct i915_request *
 119hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 120{
 121        struct intel_gt *gt = h->gt;
 122        struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
 123        struct drm_i915_gem_object *obj;
 124        struct i915_request *rq = NULL;
 125        struct i915_vma *hws, *vma;
 126        unsigned int flags;
 127        void *vaddr;
 128        u32 *batch;
 129        int err;
 130
 131        obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
 132        if (IS_ERR(obj)) {
 133                i915_vm_put(vm);
 134                return ERR_CAST(obj);
 135        }
 136
 137        vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
 138        if (IS_ERR(vaddr)) {
 139                i915_gem_object_put(obj);
 140                i915_vm_put(vm);
 141                return ERR_CAST(vaddr);
 142        }
 143
 144        i915_gem_object_unpin_map(h->obj);
 145        i915_gem_object_put(h->obj);
 146
 147        h->obj = obj;
 148        h->batch = vaddr;
 149
 150        vma = i915_vma_instance(h->obj, vm, NULL);
 151        if (IS_ERR(vma)) {
 152                i915_vm_put(vm);
 153                return ERR_CAST(vma);
 154        }
 155
 156        hws = i915_vma_instance(h->hws, vm, NULL);
 157        if (IS_ERR(hws)) {
 158                i915_vm_put(vm);
 159                return ERR_CAST(hws);
 160        }
 161
 162        err = i915_vma_pin(vma, 0, 0, PIN_USER);
 163        if (err) {
 164                i915_vm_put(vm);
 165                return ERR_PTR(err);
 166        }
 167
 168        err = i915_vma_pin(hws, 0, 0, PIN_USER);
 169        if (err)
 170                goto unpin_vma;
 171
 172        rq = igt_request_alloc(h->ctx, engine);
 173        if (IS_ERR(rq)) {
 174                err = PTR_ERR(rq);
 175                goto unpin_hws;
 176        }
 177
 178        err = move_to_active(vma, rq, 0);
 179        if (err)
 180                goto cancel_rq;
 181
 182        err = move_to_active(hws, rq, 0);
 183        if (err)
 184                goto cancel_rq;
 185
 186        batch = h->batch;
 187        if (GRAPHICS_VER(gt->i915) >= 8) {
 188                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 189                *batch++ = lower_32_bits(hws_address(hws, rq));
 190                *batch++ = upper_32_bits(hws_address(hws, rq));
 191                *batch++ = rq->fence.seqno;
 192                *batch++ = MI_NOOP;
 193
 194                memset(batch, 0, 1024);
 195                batch += 1024 / sizeof(*batch);
 196
 197                *batch++ = MI_NOOP;
 198                *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 199                *batch++ = lower_32_bits(vma->node.start);
 200                *batch++ = upper_32_bits(vma->node.start);
 201        } else if (GRAPHICS_VER(gt->i915) >= 6) {
 202                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 203                *batch++ = 0;
 204                *batch++ = lower_32_bits(hws_address(hws, rq));
 205                *batch++ = rq->fence.seqno;
 206                *batch++ = MI_NOOP;
 207
 208                memset(batch, 0, 1024);
 209                batch += 1024 / sizeof(*batch);
 210
 211                *batch++ = MI_NOOP;
 212                *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 213                *batch++ = lower_32_bits(vma->node.start);
 214        } else if (GRAPHICS_VER(gt->i915) >= 4) {
 215                *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 216                *batch++ = 0;
 217                *batch++ = lower_32_bits(hws_address(hws, rq));
 218                *batch++ = rq->fence.seqno;
 219                *batch++ = MI_NOOP;
 220
 221                memset(batch, 0, 1024);
 222                batch += 1024 / sizeof(*batch);
 223
 224                *batch++ = MI_NOOP;
 225                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 226                *batch++ = lower_32_bits(vma->node.start);
 227        } else {
 228                *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 229                *batch++ = lower_32_bits(hws_address(hws, rq));
 230                *batch++ = rq->fence.seqno;
 231                *batch++ = MI_NOOP;
 232
 233                memset(batch, 0, 1024);
 234                batch += 1024 / sizeof(*batch);
 235
 236                *batch++ = MI_NOOP;
 237                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 238                *batch++ = lower_32_bits(vma->node.start);
 239        }
 240        *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 241        intel_gt_chipset_flush(engine->gt);
 242
 243        if (rq->engine->emit_init_breadcrumb) {
 244                err = rq->engine->emit_init_breadcrumb(rq);
 245                if (err)
 246                        goto cancel_rq;
 247        }
 248
 249        flags = 0;
 250        if (GRAPHICS_VER(gt->i915) <= 5)
 251                flags |= I915_DISPATCH_SECURE;
 252
 253        err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 254
 255cancel_rq:
 256        if (err) {
 257                i915_request_set_error_once(rq, err);
 258                i915_request_add(rq);
 259        }
 260unpin_hws:
 261        i915_vma_unpin(hws);
 262unpin_vma:
 263        i915_vma_unpin(vma);
 264        i915_vm_put(vm);
 265        return err ? ERR_PTR(err) : rq;
 266}
 267
 268static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 269{
 270        return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 271}
 272
 273static void hang_fini(struct hang *h)
 274{
 275        *h->batch = MI_BATCH_BUFFER_END;
 276        intel_gt_chipset_flush(h->gt);
 277
 278        i915_gem_object_unpin_map(h->obj);
 279        i915_gem_object_put(h->obj);
 280
 281        i915_gem_object_unpin_map(h->hws);
 282        i915_gem_object_put(h->hws);
 283
 284        kernel_context_close(h->ctx);
 285
 286        igt_flush_test(h->gt->i915);
 287}
 288
 289static bool wait_until_running(struct hang *h, struct i915_request *rq)
 290{
 291        return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 292                                               rq->fence.seqno),
 293                             10) &&
 294                 wait_for(i915_seqno_passed(hws_seqno(h, rq),
 295                                            rq->fence.seqno),
 296                          1000));
 297}
 298
 299static int igt_hang_sanitycheck(void *arg)
 300{
 301        struct intel_gt *gt = arg;
 302        struct i915_request *rq;
 303        struct intel_engine_cs *engine;
 304        enum intel_engine_id id;
 305        struct hang h;
 306        int err;
 307
 308        /* Basic check that we can execute our hanging batch */
 309
 310        err = hang_init(&h, gt);
 311        if (err)
 312                return err;
 313
 314        for_each_engine(engine, gt, id) {
 315                struct intel_wedge_me w;
 316                long timeout;
 317
 318                if (!intel_engine_can_store_dword(engine))
 319                        continue;
 320
 321                rq = hang_create_request(&h, engine);
 322                if (IS_ERR(rq)) {
 323                        err = PTR_ERR(rq);
 324                        pr_err("Failed to create request for %s, err=%d\n",
 325                               engine->name, err);
 326                        goto fini;
 327                }
 328
 329                i915_request_get(rq);
 330
 331                *h.batch = MI_BATCH_BUFFER_END;
 332                intel_gt_chipset_flush(engine->gt);
 333
 334                i915_request_add(rq);
 335
 336                timeout = 0;
 337                intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
 338                        timeout = i915_request_wait(rq, 0,
 339                                                    MAX_SCHEDULE_TIMEOUT);
 340                if (intel_gt_is_wedged(gt))
 341                        timeout = -EIO;
 342
 343                i915_request_put(rq);
 344
 345                if (timeout < 0) {
 346                        err = timeout;
 347                        pr_err("Wait for request failed on %s, err=%d\n",
 348                               engine->name, err);
 349                        goto fini;
 350                }
 351        }
 352
 353fini:
 354        hang_fini(&h);
 355        return err;
 356}
 357
 358static bool wait_for_idle(struct intel_engine_cs *engine)
 359{
 360        return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
 361}
 362
 363static int igt_reset_nop(void *arg)
 364{
 365        struct intel_gt *gt = arg;
 366        struct i915_gpu_error *global = &gt->i915->gpu_error;
 367        struct intel_engine_cs *engine;
 368        unsigned int reset_count, count;
 369        enum intel_engine_id id;
 370        IGT_TIMEOUT(end_time);
 371        int err = 0;
 372
 373        /* Check that we can reset during non-user portions of requests */
 374
 375        reset_count = i915_reset_count(global);
 376        count = 0;
 377        do {
 378                for_each_engine(engine, gt, id) {
 379                        struct intel_context *ce;
 380                        int i;
 381
 382                        ce = intel_context_create(engine);
 383                        if (IS_ERR(ce)) {
 384                                err = PTR_ERR(ce);
 385                                pr_err("[%s] Create context failed: %d!\n", engine->name, err);
 386                                break;
 387                        }
 388
 389                        for (i = 0; i < 16; i++) {
 390                                struct i915_request *rq;
 391
 392                                rq = intel_context_create_request(ce);
 393                                if (IS_ERR(rq)) {
 394                                        err = PTR_ERR(rq);
 395                                        pr_err("[%s] Create request failed: %d!\n",
 396                                               engine->name, err);
 397                                        break;
 398                                }
 399
 400                                i915_request_add(rq);
 401                        }
 402
 403                        intel_context_put(ce);
 404                }
 405
 406                igt_global_reset_lock(gt);
 407                intel_gt_reset(gt, ALL_ENGINES, NULL);
 408                igt_global_reset_unlock(gt);
 409
 410                if (intel_gt_is_wedged(gt)) {
 411                        pr_err("[%s] GT is wedged!\n", engine->name);
 412                        err = -EIO;
 413                        break;
 414                }
 415
 416                if (i915_reset_count(global) != reset_count + ++count) {
 417                        pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
 418                               engine->name, i915_reset_count(global), reset_count, count);
 419                        err = -EINVAL;
 420                        break;
 421                }
 422
 423                err = igt_flush_test(gt->i915);
 424                if (err) {
 425                        pr_err("[%s] Flush failed: %d!\n", engine->name, err);
 426                        break;
 427                }
 428        } while (time_before(jiffies, end_time));
 429        pr_info("%s: %d resets\n", __func__, count);
 430
 431        if (igt_flush_test(gt->i915)) {
 432                pr_err("Post flush failed: %d!\n", err);
 433                err = -EIO;
 434        }
 435
 436        return err;
 437}
 438
 439static int igt_reset_nop_engine(void *arg)
 440{
 441        struct intel_gt *gt = arg;
 442        struct i915_gpu_error *global = &gt->i915->gpu_error;
 443        struct intel_engine_cs *engine;
 444        enum intel_engine_id id;
 445
 446        /* Check that we can engine-reset during non-user portions */
 447
 448        if (!intel_has_reset_engine(gt))
 449                return 0;
 450
 451        for_each_engine(engine, gt, id) {
 452                unsigned int reset_count, reset_engine_count, count;
 453                struct intel_context *ce;
 454                IGT_TIMEOUT(end_time);
 455                int err;
 456
 457                if (intel_engine_uses_guc(engine)) {
 458                        /* Engine level resets are triggered by GuC when a hang
 459                         * is detected. They can't be triggered by the KMD any
 460                         * more. Thus a nop batch cannot be used as a reset test
 461                         */
 462                        continue;
 463                }
 464
 465                ce = intel_context_create(engine);
 466                if (IS_ERR(ce)) {
 467                        pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
 468                        return PTR_ERR(ce);
 469                }
 470
 471                reset_count = i915_reset_count(global);
 472                reset_engine_count = i915_reset_engine_count(global, engine);
 473                count = 0;
 474
 475                st_engine_heartbeat_disable(engine);
 476                GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
 477                                            &gt->reset.flags));
 478                do {
 479                        int i;
 480
 481                        if (!wait_for_idle(engine)) {
 482                                pr_err("%s failed to idle before reset\n",
 483                                       engine->name);
 484                                err = -EIO;
 485                                break;
 486                        }
 487
 488                        for (i = 0; i < 16; i++) {
 489                                struct i915_request *rq;
 490
 491                                rq = intel_context_create_request(ce);
 492                                if (IS_ERR(rq)) {
 493                                        struct drm_printer p =
 494                                                drm_info_printer(gt->i915->drm.dev);
 495                                        intel_engine_dump(engine, &p,
 496                                                          "%s(%s): failed to submit request\n",
 497                                                          __func__,
 498                                                          engine->name);
 499
 500                                        GEM_TRACE("%s(%s): failed to submit request\n",
 501                                                  __func__,
 502                                                  engine->name);
 503                                        GEM_TRACE_DUMP();
 504
 505                                        intel_gt_set_wedged(gt);
 506
 507                                        err = PTR_ERR(rq);
 508                                        break;
 509                                }
 510
 511                                i915_request_add(rq);
 512                        }
 513                        err = intel_engine_reset(engine, NULL);
 514                        if (err) {
 515                                pr_err("intel_engine_reset(%s) failed, err:%d\n",
 516                                       engine->name, err);
 517                                break;
 518                        }
 519
 520                        if (i915_reset_count(global) != reset_count) {
 521                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 522                                err = -EINVAL;
 523                                break;
 524                        }
 525
 526                        if (i915_reset_engine_count(global, engine) !=
 527                            reset_engine_count + ++count) {
 528                                pr_err("%s engine reset not recorded!\n",
 529                                       engine->name);
 530                                err = -EINVAL;
 531                                break;
 532                        }
 533                } while (time_before(jiffies, end_time));
 534                clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 535                st_engine_heartbeat_enable(engine);
 536
 537                pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 538
 539                intel_context_put(ce);
 540                if (igt_flush_test(gt->i915))
 541                        err = -EIO;
 542                if (err)
 543                        return err;
 544        }
 545
 546        return 0;
 547}
 548
 549static void force_reset_timeout(struct intel_engine_cs *engine)
 550{
 551        engine->reset_timeout.probability = 999;
 552        atomic_set(&engine->reset_timeout.times, -1);
 553}
 554
 555static void cancel_reset_timeout(struct intel_engine_cs *engine)
 556{
 557        memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
 558}
 559
 560static int igt_reset_fail_engine(void *arg)
 561{
 562        struct intel_gt *gt = arg;
 563        struct intel_engine_cs *engine;
 564        enum intel_engine_id id;
 565
 566        /* Check that we can recover from engine-reset failues */
 567
 568        if (!intel_has_reset_engine(gt))
 569                return 0;
 570
 571        for_each_engine(engine, gt, id) {
 572                unsigned int count;
 573                struct intel_context *ce;
 574                IGT_TIMEOUT(end_time);
 575                int err;
 576
 577                /* Can't manually break the reset if i915 doesn't perform it */
 578                if (intel_engine_uses_guc(engine))
 579                        continue;
 580
 581                ce = intel_context_create(engine);
 582                if (IS_ERR(ce)) {
 583                        pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
 584                        return PTR_ERR(ce);
 585                }
 586
 587                st_engine_heartbeat_disable(engine);
 588                GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
 589                                            &gt->reset.flags));
 590
 591                force_reset_timeout(engine);
 592                err = intel_engine_reset(engine, NULL);
 593                cancel_reset_timeout(engine);
 594                if (err == 0) /* timeouts only generated on gen8+ */
 595                        goto skip;
 596
 597                count = 0;
 598                do {
 599                        struct i915_request *last = NULL;
 600                        int i;
 601
 602                        if (!wait_for_idle(engine)) {
 603                                pr_err("%s failed to idle before reset\n",
 604                                       engine->name);
 605                                err = -EIO;
 606                                break;
 607                        }
 608
 609                        for (i = 0; i < count % 15; i++) {
 610                                struct i915_request *rq;
 611
 612                                rq = intel_context_create_request(ce);
 613                                if (IS_ERR(rq)) {
 614                                        struct drm_printer p =
 615                                                drm_info_printer(gt->i915->drm.dev);
 616                                        intel_engine_dump(engine, &p,
 617                                                          "%s(%s): failed to submit request\n",
 618                                                          __func__,
 619                                                          engine->name);
 620
 621                                        GEM_TRACE("%s(%s): failed to submit request\n",
 622                                                  __func__,
 623                                                  engine->name);
 624                                        GEM_TRACE_DUMP();
 625
 626                                        intel_gt_set_wedged(gt);
 627                                        if (last)
 628                                                i915_request_put(last);
 629
 630                                        err = PTR_ERR(rq);
 631                                        goto out;
 632                                }
 633
 634                                if (last)
 635                                        i915_request_put(last);
 636                                last = i915_request_get(rq);
 637                                i915_request_add(rq);
 638                        }
 639
 640                        if (count & 1) {
 641                                err = intel_engine_reset(engine, NULL);
 642                                if (err) {
 643                                        GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
 644                                                      engine->name, err);
 645                                        GEM_TRACE_DUMP();
 646                                        i915_request_put(last);
 647                                        break;
 648                                }
 649                        } else {
 650                                force_reset_timeout(engine);
 651                                err = intel_engine_reset(engine, NULL);
 652                                cancel_reset_timeout(engine);
 653                                if (err != -ETIMEDOUT) {
 654                                        pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
 655                                               engine->name, err);
 656                                        i915_request_put(last);
 657                                        break;
 658                                }
 659                        }
 660
 661                        err = 0;
 662                        if (last) {
 663                                if (i915_request_wait(last, 0, HZ / 2) < 0) {
 664                                        struct drm_printer p =
 665                                                drm_info_printer(gt->i915->drm.dev);
 666
 667                                        intel_engine_dump(engine, &p,
 668                                                          "%s(%s): failed to complete request\n",
 669                                                          __func__,
 670                                                          engine->name);
 671
 672                                        GEM_TRACE("%s(%s): failed to complete request\n",
 673                                                  __func__,
 674                                                  engine->name);
 675                                        GEM_TRACE_DUMP();
 676
 677                                        err = -EIO;
 678                                }
 679                                i915_request_put(last);
 680                        }
 681                        count++;
 682                } while (err == 0 && time_before(jiffies, end_time));
 683out:
 684                pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 685skip:
 686                clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 687                st_engine_heartbeat_enable(engine);
 688                intel_context_put(ce);
 689
 690                if (igt_flush_test(gt->i915))
 691                        err = -EIO;
 692                if (err)
 693                        return err;
 694        }
 695
 696        return 0;
 697}
 698
 699static int __igt_reset_engine(struct intel_gt *gt, bool active)
 700{
 701        struct i915_gpu_error *global = &gt->i915->gpu_error;
 702        struct intel_engine_cs *engine;
 703        enum intel_engine_id id;
 704        struct hang h;
 705        int err = 0;
 706
 707        /* Check that we can issue an engine reset on an idle engine (no-op) */
 708
 709        if (!intel_has_reset_engine(gt))
 710                return 0;
 711
 712        if (active) {
 713                err = hang_init(&h, gt);
 714                if (err)
 715                        return err;
 716        }
 717
 718        for_each_engine(engine, gt, id) {
 719                unsigned int reset_count, reset_engine_count;
 720                unsigned long count;
 721                bool using_guc = intel_engine_uses_guc(engine);
 722                IGT_TIMEOUT(end_time);
 723
 724                if (using_guc && !active)
 725                        continue;
 726
 727                if (active && !intel_engine_can_store_dword(engine))
 728                        continue;
 729
 730                if (!wait_for_idle(engine)) {
 731                        pr_err("%s failed to idle before reset\n",
 732                               engine->name);
 733                        err = -EIO;
 734                        break;
 735                }
 736
 737                reset_count = i915_reset_count(global);
 738                reset_engine_count = i915_reset_engine_count(global, engine);
 739
 740                st_engine_heartbeat_disable(engine);
 741                GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
 742                                            &gt->reset.flags));
 743                count = 0;
 744                do {
 745                        struct i915_request *rq = NULL;
 746                        struct intel_selftest_saved_policy saved;
 747                        int err2;
 748
 749                        err = intel_selftest_modify_policy(engine, &saved,
 750                                                           SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
 751                        if (err) {
 752                                pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
 753                                break;
 754                        }
 755
 756                        if (active) {
 757                                rq = hang_create_request(&h, engine);
 758                                if (IS_ERR(rq)) {
 759                                        err = PTR_ERR(rq);
 760                                        pr_err("[%s] Create hang request failed: %d!\n",
 761                                               engine->name, err);
 762                                        goto restore;
 763                                }
 764
 765                                i915_request_get(rq);
 766                                i915_request_add(rq);
 767
 768                                if (!wait_until_running(&h, rq)) {
 769                                        struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
 770
 771                                        pr_err("%s: Failed to start request %llx, at %x\n",
 772                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 773                                        intel_engine_dump(engine, &p,
 774                                                          "%s\n", engine->name);
 775
 776                                        i915_request_put(rq);
 777                                        err = -EIO;
 778                                        goto restore;
 779                                }
 780                        }
 781
 782                        if (!using_guc) {
 783                                err = intel_engine_reset(engine, NULL);
 784                                if (err) {
 785                                        pr_err("intel_engine_reset(%s) failed, err:%d\n",
 786                                               engine->name, err);
 787                                        goto skip;
 788                                }
 789                        }
 790
 791                        if (rq) {
 792                                /* Ensure the reset happens and kills the engine */
 793                                err = intel_selftest_wait_for_rq(rq);
 794                                if (err)
 795                                        pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
 796                                               engine->name, rq->fence.context,
 797                                               rq->fence.seqno, rq->context->guc_id.id, err);
 798                        }
 799
 800skip:
 801                        if (rq)
 802                                i915_request_put(rq);
 803
 804                        if (i915_reset_count(global) != reset_count) {
 805                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 806                                err = -EINVAL;
 807                                goto restore;
 808                        }
 809
 810                        /* GuC based resets are not logged per engine */
 811                        if (!using_guc) {
 812                                if (i915_reset_engine_count(global, engine) !=
 813                                    ++reset_engine_count) {
 814                                        pr_err("%s engine reset not recorded!\n",
 815                                               engine->name);
 816                                        err = -EINVAL;
 817                                        goto restore;
 818                                }
 819                        }
 820
 821                        count++;
 822
 823restore:
 824                        err2 = intel_selftest_restore_policy(engine, &saved);
 825                        if (err2)
 826                                pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
 827                        if (err == 0)
 828                                err = err2;
 829                        if (err)
 830                                break;
 831                } while (time_before(jiffies, end_time));
 832                clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 833                st_engine_heartbeat_enable(engine);
 834                pr_info("%s: Completed %lu %s resets\n",
 835                        engine->name, count, active ? "active" : "idle");
 836
 837                if (err)
 838                        break;
 839
 840                err = igt_flush_test(gt->i915);
 841                if (err) {
 842                        pr_err("[%s] Flush failed: %d!\n", engine->name, err);
 843                        break;
 844                }
 845        }
 846
 847        if (intel_gt_is_wedged(gt)) {
 848                pr_err("GT is wedged!\n");
 849                err = -EIO;
 850        }
 851
 852        if (active)
 853                hang_fini(&h);
 854
 855        return err;
 856}
 857
 858static int igt_reset_idle_engine(void *arg)
 859{
 860        return __igt_reset_engine(arg, false);
 861}
 862
 863static int igt_reset_active_engine(void *arg)
 864{
 865        return __igt_reset_engine(arg, true);
 866}
 867
 868struct active_engine {
 869        struct task_struct *task;
 870        struct intel_engine_cs *engine;
 871        unsigned long resets;
 872        unsigned int flags;
 873};
 874
 875#define TEST_ACTIVE     BIT(0)
 876#define TEST_OTHERS     BIT(1)
 877#define TEST_SELF       BIT(2)
 878#define TEST_PRIORITY   BIT(3)
 879
 880static int active_request_put(struct i915_request *rq)
 881{
 882        int err = 0;
 883
 884        if (!rq)
 885                return 0;
 886
 887        if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
 888                GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
 889                          rq->engine->name,
 890                          rq->fence.context,
 891                          rq->fence.seqno);
 892                GEM_TRACE_DUMP();
 893
 894                intel_gt_set_wedged(rq->engine->gt);
 895                err = -EIO;
 896        }
 897
 898        i915_request_put(rq);
 899
 900        return err;
 901}
 902
 903static int active_engine(void *data)
 904{
 905        I915_RND_STATE(prng);
 906        struct active_engine *arg = data;
 907        struct intel_engine_cs *engine = arg->engine;
 908        struct i915_request *rq[8] = {};
 909        struct intel_context *ce[ARRAY_SIZE(rq)];
 910        unsigned long count;
 911        int err = 0;
 912
 913        for (count = 0; count < ARRAY_SIZE(ce); count++) {
 914                ce[count] = intel_context_create(engine);
 915                if (IS_ERR(ce[count])) {
 916                        err = PTR_ERR(ce[count]);
 917                        pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
 918                        while (--count)
 919                                intel_context_put(ce[count]);
 920                        return err;
 921                }
 922        }
 923
 924        count = 0;
 925        while (!kthread_should_stop()) {
 926                unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
 927                struct i915_request *old = rq[idx];
 928                struct i915_request *new;
 929
 930                new = intel_context_create_request(ce[idx]);
 931                if (IS_ERR(new)) {
 932                        err = PTR_ERR(new);
 933                        pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
 934                        break;
 935                }
 936
 937                rq[idx] = i915_request_get(new);
 938                i915_request_add(new);
 939
 940                if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
 941                        struct i915_sched_attr attr = {
 942                                .priority =
 943                                        i915_prandom_u32_max_state(512, &prng),
 944                        };
 945                        engine->sched_engine->schedule(rq[idx], &attr);
 946                }
 947
 948                err = active_request_put(old);
 949                if (err) {
 950                        pr_err("[%s] Request put failed: %d!\n", engine->name, err);
 951                        break;
 952                }
 953
 954                cond_resched();
 955        }
 956
 957        for (count = 0; count < ARRAY_SIZE(rq); count++) {
 958                int err__ = active_request_put(rq[count]);
 959
 960                if (err)
 961                        pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
 962
 963                /* Keep the first error */
 964                if (!err)
 965                        err = err__;
 966
 967                intel_context_put(ce[count]);
 968        }
 969
 970        return err;
 971}
 972
 973static int __igt_reset_engines(struct intel_gt *gt,
 974                               const char *test_name,
 975                               unsigned int flags)
 976{
 977        struct i915_gpu_error *global = &gt->i915->gpu_error;
 978        struct intel_engine_cs *engine, *other;
 979        enum intel_engine_id id, tmp;
 980        struct hang h;
 981        int err = 0;
 982
 983        /* Check that issuing a reset on one engine does not interfere
 984         * with any other engine.
 985         */
 986
 987        if (!intel_has_reset_engine(gt))
 988                return 0;
 989
 990        if (flags & TEST_ACTIVE) {
 991                err = hang_init(&h, gt);
 992                if (err)
 993                        return err;
 994
 995                if (flags & TEST_PRIORITY)
 996                        h.ctx->sched.priority = 1024;
 997        }
 998
 999        for_each_engine(engine, gt, id) {
1000                struct active_engine threads[I915_NUM_ENGINES] = {};
1001                unsigned long device = i915_reset_count(global);
1002                unsigned long count = 0, reported;
1003                bool using_guc = intel_engine_uses_guc(engine);
1004                IGT_TIMEOUT(end_time);
1005
1006                if (flags & TEST_ACTIVE) {
1007                        if (!intel_engine_can_store_dword(engine))
1008                                continue;
1009                } else if (using_guc)
1010                        continue;
1011
1012                if (!wait_for_idle(engine)) {
1013                        pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1014                               engine->name, test_name);
1015                        err = -EIO;
1016                        break;
1017                }
1018
1019                memset(threads, 0, sizeof(threads));
1020                for_each_engine(other, gt, tmp) {
1021                        struct task_struct *tsk;
1022
1023                        threads[tmp].resets =
1024                                i915_reset_engine_count(global, other);
1025
1026                        if (other == engine && !(flags & TEST_SELF))
1027                                continue;
1028
1029                        if (other != engine && !(flags & TEST_OTHERS))
1030                                continue;
1031
1032                        threads[tmp].engine = other;
1033                        threads[tmp].flags = flags;
1034
1035                        tsk = kthread_run(active_engine, &threads[tmp],
1036                                          "igt/%s", other->name);
1037                        if (IS_ERR(tsk)) {
1038                                err = PTR_ERR(tsk);
1039                                pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1040                                goto unwind;
1041                        }
1042
1043                        threads[tmp].task = tsk;
1044                        get_task_struct(tsk);
1045                }
1046
1047                yield(); /* start all threads before we begin */
1048
1049                st_engine_heartbeat_disable_no_pm(engine);
1050                GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
1051                                            &gt->reset.flags));
1052                do {
1053                        struct i915_request *rq = NULL;
1054                        struct intel_selftest_saved_policy saved;
1055                        int err2;
1056
1057                        err = intel_selftest_modify_policy(engine, &saved,
1058                                                           SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1059                        if (err) {
1060                                pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1061                                break;
1062                        }
1063
1064                        if (flags & TEST_ACTIVE) {
1065                                rq = hang_create_request(&h, engine);
1066                                if (IS_ERR(rq)) {
1067                                        err = PTR_ERR(rq);
1068                                        pr_err("[%s] Create hang request failed: %d!\n",
1069                                               engine->name, err);
1070                                        goto restore;
1071                                }
1072
1073                                i915_request_get(rq);
1074                                i915_request_add(rq);
1075
1076                                if (!wait_until_running(&h, rq)) {
1077                                        struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1078
1079                                        pr_err("%s: Failed to start request %llx, at %x\n",
1080                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
1081                                        intel_engine_dump(engine, &p,
1082                                                          "%s\n", engine->name);
1083
1084                                        i915_request_put(rq);
1085                                        err = -EIO;
1086                                        goto restore;
1087                                }
1088                        } else {
1089                                intel_engine_pm_get(engine);
1090                        }
1091
1092                        if (!using_guc) {
1093                                err = intel_engine_reset(engine, NULL);
1094                                if (err) {
1095                                        pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1096                                               engine->name, test_name, err);
1097                                        goto restore;
1098                                }
1099                        }
1100
1101                        if (rq) {
1102                                /* Ensure the reset happens and kills the engine */
1103                                err = intel_selftest_wait_for_rq(rq);
1104                                if (err)
1105                                        pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1106                                               engine->name, rq->fence.context,
1107                                               rq->fence.seqno, rq->context->guc_id.id, err);
1108                        }
1109
1110                        count++;
1111
1112                        if (rq) {
1113                                if (rq->fence.error != -EIO) {
1114                                        pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1115                                               engine->name, test_name,
1116                                               rq->fence.context,
1117                                               rq->fence.seqno, rq->context->guc_id.id);
1118                                        i915_request_put(rq);
1119
1120                                        GEM_TRACE_DUMP();
1121                                        intel_gt_set_wedged(gt);
1122                                        err = -EIO;
1123                                        goto restore;
1124                                }
1125
1126                                if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1127                                        struct drm_printer p =
1128                                                drm_info_printer(gt->i915->drm.dev);
1129
1130                                        pr_err("i915_reset_engine(%s:%s):"
1131                                               " failed to complete request %llx:%lld after reset\n",
1132                                               engine->name, test_name,
1133                                               rq->fence.context,
1134                                               rq->fence.seqno);
1135                                        intel_engine_dump(engine, &p,
1136                                                          "%s\n", engine->name);
1137                                        i915_request_put(rq);
1138
1139                                        GEM_TRACE_DUMP();
1140                                        intel_gt_set_wedged(gt);
1141                                        err = -EIO;
1142                                        goto restore;
1143                                }
1144
1145                                i915_request_put(rq);
1146                        }
1147
1148                        if (!(flags & TEST_ACTIVE))
1149                                intel_engine_pm_put(engine);
1150
1151                        if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1152                                struct drm_printer p =
1153                                        drm_info_printer(gt->i915->drm.dev);
1154
1155                                pr_err("i915_reset_engine(%s:%s):"
1156                                       " failed to idle after reset\n",
1157                                       engine->name, test_name);
1158                                intel_engine_dump(engine, &p,
1159                                                  "%s\n", engine->name);
1160
1161                                err = -EIO;
1162                                goto restore;
1163                        }
1164
1165restore:
1166                        err2 = intel_selftest_restore_policy(engine, &saved);
1167                        if (err2)
1168                                pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1169                        if (err == 0)
1170                                err = err2;
1171                        if (err)
1172                                break;
1173                } while (time_before(jiffies, end_time));
1174                clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1175                st_engine_heartbeat_enable_no_pm(engine);
1176
1177                pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1178                        engine->name, test_name, count);
1179
1180                /* GuC based resets are not logged per engine */
1181                if (!using_guc) {
1182                        reported = i915_reset_engine_count(global, engine);
1183                        reported -= threads[engine->id].resets;
1184                        if (reported != count) {
1185                                pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1186                                       engine->name, test_name, count, reported);
1187                                if (!err)
1188                                        err = -EINVAL;
1189                        }
1190                }
1191
1192unwind:
1193                for_each_engine(other, gt, tmp) {
1194                        int ret;
1195
1196                        if (!threads[tmp].task)
1197                                continue;
1198
1199                        ret = kthread_stop(threads[tmp].task);
1200                        if (ret) {
1201                                pr_err("kthread for other engine %s failed, err=%d\n",
1202                                       other->name, ret);
1203                                if (!err)
1204                                        err = ret;
1205                        }
1206                        put_task_struct(threads[tmp].task);
1207
1208                        /* GuC based resets are not logged per engine */
1209                        if (!using_guc) {
1210                                if (other->uabi_class != engine->uabi_class &&
1211                                    threads[tmp].resets !=
1212                                    i915_reset_engine_count(global, other)) {
1213                                        pr_err("Innocent engine %s was reset (count=%ld)\n",
1214                                               other->name,
1215                                               i915_reset_engine_count(global, other) -
1216                                               threads[tmp].resets);
1217                                        if (!err)
1218                                                err = -EINVAL;
1219                                }
1220                        }
1221                }
1222
1223                if (device != i915_reset_count(global)) {
1224                        pr_err("Global reset (count=%ld)!\n",
1225                               i915_reset_count(global) - device);
1226                        if (!err)
1227                                err = -EINVAL;
1228                }
1229
1230                if (err)
1231                        break;
1232
1233                err = igt_flush_test(gt->i915);
1234                if (err) {
1235                        pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1236                        break;
1237                }
1238        }
1239
1240        if (intel_gt_is_wedged(gt))
1241                err = -EIO;
1242
1243        if (flags & TEST_ACTIVE)
1244                hang_fini(&h);
1245
1246        return err;
1247}
1248
1249static int igt_reset_engines(void *arg)
1250{
1251        static const struct {
1252                const char *name;
1253                unsigned int flags;
1254        } phases[] = {
1255                { "idle", 0 },
1256                { "active", TEST_ACTIVE },
1257                { "others-idle", TEST_OTHERS },
1258                { "others-active", TEST_OTHERS | TEST_ACTIVE },
1259                {
1260                        "others-priority",
1261                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1262                },
1263                {
1264                        "self-priority",
1265                        TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1266                },
1267                { }
1268        };
1269        struct intel_gt *gt = arg;
1270        typeof(*phases) *p;
1271        int err;
1272
1273        for (p = phases; p->name; p++) {
1274                if (p->flags & TEST_PRIORITY) {
1275                        if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1276                                continue;
1277                }
1278
1279                err = __igt_reset_engines(arg, p->name, p->flags);
1280                if (err)
1281                        return err;
1282        }
1283
1284        return 0;
1285}
1286
1287static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1288{
1289        u32 count = i915_reset_count(&gt->i915->gpu_error);
1290
1291        intel_gt_reset(gt, mask, NULL);
1292
1293        return count;
1294}
1295
1296static int igt_reset_wait(void *arg)
1297{
1298        struct intel_gt *gt = arg;
1299        struct i915_gpu_error *global = &gt->i915->gpu_error;
1300        struct intel_engine_cs *engine = gt->engine[RCS0];
1301        struct i915_request *rq;
1302        unsigned int reset_count;
1303        struct hang h;
1304        long timeout;
1305        int err;
1306
1307        if (!engine || !intel_engine_can_store_dword(engine))
1308                return 0;
1309
1310        /* Check that we detect a stuck waiter and issue a reset */
1311
1312        igt_global_reset_lock(gt);
1313
1314        err = hang_init(&h, gt);
1315        if (err) {
1316                pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1317                goto unlock;
1318        }
1319
1320        rq = hang_create_request(&h, engine);
1321        if (IS_ERR(rq)) {
1322                err = PTR_ERR(rq);
1323                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1324                goto fini;
1325        }
1326
1327        i915_request_get(rq);
1328        i915_request_add(rq);
1329
1330        if (!wait_until_running(&h, rq)) {
1331                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1332
1333                pr_err("%s: Failed to start request %llx, at %x\n",
1334                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1335                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1336
1337                intel_gt_set_wedged(gt);
1338
1339                err = -EIO;
1340                goto out_rq;
1341        }
1342
1343        reset_count = fake_hangcheck(gt, ALL_ENGINES);
1344
1345        timeout = i915_request_wait(rq, 0, 10);
1346        if (timeout < 0) {
1347                pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1348                       timeout);
1349                err = timeout;
1350                goto out_rq;
1351        }
1352
1353        if (i915_reset_count(global) == reset_count) {
1354                pr_err("No GPU reset recorded!\n");
1355                err = -EINVAL;
1356                goto out_rq;
1357        }
1358
1359out_rq:
1360        i915_request_put(rq);
1361fini:
1362        hang_fini(&h);
1363unlock:
1364        igt_global_reset_unlock(gt);
1365
1366        if (intel_gt_is_wedged(gt))
1367                return -EIO;
1368
1369        return err;
1370}
1371
1372struct evict_vma {
1373        struct completion completion;
1374        struct i915_vma *vma;
1375};
1376
1377static int evict_vma(void *data)
1378{
1379        struct evict_vma *arg = data;
1380        struct i915_address_space *vm = arg->vma->vm;
1381        struct drm_mm_node evict = arg->vma->node;
1382        int err;
1383
1384        complete(&arg->completion);
1385
1386        mutex_lock(&vm->mutex);
1387        err = i915_gem_evict_for_node(vm, NULL, &evict, 0);
1388        mutex_unlock(&vm->mutex);
1389
1390        return err;
1391}
1392
1393static int evict_fence(void *data)
1394{
1395        struct evict_vma *arg = data;
1396        int err;
1397
1398        complete(&arg->completion);
1399
1400        /* Mark the fence register as dirty to force the mmio update. */
1401        err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1402        if (err) {
1403                pr_err("Invalid Y-tiling settings; err:%d\n", err);
1404                return err;
1405        }
1406
1407        err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1408        if (err) {
1409                pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1410                return err;
1411        }
1412
1413        err = i915_vma_pin_fence(arg->vma);
1414        i915_vma_unpin(arg->vma);
1415        if (err) {
1416                pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1417                return err;
1418        }
1419
1420        i915_vma_unpin_fence(arg->vma);
1421
1422        return 0;
1423}
1424
1425static int __igt_reset_evict_vma(struct intel_gt *gt,
1426                                 struct i915_address_space *vm,
1427                                 int (*fn)(void *),
1428                                 unsigned int flags)
1429{
1430        struct intel_engine_cs *engine = gt->engine[RCS0];
1431        struct drm_i915_gem_object *obj;
1432        struct task_struct *tsk = NULL;
1433        struct i915_request *rq;
1434        struct evict_vma arg;
1435        struct hang h;
1436        unsigned int pin_flags;
1437        int err;
1438
1439        if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1440                return 0;
1441
1442        if (!engine || !intel_engine_can_store_dword(engine))
1443                return 0;
1444
1445        /* Check that we can recover an unbind stuck on a hanging request */
1446
1447        err = hang_init(&h, gt);
1448        if (err) {
1449                pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1450                return err;
1451        }
1452
1453        obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1454        if (IS_ERR(obj)) {
1455                err = PTR_ERR(obj);
1456                pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1457                goto fini;
1458        }
1459
1460        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1461                err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1462                if (err) {
1463                        pr_err("Invalid X-tiling settings; err:%d\n", err);
1464                        goto out_obj;
1465                }
1466        }
1467
1468        arg.vma = i915_vma_instance(obj, vm, NULL);
1469        if (IS_ERR(arg.vma)) {
1470                err = PTR_ERR(arg.vma);
1471                pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1472                goto out_obj;
1473        }
1474
1475        rq = hang_create_request(&h, engine);
1476        if (IS_ERR(rq)) {
1477                err = PTR_ERR(rq);
1478                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1479                goto out_obj;
1480        }
1481
1482        pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1483
1484        if (flags & EXEC_OBJECT_NEEDS_FENCE)
1485                pin_flags |= PIN_MAPPABLE;
1486
1487        err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1488        if (err) {
1489                i915_request_add(rq);
1490                pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1491                goto out_obj;
1492        }
1493
1494        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1495                err = i915_vma_pin_fence(arg.vma);
1496                if (err) {
1497                        pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1498                        i915_vma_unpin(arg.vma);
1499                        i915_request_add(rq);
1500                        goto out_obj;
1501                }
1502        }
1503
1504        i915_vma_lock(arg.vma);
1505        err = i915_request_await_object(rq, arg.vma->obj,
1506                                        flags & EXEC_OBJECT_WRITE);
1507        if (err == 0) {
1508                err = i915_vma_move_to_active(arg.vma, rq, flags);
1509                if (err)
1510                        pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1511        } else {
1512                pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1513        }
1514
1515        i915_vma_unlock(arg.vma);
1516
1517        if (flags & EXEC_OBJECT_NEEDS_FENCE)
1518                i915_vma_unpin_fence(arg.vma);
1519        i915_vma_unpin(arg.vma);
1520
1521        i915_request_get(rq);
1522        i915_request_add(rq);
1523        if (err)
1524                goto out_rq;
1525
1526        if (!wait_until_running(&h, rq)) {
1527                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1528
1529                pr_err("%s: Failed to start request %llx, at %x\n",
1530                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1531                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1532
1533                intel_gt_set_wedged(gt);
1534                goto out_reset;
1535        }
1536
1537        init_completion(&arg.completion);
1538
1539        tsk = kthread_run(fn, &arg, "igt/evict_vma");
1540        if (IS_ERR(tsk)) {
1541                err = PTR_ERR(tsk);
1542                pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1543                tsk = NULL;
1544                goto out_reset;
1545        }
1546        get_task_struct(tsk);
1547
1548        wait_for_completion(&arg.completion);
1549
1550        if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1551                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1552
1553                pr_err("igt/evict_vma kthread did not wait\n");
1554                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1555
1556                intel_gt_set_wedged(gt);
1557                goto out_reset;
1558        }
1559
1560out_reset:
1561        igt_global_reset_lock(gt);
1562        fake_hangcheck(gt, rq->engine->mask);
1563        igt_global_reset_unlock(gt);
1564
1565        if (tsk) {
1566                struct intel_wedge_me w;
1567
1568                /* The reset, even indirectly, should take less than 10ms. */
1569                intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1570                        err = kthread_stop(tsk);
1571
1572                put_task_struct(tsk);
1573        }
1574
1575out_rq:
1576        i915_request_put(rq);
1577out_obj:
1578        i915_gem_object_put(obj);
1579fini:
1580        hang_fini(&h);
1581        if (intel_gt_is_wedged(gt))
1582                return -EIO;
1583
1584        return err;
1585}
1586
1587static int igt_reset_evict_ggtt(void *arg)
1588{
1589        struct intel_gt *gt = arg;
1590
1591        return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1592                                     evict_vma, EXEC_OBJECT_WRITE);
1593}
1594
1595static int igt_reset_evict_ppgtt(void *arg)
1596{
1597        struct intel_gt *gt = arg;
1598        struct i915_ppgtt *ppgtt;
1599        int err;
1600
1601        /* aliasing == global gtt locking, covered above */
1602        if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1603                return 0;
1604
1605        ppgtt = i915_ppgtt_create(gt, 0);
1606        if (IS_ERR(ppgtt))
1607                return PTR_ERR(ppgtt);
1608
1609        err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1610                                    evict_vma, EXEC_OBJECT_WRITE);
1611        i915_vm_put(&ppgtt->vm);
1612
1613        return err;
1614}
1615
1616static int igt_reset_evict_fence(void *arg)
1617{
1618        struct intel_gt *gt = arg;
1619
1620        return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1621                                     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1622}
1623
1624static int wait_for_others(struct intel_gt *gt,
1625                           struct intel_engine_cs *exclude)
1626{
1627        struct intel_engine_cs *engine;
1628        enum intel_engine_id id;
1629
1630        for_each_engine(engine, gt, id) {
1631                if (engine == exclude)
1632                        continue;
1633
1634                if (!wait_for_idle(engine))
1635                        return -EIO;
1636        }
1637
1638        return 0;
1639}
1640
1641static int igt_reset_queue(void *arg)
1642{
1643        struct intel_gt *gt = arg;
1644        struct i915_gpu_error *global = &gt->i915->gpu_error;
1645        struct intel_engine_cs *engine;
1646        enum intel_engine_id id;
1647        struct hang h;
1648        int err;
1649
1650        /* Check that we replay pending requests following a hang */
1651
1652        igt_global_reset_lock(gt);
1653
1654        err = hang_init(&h, gt);
1655        if (err)
1656                goto unlock;
1657
1658        for_each_engine(engine, gt, id) {
1659                struct intel_selftest_saved_policy saved;
1660                struct i915_request *prev;
1661                IGT_TIMEOUT(end_time);
1662                unsigned int count;
1663                bool using_guc = intel_engine_uses_guc(engine);
1664
1665                if (!intel_engine_can_store_dword(engine))
1666                        continue;
1667
1668                if (using_guc) {
1669                        err = intel_selftest_modify_policy(engine, &saved,
1670                                                           SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1671                        if (err) {
1672                                pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1673                                goto fini;
1674                        }
1675                }
1676
1677                prev = hang_create_request(&h, engine);
1678                if (IS_ERR(prev)) {
1679                        err = PTR_ERR(prev);
1680                        pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1681                        goto restore;
1682                }
1683
1684                i915_request_get(prev);
1685                i915_request_add(prev);
1686
1687                count = 0;
1688                do {
1689                        struct i915_request *rq;
1690                        unsigned int reset_count;
1691
1692                        rq = hang_create_request(&h, engine);
1693                        if (IS_ERR(rq)) {
1694                                err = PTR_ERR(rq);
1695                                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1696                                goto restore;
1697                        }
1698
1699                        i915_request_get(rq);
1700                        i915_request_add(rq);
1701
1702                        /*
1703                         * XXX We don't handle resetting the kernel context
1704                         * very well. If we trigger a device reset twice in
1705                         * quick succession while the kernel context is
1706                         * executing, we may end up skipping the breadcrumb.
1707                         * This is really only a problem for the selftest as
1708                         * normally there is a large interlude between resets
1709                         * (hangcheck), or we focus on resetting just one
1710                         * engine and so avoid repeatedly resetting innocents.
1711                         */
1712                        err = wait_for_others(gt, engine);
1713                        if (err) {
1714                                pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1715                                       __func__, engine->name);
1716                                i915_request_put(rq);
1717                                i915_request_put(prev);
1718
1719                                GEM_TRACE_DUMP();
1720                                intel_gt_set_wedged(gt);
1721                                goto restore;
1722                        }
1723
1724                        if (!wait_until_running(&h, prev)) {
1725                                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1726
1727                                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1728                                       __func__, engine->name,
1729                                       prev->fence.seqno, hws_seqno(&h, prev));
1730                                intel_engine_dump(engine, &p,
1731                                                  "%s\n", engine->name);
1732
1733                                i915_request_put(rq);
1734                                i915_request_put(prev);
1735
1736                                intel_gt_set_wedged(gt);
1737
1738                                err = -EIO;
1739                                goto restore;
1740                        }
1741
1742                        reset_count = fake_hangcheck(gt, BIT(id));
1743
1744                        if (prev->fence.error != -EIO) {
1745                                pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1746                                       prev->fence.error);
1747                                i915_request_put(rq);
1748                                i915_request_put(prev);
1749                                err = -EINVAL;
1750                                goto restore;
1751                        }
1752
1753                        if (rq->fence.error) {
1754                                pr_err("Fence error status not zero [%d] after unrelated reset\n",
1755                                       rq->fence.error);
1756                                i915_request_put(rq);
1757                                i915_request_put(prev);
1758                                err = -EINVAL;
1759                                goto restore;
1760                        }
1761
1762                        if (i915_reset_count(global) == reset_count) {
1763                                pr_err("No GPU reset recorded!\n");
1764                                i915_request_put(rq);
1765                                i915_request_put(prev);
1766                                err = -EINVAL;
1767                                goto restore;
1768                        }
1769
1770                        i915_request_put(prev);
1771                        prev = rq;
1772                        count++;
1773                } while (time_before(jiffies, end_time));
1774                pr_info("%s: Completed %d queued resets\n",
1775                        engine->name, count);
1776
1777                *h.batch = MI_BATCH_BUFFER_END;
1778                intel_gt_chipset_flush(engine->gt);
1779
1780                i915_request_put(prev);
1781
1782restore:
1783                if (using_guc) {
1784                        int err2 = intel_selftest_restore_policy(engine, &saved);
1785
1786                        if (err2)
1787                                pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1788                                       __func__, __LINE__, engine->name, err2);
1789                        if (err == 0)
1790                                err = err2;
1791                }
1792                if (err)
1793                        goto fini;
1794
1795                err = igt_flush_test(gt->i915);
1796                if (err) {
1797                        pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1798                        break;
1799                }
1800        }
1801
1802fini:
1803        hang_fini(&h);
1804unlock:
1805        igt_global_reset_unlock(gt);
1806
1807        if (intel_gt_is_wedged(gt))
1808                return -EIO;
1809
1810        return err;
1811}
1812
1813static int igt_handle_error(void *arg)
1814{
1815        struct intel_gt *gt = arg;
1816        struct i915_gpu_error *global = &gt->i915->gpu_error;
1817        struct intel_engine_cs *engine = gt->engine[RCS0];
1818        struct hang h;
1819        struct i915_request *rq;
1820        struct i915_gpu_coredump *error;
1821        int err;
1822
1823        /* Check that we can issue a global GPU and engine reset */
1824
1825        if (!intel_has_reset_engine(gt))
1826                return 0;
1827
1828        if (!engine || !intel_engine_can_store_dword(engine))
1829                return 0;
1830
1831        err = hang_init(&h, gt);
1832        if (err) {
1833                pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1834                return err;
1835        }
1836
1837        rq = hang_create_request(&h, engine);
1838        if (IS_ERR(rq)) {
1839                err = PTR_ERR(rq);
1840                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1841                goto err_fini;
1842        }
1843
1844        i915_request_get(rq);
1845        i915_request_add(rq);
1846
1847        if (!wait_until_running(&h, rq)) {
1848                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1849
1850                pr_err("%s: Failed to start request %llx, at %x\n",
1851                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1852                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1853
1854                intel_gt_set_wedged(gt);
1855
1856                err = -EIO;
1857                goto err_request;
1858        }
1859
1860        /* Temporarily disable error capture */
1861        error = xchg(&global->first_error, (void *)-1);
1862
1863        intel_gt_handle_error(gt, engine->mask, 0, NULL);
1864
1865        xchg(&global->first_error, error);
1866
1867        if (rq->fence.error != -EIO) {
1868                pr_err("Guilty request not identified!\n");
1869                err = -EINVAL;
1870                goto err_request;
1871        }
1872
1873err_request:
1874        i915_request_put(rq);
1875err_fini:
1876        hang_fini(&h);
1877        return err;
1878}
1879
1880static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1881                                     const struct igt_atomic_section *p,
1882                                     const char *mode)
1883{
1884        struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1885        int err;
1886
1887        GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1888                  engine->name, mode, p->name);
1889
1890        if (t->func)
1891                tasklet_disable(t);
1892        if (strcmp(p->name, "softirq"))
1893                local_bh_disable();
1894        p->critical_section_begin();
1895
1896        err = __intel_engine_reset_bh(engine, NULL);
1897
1898        p->critical_section_end();
1899        if (strcmp(p->name, "softirq"))
1900                local_bh_enable();
1901        if (t->func) {
1902                tasklet_enable(t);
1903                tasklet_hi_schedule(t);
1904        }
1905
1906        if (err)
1907                pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1908                       engine->name, mode, p->name);
1909
1910        return err;
1911}
1912
1913static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1914                                   const struct igt_atomic_section *p)
1915{
1916        struct i915_request *rq;
1917        struct hang h;
1918        int err;
1919
1920        err = __igt_atomic_reset_engine(engine, p, "idle");
1921        if (err)
1922                return err;
1923
1924        err = hang_init(&h, engine->gt);
1925        if (err) {
1926                pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1927                return err;
1928        }
1929
1930        rq = hang_create_request(&h, engine);
1931        if (IS_ERR(rq)) {
1932                err = PTR_ERR(rq);
1933                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1934                goto out;
1935        }
1936
1937        i915_request_get(rq);
1938        i915_request_add(rq);
1939
1940        if (wait_until_running(&h, rq)) {
1941                err = __igt_atomic_reset_engine(engine, p, "active");
1942        } else {
1943                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1944                       __func__, engine->name,
1945                       rq->fence.seqno, hws_seqno(&h, rq));
1946                intel_gt_set_wedged(engine->gt);
1947                err = -EIO;
1948        }
1949
1950        if (err == 0) {
1951                struct intel_wedge_me w;
1952
1953                intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1954                        i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1955                if (intel_gt_is_wedged(engine->gt))
1956                        err = -EIO;
1957        }
1958
1959        i915_request_put(rq);
1960out:
1961        hang_fini(&h);
1962        return err;
1963}
1964
1965static int igt_reset_engines_atomic(void *arg)
1966{
1967        struct intel_gt *gt = arg;
1968        const typeof(*igt_atomic_phases) *p;
1969        int err = 0;
1970
1971        /* Check that the engines resets are usable from atomic context */
1972
1973        if (!intel_has_reset_engine(gt))
1974                return 0;
1975
1976        if (intel_uc_uses_guc_submission(&gt->uc))
1977                return 0;
1978
1979        igt_global_reset_lock(gt);
1980
1981        /* Flush any requests before we get started and check basics */
1982        if (!igt_force_reset(gt))
1983                goto unlock;
1984
1985        for (p = igt_atomic_phases; p->name; p++) {
1986                struct intel_engine_cs *engine;
1987                enum intel_engine_id id;
1988
1989                for_each_engine(engine, gt, id) {
1990                        err = igt_atomic_reset_engine(engine, p);
1991                        if (err)
1992                                goto out;
1993                }
1994        }
1995
1996out:
1997        /* As we poke around the guts, do a full reset before continuing. */
1998        igt_force_reset(gt);
1999unlock:
2000        igt_global_reset_unlock(gt);
2001
2002        return err;
2003}
2004
2005int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2006{
2007        static const struct i915_subtest tests[] = {
2008                SUBTEST(igt_hang_sanitycheck),
2009                SUBTEST(igt_reset_nop),
2010                SUBTEST(igt_reset_nop_engine),
2011                SUBTEST(igt_reset_idle_engine),
2012                SUBTEST(igt_reset_active_engine),
2013                SUBTEST(igt_reset_fail_engine),
2014                SUBTEST(igt_reset_engines),
2015                SUBTEST(igt_reset_engines_atomic),
2016                SUBTEST(igt_reset_queue),
2017                SUBTEST(igt_reset_wait),
2018                SUBTEST(igt_reset_evict_ggtt),
2019                SUBTEST(igt_reset_evict_ppgtt),
2020                SUBTEST(igt_reset_evict_fence),
2021                SUBTEST(igt_handle_error),
2022        };
2023        struct intel_gt *gt = to_gt(i915);
2024        intel_wakeref_t wakeref;
2025        int err;
2026
2027        if (!intel_has_gpu_reset(gt))
2028                return 0;
2029
2030        if (intel_gt_is_wedged(gt))
2031                return -EIO; /* we're long past hope of a successful reset */
2032
2033        wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2034
2035        err = intel_gt_live_subtests(tests, gt);
2036
2037        intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2038
2039        return err;
2040}
2041