linux/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 */
  24
  25#include <linux/kthread.h>
  26
  27#include "gem/i915_gem_context.h"
  28#include "gt/intel_gt.h"
  29#include "intel_engine_pm.h"
  30
  31#include "i915_selftest.h"
  32#include "selftests/i915_random.h"
  33#include "selftests/igt_flush_test.h"
  34#include "selftests/igt_reset.h"
  35#include "selftests/igt_atomic.h"
  36
  37#include "selftests/mock_drm.h"
  38
  39#include "gem/selftests/mock_context.h"
  40#include "gem/selftests/igt_gem_utils.h"
  41
  42#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
  43
  44struct hang {
  45        struct intel_gt *gt;
  46        struct drm_i915_gem_object *hws;
  47        struct drm_i915_gem_object *obj;
  48        struct i915_gem_context *ctx;
  49        u32 *seqno;
  50        u32 *batch;
  51};
  52
  53static int hang_init(struct hang *h, struct intel_gt *gt)
  54{
  55        void *vaddr;
  56        int err;
  57
  58        memset(h, 0, sizeof(*h));
  59        h->gt = gt;
  60
  61        h->ctx = kernel_context(gt->i915);
  62        if (IS_ERR(h->ctx))
  63                return PTR_ERR(h->ctx);
  64
  65        GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
  66
  67        h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  68        if (IS_ERR(h->hws)) {
  69                err = PTR_ERR(h->hws);
  70                goto err_ctx;
  71        }
  72
  73        h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  74        if (IS_ERR(h->obj)) {
  75                err = PTR_ERR(h->obj);
  76                goto err_hws;
  77        }
  78
  79        i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
  80        vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  81        if (IS_ERR(vaddr)) {
  82                err = PTR_ERR(vaddr);
  83                goto err_obj;
  84        }
  85        h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  86
  87        vaddr = i915_gem_object_pin_map(h->obj,
  88                                        i915_coherent_map_type(gt->i915));
  89        if (IS_ERR(vaddr)) {
  90                err = PTR_ERR(vaddr);
  91                goto err_unpin_hws;
  92        }
  93        h->batch = vaddr;
  94
  95        return 0;
  96
  97err_unpin_hws:
  98        i915_gem_object_unpin_map(h->hws);
  99err_obj:
 100        i915_gem_object_put(h->obj);
 101err_hws:
 102        i915_gem_object_put(h->hws);
 103err_ctx:
 104        kernel_context_close(h->ctx);
 105        return err;
 106}
 107
 108static u64 hws_address(const struct i915_vma *hws,
 109                       const struct i915_request *rq)
 110{
 111        return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
 112}
 113
 114static int move_to_active(struct i915_vma *vma,
 115                          struct i915_request *rq,
 116                          unsigned int flags)
 117{
 118        int err;
 119
 120        i915_vma_lock(vma);
 121        err = i915_request_await_object(rq, vma->obj,
 122                                        flags & EXEC_OBJECT_WRITE);
 123        if (err == 0)
 124                err = i915_vma_move_to_active(vma, rq, flags);
 125        i915_vma_unlock(vma);
 126
 127        return err;
 128}
 129
 130static struct i915_request *
 131hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 132{
 133        struct intel_gt *gt = h->gt;
 134        struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm;
 135        struct drm_i915_gem_object *obj;
 136        struct i915_request *rq = NULL;
 137        struct i915_vma *hws, *vma;
 138        unsigned int flags;
 139        void *vaddr;
 140        u32 *batch;
 141        int err;
 142
 143        obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
 144        if (IS_ERR(obj))
 145                return ERR_CAST(obj);
 146
 147        vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
 148        if (IS_ERR(vaddr)) {
 149                i915_gem_object_put(obj);
 150                return ERR_CAST(vaddr);
 151        }
 152
 153        i915_gem_object_unpin_map(h->obj);
 154        i915_gem_object_put(h->obj);
 155
 156        h->obj = obj;
 157        h->batch = vaddr;
 158
 159        vma = i915_vma_instance(h->obj, vm, NULL);
 160        if (IS_ERR(vma))
 161                return ERR_CAST(vma);
 162
 163        hws = i915_vma_instance(h->hws, vm, NULL);
 164        if (IS_ERR(hws))
 165                return ERR_CAST(hws);
 166
 167        err = i915_vma_pin(vma, 0, 0, PIN_USER);
 168        if (err)
 169                return ERR_PTR(err);
 170
 171        err = i915_vma_pin(hws, 0, 0, PIN_USER);
 172        if (err)
 173                goto unpin_vma;
 174
 175        rq = igt_request_alloc(h->ctx, engine);
 176        if (IS_ERR(rq)) {
 177                err = PTR_ERR(rq);
 178                goto unpin_hws;
 179        }
 180
 181        err = move_to_active(vma, rq, 0);
 182        if (err)
 183                goto cancel_rq;
 184
 185        err = move_to_active(hws, rq, 0);
 186        if (err)
 187                goto cancel_rq;
 188
 189        batch = h->batch;
 190        if (INTEL_GEN(gt->i915) >= 8) {
 191                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 192                *batch++ = lower_32_bits(hws_address(hws, rq));
 193                *batch++ = upper_32_bits(hws_address(hws, rq));
 194                *batch++ = rq->fence.seqno;
 195                *batch++ = MI_ARB_CHECK;
 196
 197                memset(batch, 0, 1024);
 198                batch += 1024 / sizeof(*batch);
 199
 200                *batch++ = MI_ARB_CHECK;
 201                *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 202                *batch++ = lower_32_bits(vma->node.start);
 203                *batch++ = upper_32_bits(vma->node.start);
 204        } else if (INTEL_GEN(gt->i915) >= 6) {
 205                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 206                *batch++ = 0;
 207                *batch++ = lower_32_bits(hws_address(hws, rq));
 208                *batch++ = rq->fence.seqno;
 209                *batch++ = MI_ARB_CHECK;
 210
 211                memset(batch, 0, 1024);
 212                batch += 1024 / sizeof(*batch);
 213
 214                *batch++ = MI_ARB_CHECK;
 215                *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 216                *batch++ = lower_32_bits(vma->node.start);
 217        } else if (INTEL_GEN(gt->i915) >= 4) {
 218                *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 219                *batch++ = 0;
 220                *batch++ = lower_32_bits(hws_address(hws, rq));
 221                *batch++ = rq->fence.seqno;
 222                *batch++ = MI_ARB_CHECK;
 223
 224                memset(batch, 0, 1024);
 225                batch += 1024 / sizeof(*batch);
 226
 227                *batch++ = MI_ARB_CHECK;
 228                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 229                *batch++ = lower_32_bits(vma->node.start);
 230        } else {
 231                *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 232                *batch++ = lower_32_bits(hws_address(hws, rq));
 233                *batch++ = rq->fence.seqno;
 234                *batch++ = MI_ARB_CHECK;
 235
 236                memset(batch, 0, 1024);
 237                batch += 1024 / sizeof(*batch);
 238
 239                *batch++ = MI_ARB_CHECK;
 240                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 241                *batch++ = lower_32_bits(vma->node.start);
 242        }
 243        *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 244        intel_gt_chipset_flush(engine->gt);
 245
 246        if (rq->engine->emit_init_breadcrumb) {
 247                err = rq->engine->emit_init_breadcrumb(rq);
 248                if (err)
 249                        goto cancel_rq;
 250        }
 251
 252        flags = 0;
 253        if (INTEL_GEN(gt->i915) <= 5)
 254                flags |= I915_DISPATCH_SECURE;
 255
 256        err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 257
 258cancel_rq:
 259        if (err) {
 260                i915_request_skip(rq, err);
 261                i915_request_add(rq);
 262        }
 263unpin_hws:
 264        i915_vma_unpin(hws);
 265unpin_vma:
 266        i915_vma_unpin(vma);
 267        return err ? ERR_PTR(err) : rq;
 268}
 269
 270static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 271{
 272        return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 273}
 274
 275static void hang_fini(struct hang *h)
 276{
 277        *h->batch = MI_BATCH_BUFFER_END;
 278        intel_gt_chipset_flush(h->gt);
 279
 280        i915_gem_object_unpin_map(h->obj);
 281        i915_gem_object_put(h->obj);
 282
 283        i915_gem_object_unpin_map(h->hws);
 284        i915_gem_object_put(h->hws);
 285
 286        kernel_context_close(h->ctx);
 287
 288        igt_flush_test(h->gt->i915, I915_WAIT_LOCKED);
 289}
 290
 291static bool wait_until_running(struct hang *h, struct i915_request *rq)
 292{
 293        return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 294                                               rq->fence.seqno),
 295                             10) &&
 296                 wait_for(i915_seqno_passed(hws_seqno(h, rq),
 297                                            rq->fence.seqno),
 298                          1000));
 299}
 300
 301static int igt_hang_sanitycheck(void *arg)
 302{
 303        struct intel_gt *gt = arg;
 304        struct i915_request *rq;
 305        struct intel_engine_cs *engine;
 306        enum intel_engine_id id;
 307        struct hang h;
 308        int err;
 309
 310        /* Basic check that we can execute our hanging batch */
 311
 312        mutex_lock(&gt->i915->drm.struct_mutex);
 313        err = hang_init(&h, gt);
 314        if (err)
 315                goto unlock;
 316
 317        for_each_engine(engine, gt->i915, id) {
 318                struct intel_wedge_me w;
 319                long timeout;
 320
 321                if (!intel_engine_can_store_dword(engine))
 322                        continue;
 323
 324                rq = hang_create_request(&h, engine);
 325                if (IS_ERR(rq)) {
 326                        err = PTR_ERR(rq);
 327                        pr_err("Failed to create request for %s, err=%d\n",
 328                               engine->name, err);
 329                        goto fini;
 330                }
 331
 332                i915_request_get(rq);
 333
 334                *h.batch = MI_BATCH_BUFFER_END;
 335                intel_gt_chipset_flush(engine->gt);
 336
 337                i915_request_add(rq);
 338
 339                timeout = 0;
 340                intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
 341                        timeout = i915_request_wait(rq, 0,
 342                                                    MAX_SCHEDULE_TIMEOUT);
 343                if (intel_gt_is_wedged(gt))
 344                        timeout = -EIO;
 345
 346                i915_request_put(rq);
 347
 348                if (timeout < 0) {
 349                        err = timeout;
 350                        pr_err("Wait for request failed on %s, err=%d\n",
 351                               engine->name, err);
 352                        goto fini;
 353                }
 354        }
 355
 356fini:
 357        hang_fini(&h);
 358unlock:
 359        mutex_unlock(&gt->i915->drm.struct_mutex);
 360        return err;
 361}
 362
 363static bool wait_for_idle(struct intel_engine_cs *engine)
 364{
 365        return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
 366}
 367
 368static int igt_reset_nop(void *arg)
 369{
 370        struct intel_gt *gt = arg;
 371        struct i915_gpu_error *global = &gt->i915->gpu_error;
 372        struct intel_engine_cs *engine;
 373        struct i915_gem_context *ctx;
 374        unsigned int reset_count, count;
 375        enum intel_engine_id id;
 376        struct drm_file *file;
 377        IGT_TIMEOUT(end_time);
 378        int err = 0;
 379
 380        /* Check that we can reset during non-user portions of requests */
 381
 382        file = mock_file(gt->i915);
 383        if (IS_ERR(file))
 384                return PTR_ERR(file);
 385
 386        mutex_lock(&gt->i915->drm.struct_mutex);
 387        ctx = live_context(gt->i915, file);
 388        mutex_unlock(&gt->i915->drm.struct_mutex);
 389        if (IS_ERR(ctx)) {
 390                err = PTR_ERR(ctx);
 391                goto out;
 392        }
 393
 394        i915_gem_context_clear_bannable(ctx);
 395        reset_count = i915_reset_count(global);
 396        count = 0;
 397        do {
 398                mutex_lock(&gt->i915->drm.struct_mutex);
 399
 400                for_each_engine(engine, gt->i915, id) {
 401                        int i;
 402
 403                        for (i = 0; i < 16; i++) {
 404                                struct i915_request *rq;
 405
 406                                rq = igt_request_alloc(ctx, engine);
 407                                if (IS_ERR(rq)) {
 408                                        err = PTR_ERR(rq);
 409                                        break;
 410                                }
 411
 412                                i915_request_add(rq);
 413                        }
 414                }
 415
 416                igt_global_reset_lock(gt);
 417                intel_gt_reset(gt, ALL_ENGINES, NULL);
 418                igt_global_reset_unlock(gt);
 419
 420                mutex_unlock(&gt->i915->drm.struct_mutex);
 421                if (intel_gt_is_wedged(gt)) {
 422                        err = -EIO;
 423                        break;
 424                }
 425
 426                if (i915_reset_count(global) != reset_count + ++count) {
 427                        pr_err("Full GPU reset not recorded!\n");
 428                        err = -EINVAL;
 429                        break;
 430                }
 431
 432                err = igt_flush_test(gt->i915, 0);
 433                if (err)
 434                        break;
 435        } while (time_before(jiffies, end_time));
 436        pr_info("%s: %d resets\n", __func__, count);
 437
 438        mutex_lock(&gt->i915->drm.struct_mutex);
 439        err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
 440        mutex_unlock(&gt->i915->drm.struct_mutex);
 441
 442out:
 443        mock_file_free(gt->i915, file);
 444        if (intel_gt_is_wedged(gt))
 445                err = -EIO;
 446        return err;
 447}
 448
 449static int igt_reset_nop_engine(void *arg)
 450{
 451        struct intel_gt *gt = arg;
 452        struct i915_gpu_error *global = &gt->i915->gpu_error;
 453        struct intel_engine_cs *engine;
 454        struct i915_gem_context *ctx;
 455        enum intel_engine_id id;
 456        struct drm_file *file;
 457        int err = 0;
 458
 459        /* Check that we can engine-reset during non-user portions */
 460
 461        if (!intel_has_reset_engine(gt->i915))
 462                return 0;
 463
 464        file = mock_file(gt->i915);
 465        if (IS_ERR(file))
 466                return PTR_ERR(file);
 467
 468        mutex_lock(&gt->i915->drm.struct_mutex);
 469        ctx = live_context(gt->i915, file);
 470        mutex_unlock(&gt->i915->drm.struct_mutex);
 471        if (IS_ERR(ctx)) {
 472                err = PTR_ERR(ctx);
 473                goto out;
 474        }
 475
 476        i915_gem_context_clear_bannable(ctx);
 477        for_each_engine(engine, gt->i915, id) {
 478                unsigned int reset_count, reset_engine_count;
 479                unsigned int count;
 480                IGT_TIMEOUT(end_time);
 481
 482                reset_count = i915_reset_count(global);
 483                reset_engine_count = i915_reset_engine_count(global, engine);
 484                count = 0;
 485
 486                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 487                do {
 488                        int i;
 489
 490                        if (!wait_for_idle(engine)) {
 491                                pr_err("%s failed to idle before reset\n",
 492                                       engine->name);
 493                                err = -EIO;
 494                                break;
 495                        }
 496
 497                        mutex_lock(&gt->i915->drm.struct_mutex);
 498                        for (i = 0; i < 16; i++) {
 499                                struct i915_request *rq;
 500
 501                                rq = igt_request_alloc(ctx, engine);
 502                                if (IS_ERR(rq)) {
 503                                        err = PTR_ERR(rq);
 504                                        break;
 505                                }
 506
 507                                i915_request_add(rq);
 508                        }
 509                        err = intel_engine_reset(engine, NULL);
 510                        mutex_unlock(&gt->i915->drm.struct_mutex);
 511                        if (err) {
 512                                pr_err("i915_reset_engine failed\n");
 513                                break;
 514                        }
 515
 516                        if (i915_reset_count(global) != reset_count) {
 517                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 518                                err = -EINVAL;
 519                                break;
 520                        }
 521
 522                        if (i915_reset_engine_count(global, engine) !=
 523                            reset_engine_count + ++count) {
 524                                pr_err("%s engine reset not recorded!\n",
 525                                       engine->name);
 526                                err = -EINVAL;
 527                                break;
 528                        }
 529                } while (time_before(jiffies, end_time));
 530                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 531                pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 532
 533                if (err)
 534                        break;
 535
 536                err = igt_flush_test(gt->i915, 0);
 537                if (err)
 538                        break;
 539        }
 540
 541        mutex_lock(&gt->i915->drm.struct_mutex);
 542        err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
 543        mutex_unlock(&gt->i915->drm.struct_mutex);
 544
 545out:
 546        mock_file_free(gt->i915, file);
 547        if (intel_gt_is_wedged(gt))
 548                err = -EIO;
 549        return err;
 550}
 551
 552static int __igt_reset_engine(struct intel_gt *gt, bool active)
 553{
 554        struct i915_gpu_error *global = &gt->i915->gpu_error;
 555        struct intel_engine_cs *engine;
 556        enum intel_engine_id id;
 557        struct hang h;
 558        int err = 0;
 559
 560        /* Check that we can issue an engine reset on an idle engine (no-op) */
 561
 562        if (!intel_has_reset_engine(gt->i915))
 563                return 0;
 564
 565        if (active) {
 566                mutex_lock(&gt->i915->drm.struct_mutex);
 567                err = hang_init(&h, gt);
 568                mutex_unlock(&gt->i915->drm.struct_mutex);
 569                if (err)
 570                        return err;
 571        }
 572
 573        for_each_engine(engine, gt->i915, id) {
 574                unsigned int reset_count, reset_engine_count;
 575                IGT_TIMEOUT(end_time);
 576
 577                if (active && !intel_engine_can_store_dword(engine))
 578                        continue;
 579
 580                if (!wait_for_idle(engine)) {
 581                        pr_err("%s failed to idle before reset\n",
 582                               engine->name);
 583                        err = -EIO;
 584                        break;
 585                }
 586
 587                reset_count = i915_reset_count(global);
 588                reset_engine_count = i915_reset_engine_count(global, engine);
 589
 590                intel_engine_pm_get(engine);
 591                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 592                do {
 593                        if (active) {
 594                                struct i915_request *rq;
 595
 596                                mutex_lock(&gt->i915->drm.struct_mutex);
 597                                rq = hang_create_request(&h, engine);
 598                                if (IS_ERR(rq)) {
 599                                        err = PTR_ERR(rq);
 600                                        mutex_unlock(&gt->i915->drm.struct_mutex);
 601                                        break;
 602                                }
 603
 604                                i915_request_get(rq);
 605                                i915_request_add(rq);
 606                                mutex_unlock(&gt->i915->drm.struct_mutex);
 607
 608                                if (!wait_until_running(&h, rq)) {
 609                                        struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
 610
 611                                        pr_err("%s: Failed to start request %llx, at %x\n",
 612                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 613                                        intel_engine_dump(engine, &p,
 614                                                          "%s\n", engine->name);
 615
 616                                        i915_request_put(rq);
 617                                        err = -EIO;
 618                                        break;
 619                                }
 620
 621                                i915_request_put(rq);
 622                        }
 623
 624                        err = intel_engine_reset(engine, NULL);
 625                        if (err) {
 626                                pr_err("i915_reset_engine failed\n");
 627                                break;
 628                        }
 629
 630                        if (i915_reset_count(global) != reset_count) {
 631                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 632                                err = -EINVAL;
 633                                break;
 634                        }
 635
 636                        if (i915_reset_engine_count(global, engine) !=
 637                            ++reset_engine_count) {
 638                                pr_err("%s engine reset not recorded!\n",
 639                                       engine->name);
 640                                err = -EINVAL;
 641                                break;
 642                        }
 643                } while (time_before(jiffies, end_time));
 644                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 645                intel_engine_pm_put(engine);
 646
 647                if (err)
 648                        break;
 649
 650                err = igt_flush_test(gt->i915, 0);
 651                if (err)
 652                        break;
 653        }
 654
 655        if (intel_gt_is_wedged(gt))
 656                err = -EIO;
 657
 658        if (active) {
 659                mutex_lock(&gt->i915->drm.struct_mutex);
 660                hang_fini(&h);
 661                mutex_unlock(&gt->i915->drm.struct_mutex);
 662        }
 663
 664        return err;
 665}
 666
 667static int igt_reset_idle_engine(void *arg)
 668{
 669        return __igt_reset_engine(arg, false);
 670}
 671
 672static int igt_reset_active_engine(void *arg)
 673{
 674        return __igt_reset_engine(arg, true);
 675}
 676
 677struct active_engine {
 678        struct task_struct *task;
 679        struct intel_engine_cs *engine;
 680        unsigned long resets;
 681        unsigned int flags;
 682};
 683
 684#define TEST_ACTIVE     BIT(0)
 685#define TEST_OTHERS     BIT(1)
 686#define TEST_SELF       BIT(2)
 687#define TEST_PRIORITY   BIT(3)
 688
 689static int active_request_put(struct i915_request *rq)
 690{
 691        int err = 0;
 692
 693        if (!rq)
 694                return 0;
 695
 696        if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
 697                GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
 698                          rq->engine->name,
 699                          rq->fence.context,
 700                          rq->fence.seqno);
 701                GEM_TRACE_DUMP();
 702
 703                intel_gt_set_wedged(rq->engine->gt);
 704                err = -EIO;
 705        }
 706
 707        i915_request_put(rq);
 708
 709        return err;
 710}
 711
 712static int active_engine(void *data)
 713{
 714        I915_RND_STATE(prng);
 715        struct active_engine *arg = data;
 716        struct intel_engine_cs *engine = arg->engine;
 717        struct i915_request *rq[8] = {};
 718        struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
 719        struct drm_file *file;
 720        unsigned long count = 0;
 721        int err = 0;
 722
 723        file = mock_file(engine->i915);
 724        if (IS_ERR(file))
 725                return PTR_ERR(file);
 726
 727        for (count = 0; count < ARRAY_SIZE(ctx); count++) {
 728                mutex_lock(&engine->i915->drm.struct_mutex);
 729                ctx[count] = live_context(engine->i915, file);
 730                mutex_unlock(&engine->i915->drm.struct_mutex);
 731                if (IS_ERR(ctx[count])) {
 732                        err = PTR_ERR(ctx[count]);
 733                        while (--count)
 734                                i915_gem_context_put(ctx[count]);
 735                        goto err_file;
 736                }
 737        }
 738
 739        while (!kthread_should_stop()) {
 740                unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
 741                struct i915_request *old = rq[idx];
 742                struct i915_request *new;
 743
 744                mutex_lock(&engine->i915->drm.struct_mutex);
 745                new = igt_request_alloc(ctx[idx], engine);
 746                if (IS_ERR(new)) {
 747                        mutex_unlock(&engine->i915->drm.struct_mutex);
 748                        err = PTR_ERR(new);
 749                        break;
 750                }
 751
 752                if (arg->flags & TEST_PRIORITY)
 753                        ctx[idx]->sched.priority =
 754                                i915_prandom_u32_max_state(512, &prng);
 755
 756                rq[idx] = i915_request_get(new);
 757                i915_request_add(new);
 758                mutex_unlock(&engine->i915->drm.struct_mutex);
 759
 760                err = active_request_put(old);
 761                if (err)
 762                        break;
 763
 764                cond_resched();
 765        }
 766
 767        for (count = 0; count < ARRAY_SIZE(rq); count++) {
 768                int err__ = active_request_put(rq[count]);
 769
 770                /* Keep the first error */
 771                if (!err)
 772                        err = err__;
 773        }
 774
 775err_file:
 776        mock_file_free(engine->i915, file);
 777        return err;
 778}
 779
 780static int __igt_reset_engines(struct intel_gt *gt,
 781                               const char *test_name,
 782                               unsigned int flags)
 783{
 784        struct i915_gpu_error *global = &gt->i915->gpu_error;
 785        struct intel_engine_cs *engine, *other;
 786        enum intel_engine_id id, tmp;
 787        struct hang h;
 788        int err = 0;
 789
 790        /* Check that issuing a reset on one engine does not interfere
 791         * with any other engine.
 792         */
 793
 794        if (!intel_has_reset_engine(gt->i915))
 795                return 0;
 796
 797        if (flags & TEST_ACTIVE) {
 798                mutex_lock(&gt->i915->drm.struct_mutex);
 799                err = hang_init(&h, gt);
 800                mutex_unlock(&gt->i915->drm.struct_mutex);
 801                if (err)
 802                        return err;
 803
 804                if (flags & TEST_PRIORITY)
 805                        h.ctx->sched.priority = 1024;
 806        }
 807
 808        for_each_engine(engine, gt->i915, id) {
 809                struct active_engine threads[I915_NUM_ENGINES] = {};
 810                unsigned long device = i915_reset_count(global);
 811                unsigned long count = 0, reported;
 812                IGT_TIMEOUT(end_time);
 813
 814                if (flags & TEST_ACTIVE &&
 815                    !intel_engine_can_store_dword(engine))
 816                        continue;
 817
 818                if (!wait_for_idle(engine)) {
 819                        pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
 820                               engine->name, test_name);
 821                        err = -EIO;
 822                        break;
 823                }
 824
 825                memset(threads, 0, sizeof(threads));
 826                for_each_engine(other, gt->i915, tmp) {
 827                        struct task_struct *tsk;
 828
 829                        threads[tmp].resets =
 830                                i915_reset_engine_count(global, other);
 831
 832                        if (!(flags & TEST_OTHERS))
 833                                continue;
 834
 835                        if (other == engine && !(flags & TEST_SELF))
 836                                continue;
 837
 838                        threads[tmp].engine = other;
 839                        threads[tmp].flags = flags;
 840
 841                        tsk = kthread_run(active_engine, &threads[tmp],
 842                                          "igt/%s", other->name);
 843                        if (IS_ERR(tsk)) {
 844                                err = PTR_ERR(tsk);
 845                                goto unwind;
 846                        }
 847
 848                        threads[tmp].task = tsk;
 849                        get_task_struct(tsk);
 850                }
 851
 852                intel_engine_pm_get(engine);
 853                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 854                do {
 855                        struct i915_request *rq = NULL;
 856
 857                        if (flags & TEST_ACTIVE) {
 858                                mutex_lock(&gt->i915->drm.struct_mutex);
 859                                rq = hang_create_request(&h, engine);
 860                                if (IS_ERR(rq)) {
 861                                        err = PTR_ERR(rq);
 862                                        mutex_unlock(&gt->i915->drm.struct_mutex);
 863                                        break;
 864                                }
 865
 866                                i915_request_get(rq);
 867                                i915_request_add(rq);
 868                                mutex_unlock(&gt->i915->drm.struct_mutex);
 869
 870                                if (!wait_until_running(&h, rq)) {
 871                                        struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
 872
 873                                        pr_err("%s: Failed to start request %llx, at %x\n",
 874                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 875                                        intel_engine_dump(engine, &p,
 876                                                          "%s\n", engine->name);
 877
 878                                        i915_request_put(rq);
 879                                        err = -EIO;
 880                                        break;
 881                                }
 882                        }
 883
 884                        err = intel_engine_reset(engine, NULL);
 885                        if (err) {
 886                                pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
 887                                       engine->name, test_name, err);
 888                                break;
 889                        }
 890
 891                        count++;
 892
 893                        if (rq) {
 894                                if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 895                                        struct drm_printer p =
 896                                                drm_info_printer(gt->i915->drm.dev);
 897
 898                                        pr_err("i915_reset_engine(%s:%s):"
 899                                               " failed to complete request after reset\n",
 900                                               engine->name, test_name);
 901                                        intel_engine_dump(engine, &p,
 902                                                          "%s\n", engine->name);
 903                                        i915_request_put(rq);
 904
 905                                        GEM_TRACE_DUMP();
 906                                        intel_gt_set_wedged(gt);
 907                                        err = -EIO;
 908                                        break;
 909                                }
 910
 911                                i915_request_put(rq);
 912                        }
 913
 914                        if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
 915                                struct drm_printer p =
 916                                        drm_info_printer(gt->i915->drm.dev);
 917
 918                                pr_err("i915_reset_engine(%s:%s):"
 919                                       " failed to idle after reset\n",
 920                                       engine->name, test_name);
 921                                intel_engine_dump(engine, &p,
 922                                                  "%s\n", engine->name);
 923
 924                                err = -EIO;
 925                                break;
 926                        }
 927                } while (time_before(jiffies, end_time));
 928                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 929                intel_engine_pm_put(engine);
 930                pr_info("i915_reset_engine(%s:%s): %lu resets\n",
 931                        engine->name, test_name, count);
 932
 933                reported = i915_reset_engine_count(global, engine);
 934                reported -= threads[engine->id].resets;
 935                if (reported != count) {
 936                        pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
 937                               engine->name, test_name, count, reported);
 938                        if (!err)
 939                                err = -EINVAL;
 940                }
 941
 942unwind:
 943                for_each_engine(other, gt->i915, tmp) {
 944                        int ret;
 945
 946                        if (!threads[tmp].task)
 947                                continue;
 948
 949                        ret = kthread_stop(threads[tmp].task);
 950                        if (ret) {
 951                                pr_err("kthread for other engine %s failed, err=%d\n",
 952                                       other->name, ret);
 953                                if (!err)
 954                                        err = ret;
 955                        }
 956                        put_task_struct(threads[tmp].task);
 957
 958                        if (other->uabi_class != engine->uabi_class &&
 959                            threads[tmp].resets !=
 960                            i915_reset_engine_count(global, other)) {
 961                                pr_err("Innocent engine %s was reset (count=%ld)\n",
 962                                       other->name,
 963                                       i915_reset_engine_count(global, other) -
 964                                       threads[tmp].resets);
 965                                if (!err)
 966                                        err = -EINVAL;
 967                        }
 968                }
 969
 970                if (device != i915_reset_count(global)) {
 971                        pr_err("Global reset (count=%ld)!\n",
 972                               i915_reset_count(global) - device);
 973                        if (!err)
 974                                err = -EINVAL;
 975                }
 976
 977                if (err)
 978                        break;
 979
 980                mutex_lock(&gt->i915->drm.struct_mutex);
 981                err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
 982                mutex_unlock(&gt->i915->drm.struct_mutex);
 983                if (err)
 984                        break;
 985        }
 986
 987        if (intel_gt_is_wedged(gt))
 988                err = -EIO;
 989
 990        if (flags & TEST_ACTIVE) {
 991                mutex_lock(&gt->i915->drm.struct_mutex);
 992                hang_fini(&h);
 993                mutex_unlock(&gt->i915->drm.struct_mutex);
 994        }
 995
 996        return err;
 997}
 998
 999static int igt_reset_engines(void *arg)
1000{
1001        static const struct {
1002                const char *name;
1003                unsigned int flags;
1004        } phases[] = {
1005                { "idle", 0 },
1006                { "active", TEST_ACTIVE },
1007                { "others-idle", TEST_OTHERS },
1008                { "others-active", TEST_OTHERS | TEST_ACTIVE },
1009                {
1010                        "others-priority",
1011                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1012                },
1013                {
1014                        "self-priority",
1015                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1016                },
1017                { }
1018        };
1019        struct intel_gt *gt = arg;
1020        typeof(*phases) *p;
1021        int err;
1022
1023        for (p = phases; p->name; p++) {
1024                if (p->flags & TEST_PRIORITY) {
1025                        if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1026                                continue;
1027                }
1028
1029                err = __igt_reset_engines(arg, p->name, p->flags);
1030                if (err)
1031                        return err;
1032        }
1033
1034        return 0;
1035}
1036
1037static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1038{
1039        u32 count = i915_reset_count(&gt->i915->gpu_error);
1040
1041        intel_gt_reset(gt, mask, NULL);
1042
1043        return count;
1044}
1045
1046static int igt_reset_wait(void *arg)
1047{
1048        struct intel_gt *gt = arg;
1049        struct i915_gpu_error *global = &gt->i915->gpu_error;
1050        struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1051        struct i915_request *rq;
1052        unsigned int reset_count;
1053        struct hang h;
1054        long timeout;
1055        int err;
1056
1057        if (!engine || !intel_engine_can_store_dword(engine))
1058                return 0;
1059
1060        /* Check that we detect a stuck waiter and issue a reset */
1061
1062        igt_global_reset_lock(gt);
1063
1064        mutex_lock(&gt->i915->drm.struct_mutex);
1065        err = hang_init(&h, gt);
1066        if (err)
1067                goto unlock;
1068
1069        rq = hang_create_request(&h, engine);
1070        if (IS_ERR(rq)) {
1071                err = PTR_ERR(rq);
1072                goto fini;
1073        }
1074
1075        i915_request_get(rq);
1076        i915_request_add(rq);
1077
1078        if (!wait_until_running(&h, rq)) {
1079                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1080
1081                pr_err("%s: Failed to start request %llx, at %x\n",
1082                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1083                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1084
1085                intel_gt_set_wedged(gt);
1086
1087                err = -EIO;
1088                goto out_rq;
1089        }
1090
1091        reset_count = fake_hangcheck(gt, ALL_ENGINES);
1092
1093        timeout = i915_request_wait(rq, 0, 10);
1094        if (timeout < 0) {
1095                pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1096                       timeout);
1097                err = timeout;
1098                goto out_rq;
1099        }
1100
1101        if (i915_reset_count(global) == reset_count) {
1102                pr_err("No GPU reset recorded!\n");
1103                err = -EINVAL;
1104                goto out_rq;
1105        }
1106
1107out_rq:
1108        i915_request_put(rq);
1109fini:
1110        hang_fini(&h);
1111unlock:
1112        mutex_unlock(&gt->i915->drm.struct_mutex);
1113        igt_global_reset_unlock(gt);
1114
1115        if (intel_gt_is_wedged(gt))
1116                return -EIO;
1117
1118        return err;
1119}
1120
1121struct evict_vma {
1122        struct completion completion;
1123        struct i915_vma *vma;
1124};
1125
1126static int evict_vma(void *data)
1127{
1128        struct evict_vma *arg = data;
1129        struct i915_address_space *vm = arg->vma->vm;
1130        struct drm_i915_private *i915 = vm->i915;
1131        struct drm_mm_node evict = arg->vma->node;
1132        int err;
1133
1134        complete(&arg->completion);
1135
1136        mutex_lock(&i915->drm.struct_mutex);
1137        err = i915_gem_evict_for_node(vm, &evict, 0);
1138        mutex_unlock(&i915->drm.struct_mutex);
1139
1140        return err;
1141}
1142
1143static int evict_fence(void *data)
1144{
1145        struct evict_vma *arg = data;
1146        struct drm_i915_private *i915 = arg->vma->vm->i915;
1147        int err;
1148
1149        complete(&arg->completion);
1150
1151        mutex_lock(&i915->drm.struct_mutex);
1152
1153        /* Mark the fence register as dirty to force the mmio update. */
1154        err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1155        if (err) {
1156                pr_err("Invalid Y-tiling settings; err:%d\n", err);
1157                goto out_unlock;
1158        }
1159
1160        err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1161        if (err) {
1162                pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1163                goto out_unlock;
1164        }
1165
1166        err = i915_vma_pin_fence(arg->vma);
1167        i915_vma_unpin(arg->vma);
1168        if (err) {
1169                pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1170                goto out_unlock;
1171        }
1172
1173        i915_vma_unpin_fence(arg->vma);
1174
1175out_unlock:
1176        mutex_unlock(&i915->drm.struct_mutex);
1177
1178        return err;
1179}
1180
1181static int __igt_reset_evict_vma(struct intel_gt *gt,
1182                                 struct i915_address_space *vm,
1183                                 int (*fn)(void *),
1184                                 unsigned int flags)
1185{
1186        struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1187        struct drm_i915_gem_object *obj;
1188        struct task_struct *tsk = NULL;
1189        struct i915_request *rq;
1190        struct evict_vma arg;
1191        struct hang h;
1192        int err;
1193
1194        if (!engine || !intel_engine_can_store_dword(engine))
1195                return 0;
1196
1197        /* Check that we can recover an unbind stuck on a hanging request */
1198
1199        mutex_lock(&gt->i915->drm.struct_mutex);
1200        err = hang_init(&h, gt);
1201        if (err)
1202                goto unlock;
1203
1204        obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1205        if (IS_ERR(obj)) {
1206                err = PTR_ERR(obj);
1207                goto fini;
1208        }
1209
1210        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1211                err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1212                if (err) {
1213                        pr_err("Invalid X-tiling settings; err:%d\n", err);
1214                        goto out_obj;
1215                }
1216        }
1217
1218        arg.vma = i915_vma_instance(obj, vm, NULL);
1219        if (IS_ERR(arg.vma)) {
1220                err = PTR_ERR(arg.vma);
1221                goto out_obj;
1222        }
1223
1224        rq = hang_create_request(&h, engine);
1225        if (IS_ERR(rq)) {
1226                err = PTR_ERR(rq);
1227                goto out_obj;
1228        }
1229
1230        err = i915_vma_pin(arg.vma, 0, 0,
1231                           i915_vma_is_ggtt(arg.vma) ?
1232                           PIN_GLOBAL | PIN_MAPPABLE :
1233                           PIN_USER);
1234        if (err) {
1235                i915_request_add(rq);
1236                goto out_obj;
1237        }
1238
1239        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1240                err = i915_vma_pin_fence(arg.vma);
1241                if (err) {
1242                        pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1243                        i915_vma_unpin(arg.vma);
1244                        i915_request_add(rq);
1245                        goto out_obj;
1246                }
1247        }
1248
1249        i915_vma_lock(arg.vma);
1250        err = i915_request_await_object(rq, arg.vma->obj,
1251                                        flags & EXEC_OBJECT_WRITE);
1252        if (err == 0)
1253                err = i915_vma_move_to_active(arg.vma, rq, flags);
1254        i915_vma_unlock(arg.vma);
1255
1256        if (flags & EXEC_OBJECT_NEEDS_FENCE)
1257                i915_vma_unpin_fence(arg.vma);
1258        i915_vma_unpin(arg.vma);
1259
1260        i915_request_get(rq);
1261        i915_request_add(rq);
1262        if (err)
1263                goto out_rq;
1264
1265        mutex_unlock(&gt->i915->drm.struct_mutex);
1266
1267        if (!wait_until_running(&h, rq)) {
1268                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1269
1270                pr_err("%s: Failed to start request %llx, at %x\n",
1271                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1272                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1273
1274                intel_gt_set_wedged(gt);
1275                goto out_reset;
1276        }
1277
1278        init_completion(&arg.completion);
1279
1280        tsk = kthread_run(fn, &arg, "igt/evict_vma");
1281        if (IS_ERR(tsk)) {
1282                err = PTR_ERR(tsk);
1283                tsk = NULL;
1284                goto out_reset;
1285        }
1286        get_task_struct(tsk);
1287
1288        wait_for_completion(&arg.completion);
1289
1290        if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1291                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1292
1293                pr_err("igt/evict_vma kthread did not wait\n");
1294                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1295
1296                intel_gt_set_wedged(gt);
1297                goto out_reset;
1298        }
1299
1300out_reset:
1301        igt_global_reset_lock(gt);
1302        fake_hangcheck(gt, rq->engine->mask);
1303        igt_global_reset_unlock(gt);
1304
1305        if (tsk) {
1306                struct intel_wedge_me w;
1307
1308                /* The reset, even indirectly, should take less than 10ms. */
1309                intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1310                        err = kthread_stop(tsk);
1311
1312                put_task_struct(tsk);
1313        }
1314
1315        mutex_lock(&gt->i915->drm.struct_mutex);
1316out_rq:
1317        i915_request_put(rq);
1318out_obj:
1319        i915_gem_object_put(obj);
1320fini:
1321        hang_fini(&h);
1322unlock:
1323        mutex_unlock(&gt->i915->drm.struct_mutex);
1324
1325        if (intel_gt_is_wedged(gt))
1326                return -EIO;
1327
1328        return err;
1329}
1330
1331static int igt_reset_evict_ggtt(void *arg)
1332{
1333        struct intel_gt *gt = arg;
1334
1335        return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1336                                     evict_vma, EXEC_OBJECT_WRITE);
1337}
1338
1339static int igt_reset_evict_ppgtt(void *arg)
1340{
1341        struct intel_gt *gt = arg;
1342        struct i915_gem_context *ctx;
1343        struct drm_file *file;
1344        int err;
1345
1346        file = mock_file(gt->i915);
1347        if (IS_ERR(file))
1348                return PTR_ERR(file);
1349
1350        mutex_lock(&gt->i915->drm.struct_mutex);
1351        ctx = live_context(gt->i915, file);
1352        mutex_unlock(&gt->i915->drm.struct_mutex);
1353        if (IS_ERR(ctx)) {
1354                err = PTR_ERR(ctx);
1355                goto out;
1356        }
1357
1358        err = 0;
1359        if (ctx->vm) /* aliasing == global gtt locking, covered above */
1360                err = __igt_reset_evict_vma(gt, ctx->vm,
1361                                            evict_vma, EXEC_OBJECT_WRITE);
1362
1363out:
1364        mock_file_free(gt->i915, file);
1365        return err;
1366}
1367
1368static int igt_reset_evict_fence(void *arg)
1369{
1370        struct intel_gt *gt = arg;
1371
1372        return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1373                                     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1374}
1375
1376static int wait_for_others(struct intel_gt *gt,
1377                           struct intel_engine_cs *exclude)
1378{
1379        struct intel_engine_cs *engine;
1380        enum intel_engine_id id;
1381
1382        for_each_engine(engine, gt->i915, id) {
1383                if (engine == exclude)
1384                        continue;
1385
1386                if (!wait_for_idle(engine))
1387                        return -EIO;
1388        }
1389
1390        return 0;
1391}
1392
1393static int igt_reset_queue(void *arg)
1394{
1395        struct intel_gt *gt = arg;
1396        struct i915_gpu_error *global = &gt->i915->gpu_error;
1397        struct intel_engine_cs *engine;
1398        enum intel_engine_id id;
1399        struct hang h;
1400        int err;
1401
1402        /* Check that we replay pending requests following a hang */
1403
1404        igt_global_reset_lock(gt);
1405
1406        mutex_lock(&gt->i915->drm.struct_mutex);
1407        err = hang_init(&h, gt);
1408        if (err)
1409                goto unlock;
1410
1411        for_each_engine(engine, gt->i915, id) {
1412                struct i915_request *prev;
1413                IGT_TIMEOUT(end_time);
1414                unsigned int count;
1415
1416                if (!intel_engine_can_store_dword(engine))
1417                        continue;
1418
1419                prev = hang_create_request(&h, engine);
1420                if (IS_ERR(prev)) {
1421                        err = PTR_ERR(prev);
1422                        goto fini;
1423                }
1424
1425                i915_request_get(prev);
1426                i915_request_add(prev);
1427
1428                count = 0;
1429                do {
1430                        struct i915_request *rq;
1431                        unsigned int reset_count;
1432
1433                        rq = hang_create_request(&h, engine);
1434                        if (IS_ERR(rq)) {
1435                                err = PTR_ERR(rq);
1436                                goto fini;
1437                        }
1438
1439                        i915_request_get(rq);
1440                        i915_request_add(rq);
1441
1442                        /*
1443                         * XXX We don't handle resetting the kernel context
1444                         * very well. If we trigger a device reset twice in
1445                         * quick succession while the kernel context is
1446                         * executing, we may end up skipping the breadcrumb.
1447                         * This is really only a problem for the selftest as
1448                         * normally there is a large interlude between resets
1449                         * (hangcheck), or we focus on resetting just one
1450                         * engine and so avoid repeatedly resetting innocents.
1451                         */
1452                        err = wait_for_others(gt, engine);
1453                        if (err) {
1454                                pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1455                                       __func__, engine->name);
1456                                i915_request_put(rq);
1457                                i915_request_put(prev);
1458
1459                                GEM_TRACE_DUMP();
1460                                intel_gt_set_wedged(gt);
1461                                goto fini;
1462                        }
1463
1464                        if (!wait_until_running(&h, prev)) {
1465                                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1466
1467                                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1468                                       __func__, engine->name,
1469                                       prev->fence.seqno, hws_seqno(&h, prev));
1470                                intel_engine_dump(engine, &p,
1471                                                  "%s\n", engine->name);
1472
1473                                i915_request_put(rq);
1474                                i915_request_put(prev);
1475
1476                                intel_gt_set_wedged(gt);
1477
1478                                err = -EIO;
1479                                goto fini;
1480                        }
1481
1482                        reset_count = fake_hangcheck(gt, BIT(id));
1483
1484                        if (prev->fence.error != -EIO) {
1485                                pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1486                                       prev->fence.error);
1487                                i915_request_put(rq);
1488                                i915_request_put(prev);
1489                                err = -EINVAL;
1490                                goto fini;
1491                        }
1492
1493                        if (rq->fence.error) {
1494                                pr_err("Fence error status not zero [%d] after unrelated reset\n",
1495                                       rq->fence.error);
1496                                i915_request_put(rq);
1497                                i915_request_put(prev);
1498                                err = -EINVAL;
1499                                goto fini;
1500                        }
1501
1502                        if (i915_reset_count(global) == reset_count) {
1503                                pr_err("No GPU reset recorded!\n");
1504                                i915_request_put(rq);
1505                                i915_request_put(prev);
1506                                err = -EINVAL;
1507                                goto fini;
1508                        }
1509
1510                        i915_request_put(prev);
1511                        prev = rq;
1512                        count++;
1513                } while (time_before(jiffies, end_time));
1514                pr_info("%s: Completed %d resets\n", engine->name, count);
1515
1516                *h.batch = MI_BATCH_BUFFER_END;
1517                intel_gt_chipset_flush(engine->gt);
1518
1519                i915_request_put(prev);
1520
1521                err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1522                if (err)
1523                        break;
1524        }
1525
1526fini:
1527        hang_fini(&h);
1528unlock:
1529        mutex_unlock(&gt->i915->drm.struct_mutex);
1530        igt_global_reset_unlock(gt);
1531
1532        if (intel_gt_is_wedged(gt))
1533                return -EIO;
1534
1535        return err;
1536}
1537
1538static int igt_handle_error(void *arg)
1539{
1540        struct intel_gt *gt = arg;
1541        struct i915_gpu_error *global = &gt->i915->gpu_error;
1542        struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1543        struct hang h;
1544        struct i915_request *rq;
1545        struct i915_gpu_state *error;
1546        int err;
1547
1548        /* Check that we can issue a global GPU and engine reset */
1549
1550        if (!intel_has_reset_engine(gt->i915))
1551                return 0;
1552
1553        if (!engine || !intel_engine_can_store_dword(engine))
1554                return 0;
1555
1556        mutex_lock(&gt->i915->drm.struct_mutex);
1557
1558        err = hang_init(&h, gt);
1559        if (err)
1560                goto err_unlock;
1561
1562        rq = hang_create_request(&h, engine);
1563        if (IS_ERR(rq)) {
1564                err = PTR_ERR(rq);
1565                goto err_fini;
1566        }
1567
1568        i915_request_get(rq);
1569        i915_request_add(rq);
1570
1571        if (!wait_until_running(&h, rq)) {
1572                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1573
1574                pr_err("%s: Failed to start request %llx, at %x\n",
1575                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1576                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1577
1578                intel_gt_set_wedged(gt);
1579
1580                err = -EIO;
1581                goto err_request;
1582        }
1583
1584        mutex_unlock(&gt->i915->drm.struct_mutex);
1585
1586        /* Temporarily disable error capture */
1587        error = xchg(&global->first_error, (void *)-1);
1588
1589        intel_gt_handle_error(gt, engine->mask, 0, NULL);
1590
1591        xchg(&global->first_error, error);
1592
1593        mutex_lock(&gt->i915->drm.struct_mutex);
1594
1595        if (rq->fence.error != -EIO) {
1596                pr_err("Guilty request not identified!\n");
1597                err = -EINVAL;
1598                goto err_request;
1599        }
1600
1601err_request:
1602        i915_request_put(rq);
1603err_fini:
1604        hang_fini(&h);
1605err_unlock:
1606        mutex_unlock(&gt->i915->drm.struct_mutex);
1607        return err;
1608}
1609
1610static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1611                                     const struct igt_atomic_section *p,
1612                                     const char *mode)
1613{
1614        struct tasklet_struct * const t = &engine->execlists.tasklet;
1615        int err;
1616
1617        GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1618                  engine->name, mode, p->name);
1619
1620        tasklet_disable_nosync(t);
1621        p->critical_section_begin();
1622
1623        err = intel_engine_reset(engine, NULL);
1624
1625        p->critical_section_end();
1626        tasklet_enable(t);
1627
1628        if (err)
1629                pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1630                       engine->name, mode, p->name);
1631
1632        return err;
1633}
1634
1635static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1636                                   const struct igt_atomic_section *p)
1637{
1638        struct i915_request *rq;
1639        struct hang h;
1640        int err;
1641
1642        err = __igt_atomic_reset_engine(engine, p, "idle");
1643        if (err)
1644                return err;
1645
1646        err = hang_init(&h, engine->gt);
1647        if (err)
1648                return err;
1649
1650        rq = hang_create_request(&h, engine);
1651        if (IS_ERR(rq)) {
1652                err = PTR_ERR(rq);
1653                goto out;
1654        }
1655
1656        i915_request_get(rq);
1657        i915_request_add(rq);
1658
1659        if (wait_until_running(&h, rq)) {
1660                err = __igt_atomic_reset_engine(engine, p, "active");
1661        } else {
1662                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1663                       __func__, engine->name,
1664                       rq->fence.seqno, hws_seqno(&h, rq));
1665                intel_gt_set_wedged(engine->gt);
1666                err = -EIO;
1667        }
1668
1669        if (err == 0) {
1670                struct intel_wedge_me w;
1671
1672                intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1673                        i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1674                if (intel_gt_is_wedged(engine->gt))
1675                        err = -EIO;
1676        }
1677
1678        i915_request_put(rq);
1679out:
1680        hang_fini(&h);
1681        return err;
1682}
1683
1684static int igt_reset_engines_atomic(void *arg)
1685{
1686        struct intel_gt *gt = arg;
1687        const typeof(*igt_atomic_phases) *p;
1688        int err = 0;
1689
1690        /* Check that the engines resets are usable from atomic context */
1691
1692        if (!intel_has_reset_engine(gt->i915))
1693                return 0;
1694
1695        if (USES_GUC_SUBMISSION(gt->i915))
1696                return 0;
1697
1698        igt_global_reset_lock(gt);
1699        mutex_lock(&gt->i915->drm.struct_mutex);
1700
1701        /* Flush any requests before we get started and check basics */
1702        if (!igt_force_reset(gt))
1703                goto unlock;
1704
1705        for (p = igt_atomic_phases; p->name; p++) {
1706                struct intel_engine_cs *engine;
1707                enum intel_engine_id id;
1708
1709                for_each_engine(engine, gt->i915, id) {
1710                        err = igt_atomic_reset_engine(engine, p);
1711                        if (err)
1712                                goto out;
1713                }
1714        }
1715
1716out:
1717        /* As we poke around the guts, do a full reset before continuing. */
1718        igt_force_reset(gt);
1719
1720unlock:
1721        mutex_unlock(&gt->i915->drm.struct_mutex);
1722        igt_global_reset_unlock(gt);
1723
1724        return err;
1725}
1726
1727int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1728{
1729        static const struct i915_subtest tests[] = {
1730                SUBTEST(igt_hang_sanitycheck),
1731                SUBTEST(igt_reset_nop),
1732                SUBTEST(igt_reset_nop_engine),
1733                SUBTEST(igt_reset_idle_engine),
1734                SUBTEST(igt_reset_active_engine),
1735                SUBTEST(igt_reset_engines),
1736                SUBTEST(igt_reset_engines_atomic),
1737                SUBTEST(igt_reset_queue),
1738                SUBTEST(igt_reset_wait),
1739                SUBTEST(igt_reset_evict_ggtt),
1740                SUBTEST(igt_reset_evict_ppgtt),
1741                SUBTEST(igt_reset_evict_fence),
1742                SUBTEST(igt_handle_error),
1743        };
1744        struct intel_gt *gt = &i915->gt;
1745        intel_wakeref_t wakeref;
1746        bool saved_hangcheck;
1747        int err;
1748
1749        if (!intel_has_gpu_reset(gt->i915))
1750                return 0;
1751
1752        if (intel_gt_is_wedged(gt))
1753                return -EIO; /* we're long past hope of a successful reset */
1754
1755        wakeref = intel_runtime_pm_get(&gt->i915->runtime_pm);
1756        saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1757        drain_delayed_work(&gt->hangcheck.work); /* flush param */
1758
1759        err = intel_gt_live_subtests(tests, gt);
1760
1761        mutex_lock(&gt->i915->drm.struct_mutex);
1762        igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1763        mutex_unlock(&gt->i915->drm.struct_mutex);
1764
1765        i915_modparams.enable_hangcheck = saved_hangcheck;
1766        intel_runtime_pm_put(&gt->i915->runtime_pm, wakeref);
1767
1768        return err;
1769}
1770