linux/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 */
  24
  25#include <linux/kthread.h>
  26
  27#include "gem/i915_gem_context.h"
  28#include "intel_engine_pm.h"
  29
  30#include "i915_selftest.h"
  31#include "selftests/i915_random.h"
  32#include "selftests/igt_flush_test.h"
  33#include "selftests/igt_reset.h"
  34#include "selftests/igt_wedge_me.h"
  35#include "selftests/igt_atomic.h"
  36
  37#include "selftests/mock_drm.h"
  38
  39#include "gem/selftests/mock_context.h"
  40#include "gem/selftests/igt_gem_utils.h"
  41
  42#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
  43
  44struct hang {
  45        struct drm_i915_private *i915;
  46        struct drm_i915_gem_object *hws;
  47        struct drm_i915_gem_object *obj;
  48        struct i915_gem_context *ctx;
  49        u32 *seqno;
  50        u32 *batch;
  51};
  52
  53static int hang_init(struct hang *h, struct drm_i915_private *i915)
  54{
  55        void *vaddr;
  56        int err;
  57
  58        memset(h, 0, sizeof(*h));
  59        h->i915 = i915;
  60
  61        h->ctx = kernel_context(i915);
  62        if (IS_ERR(h->ctx))
  63                return PTR_ERR(h->ctx);
  64
  65        GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
  66
  67        h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  68        if (IS_ERR(h->hws)) {
  69                err = PTR_ERR(h->hws);
  70                goto err_ctx;
  71        }
  72
  73        h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  74        if (IS_ERR(h->obj)) {
  75                err = PTR_ERR(h->obj);
  76                goto err_hws;
  77        }
  78
  79        i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
  80        vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  81        if (IS_ERR(vaddr)) {
  82                err = PTR_ERR(vaddr);
  83                goto err_obj;
  84        }
  85        h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  86
  87        vaddr = i915_gem_object_pin_map(h->obj,
  88                                        i915_coherent_map_type(i915));
  89        if (IS_ERR(vaddr)) {
  90                err = PTR_ERR(vaddr);
  91                goto err_unpin_hws;
  92        }
  93        h->batch = vaddr;
  94
  95        return 0;
  96
  97err_unpin_hws:
  98        i915_gem_object_unpin_map(h->hws);
  99err_obj:
 100        i915_gem_object_put(h->obj);
 101err_hws:
 102        i915_gem_object_put(h->hws);
 103err_ctx:
 104        kernel_context_close(h->ctx);
 105        return err;
 106}
 107
 108static u64 hws_address(const struct i915_vma *hws,
 109                       const struct i915_request *rq)
 110{
 111        return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
 112}
 113
 114static int move_to_active(struct i915_vma *vma,
 115                          struct i915_request *rq,
 116                          unsigned int flags)
 117{
 118        int err;
 119
 120        i915_vma_lock(vma);
 121        err = i915_vma_move_to_active(vma, rq, flags);
 122        i915_vma_unlock(vma);
 123
 124        return err;
 125}
 126
 127static struct i915_request *
 128hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 129{
 130        struct drm_i915_private *i915 = h->i915;
 131        struct i915_address_space *vm = h->ctx->vm ?: &i915->ggtt.vm;
 132        struct i915_request *rq = NULL;
 133        struct i915_vma *hws, *vma;
 134        unsigned int flags;
 135        u32 *batch;
 136        int err;
 137
 138        if (i915_gem_object_is_active(h->obj)) {
 139                struct drm_i915_gem_object *obj;
 140                void *vaddr;
 141
 142                obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
 143                if (IS_ERR(obj))
 144                        return ERR_CAST(obj);
 145
 146                vaddr = i915_gem_object_pin_map(obj,
 147                                                i915_coherent_map_type(h->i915));
 148                if (IS_ERR(vaddr)) {
 149                        i915_gem_object_put(obj);
 150                        return ERR_CAST(vaddr);
 151                }
 152
 153                i915_gem_object_unpin_map(h->obj);
 154                i915_gem_object_put(h->obj);
 155
 156                h->obj = obj;
 157                h->batch = vaddr;
 158        }
 159
 160        vma = i915_vma_instance(h->obj, vm, NULL);
 161        if (IS_ERR(vma))
 162                return ERR_CAST(vma);
 163
 164        hws = i915_vma_instance(h->hws, vm, NULL);
 165        if (IS_ERR(hws))
 166                return ERR_CAST(hws);
 167
 168        err = i915_vma_pin(vma, 0, 0, PIN_USER);
 169        if (err)
 170                return ERR_PTR(err);
 171
 172        err = i915_vma_pin(hws, 0, 0, PIN_USER);
 173        if (err)
 174                goto unpin_vma;
 175
 176        rq = igt_request_alloc(h->ctx, engine);
 177        if (IS_ERR(rq)) {
 178                err = PTR_ERR(rq);
 179                goto unpin_hws;
 180        }
 181
 182        err = move_to_active(vma, rq, 0);
 183        if (err)
 184                goto cancel_rq;
 185
 186        err = move_to_active(hws, rq, 0);
 187        if (err)
 188                goto cancel_rq;
 189
 190        batch = h->batch;
 191        if (INTEL_GEN(i915) >= 8) {
 192                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 193                *batch++ = lower_32_bits(hws_address(hws, rq));
 194                *batch++ = upper_32_bits(hws_address(hws, rq));
 195                *batch++ = rq->fence.seqno;
 196                *batch++ = MI_ARB_CHECK;
 197
 198                memset(batch, 0, 1024);
 199                batch += 1024 / sizeof(*batch);
 200
 201                *batch++ = MI_ARB_CHECK;
 202                *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 203                *batch++ = lower_32_bits(vma->node.start);
 204                *batch++ = upper_32_bits(vma->node.start);
 205        } else if (INTEL_GEN(i915) >= 6) {
 206                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 207                *batch++ = 0;
 208                *batch++ = lower_32_bits(hws_address(hws, rq));
 209                *batch++ = rq->fence.seqno;
 210                *batch++ = MI_ARB_CHECK;
 211
 212                memset(batch, 0, 1024);
 213                batch += 1024 / sizeof(*batch);
 214
 215                *batch++ = MI_ARB_CHECK;
 216                *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 217                *batch++ = lower_32_bits(vma->node.start);
 218        } else if (INTEL_GEN(i915) >= 4) {
 219                *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 220                *batch++ = 0;
 221                *batch++ = lower_32_bits(hws_address(hws, rq));
 222                *batch++ = rq->fence.seqno;
 223                *batch++ = MI_ARB_CHECK;
 224
 225                memset(batch, 0, 1024);
 226                batch += 1024 / sizeof(*batch);
 227
 228                *batch++ = MI_ARB_CHECK;
 229                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 230                *batch++ = lower_32_bits(vma->node.start);
 231        } else {
 232                *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 233                *batch++ = lower_32_bits(hws_address(hws, rq));
 234                *batch++ = rq->fence.seqno;
 235                *batch++ = MI_ARB_CHECK;
 236
 237                memset(batch, 0, 1024);
 238                batch += 1024 / sizeof(*batch);
 239
 240                *batch++ = MI_ARB_CHECK;
 241                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 242                *batch++ = lower_32_bits(vma->node.start);
 243        }
 244        *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 245        i915_gem_chipset_flush(h->i915);
 246
 247        if (rq->engine->emit_init_breadcrumb) {
 248                err = rq->engine->emit_init_breadcrumb(rq);
 249                if (err)
 250                        goto cancel_rq;
 251        }
 252
 253        flags = 0;
 254        if (INTEL_GEN(vm->i915) <= 5)
 255                flags |= I915_DISPATCH_SECURE;
 256
 257        err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 258
 259cancel_rq:
 260        if (err) {
 261                i915_request_skip(rq, err);
 262                i915_request_add(rq);
 263        }
 264unpin_hws:
 265        i915_vma_unpin(hws);
 266unpin_vma:
 267        i915_vma_unpin(vma);
 268        return err ? ERR_PTR(err) : rq;
 269}
 270
 271static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 272{
 273        return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 274}
 275
 276static void hang_fini(struct hang *h)
 277{
 278        *h->batch = MI_BATCH_BUFFER_END;
 279        i915_gem_chipset_flush(h->i915);
 280
 281        i915_gem_object_unpin_map(h->obj);
 282        i915_gem_object_put(h->obj);
 283
 284        i915_gem_object_unpin_map(h->hws);
 285        i915_gem_object_put(h->hws);
 286
 287        kernel_context_close(h->ctx);
 288
 289        igt_flush_test(h->i915, I915_WAIT_LOCKED);
 290}
 291
 292static bool wait_until_running(struct hang *h, struct i915_request *rq)
 293{
 294        return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 295                                               rq->fence.seqno),
 296                             10) &&
 297                 wait_for(i915_seqno_passed(hws_seqno(h, rq),
 298                                            rq->fence.seqno),
 299                          1000));
 300}
 301
 302static int igt_hang_sanitycheck(void *arg)
 303{
 304        struct drm_i915_private *i915 = arg;
 305        struct i915_request *rq;
 306        struct intel_engine_cs *engine;
 307        enum intel_engine_id id;
 308        struct hang h;
 309        int err;
 310
 311        /* Basic check that we can execute our hanging batch */
 312
 313        mutex_lock(&i915->drm.struct_mutex);
 314        err = hang_init(&h, i915);
 315        if (err)
 316                goto unlock;
 317
 318        for_each_engine(engine, i915, id) {
 319                struct igt_wedge_me w;
 320                long timeout;
 321
 322                if (!intel_engine_can_store_dword(engine))
 323                        continue;
 324
 325                rq = hang_create_request(&h, engine);
 326                if (IS_ERR(rq)) {
 327                        err = PTR_ERR(rq);
 328                        pr_err("Failed to create request for %s, err=%d\n",
 329                               engine->name, err);
 330                        goto fini;
 331                }
 332
 333                i915_request_get(rq);
 334
 335                *h.batch = MI_BATCH_BUFFER_END;
 336                i915_gem_chipset_flush(i915);
 337
 338                i915_request_add(rq);
 339
 340                timeout = 0;
 341                igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
 342                        timeout = i915_request_wait(rq, 0,
 343                                                    MAX_SCHEDULE_TIMEOUT);
 344                if (i915_reset_failed(i915))
 345                        timeout = -EIO;
 346
 347                i915_request_put(rq);
 348
 349                if (timeout < 0) {
 350                        err = timeout;
 351                        pr_err("Wait for request failed on %s, err=%d\n",
 352                               engine->name, err);
 353                        goto fini;
 354                }
 355        }
 356
 357fini:
 358        hang_fini(&h);
 359unlock:
 360        mutex_unlock(&i915->drm.struct_mutex);
 361        return err;
 362}
 363
 364static bool wait_for_idle(struct intel_engine_cs *engine)
 365{
 366        return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
 367}
 368
 369static int igt_reset_nop(void *arg)
 370{
 371        struct drm_i915_private *i915 = arg;
 372        struct intel_engine_cs *engine;
 373        struct i915_gem_context *ctx;
 374        unsigned int reset_count, count;
 375        enum intel_engine_id id;
 376        intel_wakeref_t wakeref;
 377        struct drm_file *file;
 378        IGT_TIMEOUT(end_time);
 379        int err = 0;
 380
 381        /* Check that we can reset during non-user portions of requests */
 382
 383        file = mock_file(i915);
 384        if (IS_ERR(file))
 385                return PTR_ERR(file);
 386
 387        mutex_lock(&i915->drm.struct_mutex);
 388        ctx = live_context(i915, file);
 389        mutex_unlock(&i915->drm.struct_mutex);
 390        if (IS_ERR(ctx)) {
 391                err = PTR_ERR(ctx);
 392                goto out;
 393        }
 394
 395        i915_gem_context_clear_bannable(ctx);
 396        wakeref = intel_runtime_pm_get(&i915->runtime_pm);
 397        reset_count = i915_reset_count(&i915->gpu_error);
 398        count = 0;
 399        do {
 400                mutex_lock(&i915->drm.struct_mutex);
 401                for_each_engine(engine, i915, id) {
 402                        int i;
 403
 404                        for (i = 0; i < 16; i++) {
 405                                struct i915_request *rq;
 406
 407                                rq = igt_request_alloc(ctx, engine);
 408                                if (IS_ERR(rq)) {
 409                                        err = PTR_ERR(rq);
 410                                        break;
 411                                }
 412
 413                                i915_request_add(rq);
 414                        }
 415                }
 416                mutex_unlock(&i915->drm.struct_mutex);
 417
 418                igt_global_reset_lock(i915);
 419                i915_reset(i915, ALL_ENGINES, NULL);
 420                igt_global_reset_unlock(i915);
 421                if (i915_reset_failed(i915)) {
 422                        err = -EIO;
 423                        break;
 424                }
 425
 426                if (i915_reset_count(&i915->gpu_error) !=
 427                    reset_count + ++count) {
 428                        pr_err("Full GPU reset not recorded!\n");
 429                        err = -EINVAL;
 430                        break;
 431                }
 432
 433                err = igt_flush_test(i915, 0);
 434                if (err)
 435                        break;
 436        } while (time_before(jiffies, end_time));
 437        pr_info("%s: %d resets\n", __func__, count);
 438
 439        mutex_lock(&i915->drm.struct_mutex);
 440        err = igt_flush_test(i915, I915_WAIT_LOCKED);
 441        mutex_unlock(&i915->drm.struct_mutex);
 442
 443        intel_runtime_pm_put(&i915->runtime_pm, wakeref);
 444
 445out:
 446        mock_file_free(i915, file);
 447        if (i915_reset_failed(i915))
 448                err = -EIO;
 449        return err;
 450}
 451
 452static int igt_reset_nop_engine(void *arg)
 453{
 454        struct drm_i915_private *i915 = arg;
 455        struct intel_engine_cs *engine;
 456        struct i915_gem_context *ctx;
 457        enum intel_engine_id id;
 458        intel_wakeref_t wakeref;
 459        struct drm_file *file;
 460        int err = 0;
 461
 462        /* Check that we can engine-reset during non-user portions */
 463
 464        if (!intel_has_reset_engine(i915))
 465                return 0;
 466
 467        file = mock_file(i915);
 468        if (IS_ERR(file))
 469                return PTR_ERR(file);
 470
 471        mutex_lock(&i915->drm.struct_mutex);
 472        ctx = live_context(i915, file);
 473        mutex_unlock(&i915->drm.struct_mutex);
 474        if (IS_ERR(ctx)) {
 475                err = PTR_ERR(ctx);
 476                goto out;
 477        }
 478
 479        i915_gem_context_clear_bannable(ctx);
 480        wakeref = intel_runtime_pm_get(&i915->runtime_pm);
 481        for_each_engine(engine, i915, id) {
 482                unsigned int reset_count, reset_engine_count;
 483                unsigned int count;
 484                IGT_TIMEOUT(end_time);
 485
 486                reset_count = i915_reset_count(&i915->gpu_error);
 487                reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 488                                                             engine);
 489                count = 0;
 490
 491                set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 492                do {
 493                        int i;
 494
 495                        if (!wait_for_idle(engine)) {
 496                                pr_err("%s failed to idle before reset\n",
 497                                       engine->name);
 498                                err = -EIO;
 499                                break;
 500                        }
 501
 502                        mutex_lock(&i915->drm.struct_mutex);
 503                        for (i = 0; i < 16; i++) {
 504                                struct i915_request *rq;
 505
 506                                rq = igt_request_alloc(ctx, engine);
 507                                if (IS_ERR(rq)) {
 508                                        err = PTR_ERR(rq);
 509                                        break;
 510                                }
 511
 512                                i915_request_add(rq);
 513                        }
 514                        mutex_unlock(&i915->drm.struct_mutex);
 515
 516                        err = i915_reset_engine(engine, NULL);
 517                        if (err) {
 518                                pr_err("i915_reset_engine failed\n");
 519                                break;
 520                        }
 521
 522                        if (i915_reset_count(&i915->gpu_error) != reset_count) {
 523                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 524                                err = -EINVAL;
 525                                break;
 526                        }
 527
 528                        if (i915_reset_engine_count(&i915->gpu_error, engine) !=
 529                            reset_engine_count + ++count) {
 530                                pr_err("%s engine reset not recorded!\n",
 531                                       engine->name);
 532                                err = -EINVAL;
 533                                break;
 534                        }
 535                } while (time_before(jiffies, end_time));
 536                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 537                pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 538
 539                if (err)
 540                        break;
 541
 542                err = igt_flush_test(i915, 0);
 543                if (err)
 544                        break;
 545        }
 546
 547        mutex_lock(&i915->drm.struct_mutex);
 548        err = igt_flush_test(i915, I915_WAIT_LOCKED);
 549        mutex_unlock(&i915->drm.struct_mutex);
 550
 551        intel_runtime_pm_put(&i915->runtime_pm, wakeref);
 552out:
 553        mock_file_free(i915, file);
 554        if (i915_reset_failed(i915))
 555                err = -EIO;
 556        return err;
 557}
 558
 559static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 560{
 561        struct intel_engine_cs *engine;
 562        enum intel_engine_id id;
 563        struct hang h;
 564        int err = 0;
 565
 566        /* Check that we can issue an engine reset on an idle engine (no-op) */
 567
 568        if (!intel_has_reset_engine(i915))
 569                return 0;
 570
 571        if (active) {
 572                mutex_lock(&i915->drm.struct_mutex);
 573                err = hang_init(&h, i915);
 574                mutex_unlock(&i915->drm.struct_mutex);
 575                if (err)
 576                        return err;
 577        }
 578
 579        for_each_engine(engine, i915, id) {
 580                unsigned int reset_count, reset_engine_count;
 581                IGT_TIMEOUT(end_time);
 582
 583                if (active && !intel_engine_can_store_dword(engine))
 584                        continue;
 585
 586                if (!wait_for_idle(engine)) {
 587                        pr_err("%s failed to idle before reset\n",
 588                               engine->name);
 589                        err = -EIO;
 590                        break;
 591                }
 592
 593                reset_count = i915_reset_count(&i915->gpu_error);
 594                reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 595                                                             engine);
 596
 597                intel_engine_pm_get(engine);
 598                set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 599                do {
 600                        if (active) {
 601                                struct i915_request *rq;
 602
 603                                mutex_lock(&i915->drm.struct_mutex);
 604                                rq = hang_create_request(&h, engine);
 605                                if (IS_ERR(rq)) {
 606                                        err = PTR_ERR(rq);
 607                                        mutex_unlock(&i915->drm.struct_mutex);
 608                                        break;
 609                                }
 610
 611                                i915_request_get(rq);
 612                                i915_request_add(rq);
 613                                mutex_unlock(&i915->drm.struct_mutex);
 614
 615                                if (!wait_until_running(&h, rq)) {
 616                                        struct drm_printer p = drm_info_printer(i915->drm.dev);
 617
 618                                        pr_err("%s: Failed to start request %llx, at %x\n",
 619                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 620                                        intel_engine_dump(engine, &p,
 621                                                          "%s\n", engine->name);
 622
 623                                        i915_request_put(rq);
 624                                        err = -EIO;
 625                                        break;
 626                                }
 627
 628                                i915_request_put(rq);
 629                        }
 630
 631                        err = i915_reset_engine(engine, NULL);
 632                        if (err) {
 633                                pr_err("i915_reset_engine failed\n");
 634                                break;
 635                        }
 636
 637                        if (i915_reset_count(&i915->gpu_error) != reset_count) {
 638                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 639                                err = -EINVAL;
 640                                break;
 641                        }
 642
 643                        if (i915_reset_engine_count(&i915->gpu_error, engine) !=
 644                            ++reset_engine_count) {
 645                                pr_err("%s engine reset not recorded!\n",
 646                                       engine->name);
 647                                err = -EINVAL;
 648                                break;
 649                        }
 650                } while (time_before(jiffies, end_time));
 651                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 652                intel_engine_pm_put(engine);
 653
 654                if (err)
 655                        break;
 656
 657                err = igt_flush_test(i915, 0);
 658                if (err)
 659                        break;
 660        }
 661
 662        if (i915_reset_failed(i915))
 663                err = -EIO;
 664
 665        if (active) {
 666                mutex_lock(&i915->drm.struct_mutex);
 667                hang_fini(&h);
 668                mutex_unlock(&i915->drm.struct_mutex);
 669        }
 670
 671        return err;
 672}
 673
 674static int igt_reset_idle_engine(void *arg)
 675{
 676        return __igt_reset_engine(arg, false);
 677}
 678
 679static int igt_reset_active_engine(void *arg)
 680{
 681        return __igt_reset_engine(arg, true);
 682}
 683
 684struct active_engine {
 685        struct task_struct *task;
 686        struct intel_engine_cs *engine;
 687        unsigned long resets;
 688        unsigned int flags;
 689};
 690
 691#define TEST_ACTIVE     BIT(0)
 692#define TEST_OTHERS     BIT(1)
 693#define TEST_SELF       BIT(2)
 694#define TEST_PRIORITY   BIT(3)
 695
 696static int active_request_put(struct i915_request *rq)
 697{
 698        int err = 0;
 699
 700        if (!rq)
 701                return 0;
 702
 703        if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
 704                GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
 705                          rq->engine->name,
 706                          rq->fence.context,
 707                          rq->fence.seqno);
 708                GEM_TRACE_DUMP();
 709
 710                i915_gem_set_wedged(rq->i915);
 711                err = -EIO;
 712        }
 713
 714        i915_request_put(rq);
 715
 716        return err;
 717}
 718
 719static int active_engine(void *data)
 720{
 721        I915_RND_STATE(prng);
 722        struct active_engine *arg = data;
 723        struct intel_engine_cs *engine = arg->engine;
 724        struct i915_request *rq[8] = {};
 725        struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
 726        struct drm_file *file;
 727        unsigned long count = 0;
 728        int err = 0;
 729
 730        file = mock_file(engine->i915);
 731        if (IS_ERR(file))
 732                return PTR_ERR(file);
 733
 734        for (count = 0; count < ARRAY_SIZE(ctx); count++) {
 735                mutex_lock(&engine->i915->drm.struct_mutex);
 736                ctx[count] = live_context(engine->i915, file);
 737                mutex_unlock(&engine->i915->drm.struct_mutex);
 738                if (IS_ERR(ctx[count])) {
 739                        err = PTR_ERR(ctx[count]);
 740                        while (--count)
 741                                i915_gem_context_put(ctx[count]);
 742                        goto err_file;
 743                }
 744        }
 745
 746        while (!kthread_should_stop()) {
 747                unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
 748                struct i915_request *old = rq[idx];
 749                struct i915_request *new;
 750
 751                mutex_lock(&engine->i915->drm.struct_mutex);
 752                new = igt_request_alloc(ctx[idx], engine);
 753                if (IS_ERR(new)) {
 754                        mutex_unlock(&engine->i915->drm.struct_mutex);
 755                        err = PTR_ERR(new);
 756                        break;
 757                }
 758
 759                if (arg->flags & TEST_PRIORITY)
 760                        ctx[idx]->sched.priority =
 761                                i915_prandom_u32_max_state(512, &prng);
 762
 763                rq[idx] = i915_request_get(new);
 764                i915_request_add(new);
 765                mutex_unlock(&engine->i915->drm.struct_mutex);
 766
 767                err = active_request_put(old);
 768                if (err)
 769                        break;
 770
 771                cond_resched();
 772        }
 773
 774        for (count = 0; count < ARRAY_SIZE(rq); count++) {
 775                int err__ = active_request_put(rq[count]);
 776
 777                /* Keep the first error */
 778                if (!err)
 779                        err = err__;
 780        }
 781
 782err_file:
 783        mock_file_free(engine->i915, file);
 784        return err;
 785}
 786
 787static int __igt_reset_engines(struct drm_i915_private *i915,
 788                               const char *test_name,
 789                               unsigned int flags)
 790{
 791        struct intel_engine_cs *engine, *other;
 792        enum intel_engine_id id, tmp;
 793        struct hang h;
 794        int err = 0;
 795
 796        /* Check that issuing a reset on one engine does not interfere
 797         * with any other engine.
 798         */
 799
 800        if (!intel_has_reset_engine(i915))
 801                return 0;
 802
 803        if (flags & TEST_ACTIVE) {
 804                mutex_lock(&i915->drm.struct_mutex);
 805                err = hang_init(&h, i915);
 806                mutex_unlock(&i915->drm.struct_mutex);
 807                if (err)
 808                        return err;
 809
 810                if (flags & TEST_PRIORITY)
 811                        h.ctx->sched.priority = 1024;
 812        }
 813
 814        for_each_engine(engine, i915, id) {
 815                struct active_engine threads[I915_NUM_ENGINES] = {};
 816                unsigned long global = i915_reset_count(&i915->gpu_error);
 817                unsigned long count = 0, reported;
 818                IGT_TIMEOUT(end_time);
 819
 820                if (flags & TEST_ACTIVE &&
 821                    !intel_engine_can_store_dword(engine))
 822                        continue;
 823
 824                if (!wait_for_idle(engine)) {
 825                        pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
 826                               engine->name, test_name);
 827                        err = -EIO;
 828                        break;
 829                }
 830
 831                memset(threads, 0, sizeof(threads));
 832                for_each_engine(other, i915, tmp) {
 833                        struct task_struct *tsk;
 834
 835                        threads[tmp].resets =
 836                                i915_reset_engine_count(&i915->gpu_error,
 837                                                        other);
 838
 839                        if (!(flags & TEST_OTHERS))
 840                                continue;
 841
 842                        if (other == engine && !(flags & TEST_SELF))
 843                                continue;
 844
 845                        threads[tmp].engine = other;
 846                        threads[tmp].flags = flags;
 847
 848                        tsk = kthread_run(active_engine, &threads[tmp],
 849                                          "igt/%s", other->name);
 850                        if (IS_ERR(tsk)) {
 851                                err = PTR_ERR(tsk);
 852                                goto unwind;
 853                        }
 854
 855                        threads[tmp].task = tsk;
 856                        get_task_struct(tsk);
 857                }
 858
 859                intel_engine_pm_get(engine);
 860                set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 861                do {
 862                        struct i915_request *rq = NULL;
 863
 864                        if (flags & TEST_ACTIVE) {
 865                                mutex_lock(&i915->drm.struct_mutex);
 866                                rq = hang_create_request(&h, engine);
 867                                if (IS_ERR(rq)) {
 868                                        err = PTR_ERR(rq);
 869                                        mutex_unlock(&i915->drm.struct_mutex);
 870                                        break;
 871                                }
 872
 873                                i915_request_get(rq);
 874                                i915_request_add(rq);
 875                                mutex_unlock(&i915->drm.struct_mutex);
 876
 877                                if (!wait_until_running(&h, rq)) {
 878                                        struct drm_printer p = drm_info_printer(i915->drm.dev);
 879
 880                                        pr_err("%s: Failed to start request %llx, at %x\n",
 881                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 882                                        intel_engine_dump(engine, &p,
 883                                                          "%s\n", engine->name);
 884
 885                                        i915_request_put(rq);
 886                                        err = -EIO;
 887                                        break;
 888                                }
 889                        }
 890
 891                        err = i915_reset_engine(engine, NULL);
 892                        if (err) {
 893                                pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
 894                                       engine->name, test_name, err);
 895                                break;
 896                        }
 897
 898                        count++;
 899
 900                        if (rq) {
 901                                if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 902                                        struct drm_printer p =
 903                                                drm_info_printer(i915->drm.dev);
 904
 905                                        pr_err("i915_reset_engine(%s:%s):"
 906                                               " failed to complete request after reset\n",
 907                                               engine->name, test_name);
 908                                        intel_engine_dump(engine, &p,
 909                                                          "%s\n", engine->name);
 910                                        i915_request_put(rq);
 911
 912                                        GEM_TRACE_DUMP();
 913                                        i915_gem_set_wedged(i915);
 914                                        err = -EIO;
 915                                        break;
 916                                }
 917
 918                                i915_request_put(rq);
 919                        }
 920
 921                        if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
 922                                struct drm_printer p =
 923                                        drm_info_printer(i915->drm.dev);
 924
 925                                pr_err("i915_reset_engine(%s:%s):"
 926                                       " failed to idle after reset\n",
 927                                       engine->name, test_name);
 928                                intel_engine_dump(engine, &p,
 929                                                  "%s\n", engine->name);
 930
 931                                err = -EIO;
 932                                break;
 933                        }
 934                } while (time_before(jiffies, end_time));
 935                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 936                intel_engine_pm_put(engine);
 937                pr_info("i915_reset_engine(%s:%s): %lu resets\n",
 938                        engine->name, test_name, count);
 939
 940                reported = i915_reset_engine_count(&i915->gpu_error, engine);
 941                reported -= threads[engine->id].resets;
 942                if (reported != count) {
 943                        pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
 944                               engine->name, test_name, count, reported);
 945                        if (!err)
 946                                err = -EINVAL;
 947                }
 948
 949unwind:
 950                for_each_engine(other, i915, tmp) {
 951                        int ret;
 952
 953                        if (!threads[tmp].task)
 954                                continue;
 955
 956                        ret = kthread_stop(threads[tmp].task);
 957                        if (ret) {
 958                                pr_err("kthread for other engine %s failed, err=%d\n",
 959                                       other->name, ret);
 960                                if (!err)
 961                                        err = ret;
 962                        }
 963                        put_task_struct(threads[tmp].task);
 964
 965                        if (other != engine &&
 966                            threads[tmp].resets !=
 967                            i915_reset_engine_count(&i915->gpu_error, other)) {
 968                                pr_err("Innocent engine %s was reset (count=%ld)\n",
 969                                       other->name,
 970                                       i915_reset_engine_count(&i915->gpu_error,
 971                                                               other) -
 972                                       threads[tmp].resets);
 973                                if (!err)
 974                                        err = -EINVAL;
 975                        }
 976                }
 977
 978                if (global != i915_reset_count(&i915->gpu_error)) {
 979                        pr_err("Global reset (count=%ld)!\n",
 980                               i915_reset_count(&i915->gpu_error) - global);
 981                        if (!err)
 982                                err = -EINVAL;
 983                }
 984
 985                if (err)
 986                        break;
 987
 988                mutex_lock(&i915->drm.struct_mutex);
 989                err = igt_flush_test(i915, I915_WAIT_LOCKED);
 990                mutex_unlock(&i915->drm.struct_mutex);
 991                if (err)
 992                        break;
 993        }
 994
 995        if (i915_reset_failed(i915))
 996                err = -EIO;
 997
 998        if (flags & TEST_ACTIVE) {
 999                mutex_lock(&i915->drm.struct_mutex);
1000                hang_fini(&h);
1001                mutex_unlock(&i915->drm.struct_mutex);
1002        }
1003
1004        return err;
1005}
1006
1007static int igt_reset_engines(void *arg)
1008{
1009        static const struct {
1010                const char *name;
1011                unsigned int flags;
1012        } phases[] = {
1013                { "idle", 0 },
1014                { "active", TEST_ACTIVE },
1015                { "others-idle", TEST_OTHERS },
1016                { "others-active", TEST_OTHERS | TEST_ACTIVE },
1017                {
1018                        "others-priority",
1019                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1020                },
1021                {
1022                        "self-priority",
1023                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1024                },
1025                { }
1026        };
1027        struct drm_i915_private *i915 = arg;
1028        typeof(*phases) *p;
1029        int err;
1030
1031        for (p = phases; p->name; p++) {
1032                if (p->flags & TEST_PRIORITY) {
1033                        if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1034                                continue;
1035                }
1036
1037                err = __igt_reset_engines(arg, p->name, p->flags);
1038                if (err)
1039                        return err;
1040        }
1041
1042        return 0;
1043}
1044
1045static u32 fake_hangcheck(struct drm_i915_private *i915,
1046                          intel_engine_mask_t mask)
1047{
1048        u32 count = i915_reset_count(&i915->gpu_error);
1049
1050        i915_reset(i915, mask, NULL);
1051
1052        return count;
1053}
1054
1055static int igt_reset_wait(void *arg)
1056{
1057        struct drm_i915_private *i915 = arg;
1058        struct i915_request *rq;
1059        unsigned int reset_count;
1060        struct hang h;
1061        long timeout;
1062        int err;
1063
1064        if (!intel_engine_can_store_dword(i915->engine[RCS0]))
1065                return 0;
1066
1067        /* Check that we detect a stuck waiter and issue a reset */
1068
1069        igt_global_reset_lock(i915);
1070
1071        mutex_lock(&i915->drm.struct_mutex);
1072        err = hang_init(&h, i915);
1073        if (err)
1074                goto unlock;
1075
1076        rq = hang_create_request(&h, i915->engine[RCS0]);
1077        if (IS_ERR(rq)) {
1078                err = PTR_ERR(rq);
1079                goto fini;
1080        }
1081
1082        i915_request_get(rq);
1083        i915_request_add(rq);
1084
1085        if (!wait_until_running(&h, rq)) {
1086                struct drm_printer p = drm_info_printer(i915->drm.dev);
1087
1088                pr_err("%s: Failed to start request %llx, at %x\n",
1089                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1090                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1091
1092                i915_gem_set_wedged(i915);
1093
1094                err = -EIO;
1095                goto out_rq;
1096        }
1097
1098        reset_count = fake_hangcheck(i915, ALL_ENGINES);
1099
1100        timeout = i915_request_wait(rq, 0, 10);
1101        if (timeout < 0) {
1102                pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1103                       timeout);
1104                err = timeout;
1105                goto out_rq;
1106        }
1107
1108        if (i915_reset_count(&i915->gpu_error) == reset_count) {
1109                pr_err("No GPU reset recorded!\n");
1110                err = -EINVAL;
1111                goto out_rq;
1112        }
1113
1114out_rq:
1115        i915_request_put(rq);
1116fini:
1117        hang_fini(&h);
1118unlock:
1119        mutex_unlock(&i915->drm.struct_mutex);
1120        igt_global_reset_unlock(i915);
1121
1122        if (i915_reset_failed(i915))
1123                return -EIO;
1124
1125        return err;
1126}
1127
1128struct evict_vma {
1129        struct completion completion;
1130        struct i915_vma *vma;
1131};
1132
1133static int evict_vma(void *data)
1134{
1135        struct evict_vma *arg = data;
1136        struct i915_address_space *vm = arg->vma->vm;
1137        struct drm_i915_private *i915 = vm->i915;
1138        struct drm_mm_node evict = arg->vma->node;
1139        int err;
1140
1141        complete(&arg->completion);
1142
1143        mutex_lock(&i915->drm.struct_mutex);
1144        err = i915_gem_evict_for_node(vm, &evict, 0);
1145        mutex_unlock(&i915->drm.struct_mutex);
1146
1147        return err;
1148}
1149
1150static int evict_fence(void *data)
1151{
1152        struct evict_vma *arg = data;
1153        struct drm_i915_private *i915 = arg->vma->vm->i915;
1154        int err;
1155
1156        complete(&arg->completion);
1157
1158        mutex_lock(&i915->drm.struct_mutex);
1159
1160        /* Mark the fence register as dirty to force the mmio update. */
1161        err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1162        if (err) {
1163                pr_err("Invalid Y-tiling settings; err:%d\n", err);
1164                goto out_unlock;
1165        }
1166
1167        err = i915_vma_pin_fence(arg->vma);
1168        if (err) {
1169                pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1170                goto out_unlock;
1171        }
1172
1173        i915_vma_unpin_fence(arg->vma);
1174
1175out_unlock:
1176        mutex_unlock(&i915->drm.struct_mutex);
1177
1178        return err;
1179}
1180
1181static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1182                                 struct i915_address_space *vm,
1183                                 int (*fn)(void *),
1184                                 unsigned int flags)
1185{
1186        struct drm_i915_gem_object *obj;
1187        struct task_struct *tsk = NULL;
1188        struct i915_request *rq;
1189        struct evict_vma arg;
1190        struct hang h;
1191        int err;
1192
1193        if (!intel_engine_can_store_dword(i915->engine[RCS0]))
1194                return 0;
1195
1196        /* Check that we can recover an unbind stuck on a hanging request */
1197
1198        mutex_lock(&i915->drm.struct_mutex);
1199        err = hang_init(&h, i915);
1200        if (err)
1201                goto unlock;
1202
1203        obj = i915_gem_object_create_internal(i915, SZ_1M);
1204        if (IS_ERR(obj)) {
1205                err = PTR_ERR(obj);
1206                goto fini;
1207        }
1208
1209        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1210                err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1211                if (err) {
1212                        pr_err("Invalid X-tiling settings; err:%d\n", err);
1213                        goto out_obj;
1214                }
1215        }
1216
1217        arg.vma = i915_vma_instance(obj, vm, NULL);
1218        if (IS_ERR(arg.vma)) {
1219                err = PTR_ERR(arg.vma);
1220                goto out_obj;
1221        }
1222
1223        rq = hang_create_request(&h, i915->engine[RCS0]);
1224        if (IS_ERR(rq)) {
1225                err = PTR_ERR(rq);
1226                goto out_obj;
1227        }
1228
1229        err = i915_vma_pin(arg.vma, 0, 0,
1230                           i915_vma_is_ggtt(arg.vma) ?
1231                           PIN_GLOBAL | PIN_MAPPABLE :
1232                           PIN_USER);
1233        if (err) {
1234                i915_request_add(rq);
1235                goto out_obj;
1236        }
1237
1238        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1239                err = i915_vma_pin_fence(arg.vma);
1240                if (err) {
1241                        pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1242                        i915_vma_unpin(arg.vma);
1243                        i915_request_add(rq);
1244                        goto out_obj;
1245                }
1246        }
1247
1248        i915_vma_lock(arg.vma);
1249        err = i915_vma_move_to_active(arg.vma, rq, flags);
1250        i915_vma_unlock(arg.vma);
1251
1252        if (flags & EXEC_OBJECT_NEEDS_FENCE)
1253                i915_vma_unpin_fence(arg.vma);
1254        i915_vma_unpin(arg.vma);
1255
1256        i915_request_get(rq);
1257        i915_request_add(rq);
1258        if (err)
1259                goto out_rq;
1260
1261        mutex_unlock(&i915->drm.struct_mutex);
1262
1263        if (!wait_until_running(&h, rq)) {
1264                struct drm_printer p = drm_info_printer(i915->drm.dev);
1265
1266                pr_err("%s: Failed to start request %llx, at %x\n",
1267                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1268                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1269
1270                i915_gem_set_wedged(i915);
1271                goto out_reset;
1272        }
1273
1274        init_completion(&arg.completion);
1275
1276        tsk = kthread_run(fn, &arg, "igt/evict_vma");
1277        if (IS_ERR(tsk)) {
1278                err = PTR_ERR(tsk);
1279                tsk = NULL;
1280                goto out_reset;
1281        }
1282        get_task_struct(tsk);
1283
1284        wait_for_completion(&arg.completion);
1285
1286        if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1287                struct drm_printer p = drm_info_printer(i915->drm.dev);
1288
1289                pr_err("igt/evict_vma kthread did not wait\n");
1290                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1291
1292                i915_gem_set_wedged(i915);
1293                goto out_reset;
1294        }
1295
1296out_reset:
1297        igt_global_reset_lock(i915);
1298        fake_hangcheck(rq->i915, rq->engine->mask);
1299        igt_global_reset_unlock(i915);
1300
1301        if (tsk) {
1302                struct igt_wedge_me w;
1303
1304                /* The reset, even indirectly, should take less than 10ms. */
1305                igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
1306                        err = kthread_stop(tsk);
1307
1308                put_task_struct(tsk);
1309        }
1310
1311        mutex_lock(&i915->drm.struct_mutex);
1312out_rq:
1313        i915_request_put(rq);
1314out_obj:
1315        i915_gem_object_put(obj);
1316fini:
1317        hang_fini(&h);
1318unlock:
1319        mutex_unlock(&i915->drm.struct_mutex);
1320
1321        if (i915_reset_failed(i915))
1322                return -EIO;
1323
1324        return err;
1325}
1326
1327static int igt_reset_evict_ggtt(void *arg)
1328{
1329        struct drm_i915_private *i915 = arg;
1330
1331        return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1332                                     evict_vma, EXEC_OBJECT_WRITE);
1333}
1334
1335static int igt_reset_evict_ppgtt(void *arg)
1336{
1337        struct drm_i915_private *i915 = arg;
1338        struct i915_gem_context *ctx;
1339        struct drm_file *file;
1340        int err;
1341
1342        file = mock_file(i915);
1343        if (IS_ERR(file))
1344                return PTR_ERR(file);
1345
1346        mutex_lock(&i915->drm.struct_mutex);
1347        ctx = live_context(i915, file);
1348        mutex_unlock(&i915->drm.struct_mutex);
1349        if (IS_ERR(ctx)) {
1350                err = PTR_ERR(ctx);
1351                goto out;
1352        }
1353
1354        err = 0;
1355        if (ctx->vm) /* aliasing == global gtt locking, covered above */
1356                err = __igt_reset_evict_vma(i915, ctx->vm,
1357                                            evict_vma, EXEC_OBJECT_WRITE);
1358
1359out:
1360        mock_file_free(i915, file);
1361        return err;
1362}
1363
1364static int igt_reset_evict_fence(void *arg)
1365{
1366        struct drm_i915_private *i915 = arg;
1367
1368        return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1369                                     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1370}
1371
1372static int wait_for_others(struct drm_i915_private *i915,
1373                           struct intel_engine_cs *exclude)
1374{
1375        struct intel_engine_cs *engine;
1376        enum intel_engine_id id;
1377
1378        for_each_engine(engine, i915, id) {
1379                if (engine == exclude)
1380                        continue;
1381
1382                if (!wait_for_idle(engine))
1383                        return -EIO;
1384        }
1385
1386        return 0;
1387}
1388
1389static int igt_reset_queue(void *arg)
1390{
1391        struct drm_i915_private *i915 = arg;
1392        struct intel_engine_cs *engine;
1393        enum intel_engine_id id;
1394        struct hang h;
1395        int err;
1396
1397        /* Check that we replay pending requests following a hang */
1398
1399        igt_global_reset_lock(i915);
1400
1401        mutex_lock(&i915->drm.struct_mutex);
1402        err = hang_init(&h, i915);
1403        if (err)
1404                goto unlock;
1405
1406        for_each_engine(engine, i915, id) {
1407                struct i915_request *prev;
1408                IGT_TIMEOUT(end_time);
1409                unsigned int count;
1410
1411                if (!intel_engine_can_store_dword(engine))
1412                        continue;
1413
1414                prev = hang_create_request(&h, engine);
1415                if (IS_ERR(prev)) {
1416                        err = PTR_ERR(prev);
1417                        goto fini;
1418                }
1419
1420                i915_request_get(prev);
1421                i915_request_add(prev);
1422
1423                count = 0;
1424                do {
1425                        struct i915_request *rq;
1426                        unsigned int reset_count;
1427
1428                        rq = hang_create_request(&h, engine);
1429                        if (IS_ERR(rq)) {
1430                                err = PTR_ERR(rq);
1431                                goto fini;
1432                        }
1433
1434                        i915_request_get(rq);
1435                        i915_request_add(rq);
1436
1437                        /*
1438                         * XXX We don't handle resetting the kernel context
1439                         * very well. If we trigger a device reset twice in
1440                         * quick succession while the kernel context is
1441                         * executing, we may end up skipping the breadcrumb.
1442                         * This is really only a problem for the selftest as
1443                         * normally there is a large interlude between resets
1444                         * (hangcheck), or we focus on resetting just one
1445                         * engine and so avoid repeatedly resetting innocents.
1446                         */
1447                        err = wait_for_others(i915, engine);
1448                        if (err) {
1449                                pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1450                                       __func__, engine->name);
1451                                i915_request_put(rq);
1452                                i915_request_put(prev);
1453
1454                                GEM_TRACE_DUMP();
1455                                i915_gem_set_wedged(i915);
1456                                goto fini;
1457                        }
1458
1459                        if (!wait_until_running(&h, prev)) {
1460                                struct drm_printer p = drm_info_printer(i915->drm.dev);
1461
1462                                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1463                                       __func__, engine->name,
1464                                       prev->fence.seqno, hws_seqno(&h, prev));
1465                                intel_engine_dump(engine, &p,
1466                                                  "%s\n", engine->name);
1467
1468                                i915_request_put(rq);
1469                                i915_request_put(prev);
1470
1471                                i915_gem_set_wedged(i915);
1472
1473                                err = -EIO;
1474                                goto fini;
1475                        }
1476
1477                        reset_count = fake_hangcheck(i915, BIT(id));
1478
1479                        if (prev->fence.error != -EIO) {
1480                                pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1481                                       prev->fence.error);
1482                                i915_request_put(rq);
1483                                i915_request_put(prev);
1484                                err = -EINVAL;
1485                                goto fini;
1486                        }
1487
1488                        if (rq->fence.error) {
1489                                pr_err("Fence error status not zero [%d] after unrelated reset\n",
1490                                       rq->fence.error);
1491                                i915_request_put(rq);
1492                                i915_request_put(prev);
1493                                err = -EINVAL;
1494                                goto fini;
1495                        }
1496
1497                        if (i915_reset_count(&i915->gpu_error) == reset_count) {
1498                                pr_err("No GPU reset recorded!\n");
1499                                i915_request_put(rq);
1500                                i915_request_put(prev);
1501                                err = -EINVAL;
1502                                goto fini;
1503                        }
1504
1505                        i915_request_put(prev);
1506                        prev = rq;
1507                        count++;
1508                } while (time_before(jiffies, end_time));
1509                pr_info("%s: Completed %d resets\n", engine->name, count);
1510
1511                *h.batch = MI_BATCH_BUFFER_END;
1512                i915_gem_chipset_flush(i915);
1513
1514                i915_request_put(prev);
1515
1516                err = igt_flush_test(i915, I915_WAIT_LOCKED);
1517                if (err)
1518                        break;
1519        }
1520
1521fini:
1522        hang_fini(&h);
1523unlock:
1524        mutex_unlock(&i915->drm.struct_mutex);
1525        igt_global_reset_unlock(i915);
1526
1527        if (i915_reset_failed(i915))
1528                return -EIO;
1529
1530        return err;
1531}
1532
1533static int igt_handle_error(void *arg)
1534{
1535        struct drm_i915_private *i915 = arg;
1536        struct intel_engine_cs *engine = i915->engine[RCS0];
1537        struct hang h;
1538        struct i915_request *rq;
1539        struct i915_gpu_state *error;
1540        int err;
1541
1542        /* Check that we can issue a global GPU and engine reset */
1543
1544        if (!intel_has_reset_engine(i915))
1545                return 0;
1546
1547        if (!engine || !intel_engine_can_store_dword(engine))
1548                return 0;
1549
1550        mutex_lock(&i915->drm.struct_mutex);
1551
1552        err = hang_init(&h, i915);
1553        if (err)
1554                goto err_unlock;
1555
1556        rq = hang_create_request(&h, engine);
1557        if (IS_ERR(rq)) {
1558                err = PTR_ERR(rq);
1559                goto err_fini;
1560        }
1561
1562        i915_request_get(rq);
1563        i915_request_add(rq);
1564
1565        if (!wait_until_running(&h, rq)) {
1566                struct drm_printer p = drm_info_printer(i915->drm.dev);
1567
1568                pr_err("%s: Failed to start request %llx, at %x\n",
1569                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1570                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1571
1572                i915_gem_set_wedged(i915);
1573
1574                err = -EIO;
1575                goto err_request;
1576        }
1577
1578        mutex_unlock(&i915->drm.struct_mutex);
1579
1580        /* Temporarily disable error capture */
1581        error = xchg(&i915->gpu_error.first_error, (void *)-1);
1582
1583        i915_handle_error(i915, engine->mask, 0, NULL);
1584
1585        xchg(&i915->gpu_error.first_error, error);
1586
1587        mutex_lock(&i915->drm.struct_mutex);
1588
1589        if (rq->fence.error != -EIO) {
1590                pr_err("Guilty request not identified!\n");
1591                err = -EINVAL;
1592                goto err_request;
1593        }
1594
1595err_request:
1596        i915_request_put(rq);
1597err_fini:
1598        hang_fini(&h);
1599err_unlock:
1600        mutex_unlock(&i915->drm.struct_mutex);
1601        return err;
1602}
1603
1604static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1605                                     const struct igt_atomic_section *p,
1606                                     const char *mode)
1607{
1608        struct tasklet_struct * const t = &engine->execlists.tasklet;
1609        int err;
1610
1611        GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1612                  engine->name, mode, p->name);
1613
1614        tasklet_disable_nosync(t);
1615        p->critical_section_begin();
1616
1617        err = i915_reset_engine(engine, NULL);
1618
1619        p->critical_section_end();
1620        tasklet_enable(t);
1621
1622        if (err)
1623                pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1624                       engine->name, mode, p->name);
1625
1626        return err;
1627}
1628
1629static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1630                                   const struct igt_atomic_section *p)
1631{
1632        struct drm_i915_private *i915 = engine->i915;
1633        struct i915_request *rq;
1634        struct hang h;
1635        int err;
1636
1637        err = __igt_atomic_reset_engine(engine, p, "idle");
1638        if (err)
1639                return err;
1640
1641        err = hang_init(&h, i915);
1642        if (err)
1643                return err;
1644
1645        rq = hang_create_request(&h, engine);
1646        if (IS_ERR(rq)) {
1647                err = PTR_ERR(rq);
1648                goto out;
1649        }
1650
1651        i915_request_get(rq);
1652        i915_request_add(rq);
1653
1654        if (wait_until_running(&h, rq)) {
1655                err = __igt_atomic_reset_engine(engine, p, "active");
1656        } else {
1657                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1658                       __func__, engine->name,
1659                       rq->fence.seqno, hws_seqno(&h, rq));
1660                i915_gem_set_wedged(i915);
1661                err = -EIO;
1662        }
1663
1664        if (err == 0) {
1665                struct igt_wedge_me w;
1666
1667                igt_wedge_on_timeout(&w, i915, HZ / 20 /* 50ms timeout*/)
1668                        i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1669                if (i915_reset_failed(i915))
1670                        err = -EIO;
1671        }
1672
1673        i915_request_put(rq);
1674out:
1675        hang_fini(&h);
1676        return err;
1677}
1678
1679static int igt_reset_engines_atomic(void *arg)
1680{
1681        struct drm_i915_private *i915 = arg;
1682        const typeof(*igt_atomic_phases) *p;
1683        int err = 0;
1684
1685        /* Check that the engines resets are usable from atomic context */
1686
1687        if (!intel_has_reset_engine(i915))
1688                return 0;
1689
1690        if (USES_GUC_SUBMISSION(i915))
1691                return 0;
1692
1693        igt_global_reset_lock(i915);
1694        mutex_lock(&i915->drm.struct_mutex);
1695
1696        /* Flush any requests before we get started and check basics */
1697        if (!igt_force_reset(i915))
1698                goto unlock;
1699
1700        for (p = igt_atomic_phases; p->name; p++) {
1701                struct intel_engine_cs *engine;
1702                enum intel_engine_id id;
1703
1704                for_each_engine(engine, i915, id) {
1705                        err = igt_atomic_reset_engine(engine, p);
1706                        if (err)
1707                                goto out;
1708                }
1709        }
1710
1711out:
1712        /* As we poke around the guts, do a full reset before continuing. */
1713        igt_force_reset(i915);
1714
1715unlock:
1716        mutex_unlock(&i915->drm.struct_mutex);
1717        igt_global_reset_unlock(i915);
1718
1719        return err;
1720}
1721
1722int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1723{
1724        static const struct i915_subtest tests[] = {
1725                SUBTEST(igt_hang_sanitycheck),
1726                SUBTEST(igt_reset_nop),
1727                SUBTEST(igt_reset_nop_engine),
1728                SUBTEST(igt_reset_idle_engine),
1729                SUBTEST(igt_reset_active_engine),
1730                SUBTEST(igt_reset_engines),
1731                SUBTEST(igt_reset_engines_atomic),
1732                SUBTEST(igt_reset_queue),
1733                SUBTEST(igt_reset_wait),
1734                SUBTEST(igt_reset_evict_ggtt),
1735                SUBTEST(igt_reset_evict_ppgtt),
1736                SUBTEST(igt_reset_evict_fence),
1737                SUBTEST(igt_handle_error),
1738        };
1739        intel_wakeref_t wakeref;
1740        bool saved_hangcheck;
1741        int err;
1742
1743        if (!intel_has_gpu_reset(i915))
1744                return 0;
1745
1746        if (i915_terminally_wedged(i915))
1747                return -EIO; /* we're long past hope of a successful reset */
1748
1749        wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1750        saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1751        drain_delayed_work(&i915->gpu_error.hangcheck_work); /* flush param */
1752
1753        err = i915_subtests(tests, i915);
1754
1755        mutex_lock(&i915->drm.struct_mutex);
1756        igt_flush_test(i915, I915_WAIT_LOCKED);
1757        mutex_unlock(&i915->drm.struct_mutex);
1758
1759        i915_modparams.enable_hangcheck = saved_hangcheck;
1760        intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1761
1762        return err;
1763}
1764