linux/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 */
  24
  25#include <linux/kthread.h>
  26
  27#include "../i915_selftest.h"
  28#include "i915_random.h"
  29#include "igt_flush_test.h"
  30#include "igt_reset.h"
  31#include "igt_wedge_me.h"
  32
  33#include "mock_context.h"
  34#include "mock_drm.h"
  35
  36#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
  37
  38struct hang {
  39        struct drm_i915_private *i915;
  40        struct drm_i915_gem_object *hws;
  41        struct drm_i915_gem_object *obj;
  42        struct i915_gem_context *ctx;
  43        u32 *seqno;
  44        u32 *batch;
  45};
  46
  47static int hang_init(struct hang *h, struct drm_i915_private *i915)
  48{
  49        void *vaddr;
  50        int err;
  51
  52        memset(h, 0, sizeof(*h));
  53        h->i915 = i915;
  54
  55        h->ctx = kernel_context(i915);
  56        if (IS_ERR(h->ctx))
  57                return PTR_ERR(h->ctx);
  58
  59        GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
  60
  61        h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  62        if (IS_ERR(h->hws)) {
  63                err = PTR_ERR(h->hws);
  64                goto err_ctx;
  65        }
  66
  67        h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  68        if (IS_ERR(h->obj)) {
  69                err = PTR_ERR(h->obj);
  70                goto err_hws;
  71        }
  72
  73        i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
  74        vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  75        if (IS_ERR(vaddr)) {
  76                err = PTR_ERR(vaddr);
  77                goto err_obj;
  78        }
  79        h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  80
  81        vaddr = i915_gem_object_pin_map(h->obj,
  82                                        i915_coherent_map_type(i915));
  83        if (IS_ERR(vaddr)) {
  84                err = PTR_ERR(vaddr);
  85                goto err_unpin_hws;
  86        }
  87        h->batch = vaddr;
  88
  89        return 0;
  90
  91err_unpin_hws:
  92        i915_gem_object_unpin_map(h->hws);
  93err_obj:
  94        i915_gem_object_put(h->obj);
  95err_hws:
  96        i915_gem_object_put(h->hws);
  97err_ctx:
  98        kernel_context_close(h->ctx);
  99        return err;
 100}
 101
 102static u64 hws_address(const struct i915_vma *hws,
 103                       const struct i915_request *rq)
 104{
 105        return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
 106}
 107
 108static int move_to_active(struct i915_vma *vma,
 109                          struct i915_request *rq,
 110                          unsigned int flags)
 111{
 112        int err;
 113
 114        err = i915_vma_move_to_active(vma, rq, flags);
 115        if (err)
 116                return err;
 117
 118        if (!i915_gem_object_has_active_reference(vma->obj)) {
 119                i915_gem_object_get(vma->obj);
 120                i915_gem_object_set_active_reference(vma->obj);
 121        }
 122
 123        return 0;
 124}
 125
 126static struct i915_request *
 127hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 128{
 129        struct drm_i915_private *i915 = h->i915;
 130        struct i915_address_space *vm =
 131                h->ctx->ppgtt ? &h->ctx->ppgtt->vm : &i915->ggtt.vm;
 132        struct i915_request *rq = NULL;
 133        struct i915_vma *hws, *vma;
 134        unsigned int flags;
 135        u32 *batch;
 136        int err;
 137
 138        if (i915_gem_object_is_active(h->obj)) {
 139                struct drm_i915_gem_object *obj;
 140                void *vaddr;
 141
 142                obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
 143                if (IS_ERR(obj))
 144                        return ERR_CAST(obj);
 145
 146                vaddr = i915_gem_object_pin_map(obj,
 147                                                i915_coherent_map_type(h->i915));
 148                if (IS_ERR(vaddr)) {
 149                        i915_gem_object_put(obj);
 150                        return ERR_CAST(vaddr);
 151                }
 152
 153                i915_gem_object_unpin_map(h->obj);
 154                i915_gem_object_put(h->obj);
 155
 156                h->obj = obj;
 157                h->batch = vaddr;
 158        }
 159
 160        vma = i915_vma_instance(h->obj, vm, NULL);
 161        if (IS_ERR(vma))
 162                return ERR_CAST(vma);
 163
 164        hws = i915_vma_instance(h->hws, vm, NULL);
 165        if (IS_ERR(hws))
 166                return ERR_CAST(hws);
 167
 168        err = i915_vma_pin(vma, 0, 0, PIN_USER);
 169        if (err)
 170                return ERR_PTR(err);
 171
 172        err = i915_vma_pin(hws, 0, 0, PIN_USER);
 173        if (err)
 174                goto unpin_vma;
 175
 176        rq = i915_request_alloc(engine, h->ctx);
 177        if (IS_ERR(rq)) {
 178                err = PTR_ERR(rq);
 179                goto unpin_hws;
 180        }
 181
 182        err = move_to_active(vma, rq, 0);
 183        if (err)
 184                goto cancel_rq;
 185
 186        err = move_to_active(hws, rq, 0);
 187        if (err)
 188                goto cancel_rq;
 189
 190        batch = h->batch;
 191        if (INTEL_GEN(i915) >= 8) {
 192                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 193                *batch++ = lower_32_bits(hws_address(hws, rq));
 194                *batch++ = upper_32_bits(hws_address(hws, rq));
 195                *batch++ = rq->fence.seqno;
 196                *batch++ = MI_ARB_CHECK;
 197
 198                memset(batch, 0, 1024);
 199                batch += 1024 / sizeof(*batch);
 200
 201                *batch++ = MI_ARB_CHECK;
 202                *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 203                *batch++ = lower_32_bits(vma->node.start);
 204                *batch++ = upper_32_bits(vma->node.start);
 205        } else if (INTEL_GEN(i915) >= 6) {
 206                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 207                *batch++ = 0;
 208                *batch++ = lower_32_bits(hws_address(hws, rq));
 209                *batch++ = rq->fence.seqno;
 210                *batch++ = MI_ARB_CHECK;
 211
 212                memset(batch, 0, 1024);
 213                batch += 1024 / sizeof(*batch);
 214
 215                *batch++ = MI_ARB_CHECK;
 216                *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 217                *batch++ = lower_32_bits(vma->node.start);
 218        } else if (INTEL_GEN(i915) >= 4) {
 219                *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 220                *batch++ = 0;
 221                *batch++ = lower_32_bits(hws_address(hws, rq));
 222                *batch++ = rq->fence.seqno;
 223                *batch++ = MI_ARB_CHECK;
 224
 225                memset(batch, 0, 1024);
 226                batch += 1024 / sizeof(*batch);
 227
 228                *batch++ = MI_ARB_CHECK;
 229                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 230                *batch++ = lower_32_bits(vma->node.start);
 231        } else {
 232                *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 233                *batch++ = lower_32_bits(hws_address(hws, rq));
 234                *batch++ = rq->fence.seqno;
 235                *batch++ = MI_ARB_CHECK;
 236
 237                memset(batch, 0, 1024);
 238                batch += 1024 / sizeof(*batch);
 239
 240                *batch++ = MI_ARB_CHECK;
 241                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 242                *batch++ = lower_32_bits(vma->node.start);
 243        }
 244        *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 245        i915_gem_chipset_flush(h->i915);
 246
 247        if (rq->engine->emit_init_breadcrumb) {
 248                err = rq->engine->emit_init_breadcrumb(rq);
 249                if (err)
 250                        goto cancel_rq;
 251        }
 252
 253        flags = 0;
 254        if (INTEL_GEN(vm->i915) <= 5)
 255                flags |= I915_DISPATCH_SECURE;
 256
 257        err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 258
 259cancel_rq:
 260        if (err) {
 261                i915_request_skip(rq, err);
 262                i915_request_add(rq);
 263        }
 264unpin_hws:
 265        i915_vma_unpin(hws);
 266unpin_vma:
 267        i915_vma_unpin(vma);
 268        return err ? ERR_PTR(err) : rq;
 269}
 270
 271static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 272{
 273        return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 274}
 275
 276static void hang_fini(struct hang *h)
 277{
 278        *h->batch = MI_BATCH_BUFFER_END;
 279        i915_gem_chipset_flush(h->i915);
 280
 281        i915_gem_object_unpin_map(h->obj);
 282        i915_gem_object_put(h->obj);
 283
 284        i915_gem_object_unpin_map(h->hws);
 285        i915_gem_object_put(h->hws);
 286
 287        kernel_context_close(h->ctx);
 288
 289        igt_flush_test(h->i915, I915_WAIT_LOCKED);
 290}
 291
 292static bool wait_until_running(struct hang *h, struct i915_request *rq)
 293{
 294        return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 295                                               rq->fence.seqno),
 296                             10) &&
 297                 wait_for(i915_seqno_passed(hws_seqno(h, rq),
 298                                            rq->fence.seqno),
 299                          1000));
 300}
 301
 302static int igt_hang_sanitycheck(void *arg)
 303{
 304        struct drm_i915_private *i915 = arg;
 305        struct i915_request *rq;
 306        struct intel_engine_cs *engine;
 307        enum intel_engine_id id;
 308        struct hang h;
 309        int err;
 310
 311        /* Basic check that we can execute our hanging batch */
 312
 313        mutex_lock(&i915->drm.struct_mutex);
 314        err = hang_init(&h, i915);
 315        if (err)
 316                goto unlock;
 317
 318        for_each_engine(engine, i915, id) {
 319                struct igt_wedge_me w;
 320                long timeout;
 321
 322                if (!intel_engine_can_store_dword(engine))
 323                        continue;
 324
 325                rq = hang_create_request(&h, engine);
 326                if (IS_ERR(rq)) {
 327                        err = PTR_ERR(rq);
 328                        pr_err("Failed to create request for %s, err=%d\n",
 329                               engine->name, err);
 330                        goto fini;
 331                }
 332
 333                i915_request_get(rq);
 334
 335                *h.batch = MI_BATCH_BUFFER_END;
 336                i915_gem_chipset_flush(i915);
 337
 338                i915_request_add(rq);
 339
 340                timeout = 0;
 341                igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
 342                        timeout = i915_request_wait(rq,
 343                                                    I915_WAIT_LOCKED,
 344                                                    MAX_SCHEDULE_TIMEOUT);
 345                if (i915_reset_failed(i915))
 346                        timeout = -EIO;
 347
 348                i915_request_put(rq);
 349
 350                if (timeout < 0) {
 351                        err = timeout;
 352                        pr_err("Wait for request failed on %s, err=%d\n",
 353                               engine->name, err);
 354                        goto fini;
 355                }
 356        }
 357
 358fini:
 359        hang_fini(&h);
 360unlock:
 361        mutex_unlock(&i915->drm.struct_mutex);
 362        return err;
 363}
 364
 365static int igt_global_reset(void *arg)
 366{
 367        struct drm_i915_private *i915 = arg;
 368        unsigned int reset_count;
 369        int err = 0;
 370
 371        /* Check that we can issue a global GPU reset */
 372
 373        igt_global_reset_lock(i915);
 374
 375        reset_count = i915_reset_count(&i915->gpu_error);
 376
 377        i915_reset(i915, ALL_ENGINES, NULL);
 378
 379        if (i915_reset_count(&i915->gpu_error) == reset_count) {
 380                pr_err("No GPU reset recorded!\n");
 381                err = -EINVAL;
 382        }
 383
 384        igt_global_reset_unlock(i915);
 385
 386        if (i915_reset_failed(i915))
 387                err = -EIO;
 388
 389        return err;
 390}
 391
 392static int igt_wedged_reset(void *arg)
 393{
 394        struct drm_i915_private *i915 = arg;
 395        intel_wakeref_t wakeref;
 396
 397        /* Check that we can recover a wedged device with a GPU reset */
 398
 399        igt_global_reset_lock(i915);
 400        wakeref = intel_runtime_pm_get(i915);
 401
 402        i915_gem_set_wedged(i915);
 403
 404        GEM_BUG_ON(!i915_reset_failed(i915));
 405        i915_reset(i915, ALL_ENGINES, NULL);
 406
 407        intel_runtime_pm_put(i915, wakeref);
 408        igt_global_reset_unlock(i915);
 409
 410        return i915_reset_failed(i915) ? -EIO : 0;
 411}
 412
 413static bool wait_for_idle(struct intel_engine_cs *engine)
 414{
 415        return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
 416}
 417
 418static int igt_reset_nop(void *arg)
 419{
 420        struct drm_i915_private *i915 = arg;
 421        struct intel_engine_cs *engine;
 422        struct i915_gem_context *ctx;
 423        unsigned int reset_count, count;
 424        enum intel_engine_id id;
 425        intel_wakeref_t wakeref;
 426        struct drm_file *file;
 427        IGT_TIMEOUT(end_time);
 428        int err = 0;
 429
 430        /* Check that we can reset during non-user portions of requests */
 431
 432        file = mock_file(i915);
 433        if (IS_ERR(file))
 434                return PTR_ERR(file);
 435
 436        mutex_lock(&i915->drm.struct_mutex);
 437        ctx = live_context(i915, file);
 438        mutex_unlock(&i915->drm.struct_mutex);
 439        if (IS_ERR(ctx)) {
 440                err = PTR_ERR(ctx);
 441                goto out;
 442        }
 443
 444        i915_gem_context_clear_bannable(ctx);
 445        wakeref = intel_runtime_pm_get(i915);
 446        reset_count = i915_reset_count(&i915->gpu_error);
 447        count = 0;
 448        do {
 449                mutex_lock(&i915->drm.struct_mutex);
 450                for_each_engine(engine, i915, id) {
 451                        int i;
 452
 453                        for (i = 0; i < 16; i++) {
 454                                struct i915_request *rq;
 455
 456                                rq = i915_request_alloc(engine, ctx);
 457                                if (IS_ERR(rq)) {
 458                                        err = PTR_ERR(rq);
 459                                        break;
 460                                }
 461
 462                                i915_request_add(rq);
 463                        }
 464                }
 465                mutex_unlock(&i915->drm.struct_mutex);
 466
 467                igt_global_reset_lock(i915);
 468                i915_reset(i915, ALL_ENGINES, NULL);
 469                igt_global_reset_unlock(i915);
 470                if (i915_reset_failed(i915)) {
 471                        err = -EIO;
 472                        break;
 473                }
 474
 475                if (i915_reset_count(&i915->gpu_error) !=
 476                    reset_count + ++count) {
 477                        pr_err("Full GPU reset not recorded!\n");
 478                        err = -EINVAL;
 479                        break;
 480                }
 481
 482                if (!i915_reset_flush(i915)) {
 483                        struct drm_printer p =
 484                                drm_info_printer(i915->drm.dev);
 485
 486                        pr_err("%s failed to idle after reset\n",
 487                               engine->name);
 488                        intel_engine_dump(engine, &p,
 489                                          "%s\n", engine->name);
 490
 491                        err = -EIO;
 492                        break;
 493                }
 494
 495                err = igt_flush_test(i915, 0);
 496                if (err)
 497                        break;
 498        } while (time_before(jiffies, end_time));
 499        pr_info("%s: %d resets\n", __func__, count);
 500
 501        mutex_lock(&i915->drm.struct_mutex);
 502        err = igt_flush_test(i915, I915_WAIT_LOCKED);
 503        mutex_unlock(&i915->drm.struct_mutex);
 504
 505        intel_runtime_pm_put(i915, wakeref);
 506
 507out:
 508        mock_file_free(i915, file);
 509        if (i915_reset_failed(i915))
 510                err = -EIO;
 511        return err;
 512}
 513
 514static int igt_reset_nop_engine(void *arg)
 515{
 516        struct drm_i915_private *i915 = arg;
 517        struct intel_engine_cs *engine;
 518        struct i915_gem_context *ctx;
 519        enum intel_engine_id id;
 520        intel_wakeref_t wakeref;
 521        struct drm_file *file;
 522        int err = 0;
 523
 524        /* Check that we can engine-reset during non-user portions */
 525
 526        if (!intel_has_reset_engine(i915))
 527                return 0;
 528
 529        file = mock_file(i915);
 530        if (IS_ERR(file))
 531                return PTR_ERR(file);
 532
 533        mutex_lock(&i915->drm.struct_mutex);
 534        ctx = live_context(i915, file);
 535        mutex_unlock(&i915->drm.struct_mutex);
 536        if (IS_ERR(ctx)) {
 537                err = PTR_ERR(ctx);
 538                goto out;
 539        }
 540
 541        i915_gem_context_clear_bannable(ctx);
 542        wakeref = intel_runtime_pm_get(i915);
 543        for_each_engine(engine, i915, id) {
 544                unsigned int reset_count, reset_engine_count;
 545                unsigned int count;
 546                IGT_TIMEOUT(end_time);
 547
 548                reset_count = i915_reset_count(&i915->gpu_error);
 549                reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 550                                                             engine);
 551                count = 0;
 552
 553                set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 554                do {
 555                        int i;
 556
 557                        if (!wait_for_idle(engine)) {
 558                                pr_err("%s failed to idle before reset\n",
 559                                       engine->name);
 560                                err = -EIO;
 561                                break;
 562                        }
 563
 564                        mutex_lock(&i915->drm.struct_mutex);
 565                        for (i = 0; i < 16; i++) {
 566                                struct i915_request *rq;
 567
 568                                rq = i915_request_alloc(engine, ctx);
 569                                if (IS_ERR(rq)) {
 570                                        err = PTR_ERR(rq);
 571                                        break;
 572                                }
 573
 574                                i915_request_add(rq);
 575                        }
 576                        mutex_unlock(&i915->drm.struct_mutex);
 577
 578                        err = i915_reset_engine(engine, NULL);
 579                        if (err) {
 580                                pr_err("i915_reset_engine failed\n");
 581                                break;
 582                        }
 583
 584                        if (i915_reset_count(&i915->gpu_error) != reset_count) {
 585                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 586                                err = -EINVAL;
 587                                break;
 588                        }
 589
 590                        if (i915_reset_engine_count(&i915->gpu_error, engine) !=
 591                            reset_engine_count + ++count) {
 592                                pr_err("%s engine reset not recorded!\n",
 593                                       engine->name);
 594                                err = -EINVAL;
 595                                break;
 596                        }
 597
 598                        if (!i915_reset_flush(i915)) {
 599                                struct drm_printer p =
 600                                        drm_info_printer(i915->drm.dev);
 601
 602                                pr_err("%s failed to idle after reset\n",
 603                                       engine->name);
 604                                intel_engine_dump(engine, &p,
 605                                                  "%s\n", engine->name);
 606
 607                                err = -EIO;
 608                                break;
 609                        }
 610                } while (time_before(jiffies, end_time));
 611                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 612                pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 613
 614                if (err)
 615                        break;
 616
 617                err = igt_flush_test(i915, 0);
 618                if (err)
 619                        break;
 620        }
 621
 622        mutex_lock(&i915->drm.struct_mutex);
 623        err = igt_flush_test(i915, I915_WAIT_LOCKED);
 624        mutex_unlock(&i915->drm.struct_mutex);
 625
 626        intel_runtime_pm_put(i915, wakeref);
 627out:
 628        mock_file_free(i915, file);
 629        if (i915_reset_failed(i915))
 630                err = -EIO;
 631        return err;
 632}
 633
 634static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 635{
 636        struct intel_engine_cs *engine;
 637        enum intel_engine_id id;
 638        struct hang h;
 639        int err = 0;
 640
 641        /* Check that we can issue an engine reset on an idle engine (no-op) */
 642
 643        if (!intel_has_reset_engine(i915))
 644                return 0;
 645
 646        if (active) {
 647                mutex_lock(&i915->drm.struct_mutex);
 648                err = hang_init(&h, i915);
 649                mutex_unlock(&i915->drm.struct_mutex);
 650                if (err)
 651                        return err;
 652        }
 653
 654        for_each_engine(engine, i915, id) {
 655                unsigned int reset_count, reset_engine_count;
 656                IGT_TIMEOUT(end_time);
 657
 658                if (active && !intel_engine_can_store_dword(engine))
 659                        continue;
 660
 661                if (!wait_for_idle(engine)) {
 662                        pr_err("%s failed to idle before reset\n",
 663                               engine->name);
 664                        err = -EIO;
 665                        break;
 666                }
 667
 668                reset_count = i915_reset_count(&i915->gpu_error);
 669                reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 670                                                             engine);
 671
 672                set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 673                do {
 674                        if (active) {
 675                                struct i915_request *rq;
 676
 677                                mutex_lock(&i915->drm.struct_mutex);
 678                                rq = hang_create_request(&h, engine);
 679                                if (IS_ERR(rq)) {
 680                                        err = PTR_ERR(rq);
 681                                        mutex_unlock(&i915->drm.struct_mutex);
 682                                        break;
 683                                }
 684
 685                                i915_request_get(rq);
 686                                i915_request_add(rq);
 687                                mutex_unlock(&i915->drm.struct_mutex);
 688
 689                                if (!wait_until_running(&h, rq)) {
 690                                        struct drm_printer p = drm_info_printer(i915->drm.dev);
 691
 692                                        pr_err("%s: Failed to start request %llx, at %x\n",
 693                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 694                                        intel_engine_dump(engine, &p,
 695                                                          "%s\n", engine->name);
 696
 697                                        i915_request_put(rq);
 698                                        err = -EIO;
 699                                        break;
 700                                }
 701
 702                                i915_request_put(rq);
 703                        }
 704
 705                        err = i915_reset_engine(engine, NULL);
 706                        if (err) {
 707                                pr_err("i915_reset_engine failed\n");
 708                                break;
 709                        }
 710
 711                        if (i915_reset_count(&i915->gpu_error) != reset_count) {
 712                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 713                                err = -EINVAL;
 714                                break;
 715                        }
 716
 717                        if (i915_reset_engine_count(&i915->gpu_error, engine) !=
 718                            ++reset_engine_count) {
 719                                pr_err("%s engine reset not recorded!\n",
 720                                       engine->name);
 721                                err = -EINVAL;
 722                                break;
 723                        }
 724
 725                        if (!i915_reset_flush(i915)) {
 726                                struct drm_printer p =
 727                                        drm_info_printer(i915->drm.dev);
 728
 729                                pr_err("%s failed to idle after reset\n",
 730                                       engine->name);
 731                                intel_engine_dump(engine, &p,
 732                                                  "%s\n", engine->name);
 733
 734                                err = -EIO;
 735                                break;
 736                        }
 737                } while (time_before(jiffies, end_time));
 738                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 739
 740                if (err)
 741                        break;
 742
 743                err = igt_flush_test(i915, 0);
 744                if (err)
 745                        break;
 746        }
 747
 748        if (i915_reset_failed(i915))
 749                err = -EIO;
 750
 751        if (active) {
 752                mutex_lock(&i915->drm.struct_mutex);
 753                hang_fini(&h);
 754                mutex_unlock(&i915->drm.struct_mutex);
 755        }
 756
 757        return err;
 758}
 759
 760static int igt_reset_idle_engine(void *arg)
 761{
 762        return __igt_reset_engine(arg, false);
 763}
 764
 765static int igt_reset_active_engine(void *arg)
 766{
 767        return __igt_reset_engine(arg, true);
 768}
 769
 770struct active_engine {
 771        struct task_struct *task;
 772        struct intel_engine_cs *engine;
 773        unsigned long resets;
 774        unsigned int flags;
 775};
 776
 777#define TEST_ACTIVE     BIT(0)
 778#define TEST_OTHERS     BIT(1)
 779#define TEST_SELF       BIT(2)
 780#define TEST_PRIORITY   BIT(3)
 781
 782static int active_request_put(struct i915_request *rq)
 783{
 784        int err = 0;
 785
 786        if (!rq)
 787                return 0;
 788
 789        if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
 790                GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
 791                          rq->engine->name,
 792                          rq->fence.context,
 793                          rq->fence.seqno);
 794                GEM_TRACE_DUMP();
 795
 796                i915_gem_set_wedged(rq->i915);
 797                err = -EIO;
 798        }
 799
 800        i915_request_put(rq);
 801
 802        return err;
 803}
 804
 805static int active_engine(void *data)
 806{
 807        I915_RND_STATE(prng);
 808        struct active_engine *arg = data;
 809        struct intel_engine_cs *engine = arg->engine;
 810        struct i915_request *rq[8] = {};
 811        struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
 812        struct drm_file *file;
 813        unsigned long count = 0;
 814        int err = 0;
 815
 816        file = mock_file(engine->i915);
 817        if (IS_ERR(file))
 818                return PTR_ERR(file);
 819
 820        for (count = 0; count < ARRAY_SIZE(ctx); count++) {
 821                mutex_lock(&engine->i915->drm.struct_mutex);
 822                ctx[count] = live_context(engine->i915, file);
 823                mutex_unlock(&engine->i915->drm.struct_mutex);
 824                if (IS_ERR(ctx[count])) {
 825                        err = PTR_ERR(ctx[count]);
 826                        while (--count)
 827                                i915_gem_context_put(ctx[count]);
 828                        goto err_file;
 829                }
 830        }
 831
 832        while (!kthread_should_stop()) {
 833                unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
 834                struct i915_request *old = rq[idx];
 835                struct i915_request *new;
 836
 837                mutex_lock(&engine->i915->drm.struct_mutex);
 838                new = i915_request_alloc(engine, ctx[idx]);
 839                if (IS_ERR(new)) {
 840                        mutex_unlock(&engine->i915->drm.struct_mutex);
 841                        err = PTR_ERR(new);
 842                        break;
 843                }
 844
 845                if (arg->flags & TEST_PRIORITY)
 846                        ctx[idx]->sched.priority =
 847                                i915_prandom_u32_max_state(512, &prng);
 848
 849                rq[idx] = i915_request_get(new);
 850                i915_request_add(new);
 851                mutex_unlock(&engine->i915->drm.struct_mutex);
 852
 853                err = active_request_put(old);
 854                if (err)
 855                        break;
 856
 857                cond_resched();
 858        }
 859
 860        for (count = 0; count < ARRAY_SIZE(rq); count++) {
 861                int err__ = active_request_put(rq[count]);
 862
 863                /* Keep the first error */
 864                if (!err)
 865                        err = err__;
 866        }
 867
 868err_file:
 869        mock_file_free(engine->i915, file);
 870        return err;
 871}
 872
 873static int __igt_reset_engines(struct drm_i915_private *i915,
 874                               const char *test_name,
 875                               unsigned int flags)
 876{
 877        struct intel_engine_cs *engine, *other;
 878        enum intel_engine_id id, tmp;
 879        struct hang h;
 880        int err = 0;
 881
 882        /* Check that issuing a reset on one engine does not interfere
 883         * with any other engine.
 884         */
 885
 886        if (!intel_has_reset_engine(i915))
 887                return 0;
 888
 889        if (flags & TEST_ACTIVE) {
 890                mutex_lock(&i915->drm.struct_mutex);
 891                err = hang_init(&h, i915);
 892                mutex_unlock(&i915->drm.struct_mutex);
 893                if (err)
 894                        return err;
 895
 896                if (flags & TEST_PRIORITY)
 897                        h.ctx->sched.priority = 1024;
 898        }
 899
 900        for_each_engine(engine, i915, id) {
 901                struct active_engine threads[I915_NUM_ENGINES] = {};
 902                unsigned long global = i915_reset_count(&i915->gpu_error);
 903                unsigned long count = 0, reported;
 904                IGT_TIMEOUT(end_time);
 905
 906                if (flags & TEST_ACTIVE &&
 907                    !intel_engine_can_store_dword(engine))
 908                        continue;
 909
 910                if (!wait_for_idle(engine)) {
 911                        pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
 912                               engine->name, test_name);
 913                        err = -EIO;
 914                        break;
 915                }
 916
 917                memset(threads, 0, sizeof(threads));
 918                for_each_engine(other, i915, tmp) {
 919                        struct task_struct *tsk;
 920
 921                        threads[tmp].resets =
 922                                i915_reset_engine_count(&i915->gpu_error,
 923                                                        other);
 924
 925                        if (!(flags & TEST_OTHERS))
 926                                continue;
 927
 928                        if (other == engine && !(flags & TEST_SELF))
 929                                continue;
 930
 931                        threads[tmp].engine = other;
 932                        threads[tmp].flags = flags;
 933
 934                        tsk = kthread_run(active_engine, &threads[tmp],
 935                                          "igt/%s", other->name);
 936                        if (IS_ERR(tsk)) {
 937                                err = PTR_ERR(tsk);
 938                                goto unwind;
 939                        }
 940
 941                        threads[tmp].task = tsk;
 942                        get_task_struct(tsk);
 943                }
 944
 945                set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 946                do {
 947                        struct i915_request *rq = NULL;
 948
 949                        if (flags & TEST_ACTIVE) {
 950                                mutex_lock(&i915->drm.struct_mutex);
 951                                rq = hang_create_request(&h, engine);
 952                                if (IS_ERR(rq)) {
 953                                        err = PTR_ERR(rq);
 954                                        mutex_unlock(&i915->drm.struct_mutex);
 955                                        break;
 956                                }
 957
 958                                i915_request_get(rq);
 959                                i915_request_add(rq);
 960                                mutex_unlock(&i915->drm.struct_mutex);
 961
 962                                if (!wait_until_running(&h, rq)) {
 963                                        struct drm_printer p = drm_info_printer(i915->drm.dev);
 964
 965                                        pr_err("%s: Failed to start request %llx, at %x\n",
 966                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 967                                        intel_engine_dump(engine, &p,
 968                                                          "%s\n", engine->name);
 969
 970                                        i915_request_put(rq);
 971                                        err = -EIO;
 972                                        break;
 973                                }
 974                        }
 975
 976                        err = i915_reset_engine(engine, NULL);
 977                        if (err) {
 978                                pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
 979                                       engine->name, test_name, err);
 980                                break;
 981                        }
 982
 983                        count++;
 984
 985                        if (rq) {
 986                                if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 987                                        struct drm_printer p =
 988                                                drm_info_printer(i915->drm.dev);
 989
 990                                        pr_err("i915_reset_engine(%s:%s):"
 991                                               " failed to complete request after reset\n",
 992                                               engine->name, test_name);
 993                                        intel_engine_dump(engine, &p,
 994                                                          "%s\n", engine->name);
 995                                        i915_request_put(rq);
 996
 997                                        GEM_TRACE_DUMP();
 998                                        i915_gem_set_wedged(i915);
 999                                        err = -EIO;
1000                                        break;
1001                                }
1002
1003                                i915_request_put(rq);
1004                        }
1005
1006                        if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1007                                struct drm_printer p =
1008                                        drm_info_printer(i915->drm.dev);
1009
1010                                pr_err("i915_reset_engine(%s:%s):"
1011                                       " failed to idle after reset\n",
1012                                       engine->name, test_name);
1013                                intel_engine_dump(engine, &p,
1014                                                  "%s\n", engine->name);
1015
1016                                err = -EIO;
1017                                break;
1018                        }
1019                } while (time_before(jiffies, end_time));
1020                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
1021                pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1022                        engine->name, test_name, count);
1023
1024                reported = i915_reset_engine_count(&i915->gpu_error, engine);
1025                reported -= threads[engine->id].resets;
1026                if (reported != count) {
1027                        pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1028                               engine->name, test_name, count, reported);
1029                        if (!err)
1030                                err = -EINVAL;
1031                }
1032
1033unwind:
1034                for_each_engine(other, i915, tmp) {
1035                        int ret;
1036
1037                        if (!threads[tmp].task)
1038                                continue;
1039
1040                        ret = kthread_stop(threads[tmp].task);
1041                        if (ret) {
1042                                pr_err("kthread for other engine %s failed, err=%d\n",
1043                                       other->name, ret);
1044                                if (!err)
1045                                        err = ret;
1046                        }
1047                        put_task_struct(threads[tmp].task);
1048
1049                        if (other != engine &&
1050                            threads[tmp].resets !=
1051                            i915_reset_engine_count(&i915->gpu_error, other)) {
1052                                pr_err("Innocent engine %s was reset (count=%ld)\n",
1053                                       other->name,
1054                                       i915_reset_engine_count(&i915->gpu_error,
1055                                                               other) -
1056                                       threads[tmp].resets);
1057                                if (!err)
1058                                        err = -EINVAL;
1059                        }
1060                }
1061
1062                if (global != i915_reset_count(&i915->gpu_error)) {
1063                        pr_err("Global reset (count=%ld)!\n",
1064                               i915_reset_count(&i915->gpu_error) - global);
1065                        if (!err)
1066                                err = -EINVAL;
1067                }
1068
1069                if (err)
1070                        break;
1071
1072                err = igt_flush_test(i915, 0);
1073                if (err)
1074                        break;
1075        }
1076
1077        if (i915_reset_failed(i915))
1078                err = -EIO;
1079
1080        if (flags & TEST_ACTIVE) {
1081                mutex_lock(&i915->drm.struct_mutex);
1082                hang_fini(&h);
1083                mutex_unlock(&i915->drm.struct_mutex);
1084        }
1085
1086        return err;
1087}
1088
1089static int igt_reset_engines(void *arg)
1090{
1091        static const struct {
1092                const char *name;
1093                unsigned int flags;
1094        } phases[] = {
1095                { "idle", 0 },
1096                { "active", TEST_ACTIVE },
1097                { "others-idle", TEST_OTHERS },
1098                { "others-active", TEST_OTHERS | TEST_ACTIVE },
1099                {
1100                        "others-priority",
1101                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1102                },
1103                {
1104                        "self-priority",
1105                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1106                },
1107                { }
1108        };
1109        struct drm_i915_private *i915 = arg;
1110        typeof(*phases) *p;
1111        int err;
1112
1113        for (p = phases; p->name; p++) {
1114                if (p->flags & TEST_PRIORITY) {
1115                        if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1116                                continue;
1117                }
1118
1119                err = __igt_reset_engines(arg, p->name, p->flags);
1120                if (err)
1121                        return err;
1122        }
1123
1124        return 0;
1125}
1126
1127static u32 fake_hangcheck(struct drm_i915_private *i915,
1128                          intel_engine_mask_t mask)
1129{
1130        u32 count = i915_reset_count(&i915->gpu_error);
1131
1132        i915_reset(i915, mask, NULL);
1133
1134        return count;
1135}
1136
1137static int igt_reset_wait(void *arg)
1138{
1139        struct drm_i915_private *i915 = arg;
1140        struct i915_request *rq;
1141        unsigned int reset_count;
1142        struct hang h;
1143        long timeout;
1144        int err;
1145
1146        if (!intel_engine_can_store_dword(i915->engine[RCS0]))
1147                return 0;
1148
1149        /* Check that we detect a stuck waiter and issue a reset */
1150
1151        igt_global_reset_lock(i915);
1152
1153        mutex_lock(&i915->drm.struct_mutex);
1154        err = hang_init(&h, i915);
1155        if (err)
1156                goto unlock;
1157
1158        rq = hang_create_request(&h, i915->engine[RCS0]);
1159        if (IS_ERR(rq)) {
1160                err = PTR_ERR(rq);
1161                goto fini;
1162        }
1163
1164        i915_request_get(rq);
1165        i915_request_add(rq);
1166
1167        if (!wait_until_running(&h, rq)) {
1168                struct drm_printer p = drm_info_printer(i915->drm.dev);
1169
1170                pr_err("%s: Failed to start request %llx, at %x\n",
1171                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1172                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1173
1174                i915_gem_set_wedged(i915);
1175
1176                err = -EIO;
1177                goto out_rq;
1178        }
1179
1180        reset_count = fake_hangcheck(i915, ALL_ENGINES);
1181
1182        timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
1183        if (timeout < 0) {
1184                pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1185                       timeout);
1186                err = timeout;
1187                goto out_rq;
1188        }
1189
1190        if (i915_reset_count(&i915->gpu_error) == reset_count) {
1191                pr_err("No GPU reset recorded!\n");
1192                err = -EINVAL;
1193                goto out_rq;
1194        }
1195
1196out_rq:
1197        i915_request_put(rq);
1198fini:
1199        hang_fini(&h);
1200unlock:
1201        mutex_unlock(&i915->drm.struct_mutex);
1202        igt_global_reset_unlock(i915);
1203
1204        if (i915_reset_failed(i915))
1205                return -EIO;
1206
1207        return err;
1208}
1209
1210struct evict_vma {
1211        struct completion completion;
1212        struct i915_vma *vma;
1213};
1214
1215static int evict_vma(void *data)
1216{
1217        struct evict_vma *arg = data;
1218        struct i915_address_space *vm = arg->vma->vm;
1219        struct drm_i915_private *i915 = vm->i915;
1220        struct drm_mm_node evict = arg->vma->node;
1221        int err;
1222
1223        complete(&arg->completion);
1224
1225        mutex_lock(&i915->drm.struct_mutex);
1226        err = i915_gem_evict_for_node(vm, &evict, 0);
1227        mutex_unlock(&i915->drm.struct_mutex);
1228
1229        return err;
1230}
1231
1232static int evict_fence(void *data)
1233{
1234        struct evict_vma *arg = data;
1235        struct drm_i915_private *i915 = arg->vma->vm->i915;
1236        int err;
1237
1238        complete(&arg->completion);
1239
1240        mutex_lock(&i915->drm.struct_mutex);
1241
1242        /* Mark the fence register as dirty to force the mmio update. */
1243        err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1244        if (err) {
1245                pr_err("Invalid Y-tiling settings; err:%d\n", err);
1246                goto out_unlock;
1247        }
1248
1249        err = i915_vma_pin_fence(arg->vma);
1250        if (err) {
1251                pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1252                goto out_unlock;
1253        }
1254
1255        i915_vma_unpin_fence(arg->vma);
1256
1257out_unlock:
1258        mutex_unlock(&i915->drm.struct_mutex);
1259
1260        return err;
1261}
1262
1263static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1264                                 struct i915_address_space *vm,
1265                                 int (*fn)(void *),
1266                                 unsigned int flags)
1267{
1268        struct drm_i915_gem_object *obj;
1269        struct task_struct *tsk = NULL;
1270        struct i915_request *rq;
1271        struct evict_vma arg;
1272        struct hang h;
1273        int err;
1274
1275        if (!intel_engine_can_store_dword(i915->engine[RCS0]))
1276                return 0;
1277
1278        /* Check that we can recover an unbind stuck on a hanging request */
1279
1280        mutex_lock(&i915->drm.struct_mutex);
1281        err = hang_init(&h, i915);
1282        if (err)
1283                goto unlock;
1284
1285        obj = i915_gem_object_create_internal(i915, SZ_1M);
1286        if (IS_ERR(obj)) {
1287                err = PTR_ERR(obj);
1288                goto fini;
1289        }
1290
1291        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1292                err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1293                if (err) {
1294                        pr_err("Invalid X-tiling settings; err:%d\n", err);
1295                        goto out_obj;
1296                }
1297        }
1298
1299        arg.vma = i915_vma_instance(obj, vm, NULL);
1300        if (IS_ERR(arg.vma)) {
1301                err = PTR_ERR(arg.vma);
1302                goto out_obj;
1303        }
1304
1305        rq = hang_create_request(&h, i915->engine[RCS0]);
1306        if (IS_ERR(rq)) {
1307                err = PTR_ERR(rq);
1308                goto out_obj;
1309        }
1310
1311        err = i915_vma_pin(arg.vma, 0, 0,
1312                           i915_vma_is_ggtt(arg.vma) ?
1313                           PIN_GLOBAL | PIN_MAPPABLE :
1314                           PIN_USER);
1315        if (err) {
1316                i915_request_add(rq);
1317                goto out_obj;
1318        }
1319
1320        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1321                err = i915_vma_pin_fence(arg.vma);
1322                if (err) {
1323                        pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1324                        i915_vma_unpin(arg.vma);
1325                        i915_request_add(rq);
1326                        goto out_obj;
1327                }
1328        }
1329
1330        err = i915_vma_move_to_active(arg.vma, rq, flags);
1331
1332        if (flags & EXEC_OBJECT_NEEDS_FENCE)
1333                i915_vma_unpin_fence(arg.vma);
1334        i915_vma_unpin(arg.vma);
1335
1336        i915_request_get(rq);
1337        i915_request_add(rq);
1338        if (err)
1339                goto out_rq;
1340
1341        mutex_unlock(&i915->drm.struct_mutex);
1342
1343        if (!wait_until_running(&h, rq)) {
1344                struct drm_printer p = drm_info_printer(i915->drm.dev);
1345
1346                pr_err("%s: Failed to start request %llx, at %x\n",
1347                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1348                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1349
1350                i915_gem_set_wedged(i915);
1351                goto out_reset;
1352        }
1353
1354        init_completion(&arg.completion);
1355
1356        tsk = kthread_run(fn, &arg, "igt/evict_vma");
1357        if (IS_ERR(tsk)) {
1358                err = PTR_ERR(tsk);
1359                tsk = NULL;
1360                goto out_reset;
1361        }
1362        get_task_struct(tsk);
1363
1364        wait_for_completion(&arg.completion);
1365
1366        if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1367                struct drm_printer p = drm_info_printer(i915->drm.dev);
1368
1369                pr_err("igt/evict_vma kthread did not wait\n");
1370                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1371
1372                i915_gem_set_wedged(i915);
1373                goto out_reset;
1374        }
1375
1376out_reset:
1377        igt_global_reset_lock(i915);
1378        fake_hangcheck(rq->i915, rq->engine->mask);
1379        igt_global_reset_unlock(i915);
1380
1381        if (tsk) {
1382                struct igt_wedge_me w;
1383
1384                /* The reset, even indirectly, should take less than 10ms. */
1385                igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
1386                        err = kthread_stop(tsk);
1387
1388                put_task_struct(tsk);
1389        }
1390
1391        mutex_lock(&i915->drm.struct_mutex);
1392out_rq:
1393        i915_request_put(rq);
1394out_obj:
1395        i915_gem_object_put(obj);
1396fini:
1397        hang_fini(&h);
1398unlock:
1399        mutex_unlock(&i915->drm.struct_mutex);
1400
1401        if (i915_reset_failed(i915))
1402                return -EIO;
1403
1404        return err;
1405}
1406
1407static int igt_reset_evict_ggtt(void *arg)
1408{
1409        struct drm_i915_private *i915 = arg;
1410
1411        return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1412                                     evict_vma, EXEC_OBJECT_WRITE);
1413}
1414
1415static int igt_reset_evict_ppgtt(void *arg)
1416{
1417        struct drm_i915_private *i915 = arg;
1418        struct i915_gem_context *ctx;
1419        struct drm_file *file;
1420        int err;
1421
1422        file = mock_file(i915);
1423        if (IS_ERR(file))
1424                return PTR_ERR(file);
1425
1426        mutex_lock(&i915->drm.struct_mutex);
1427        ctx = live_context(i915, file);
1428        mutex_unlock(&i915->drm.struct_mutex);
1429        if (IS_ERR(ctx)) {
1430                err = PTR_ERR(ctx);
1431                goto out;
1432        }
1433
1434        err = 0;
1435        if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */
1436                err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm,
1437                                            evict_vma, EXEC_OBJECT_WRITE);
1438
1439out:
1440        mock_file_free(i915, file);
1441        return err;
1442}
1443
1444static int igt_reset_evict_fence(void *arg)
1445{
1446        struct drm_i915_private *i915 = arg;
1447
1448        return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1449                                     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1450}
1451
1452static int wait_for_others(struct drm_i915_private *i915,
1453                           struct intel_engine_cs *exclude)
1454{
1455        struct intel_engine_cs *engine;
1456        enum intel_engine_id id;
1457
1458        for_each_engine(engine, i915, id) {
1459                if (engine == exclude)
1460                        continue;
1461
1462                if (!wait_for_idle(engine))
1463                        return -EIO;
1464        }
1465
1466        return 0;
1467}
1468
1469static int igt_reset_queue(void *arg)
1470{
1471        struct drm_i915_private *i915 = arg;
1472        struct intel_engine_cs *engine;
1473        enum intel_engine_id id;
1474        struct hang h;
1475        int err;
1476
1477        /* Check that we replay pending requests following a hang */
1478
1479        igt_global_reset_lock(i915);
1480
1481        mutex_lock(&i915->drm.struct_mutex);
1482        err = hang_init(&h, i915);
1483        if (err)
1484                goto unlock;
1485
1486        for_each_engine(engine, i915, id) {
1487                struct i915_request *prev;
1488                IGT_TIMEOUT(end_time);
1489                unsigned int count;
1490
1491                if (!intel_engine_can_store_dword(engine))
1492                        continue;
1493
1494                prev = hang_create_request(&h, engine);
1495                if (IS_ERR(prev)) {
1496                        err = PTR_ERR(prev);
1497                        goto fini;
1498                }
1499
1500                i915_request_get(prev);
1501                i915_request_add(prev);
1502
1503                count = 0;
1504                do {
1505                        struct i915_request *rq;
1506                        unsigned int reset_count;
1507
1508                        rq = hang_create_request(&h, engine);
1509                        if (IS_ERR(rq)) {
1510                                err = PTR_ERR(rq);
1511                                goto fini;
1512                        }
1513
1514                        i915_request_get(rq);
1515                        i915_request_add(rq);
1516
1517                        /*
1518                         * XXX We don't handle resetting the kernel context
1519                         * very well. If we trigger a device reset twice in
1520                         * quick succession while the kernel context is
1521                         * executing, we may end up skipping the breadcrumb.
1522                         * This is really only a problem for the selftest as
1523                         * normally there is a large interlude between resets
1524                         * (hangcheck), or we focus on resetting just one
1525                         * engine and so avoid repeatedly resetting innocents.
1526                         */
1527                        err = wait_for_others(i915, engine);
1528                        if (err) {
1529                                pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1530                                       __func__, engine->name);
1531                                i915_request_put(rq);
1532                                i915_request_put(prev);
1533
1534                                GEM_TRACE_DUMP();
1535                                i915_gem_set_wedged(i915);
1536                                goto fini;
1537                        }
1538
1539                        if (!wait_until_running(&h, prev)) {
1540                                struct drm_printer p = drm_info_printer(i915->drm.dev);
1541
1542                                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1543                                       __func__, engine->name,
1544                                       prev->fence.seqno, hws_seqno(&h, prev));
1545                                intel_engine_dump(engine, &p,
1546                                                  "%s\n", engine->name);
1547
1548                                i915_request_put(rq);
1549                                i915_request_put(prev);
1550
1551                                i915_gem_set_wedged(i915);
1552
1553                                err = -EIO;
1554                                goto fini;
1555                        }
1556
1557                        reset_count = fake_hangcheck(i915, BIT(id));
1558
1559                        if (prev->fence.error != -EIO) {
1560                                pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1561                                       prev->fence.error);
1562                                i915_request_put(rq);
1563                                i915_request_put(prev);
1564                                err = -EINVAL;
1565                                goto fini;
1566                        }
1567
1568                        if (rq->fence.error) {
1569                                pr_err("Fence error status not zero [%d] after unrelated reset\n",
1570                                       rq->fence.error);
1571                                i915_request_put(rq);
1572                                i915_request_put(prev);
1573                                err = -EINVAL;
1574                                goto fini;
1575                        }
1576
1577                        if (i915_reset_count(&i915->gpu_error) == reset_count) {
1578                                pr_err("No GPU reset recorded!\n");
1579                                i915_request_put(rq);
1580                                i915_request_put(prev);
1581                                err = -EINVAL;
1582                                goto fini;
1583                        }
1584
1585                        i915_request_put(prev);
1586                        prev = rq;
1587                        count++;
1588                } while (time_before(jiffies, end_time));
1589                pr_info("%s: Completed %d resets\n", engine->name, count);
1590
1591                *h.batch = MI_BATCH_BUFFER_END;
1592                i915_gem_chipset_flush(i915);
1593
1594                i915_request_put(prev);
1595
1596                err = igt_flush_test(i915, I915_WAIT_LOCKED);
1597                if (err)
1598                        break;
1599        }
1600
1601fini:
1602        hang_fini(&h);
1603unlock:
1604        mutex_unlock(&i915->drm.struct_mutex);
1605        igt_global_reset_unlock(i915);
1606
1607        if (i915_reset_failed(i915))
1608                return -EIO;
1609
1610        return err;
1611}
1612
1613static int igt_handle_error(void *arg)
1614{
1615        struct drm_i915_private *i915 = arg;
1616        struct intel_engine_cs *engine = i915->engine[RCS0];
1617        struct hang h;
1618        struct i915_request *rq;
1619        struct i915_gpu_state *error;
1620        int err;
1621
1622        /* Check that we can issue a global GPU and engine reset */
1623
1624        if (!intel_has_reset_engine(i915))
1625                return 0;
1626
1627        if (!engine || !intel_engine_can_store_dword(engine))
1628                return 0;
1629
1630        mutex_lock(&i915->drm.struct_mutex);
1631
1632        err = hang_init(&h, i915);
1633        if (err)
1634                goto err_unlock;
1635
1636        rq = hang_create_request(&h, engine);
1637        if (IS_ERR(rq)) {
1638                err = PTR_ERR(rq);
1639                goto err_fini;
1640        }
1641
1642        i915_request_get(rq);
1643        i915_request_add(rq);
1644
1645        if (!wait_until_running(&h, rq)) {
1646                struct drm_printer p = drm_info_printer(i915->drm.dev);
1647
1648                pr_err("%s: Failed to start request %llx, at %x\n",
1649                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1650                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1651
1652                i915_gem_set_wedged(i915);
1653
1654                err = -EIO;
1655                goto err_request;
1656        }
1657
1658        mutex_unlock(&i915->drm.struct_mutex);
1659
1660        /* Temporarily disable error capture */
1661        error = xchg(&i915->gpu_error.first_error, (void *)-1);
1662
1663        i915_handle_error(i915, engine->mask, 0, NULL);
1664
1665        xchg(&i915->gpu_error.first_error, error);
1666
1667        mutex_lock(&i915->drm.struct_mutex);
1668
1669        if (rq->fence.error != -EIO) {
1670                pr_err("Guilty request not identified!\n");
1671                err = -EINVAL;
1672                goto err_request;
1673        }
1674
1675err_request:
1676        i915_request_put(rq);
1677err_fini:
1678        hang_fini(&h);
1679err_unlock:
1680        mutex_unlock(&i915->drm.struct_mutex);
1681        return err;
1682}
1683
1684static void __preempt_begin(void)
1685{
1686        preempt_disable();
1687}
1688
1689static void __preempt_end(void)
1690{
1691        preempt_enable();
1692}
1693
1694static void __softirq_begin(void)
1695{
1696        local_bh_disable();
1697}
1698
1699static void __softirq_end(void)
1700{
1701        local_bh_enable();
1702}
1703
1704static void __hardirq_begin(void)
1705{
1706        local_irq_disable();
1707}
1708
1709static void __hardirq_end(void)
1710{
1711        local_irq_enable();
1712}
1713
1714struct atomic_section {
1715        const char *name;
1716        void (*critical_section_begin)(void);
1717        void (*critical_section_end)(void);
1718};
1719
1720static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1721                                     const struct atomic_section *p,
1722                                     const char *mode)
1723{
1724        struct tasklet_struct * const t = &engine->execlists.tasklet;
1725        int err;
1726
1727        GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1728                  engine->name, mode, p->name);
1729
1730        tasklet_disable_nosync(t);
1731        p->critical_section_begin();
1732
1733        err = i915_reset_engine(engine, NULL);
1734
1735        p->critical_section_end();
1736        tasklet_enable(t);
1737
1738        if (err)
1739                pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1740                       engine->name, mode, p->name);
1741
1742        return err;
1743}
1744
1745static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1746                                   const struct atomic_section *p)
1747{
1748        struct drm_i915_private *i915 = engine->i915;
1749        struct i915_request *rq;
1750        struct hang h;
1751        int err;
1752
1753        err = __igt_atomic_reset_engine(engine, p, "idle");
1754        if (err)
1755                return err;
1756
1757        err = hang_init(&h, i915);
1758        if (err)
1759                return err;
1760
1761        rq = hang_create_request(&h, engine);
1762        if (IS_ERR(rq)) {
1763                err = PTR_ERR(rq);
1764                goto out;
1765        }
1766
1767        i915_request_get(rq);
1768        i915_request_add(rq);
1769
1770        if (wait_until_running(&h, rq)) {
1771                err = __igt_atomic_reset_engine(engine, p, "active");
1772        } else {
1773                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1774                       __func__, engine->name,
1775                       rq->fence.seqno, hws_seqno(&h, rq));
1776                i915_gem_set_wedged(i915);
1777                err = -EIO;
1778        }
1779
1780        if (err == 0) {
1781                struct igt_wedge_me w;
1782
1783                igt_wedge_on_timeout(&w, i915, HZ / 20 /* 50ms timeout*/)
1784                        i915_request_wait(rq,
1785                                          I915_WAIT_LOCKED,
1786                                          MAX_SCHEDULE_TIMEOUT);
1787                if (i915_reset_failed(i915))
1788                        err = -EIO;
1789        }
1790
1791        i915_request_put(rq);
1792out:
1793        hang_fini(&h);
1794        return err;
1795}
1796
1797static void force_reset(struct drm_i915_private *i915)
1798{
1799        i915_gem_set_wedged(i915);
1800        i915_reset(i915, 0, NULL);
1801}
1802
1803static int igt_atomic_reset(void *arg)
1804{
1805        static const struct atomic_section phases[] = {
1806                { "preempt", __preempt_begin, __preempt_end },
1807                { "softirq", __softirq_begin, __softirq_end },
1808                { "hardirq", __hardirq_begin, __hardirq_end },
1809                { }
1810        };
1811        struct drm_i915_private *i915 = arg;
1812        intel_wakeref_t wakeref;
1813        int err = 0;
1814
1815        /* Check that the resets are usable from atomic context */
1816
1817        if (USES_GUC_SUBMISSION(i915))
1818                return 0; /* guc is dead; long live the guc */
1819
1820        igt_global_reset_lock(i915);
1821        mutex_lock(&i915->drm.struct_mutex);
1822        wakeref = intel_runtime_pm_get(i915);
1823
1824        /* Flush any requests before we get started and check basics */
1825        force_reset(i915);
1826        if (i915_reset_failed(i915))
1827                goto unlock;
1828
1829        if (intel_has_gpu_reset(i915)) {
1830                const typeof(*phases) *p;
1831
1832                for (p = phases; p->name; p++) {
1833                        GEM_TRACE("intel_gpu_reset under %s\n", p->name);
1834
1835                        p->critical_section_begin();
1836                        err = intel_gpu_reset(i915, ALL_ENGINES);
1837                        p->critical_section_end();
1838
1839                        if (err) {
1840                                pr_err("intel_gpu_reset failed under %s\n",
1841                                       p->name);
1842                                goto out;
1843                        }
1844                }
1845
1846                force_reset(i915);
1847        }
1848
1849        if (intel_has_reset_engine(i915)) {
1850                struct intel_engine_cs *engine;
1851                enum intel_engine_id id;
1852
1853                for_each_engine(engine, i915, id) {
1854                        const typeof(*phases) *p;
1855
1856                        for (p = phases; p->name; p++) {
1857                                err = igt_atomic_reset_engine(engine, p);
1858                                if (err)
1859                                        goto out;
1860                        }
1861                }
1862        }
1863
1864out:
1865        /* As we poke around the guts, do a full reset before continuing. */
1866        force_reset(i915);
1867
1868unlock:
1869        intel_runtime_pm_put(i915, wakeref);
1870        mutex_unlock(&i915->drm.struct_mutex);
1871        igt_global_reset_unlock(i915);
1872
1873        return err;
1874}
1875
1876int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1877{
1878        static const struct i915_subtest tests[] = {
1879                SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1880                SUBTEST(igt_wedged_reset),
1881                SUBTEST(igt_hang_sanitycheck),
1882                SUBTEST(igt_reset_nop),
1883                SUBTEST(igt_reset_nop_engine),
1884                SUBTEST(igt_reset_idle_engine),
1885                SUBTEST(igt_reset_active_engine),
1886                SUBTEST(igt_reset_engines),
1887                SUBTEST(igt_reset_queue),
1888                SUBTEST(igt_reset_wait),
1889                SUBTEST(igt_reset_evict_ggtt),
1890                SUBTEST(igt_reset_evict_ppgtt),
1891                SUBTEST(igt_reset_evict_fence),
1892                SUBTEST(igt_handle_error),
1893                SUBTEST(igt_atomic_reset),
1894        };
1895        intel_wakeref_t wakeref;
1896        bool saved_hangcheck;
1897        int err;
1898
1899        if (!intel_has_gpu_reset(i915))
1900                return 0;
1901
1902        if (i915_terminally_wedged(i915))
1903                return -EIO; /* we're long past hope of a successful reset */
1904
1905        wakeref = intel_runtime_pm_get(i915);
1906        saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1907        drain_delayed_work(&i915->gpu_error.hangcheck_work); /* flush param */
1908
1909        err = i915_subtests(tests, i915);
1910
1911        mutex_lock(&i915->drm.struct_mutex);
1912        igt_flush_test(i915, I915_WAIT_LOCKED);
1913        mutex_unlock(&i915->drm.struct_mutex);
1914
1915        i915_modparams.enable_hangcheck = saved_hangcheck;
1916        intel_runtime_pm_put(i915, wakeref);
1917
1918        return err;
1919}
1920