LXR linux/drivers/gpu/drm/i915/gt/selftest

   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2017-2018 Intel Corporation
   4 */
   5
   6#include <linux/prime_numbers.h>
   7
   8#include "intel_context.h"
   9#include "intel_engine_heartbeat.h"
  10#include "intel_engine_pm.h"
  11#include "intel_gpu_commands.h"
  12#include "intel_gt.h"
  13#include "intel_gt_requests.h"
  14#include "intel_ring.h"
  15#include "selftest_engine_heartbeat.h"
  16
  17#include "../selftests/i915_random.h"
  18#include "../i915_selftest.h"
  19
  20#include "selftests/igt_flush_test.h"
  21#include "selftests/lib_sw_fence.h"
  22#include "selftests/mock_gem_device.h"
  23#include "selftests/mock_timeline.h"
  24
  25static struct page *hwsp_page(struct intel_timeline *tl)
  26{
  27        struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj;
  28
  29        GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
  30        return sg_page(obj->mm.pages->sgl);
  31}
  32
  33static unsigned long hwsp_cacheline(struct intel_timeline *tl)
  34{
  35        unsigned long address = (unsigned long)page_address(hwsp_page(tl));
  36
  37        return (address + offset_in_page(tl->hwsp_offset)) / TIMELINE_SEQNO_BYTES;
  38}
  39
  40static int selftest_tl_pin(struct intel_timeline *tl)
  41{
  42        struct i915_gem_ww_ctx ww;
  43        int err;
  44
  45        i915_gem_ww_ctx_init(&ww, false);
  46retry:
  47        err = i915_gem_object_lock(tl->hwsp_ggtt->obj, &ww);
  48        if (!err)
  49                err = intel_timeline_pin(tl, &ww);
  50
  51        if (err == -EDEADLK) {
  52                err = i915_gem_ww_ctx_backoff(&ww);
  53                if (!err)
  54                        goto retry;
  55        }
  56        i915_gem_ww_ctx_fini(&ww);
  57        return err;
  58}
  59
  60/* Only half of seqno's are usable, see __intel_timeline_get_seqno() */
  61#define CACHELINES_PER_PAGE (PAGE_SIZE / TIMELINE_SEQNO_BYTES / 2)
  62
  63struct mock_hwsp_freelist {
  64        struct intel_gt *gt;
  65        struct radix_tree_root cachelines;
  66        struct intel_timeline **history;
  67        unsigned long count, max;
  68        struct rnd_state prng;
  69};
  70
  71enum {
  72        SHUFFLE = BIT(0),
  73};
  74
  75static void __mock_hwsp_record(struct mock_hwsp_freelist *state,
  76                               unsigned int idx,
  77                               struct intel_timeline *tl)
  78{
  79        tl = xchg(&state->history[idx], tl);
  80        if (tl) {
  81                radix_tree_delete(&state->cachelines, hwsp_cacheline(tl));
  82                intel_timeline_unpin(tl);
  83                intel_timeline_put(tl);
  84        }
  85}
  86
  87static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
  88                                unsigned int count,
  89                                unsigned int flags)
  90{
  91        struct intel_timeline *tl;
  92        unsigned int idx;
  93
  94        while (count--) {
  95                unsigned long cacheline;
  96                int err;
  97
  98                tl = intel_timeline_create(state->gt);
  99                if (IS_ERR(tl))
 100                        return PTR_ERR(tl);
 101
 102                err = selftest_tl_pin(tl);
 103                if (err) {
 104                        intel_timeline_put(tl);
 105                        return err;
 106                }
 107
 108                cacheline = hwsp_cacheline(tl);
 109                err = radix_tree_insert(&state->cachelines, cacheline, tl);
 110                if (err) {
 111                        if (err == -EEXIST) {
 112                                pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
 113                                       cacheline);
 114                        }
 115                        intel_timeline_unpin(tl);
 116                        intel_timeline_put(tl);
 117                        return err;
 118                }
 119
 120                idx = state->count++ % state->max;
 121                __mock_hwsp_record(state, idx, tl);
 122        }
 123
 124        if (flags & SHUFFLE)
 125                i915_prandom_shuffle(state->history,
 126                                     sizeof(*state->history),
 127                                     min(state->count, state->max),
 128                                     &state->prng);
 129
 130        count = i915_prandom_u32_max_state(min(state->count, state->max),
 131                                           &state->prng);
 132        while (count--) {
 133                idx = --state->count % state->max;
 134                __mock_hwsp_record(state, idx, NULL);
 135        }
 136
 137        return 0;
 138}
 139
 140static int mock_hwsp_freelist(void *arg)
 141{
 142        struct mock_hwsp_freelist state;
 143        struct drm_i915_private *i915;
 144        const struct {
 145                const char *name;
 146                unsigned int flags;
 147        } phases[] = {
 148                { "linear", 0 },
 149                { "shuffled", SHUFFLE },
 150                { },
 151        }, *p;
 152        unsigned int na;
 153        int err = 0;
 154
 155        i915 = mock_gem_device();
 156        if (!i915)
 157                return -ENOMEM;
 158
 159        INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
 160        state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
 161
 162        state.gt = &i915->gt;
 163
 164        /*
 165         * Create a bunch of timelines and check that their HWSP do not overlap.
 166         * Free some, and try again.
 167         */
 168
 169        state.max = PAGE_SIZE / sizeof(*state.history);
 170        state.count = 0;
 171        state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL);
 172        if (!state.history) {
 173                err = -ENOMEM;
 174                goto err_put;
 175        }
 176
 177        for (p = phases; p->name; p++) {
 178                pr_debug("%s(%s)\n", __func__, p->name);
 179                for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) {
 180                        err = __mock_hwsp_timeline(&state, na, p->flags);
 181                        if (err)
 182                                goto out;
 183                }
 184        }
 185
 186out:
 187        for (na = 0; na < state.max; na++)
 188                __mock_hwsp_record(&state, na, NULL);
 189        kfree(state.history);
 190err_put:
 191        mock_destroy_device(i915);
 192        return err;
 193}
 194
 195struct __igt_sync {
 196        const char *name;
 197        u32 seqno;
 198        bool expected;
 199        bool set;
 200};
 201
 202static int __igt_sync(struct intel_timeline *tl,
 203                      u64 ctx,
 204                      const struct __igt_sync *p,
 205                      const char *name)
 206{
 207        int ret;
 208
 209        if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) {
 210                pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
 211                       name, p->name, ctx, p->seqno, yesno(p->expected));
 212                return -EINVAL;
 213        }
 214
 215        if (p->set) {
 216                ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
 217                if (ret)
 218                        return ret;
 219        }
 220
 221        return 0;
 222}
 223
 224static int igt_sync(void *arg)
 225{
 226        const struct __igt_sync pass[] = {
 227                { "unset", 0, false, false },
 228                { "new", 0, false, true },
 229                { "0a", 0, true, true },
 230                { "1a", 1, false, true },
 231                { "1b", 1, true, true },
 232                { "0b", 0, true, false },
 233                { "2a", 2, false, true },
 234                { "4", 4, false, true },
 235                { "INT_MAX", INT_MAX, false, true },
 236                { "INT_MAX-1", INT_MAX-1, true, false },
 237                { "INT_MAX+1", (u32)INT_MAX+1, false, true },
 238                { "INT_MAX", INT_MAX, true, false },
 239                { "UINT_MAX", UINT_MAX, false, true },
 240                { "wrap", 0, false, true },
 241                { "unwrap", UINT_MAX, true, false },
 242                {},
 243        }, *p;
 244        struct intel_timeline tl;
 245        int order, offset;
 246        int ret = -ENODEV;
 247
 248        mock_timeline_init(&tl, 0);
 249        for (p = pass; p->name; p++) {
 250                for (order = 1; order < 64; order++) {
 251                        for (offset = -1; offset <= (order > 1); offset++) {
 252                                u64 ctx = BIT_ULL(order) + offset;
 253
 254                                ret = __igt_sync(&tl, ctx, p, "1");
 255                                if (ret)
 256                                        goto out;
 257                        }
 258                }
 259        }
 260        mock_timeline_fini(&tl);
 261
 262        mock_timeline_init(&tl, 0);
 263        for (order = 1; order < 64; order++) {
 264                for (offset = -1; offset <= (order > 1); offset++) {
 265                        u64 ctx = BIT_ULL(order) + offset;
 266
 267                        for (p = pass; p->name; p++) {
 268                                ret = __igt_sync(&tl, ctx, p, "2");
 269                                if (ret)
 270                                        goto out;
 271                        }
 272                }
 273        }
 274
 275out:
 276        mock_timeline_fini(&tl);
 277        return ret;
 278}
 279
 280static unsigned int random_engine(struct rnd_state *rnd)
 281{
 282        return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd);
 283}
 284
 285static int bench_sync(void *arg)
 286{
 287        struct rnd_state prng;
 288        struct intel_timeline tl;
 289        unsigned long end_time, count;
 290        u64 prng32_1M;
 291        ktime_t kt;
 292        int order, last_order;
 293
 294        mock_timeline_init(&tl, 0);
 295
 296        /* Lookups from cache are very fast and so the random number generation
 297         * and the loop itself becomes a significant factor in the per-iteration
 298         * timings. We try to compensate the results by measuring the overhead
 299         * of the prng and subtract it from the reported results.
 300         */
 301        prandom_seed_state(&prng, i915_selftest.random_seed);
 302        count = 0;
 303        kt = ktime_get();
 304        end_time = jiffies + HZ/10;
 305        do {
 306                u32 x;
 307
 308                /* Make sure the compiler doesn't optimise away the prng call */
 309                WRITE_ONCE(x, prandom_u32_state(&prng));
 310
 311                count++;
 312        } while (!time_after(jiffies, end_time));
 313        kt = ktime_sub(ktime_get(), kt);
 314        pr_debug("%s: %lu random evaluations, %lluns/prng\n",
 315                 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 316        prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count);
 317
 318        /* Benchmark (only) setting random context ids */
 319        prandom_seed_state(&prng, i915_selftest.random_seed);
 320        count = 0;
 321        kt = ktime_get();
 322        end_time = jiffies + HZ/10;
 323        do {
 324                u64 id = i915_prandom_u64_state(&prng);
 325
 326                __intel_timeline_sync_set(&tl, id, 0);
 327                count++;
 328        } while (!time_after(jiffies, end_time));
 329        kt = ktime_sub(ktime_get(), kt);
 330        kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
 331        pr_info("%s: %lu random insertions, %lluns/insert\n",
 332                __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 333
 334        /* Benchmark looking up the exact same context ids as we just set */
 335        prandom_seed_state(&prng, i915_selftest.random_seed);
 336        end_time = count;
 337        kt = ktime_get();
 338        while (end_time--) {
 339                u64 id = i915_prandom_u64_state(&prng);
 340
 341                if (!__intel_timeline_sync_is_later(&tl, id, 0)) {
 342                        mock_timeline_fini(&tl);
 343                        pr_err("Lookup of %llu failed\n", id);
 344                        return -EINVAL;
 345                }
 346        }
 347        kt = ktime_sub(ktime_get(), kt);
 348        kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
 349        pr_info("%s: %lu random lookups, %lluns/lookup\n",
 350                __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 351
 352        mock_timeline_fini(&tl);
 353        cond_resched();
 354
 355        mock_timeline_init(&tl, 0);
 356
 357        /* Benchmark setting the first N (in order) contexts */
 358        count = 0;
 359        kt = ktime_get();
 360        end_time = jiffies + HZ/10;
 361        do {
 362                __intel_timeline_sync_set(&tl, count++, 0);
 363        } while (!time_after(jiffies, end_time));
 364        kt = ktime_sub(ktime_get(), kt);
 365        pr_info("%s: %lu in-order insertions, %lluns/insert\n",
 366                __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 367
 368        /* Benchmark looking up the exact same context ids as we just set */
 369        end_time = count;
 370        kt = ktime_get();
 371        while (end_time--) {
 372                if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) {
 373                        pr_err("Lookup of %lu failed\n", end_time);
 374                        mock_timeline_fini(&tl);
 375                        return -EINVAL;
 376                }
 377        }
 378        kt = ktime_sub(ktime_get(), kt);
 379        pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
 380                __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 381
 382        mock_timeline_fini(&tl);
 383        cond_resched();
 384
 385        mock_timeline_init(&tl, 0);
 386
 387        /* Benchmark searching for a random context id and maybe changing it */
 388        prandom_seed_state(&prng, i915_selftest.random_seed);
 389        count = 0;
 390        kt = ktime_get();
 391        end_time = jiffies + HZ/10;
 392        do {
 393                u32 id = random_engine(&prng);
 394                u32 seqno = prandom_u32_state(&prng);
 395
 396                if (!__intel_timeline_sync_is_later(&tl, id, seqno))
 397                        __intel_timeline_sync_set(&tl, id, seqno);
 398
 399                count++;
 400        } while (!time_after(jiffies, end_time));
 401        kt = ktime_sub(ktime_get(), kt);
 402        kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
 403        pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
 404                __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 405        mock_timeline_fini(&tl);
 406        cond_resched();
 407
 408        /* Benchmark searching for a known context id and changing the seqno */
 409        for (last_order = 1, order = 1; order < 32;
 410             ({ int tmp = last_order; last_order = order; order += tmp; })) {
 411                unsigned int mask = BIT(order) - 1;
 412
 413                mock_timeline_init(&tl, 0);
 414
 415                count = 0;
 416                kt = ktime_get();
 417                end_time = jiffies + HZ/10;
 418                do {
 419                        /* Without assuming too many details of the underlying
 420                         * implementation, try to identify its phase-changes
 421                         * (if any)!
 422                         */
 423                        u64 id = (u64)(count & mask) << order;
 424
 425                        __intel_timeline_sync_is_later(&tl, id, 0);
 426                        __intel_timeline_sync_set(&tl, id, 0);
 427
 428                        count++;
 429                } while (!time_after(jiffies, end_time));
 430                kt = ktime_sub(ktime_get(), kt);
 431                pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
 432                        __func__, count, order,
 433                        (long long)div64_ul(ktime_to_ns(kt), count));
 434                mock_timeline_fini(&tl);
 435                cond_resched();
 436        }
 437
 438        return 0;
 439}
 440
 441int intel_timeline_mock_selftests(void)
 442{
 443        static const struct i915_subtest tests[] = {
 444                SUBTEST(mock_hwsp_freelist),
 445                SUBTEST(igt_sync),
 446                SUBTEST(bench_sync),
 447        };
 448
 449        return i915_subtests(tests, NULL);
 450}
 451
 452static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
 453{
 454        u32 *cs;
 455
 456        cs = intel_ring_begin(rq, 4);
 457        if (IS_ERR(cs))
 458                return PTR_ERR(cs);
 459
 460        if (GRAPHICS_VER(rq->engine->i915) >= 8) {
 461                *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 462                *cs++ = addr;
 463                *cs++ = 0;
 464                *cs++ = value;
 465        } else if (GRAPHICS_VER(rq->engine->i915) >= 4) {
 466                *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 467                *cs++ = 0;
 468                *cs++ = addr;
 469                *cs++ = value;
 470        } else {
 471                *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 472                *cs++ = addr;
 473                *cs++ = value;
 474                *cs++ = MI_NOOP;
 475        }
 476
 477        intel_ring_advance(rq, cs);
 478
 479        return 0;
 480}
 481
 482static struct i915_request *
 483checked_tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value)
 484{
 485        struct i915_request *rq;
 486        int err;
 487
 488        err = selftest_tl_pin(tl);
 489        if (err) {
 490                rq = ERR_PTR(err);
 491                goto out;
 492        }
 493
 494        if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) {
 495                pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
 496                       *tl->hwsp_seqno, tl->seqno);
 497                intel_timeline_unpin(tl);
 498                return ERR_PTR(-EINVAL);
 499        }
 500
 501        rq = intel_engine_create_kernel_request(engine);
 502        if (IS_ERR(rq))
 503                goto out_unpin;
 504
 505        i915_request_get(rq);
 506
 507        err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value);
 508        i915_request_add(rq);
 509        if (err) {
 510                i915_request_put(rq);
 511                rq = ERR_PTR(err);
 512        }
 513
 514out_unpin:
 515        intel_timeline_unpin(tl);
 516out:
 517        if (IS_ERR(rq))
 518                pr_err("Failed to write to timeline!\n");
 519        return rq;
 520}
 521
 522static int live_hwsp_engine(void *arg)
 523{
 524#define NUM_TIMELINES 4096
 525        struct intel_gt *gt = arg;
 526        struct intel_timeline **timelines;
 527        struct intel_engine_cs *engine;
 528        enum intel_engine_id id;
 529        unsigned long count, n;
 530        int err = 0;
 531
 532        /*
 533         * Create a bunch of timelines and check we can write
 534         * independently to each of their breadcrumb slots.
 535         */
 536
 537        timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
 538                                   sizeof(*timelines),
 539                                   GFP_KERNEL);
 540        if (!timelines)
 541                return -ENOMEM;
 542
 543        count = 0;
 544        for_each_engine(engine, gt, id) {
 545                if (!intel_engine_can_store_dword(engine))
 546                        continue;
 547
 548                intel_engine_pm_get(engine);
 549
 550                for (n = 0; n < NUM_TIMELINES; n++) {
 551                        struct intel_timeline *tl;
 552                        struct i915_request *rq;
 553
 554                        tl = intel_timeline_create(gt);
 555                        if (IS_ERR(tl)) {
 556                                err = PTR_ERR(tl);
 557                                break;
 558                        }
 559
 560                        rq = checked_tl_write(tl, engine, count);
 561                        if (IS_ERR(rq)) {
 562                                intel_timeline_put(tl);
 563                                err = PTR_ERR(rq);
 564                                break;
 565                        }
 566
 567                        timelines[count++] = tl;
 568                        i915_request_put(rq);
 569                }
 570
 571                intel_engine_pm_put(engine);
 572                if (err)
 573                        break;
 574        }
 575
 576        if (igt_flush_test(gt->i915))
 577                err = -EIO;
 578
 579        for (n = 0; n < count; n++) {
 580                struct intel_timeline *tl = timelines[n];
 581
 582                if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
 583                        GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
 584                                      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
 585                        GEM_TRACE_DUMP();
 586                        err = -EINVAL;
 587                }
 588                intel_timeline_put(tl);
 589        }
 590
 591        kvfree(timelines);
 592        return err;
 593#undef NUM_TIMELINES
 594}
 595
 596static int live_hwsp_alternate(void *arg)
 597{
 598#define NUM_TIMELINES 4096
 599        struct intel_gt *gt = arg;
 600        struct intel_timeline **timelines;
 601        struct intel_engine_cs *engine;
 602        enum intel_engine_id id;
 603        unsigned long count, n;
 604        int err = 0;
 605
 606        /*
 607         * Create a bunch of timelines and check we can write
 608         * independently to each of their breadcrumb slots with adjacent
 609         * engines.
 610         */
 611
 612        timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
 613                                   sizeof(*timelines),
 614                                   GFP_KERNEL);
 615        if (!timelines)
 616                return -ENOMEM;
 617
 618        count = 0;
 619        for (n = 0; n < NUM_TIMELINES; n++) {
 620                for_each_engine(engine, gt, id) {
 621                        struct intel_timeline *tl;
 622                        struct i915_request *rq;
 623
 624                        if (!intel_engine_can_store_dword(engine))
 625                                continue;
 626
 627                        tl = intel_timeline_create(gt);
 628                        if (IS_ERR(tl)) {
 629                                err = PTR_ERR(tl);
 630                                goto out;
 631                        }
 632
 633                        intel_engine_pm_get(engine);
 634                        rq = checked_tl_write(tl, engine, count);
 635                        intel_engine_pm_put(engine);
 636                        if (IS_ERR(rq)) {
 637                                intel_timeline_put(tl);
 638                                err = PTR_ERR(rq);
 639                                goto out;
 640                        }
 641
 642                        timelines[count++] = tl;
 643                        i915_request_put(rq);
 644                }
 645        }
 646
 647out:
 648        if (igt_flush_test(gt->i915))
 649                err = -EIO;
 650
 651        for (n = 0; n < count; n++) {
 652                struct intel_timeline *tl = timelines[n];
 653
 654                if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
 655                        GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
 656                                      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
 657                        GEM_TRACE_DUMP();
 658                        err = -EINVAL;
 659                }
 660                intel_timeline_put(tl);
 661        }
 662
 663        kvfree(timelines);
 664        return err;
 665#undef NUM_TIMELINES
 666}
 667
 668static int live_hwsp_wrap(void *arg)
 669{
 670        struct intel_gt *gt = arg;
 671        struct intel_engine_cs *engine;
 672        struct intel_timeline *tl;
 673        enum intel_engine_id id;
 674        int err = 0;
 675
 676        /*
 677         * Across a seqno wrap, we need to keep the old cacheline alive for
 678         * foreign GPU references.
 679         */
 680
 681        tl = intel_timeline_create(gt);
 682        if (IS_ERR(tl))
 683                return PTR_ERR(tl);
 684
 685        if (!tl->has_initial_breadcrumb)
 686                goto out_free;
 687
 688        err = selftest_tl_pin(tl);
 689        if (err)
 690                goto out_free;
 691
 692        for_each_engine(engine, gt, id) {
 693                const u32 *hwsp_seqno[2];
 694                struct i915_request *rq;
 695                u32 seqno[2];
 696
 697                if (!intel_engine_can_store_dword(engine))
 698                        continue;
 699
 700                rq = intel_engine_create_kernel_request(engine);
 701                if (IS_ERR(rq)) {
 702                        err = PTR_ERR(rq);
 703                        goto out;
 704                }
 705
 706                tl->seqno = -4u;
 707
 708                mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
 709                err = intel_timeline_get_seqno(tl, rq, &seqno[0]);
 710                mutex_unlock(&tl->mutex);
 711                if (err) {
 712                        i915_request_add(rq);
 713                        goto out;
 714                }
 715                pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
 716                         seqno[0], tl->hwsp_offset);
 717
 718                err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]);
 719                if (err) {
 720                        i915_request_add(rq);
 721                        goto out;
 722                }
 723                hwsp_seqno[0] = tl->hwsp_seqno;
 724
 725                mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
 726                err = intel_timeline_get_seqno(tl, rq, &seqno[1]);
 727                mutex_unlock(&tl->mutex);
 728                if (err) {
 729                        i915_request_add(rq);
 730                        goto out;
 731                }
 732                pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
 733                         seqno[1], tl->hwsp_offset);
 734
 735                err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]);
 736                if (err) {
 737                        i915_request_add(rq);
 738                        goto out;
 739                }
 740                hwsp_seqno[1] = tl->hwsp_seqno;
 741
 742                /* With wrap should come a new hwsp */
 743                GEM_BUG_ON(seqno[1] >= seqno[0]);
 744                GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]);
 745
 746                i915_request_add(rq);
 747
 748                if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 749                        pr_err("Wait for timeline writes timed out!\n");
 750                        err = -EIO;
 751                        goto out;
 752                }
 753
 754                if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] ||
 755                    READ_ONCE(*hwsp_seqno[1]) != seqno[1]) {
 756                        pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
 757                               *hwsp_seqno[0], *hwsp_seqno[1],
 758                               seqno[0], seqno[1]);
 759                        err = -EINVAL;
 760                        goto out;
 761                }
 762
 763                intel_gt_retire_requests(gt); /* recycle HWSP */
 764        }
 765
 766out:
 767        if (igt_flush_test(gt->i915))
 768                err = -EIO;
 769
 770        intel_timeline_unpin(tl);
 771out_free:
 772        intel_timeline_put(tl);
 773        return err;
 774}
 775
 776static int emit_read_hwsp(struct i915_request *rq,
 777                          u32 seqno, u32 hwsp,
 778                          u32 *addr)
 779{
 780        const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0));
 781        u32 *cs;
 782
 783        cs = intel_ring_begin(rq, 12);
 784        if (IS_ERR(cs))
 785                return PTR_ERR(cs);
 786
 787        *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 788        *cs++ = *addr;
 789        *cs++ = 0;
 790        *cs++ = seqno;
 791        *addr += 4;
 792
 793        *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
 794        *cs++ = gpr;
 795        *cs++ = hwsp;
 796        *cs++ = 0;
 797
 798        *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
 799        *cs++ = gpr;
 800        *cs++ = *addr;
 801        *cs++ = 0;
 802        *addr += 4;
 803
 804        intel_ring_advance(rq, cs);
 805
 806        return 0;
 807}
 808
 809struct hwsp_watcher {
 810        struct i915_vma *vma;
 811        struct i915_request *rq;
 812        u32 addr;
 813        u32 *map;
 814};
 815
 816static bool cmp_lt(u32 a, u32 b)
 817{
 818        return a < b;
 819}
 820
 821static bool cmp_gte(u32 a, u32 b)
 822{
 823        return a >= b;
 824}
 825
 826static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt)
 827{
 828        struct drm_i915_gem_object *obj;
 829        struct i915_vma *vma;
 830
 831        obj = i915_gem_object_create_internal(gt->i915, SZ_2M);
 832        if (IS_ERR(obj))
 833                return PTR_ERR(obj);
 834
 835        w->map = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
 836        if (IS_ERR(w->map)) {
 837                i915_gem_object_put(obj);
 838                return PTR_ERR(w->map);
 839        }
 840
 841        vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
 842        if (IS_ERR(vma)) {
 843                i915_gem_object_put(obj);
 844                return PTR_ERR(vma);
 845        }
 846
 847        w->vma = vma;
 848        w->addr = i915_ggtt_offset(vma);
 849        return 0;
 850}
 851
 852static void switch_tl_lock(struct i915_request *from, struct i915_request *to)
 853{
 854        /* some light mutex juggling required; think co-routines */
 855
 856        if (from) {
 857                lockdep_unpin_lock(&from->context->timeline->mutex, from->cookie);
 858                mutex_unlock(&from->context->timeline->mutex);
 859        }
 860
 861        if (to) {
 862                mutex_lock(&to->context->timeline->mutex);
 863                to->cookie = lockdep_pin_lock(&to->context->timeline->mutex);
 864        }
 865}
 866
 867static int create_watcher(struct hwsp_watcher *w,
 868                          struct intel_engine_cs *engine,
 869                          int ringsz)
 870{
 871        struct intel_context *ce;
 872
 873        ce = intel_context_create(engine);
 874        if (IS_ERR(ce))
 875                return PTR_ERR(ce);
 876
 877        ce->ring_size = ringsz;
 878        w->rq = intel_context_create_request(ce);
 879        intel_context_put(ce);
 880        if (IS_ERR(w->rq))
 881                return PTR_ERR(w->rq);
 882
 883        w->addr = i915_ggtt_offset(w->vma);
 884
 885        switch_tl_lock(w->rq, NULL);
 886
 887        return 0;
 888}
 889
 890static int check_watcher(struct hwsp_watcher *w, const char *name,
 891                         bool (*op)(u32 hwsp, u32 seqno))
 892{
 893        struct i915_request *rq = fetch_and_zero(&w->rq);
 894        u32 offset, end;
 895        int err;
 896
 897        GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size);
 898
 899        i915_request_get(rq);
 900        switch_tl_lock(NULL, rq);
 901        i915_request_add(rq);
 902
 903        if (i915_request_wait(rq, 0, HZ) < 0) {
 904                err = -ETIME;
 905                goto out;
 906        }
 907
 908        err = 0;
 909        offset = 0;
 910        end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map);
 911        while (offset < end) {
 912                if (!op(w->map[offset + 1], w->map[offset])) {
 913                        pr_err("Watcher '%s' found HWSP value %x for seqno %x\n",
 914                               name, w->map[offset + 1], w->map[offset]);
 915                        err = -EINVAL;
 916                }
 917
 918                offset += 2;
 919        }
 920
 921out:
 922        i915_request_put(rq);
 923        return err;
 924}
 925
 926static void cleanup_watcher(struct hwsp_watcher *w)
 927{
 928        if (w->rq) {
 929                switch_tl_lock(NULL, w->rq);
 930
 931                i915_request_add(w->rq);
 932        }
 933
 934        i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP);
 935}
 936
 937static bool retire_requests(struct intel_timeline *tl)
 938{
 939        struct i915_request *rq, *rn;
 940
 941        mutex_lock(&tl->mutex);
 942        list_for_each_entry_safe(rq, rn, &tl->requests, link)
 943                if (!i915_request_retire(rq))
 944                        break;
 945        mutex_unlock(&tl->mutex);
 946
 947        return !i915_active_fence_isset(&tl->last_request);
 948}
 949
 950static struct i915_request *wrap_timeline(struct i915_request *rq)
 951{
 952        struct intel_context *ce = rq->context;
 953        struct intel_timeline *tl = ce->timeline;
 954        u32 seqno = rq->fence.seqno;
 955
 956        while (tl->seqno >= seqno) { /* Cause a wrap */
 957                i915_request_put(rq);
 958                rq = intel_context_create_request(ce);
 959                if (IS_ERR(rq))
 960                        return rq;
 961
 962                i915_request_get(rq);
 963                i915_request_add(rq);
 964        }
 965
 966        i915_request_put(rq);
 967        rq = i915_request_create(ce);
 968        if (IS_ERR(rq))
 969                return rq;
 970
 971        i915_request_get(rq);
 972        i915_request_add(rq);
 973
 974        return rq;
 975}
 976
 977static int live_hwsp_read(void *arg)
 978{
 979        struct intel_gt *gt = arg;
 980        struct hwsp_watcher watcher[2] = {};
 981        struct intel_engine_cs *engine;
 982        struct intel_timeline *tl;
 983        enum intel_engine_id id;
 984        int err = 0;
 985        int i;
 986
 987        /*
 988         * If we take a reference to the HWSP for reading on the GPU, that
 989         * read may be arbitrarily delayed (either by foreign fence or
 990         * priority saturation) and a wrap can happen within 30 minutes.
 991         * When the GPU read is finally submitted it should be correct,
 992         * even across multiple wraps.
 993         */
 994
 995        if (GRAPHICS_VER(gt->i915) < 8) /* CS convenience [SRM/LRM] */
 996                return 0;
 997
 998        tl = intel_timeline_create(gt);
 999        if (IS_ERR(tl))
1000                return PTR_ERR(tl);

1001
1002        if (!tl->has_initial_breadcrumb)
1003                goto out_free;
1004
1005        for (i = 0; i < ARRAY_SIZE(watcher); i++) {
1006                err = setup_watcher(&watcher[i], gt);
1007                if (err)
1008                        goto out;
1009        }
1010
1011        for_each_engine(engine, gt, id) {
1012                struct intel_context *ce;
1013                unsigned long count = 0;
1014                IGT_TIMEOUT(end_time);
1015
1016                /* Create a request we can use for remote reading of the HWSP */
1017                err = create_watcher(&watcher[1], engine, SZ_512K);
1018                if (err)
1019                        goto out;
1020
1021                do {
1022                        struct i915_sw_fence *submit;
1023                        struct i915_request *rq;
1024                        u32 hwsp, dummy;
1025
1026                        submit = heap_fence_create(GFP_KERNEL);
1027                        if (!submit) {
1028                                err = -ENOMEM;
1029                                goto out;
1030                        }
1031
1032                        err = create_watcher(&watcher[0], engine, SZ_4K);
1033                        if (err)
1034                                goto out;
1035
1036                        ce = intel_context_create(engine);
1037                        if (IS_ERR(ce)) {
1038                                err = PTR_ERR(ce);
1039                                goto out;
1040                        }
1041
1042                        ce->timeline = intel_timeline_get(tl);
1043
1044                        /* Ensure timeline is mapped, done during first pin */
1045                        err = intel_context_pin(ce);
1046                        if (err) {
1047                                intel_context_put(ce);
1048                                goto out;
1049                        }
1050
1051                        /*
1052                         * Start at a new wrap, and set seqno right before another wrap,
1053                         * saving 30 minutes of nops
1054                         */
1055                        tl->seqno = -12u + 2 * (count & 3);
1056                        __intel_timeline_get_seqno(tl, &dummy);
1057
1058                        rq = i915_request_create(ce);
1059                        if (IS_ERR(rq)) {
1060                                err = PTR_ERR(rq);
1061                                intel_context_unpin(ce);
1062                                intel_context_put(ce);
1063                                goto out;
1064                        }
1065
1066                        err = i915_sw_fence_await_dma_fence(&rq->submit,
1067                                                            &watcher[0].rq->fence, 0,
1068                                                            GFP_KERNEL);
1069                        if (err < 0) {
1070                                i915_request_add(rq);
1071                                intel_context_unpin(ce);
1072                                intel_context_put(ce);
1073                                goto out;
1074                        }
1075
1076                        switch_tl_lock(rq, watcher[0].rq);
1077                        err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp);
1078                        if (err == 0)
1079                                err = emit_read_hwsp(watcher[0].rq, /* before */
1080                                                     rq->fence.seqno, hwsp,
1081                                                     &watcher[0].addr);
1082                        switch_tl_lock(watcher[0].rq, rq);
1083                        if (err) {
1084                                i915_request_add(rq);
1085                                intel_context_unpin(ce);
1086                                intel_context_put(ce);
1087                                goto out;
1088                        }
1089
1090                        switch_tl_lock(rq, watcher[1].rq);
1091                        err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp);
1092                        if (err == 0)
1093                                err = emit_read_hwsp(watcher[1].rq, /* after */
1094                                                     rq->fence.seqno, hwsp,
1095                                                     &watcher[1].addr);
1096                        switch_tl_lock(watcher[1].rq, rq);
1097                        if (err) {
1098                                i915_request_add(rq);
1099                                intel_context_unpin(ce);
1100                                intel_context_put(ce);
1101                                goto out;
1102                        }
1103
1104                        i915_request_get(rq);
1105                        i915_request_add(rq);
1106
1107                        rq = wrap_timeline(rq);
1108                        intel_context_unpin(ce);
1109                        intel_context_put(ce);
1110                        if (IS_ERR(rq)) {
1111                                err = PTR_ERR(rq);
1112                                goto out;
1113                        }
1114
1115                        err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit,
1116                                                            &rq->fence, 0,
1117                                                            GFP_KERNEL);
1118                        if (err < 0) {
1119                                i915_request_put(rq);
1120                                goto out;
1121                        }
1122
1123                        err = check_watcher(&watcher[0], "before", cmp_lt);
1124                        i915_sw_fence_commit(submit);
1125                        heap_fence_put(submit);
1126                        if (err) {
1127                                i915_request_put(rq);
1128                                goto out;
1129                        }
1130                        count++;
1131
1132                        /* Flush the timeline before manually wrapping again */
1133                        if (i915_request_wait(rq,
1134                                              I915_WAIT_INTERRUPTIBLE,
1135                                              HZ) < 0) {
1136                                err = -ETIME;
1137                                i915_request_put(rq);
1138                                goto out;
1139                        }
1140                        retire_requests(tl);
1141                        i915_request_put(rq);
1142
1143                        /* Single requests are limited to half a ring at most */
1144                        if (8 * watcher[1].rq->ring->emit >
1145                            3 * watcher[1].rq->ring->size)
1146                                break;
1147
1148                } while (!__igt_timeout(end_time, NULL) &&
1149                         count < (PAGE_SIZE / TIMELINE_SEQNO_BYTES - 1) / 2);
1150
1151                pr_info("%s: simulated %lu wraps\n", engine->name, count);
1152                err = check_watcher(&watcher[1], "after", cmp_gte);
1153                if (err)
1154                        goto out;
1155        }
1156
1157out:
1158        for (i = 0; i < ARRAY_SIZE(watcher); i++)
1159                cleanup_watcher(&watcher[i]);
1160
1161        if (igt_flush_test(gt->i915))
1162                err = -EIO;
1163
1164out_free:
1165        intel_timeline_put(tl);
1166        return err;
1167}
1168
1169static int live_hwsp_rollover_kernel(void *arg)
1170{
1171        struct intel_gt *gt = arg;
1172        struct intel_engine_cs *engine;
1173        enum intel_engine_id id;
1174        int err = 0;
1175
1176        /*
1177         * Run the host for long enough, and even the kernel context will
1178         * see a seqno rollover.
1179         */
1180
1181        for_each_engine(engine, gt, id) {
1182                struct intel_context *ce = engine->kernel_context;
1183                struct intel_timeline *tl = ce->timeline;
1184                struct i915_request *rq[3] = {};
1185                int i;
1186
1187                st_engine_heartbeat_disable(engine);
1188                if (intel_gt_wait_for_idle(gt, HZ / 2)) {
1189                        err = -EIO;
1190                        goto out;
1191                }
1192
1193                GEM_BUG_ON(i915_active_fence_isset(&tl->last_request));
1194                tl->seqno = -2u;
1195                WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1196
1197                for (i = 0; i < ARRAY_SIZE(rq); i++) {
1198                        struct i915_request *this;
1199
1200                        this = i915_request_create(ce);
1201                        if (IS_ERR(this)) {
1202                                err = PTR_ERR(this);
1203                                goto out;
1204                        }
1205
1206                        pr_debug("%s: create fence.seqnp:%d\n",
1207                                 engine->name,
1208                                 lower_32_bits(this->fence.seqno));
1209
1210                        GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1211
1212                        rq[i] = i915_request_get(this);
1213                        i915_request_add(this);
1214                }
1215
1216                /* We expected a wrap! */
1217                GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1218
1219                if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1220                        pr_err("Wait for timeline wrap timed out!\n");
1221                        err = -EIO;
1222                        goto out;
1223                }
1224
1225                for (i = 0; i < ARRAY_SIZE(rq); i++) {
1226                        if (!i915_request_completed(rq[i])) {
1227                                pr_err("Pre-wrap request not completed!\n");
1228                                err = -EINVAL;
1229                                goto out;
1230                        }
1231                }
1232
1233out:
1234                for (i = 0; i < ARRAY_SIZE(rq); i++)
1235                        i915_request_put(rq[i]);
1236                st_engine_heartbeat_enable(engine);
1237                if (err)
1238                        break;
1239        }
1240
1241        if (igt_flush_test(gt->i915))
1242                err = -EIO;
1243
1244        return err;
1245}
1246
1247static int live_hwsp_rollover_user(void *arg)
1248{
1249        struct intel_gt *gt = arg;
1250        struct intel_engine_cs *engine;
1251        enum intel_engine_id id;
1252        int err = 0;
1253
1254        /*
1255         * Simulate a long running user context, and force the seqno wrap
1256         * on the user's timeline.
1257         */
1258
1259        for_each_engine(engine, gt, id) {
1260                struct i915_request *rq[3] = {};
1261                struct intel_timeline *tl;
1262                struct intel_context *ce;
1263                int i;
1264
1265                ce = intel_context_create(engine);
1266                if (IS_ERR(ce))
1267                        return PTR_ERR(ce);
1268
1269                err = intel_context_alloc_state(ce);
1270                if (err)
1271                        goto out;
1272
1273                tl = ce->timeline;
1274                if (!tl->has_initial_breadcrumb)
1275                        goto out;
1276
1277                err = intel_context_pin(ce);
1278                if (err)
1279                        goto out;
1280
1281                tl->seqno = -4u;
1282                WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1283
1284                for (i = 0; i < ARRAY_SIZE(rq); i++) {
1285                        struct i915_request *this;
1286
1287                        this = intel_context_create_request(ce);
1288                        if (IS_ERR(this)) {
1289                                err = PTR_ERR(this);
1290                                goto out_unpin;
1291                        }
1292
1293                        pr_debug("%s: create fence.seqnp:%d\n",
1294                                 engine->name,
1295                                 lower_32_bits(this->fence.seqno));
1296
1297                        GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1298
1299                        rq[i] = i915_request_get(this);
1300                        i915_request_add(this);
1301                }
1302
1303                /* We expected a wrap! */
1304                GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1305
1306                if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1307                        pr_err("Wait for timeline wrap timed out!\n");
1308                        err = -EIO;
1309                        goto out_unpin;
1310                }
1311
1312                for (i = 0; i < ARRAY_SIZE(rq); i++) {
1313                        if (!i915_request_completed(rq[i])) {
1314                                pr_err("Pre-wrap request not completed!\n");
1315                                err = -EINVAL;
1316                                goto out_unpin;
1317                        }
1318                }
1319out_unpin:
1320                intel_context_unpin(ce);
1321out:
1322                for (i = 0; i < ARRAY_SIZE(rq); i++)
1323                        i915_request_put(rq[i]);
1324                intel_context_put(ce);
1325                if (err)
1326                        break;
1327        }
1328
1329        if (igt_flush_test(gt->i915))
1330                err = -EIO;
1331
1332        return err;
1333}
1334
1335static int live_hwsp_recycle(void *arg)
1336{
1337        struct intel_gt *gt = arg;
1338        struct intel_engine_cs *engine;
1339        enum intel_engine_id id;
1340        unsigned long count;
1341        int err = 0;
1342
1343        /*
1344         * Check seqno writes into one timeline at a time. We expect to
1345         * recycle the breadcrumb slot between iterations and neither
1346         * want to confuse ourselves or the GPU.
1347         */
1348
1349        count = 0;
1350        for_each_engine(engine, gt, id) {
1351                IGT_TIMEOUT(end_time);
1352
1353                if (!intel_engine_can_store_dword(engine))
1354                        continue;
1355
1356                intel_engine_pm_get(engine);
1357
1358                do {
1359                        struct intel_timeline *tl;
1360                        struct i915_request *rq;
1361
1362                        tl = intel_timeline_create(gt);
1363                        if (IS_ERR(tl)) {
1364                                err = PTR_ERR(tl);
1365                                break;
1366                        }
1367
1368                        rq = checked_tl_write(tl, engine, count);
1369                        if (IS_ERR(rq)) {
1370                                intel_timeline_put(tl);
1371                                err = PTR_ERR(rq);
1372                                break;
1373                        }
1374
1375                        if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1376                                pr_err("Wait for timeline writes timed out!\n");
1377                                i915_request_put(rq);
1378                                intel_timeline_put(tl);
1379                                err = -EIO;
1380                                break;
1381                        }
1382
1383                        if (READ_ONCE(*tl->hwsp_seqno) != count) {
1384                                GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n",
1385                                              count, tl->fence_context,
1386                                              tl->hwsp_offset, *tl->hwsp_seqno);
1387                                GEM_TRACE_DUMP();
1388                                err = -EINVAL;
1389                        }
1390
1391                        i915_request_put(rq);
1392                        intel_timeline_put(tl);
1393                        count++;
1394
1395                        if (err)
1396                                break;
1397                } while (!__igt_timeout(end_time, NULL));
1398
1399                intel_engine_pm_put(engine);
1400                if (err)
1401                        break;
1402        }
1403
1404        return err;
1405}
1406
1407int intel_timeline_live_selftests(struct drm_i915_private *i915)
1408{
1409        static const struct i915_subtest tests[] = {
1410                SUBTEST(live_hwsp_recycle),
1411                SUBTEST(live_hwsp_engine),
1412                SUBTEST(live_hwsp_alternate),
1413                SUBTEST(live_hwsp_wrap),
1414                SUBTEST(live_hwsp_read),
1415                SUBTEST(live_hwsp_rollover_kernel),
1416                SUBTEST(live_hwsp_rollover_user),
1417        };
1418
1419        if (intel_gt_is_wedged(&i915->gt))
1420                return 0;
1421
1422        return intel_gt_live_subtests(tests, &i915->gt);
1423}
1424