linux/drivers/gpu/drm/i915/gt/intel_migrate.c
<<
>>
Prefs
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2020 Intel Corporation
   4 */
   5
   6#include "i915_drv.h"
   7#include "intel_context.h"
   8#include "intel_gpu_commands.h"
   9#include "intel_gt.h"
  10#include "intel_gtt.h"
  11#include "intel_migrate.h"
  12#include "intel_ring.h"
  13
  14struct insert_pte_data {
  15        u64 offset;
  16        bool is_lmem;
  17};
  18
  19#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
  20
  21static bool engine_supports_migration(struct intel_engine_cs *engine)
  22{
  23        if (!engine)
  24                return false;
  25
  26        /*
  27         * We need the ability to prevent aribtration (MI_ARB_ON_OFF),
  28         * the ability to write PTE using inline data (MI_STORE_DATA)
  29         * and of course the ability to do the block transfer (blits).
  30         */
  31        GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS);
  32
  33        return true;
  34}
  35
  36static void insert_pte(struct i915_address_space *vm,
  37                       struct i915_page_table *pt,
  38                       void *data)
  39{
  40        struct insert_pte_data *d = data;
  41
  42        vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE,
  43                        d->is_lmem ? PTE_LM : 0);
  44        d->offset += PAGE_SIZE;
  45}
  46
  47static struct i915_address_space *migrate_vm(struct intel_gt *gt)
  48{
  49        struct i915_vm_pt_stash stash = {};
  50        struct i915_ppgtt *vm;
  51        int err;
  52        int i;
  53
  54        /*
  55         * We construct a very special VM for use by all migration contexts,
  56         * it is kept pinned so that it can be used at any time. As we need
  57         * to pre-allocate the page directories for the migration VM, this
  58         * limits us to only using a small number of prepared vma.
  59         *
  60         * To be able to pipeline and reschedule migration operations while
  61         * avoiding unnecessary contention on the vm itself, the PTE updates
  62         * are inline with the blits. All the blits use the same fixed
  63         * addresses, with the backing store redirection being updated on the
  64         * fly. Only 2 implicit vma are used for all migration operations.
  65         *
  66         * We lay the ppGTT out as:
  67         *
  68         *      [0, CHUNK_SZ) -> first object
  69         *      [CHUNK_SZ, 2 * CHUNK_SZ) -> second object
  70         *      [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE
  71         *
  72         * By exposing the dma addresses of the page directories themselves
  73         * within the ppGTT, we are then able to rewrite the PTE prior to use.
  74         * But the PTE update and subsequent migration operation must be atomic,
  75         * i.e. within the same non-preemptible window so that we do not switch
  76         * to another migration context that overwrites the PTE.
  77         *
  78         * TODO: Add support for huge LMEM PTEs
  79         */
  80
  81        vm = i915_ppgtt_create(gt);
  82        if (IS_ERR(vm))
  83                return ERR_CAST(vm);
  84
  85        if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
  86                err = -ENODEV;
  87                goto err_vm;
  88        }
  89
  90        /*
  91         * Each engine instance is assigned its own chunk in the VM, so
  92         * that we can run multiple instances concurrently
  93         */
  94        for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
  95                struct intel_engine_cs *engine;
  96                u64 base = (u64)i << 32;
  97                struct insert_pte_data d = {};
  98                struct i915_gem_ww_ctx ww;
  99                u64 sz;
 100
 101                engine = gt->engine_class[COPY_ENGINE_CLASS][i];
 102                if (!engine_supports_migration(engine))
 103                        continue;
 104
 105                /*
 106                 * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
 107                 * 4x2 page directories for source/destination.
 108                 */
 109                sz = 2 * CHUNK_SZ;
 110                d.offset = base + sz;
 111
 112                /*
 113                 * We need another page directory setup so that we can write
 114                 * the 8x512 PTE in each chunk.
 115                 */
 116                sz += (sz >> 12) * sizeof(u64);
 117
 118                err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
 119                if (err)
 120                        goto err_vm;
 121
 122                for_i915_gem_ww(&ww, err, true) {
 123                        err = i915_vm_lock_objects(&vm->vm, &ww);
 124                        if (err)
 125                                continue;
 126                        err = i915_vm_map_pt_stash(&vm->vm, &stash);
 127                        if (err)
 128                                continue;
 129
 130                        vm->vm.allocate_va_range(&vm->vm, &stash, base, sz);
 131                }
 132                i915_vm_free_pt_stash(&vm->vm, &stash);
 133                if (err)
 134                        goto err_vm;
 135
 136                /* Now allow the GPU to rewrite the PTE via its own ppGTT */
 137                d.is_lmem = i915_gem_object_is_lmem(vm->vm.scratch[0]);
 138                vm->vm.foreach(&vm->vm, base, base + sz, insert_pte, &d);
 139        }
 140
 141        return &vm->vm;
 142
 143err_vm:
 144        i915_vm_put(&vm->vm);
 145        return ERR_PTR(err);
 146}
 147
 148static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt)
 149{
 150        struct intel_engine_cs *engine;
 151        int i;
 152
 153        for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
 154                engine = gt->engine_class[COPY_ENGINE_CLASS][i];
 155                if (engine_supports_migration(engine))
 156                        return engine;
 157        }
 158
 159        return NULL;
 160}
 161
 162static struct intel_context *pinned_context(struct intel_gt *gt)
 163{
 164        static struct lock_class_key key;
 165        struct intel_engine_cs *engine;
 166        struct i915_address_space *vm;
 167        struct intel_context *ce;
 168
 169        engine = first_copy_engine(gt);
 170        if (!engine)
 171                return ERR_PTR(-ENODEV);
 172
 173        vm = migrate_vm(gt);
 174        if (IS_ERR(vm))
 175                return ERR_CAST(vm);
 176
 177        ce = intel_engine_create_pinned_context(engine, vm, SZ_512K,
 178                                                I915_GEM_HWS_MIGRATE,
 179                                                &key, "migrate");
 180        i915_vm_put(vm);
 181        return ce;
 182}
 183
 184int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt)
 185{
 186        struct intel_context *ce;
 187
 188        memset(m, 0, sizeof(*m));
 189
 190        ce = pinned_context(gt);
 191        if (IS_ERR(ce))
 192                return PTR_ERR(ce);
 193
 194        m->context = ce;
 195        return 0;
 196}
 197
 198static int random_index(unsigned int max)
 199{
 200        return upper_32_bits(mul_u32_u32(get_random_u32(), max));
 201}
 202
 203static struct intel_context *__migrate_engines(struct intel_gt *gt)
 204{
 205        struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE];
 206        struct intel_engine_cs *engine;
 207        unsigned int count, i;
 208
 209        count = 0;
 210        for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
 211                engine = gt->engine_class[COPY_ENGINE_CLASS][i];
 212                if (engine_supports_migration(engine))
 213                        engines[count++] = engine;
 214        }
 215
 216        return intel_context_create(engines[random_index(count)]);
 217}
 218
 219struct intel_context *intel_migrate_create_context(struct intel_migrate *m)
 220{
 221        struct intel_context *ce;
 222
 223        /*
 224         * We randomly distribute contexts across the engines upon constrction,
 225         * as they all share the same pinned vm, and so in order to allow
 226         * multiple blits to run in parallel, we must construct each blit
 227         * to use a different range of the vm for its GTT. This has to be
 228         * known at construction, so we can not use the late greedy load
 229         * balancing of the virtual-engine.
 230         */
 231        ce = __migrate_engines(m->context->engine->gt);
 232        if (IS_ERR(ce))
 233                return ce;
 234
 235        ce->ring = NULL;
 236        ce->ring_size = SZ_256K;
 237
 238        i915_vm_put(ce->vm);
 239        ce->vm = i915_vm_get(m->context->vm);
 240
 241        return ce;
 242}
 243
 244static inline struct sgt_dma sg_sgt(struct scatterlist *sg)
 245{
 246        dma_addr_t addr = sg_dma_address(sg);
 247
 248        return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) };
 249}
 250
 251static int emit_no_arbitration(struct i915_request *rq)
 252{
 253        u32 *cs;
 254
 255        cs = intel_ring_begin(rq, 2);
 256        if (IS_ERR(cs))
 257                return PTR_ERR(cs);
 258
 259        /* Explicitly disable preemption for this request. */
 260        *cs++ = MI_ARB_ON_OFF;
 261        *cs++ = MI_NOOP;
 262        intel_ring_advance(rq, cs);
 263
 264        return 0;
 265}
 266
 267static int emit_pte(struct i915_request *rq,
 268                    struct sgt_dma *it,
 269                    enum i915_cache_level cache_level,
 270                    bool is_lmem,
 271                    u64 offset,
 272                    int length)
 273{
 274        const u64 encode = rq->context->vm->pte_encode(0, cache_level,
 275                                                       is_lmem ? PTE_LM : 0);
 276        struct intel_ring *ring = rq->ring;
 277        int total = 0;
 278        u32 *hdr, *cs;
 279        int pkt;
 280
 281        GEM_BUG_ON(GRAPHICS_VER(rq->engine->i915) < 8);
 282
 283        /* Compute the page directory offset for the target address range */
 284        offset += (u64)rq->engine->instance << 32;
 285        offset >>= 12;
 286        offset *= sizeof(u64);
 287        offset += 2 * CHUNK_SZ;
 288
 289        cs = intel_ring_begin(rq, 6);
 290        if (IS_ERR(cs))
 291                return PTR_ERR(cs);
 292
 293        /* Pack as many PTE updates as possible into a single MI command */
 294        pkt = min_t(int, 0x400, ring->space / sizeof(u32) + 5);
 295        pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5);
 296
 297        hdr = cs;
 298        *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); /* as qword elements */
 299        *cs++ = lower_32_bits(offset);
 300        *cs++ = upper_32_bits(offset);
 301
 302        do {
 303                if (cs - hdr >= pkt) {
 304                        *hdr += cs - hdr - 2;
 305                        *cs++ = MI_NOOP;
 306
 307                        ring->emit = (void *)cs - ring->vaddr;
 308                        intel_ring_advance(rq, cs);
 309                        intel_ring_update_space(ring);
 310
 311                        cs = intel_ring_begin(rq, 6);
 312                        if (IS_ERR(cs))
 313                                return PTR_ERR(cs);
 314
 315                        pkt = min_t(int, 0x400, ring->space / sizeof(u32) + 5);
 316                        pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5);
 317
 318                        hdr = cs;
 319                        *cs++ = MI_STORE_DATA_IMM | REG_BIT(21);
 320                        *cs++ = lower_32_bits(offset);
 321                        *cs++ = upper_32_bits(offset);
 322                }
 323
 324                *cs++ = lower_32_bits(encode | it->dma);
 325                *cs++ = upper_32_bits(encode | it->dma);
 326
 327                offset += 8;
 328                total += I915_GTT_PAGE_SIZE;
 329
 330                it->dma += I915_GTT_PAGE_SIZE;
 331                if (it->dma >= it->max) {
 332                        it->sg = __sg_next(it->sg);
 333                        if (!it->sg || sg_dma_len(it->sg) == 0)
 334                                break;
 335
 336                        it->dma = sg_dma_address(it->sg);
 337                        it->max = it->dma + sg_dma_len(it->sg);
 338                }
 339        } while (total < length);
 340
 341        *hdr += cs - hdr - 2;
 342        *cs++ = MI_NOOP;
 343
 344        ring->emit = (void *)cs - ring->vaddr;
 345        intel_ring_advance(rq, cs);
 346        intel_ring_update_space(ring);
 347
 348        return total;
 349}
 350
 351static bool wa_1209644611_applies(int ver, u32 size)
 352{
 353        u32 height = size >> PAGE_SHIFT;
 354
 355        if (ver != 11)
 356                return false;
 357
 358        return height % 4 == 3 && height <= 8;
 359}
 360
 361static int emit_copy(struct i915_request *rq, int size)
 362{
 363        const int ver = GRAPHICS_VER(rq->engine->i915);
 364        u32 instance = rq->engine->instance;
 365        u32 *cs;
 366
 367        cs = intel_ring_begin(rq, ver >= 8 ? 10 : 6);
 368        if (IS_ERR(cs))
 369                return PTR_ERR(cs);
 370
 371        if (ver >= 9 && !wa_1209644611_applies(ver, size)) {
 372                *cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
 373                *cs++ = BLT_DEPTH_32 | PAGE_SIZE;
 374                *cs++ = 0;
 375                *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
 376                *cs++ = CHUNK_SZ; /* dst offset */
 377                *cs++ = instance;
 378                *cs++ = 0;
 379                *cs++ = PAGE_SIZE;
 380                *cs++ = 0; /* src offset */
 381                *cs++ = instance;
 382        } else if (ver >= 8) {
 383                *cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2);
 384                *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
 385                *cs++ = 0;
 386                *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
 387                *cs++ = CHUNK_SZ; /* dst offset */
 388                *cs++ = instance;
 389                *cs++ = 0;
 390                *cs++ = PAGE_SIZE;
 391                *cs++ = 0; /* src offset */
 392                *cs++ = instance;
 393        } else {
 394                GEM_BUG_ON(instance);
 395                *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
 396                *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
 397                *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE;
 398                *cs++ = CHUNK_SZ; /* dst offset */
 399                *cs++ = PAGE_SIZE;
 400                *cs++ = 0; /* src offset */
 401        }
 402
 403        intel_ring_advance(rq, cs);
 404        return 0;
 405}
 406
 407int
 408intel_context_migrate_copy(struct intel_context *ce,
 409                           struct dma_fence *await,
 410                           struct scatterlist *src,
 411                           enum i915_cache_level src_cache_level,
 412                           bool src_is_lmem,
 413                           struct scatterlist *dst,
 414                           enum i915_cache_level dst_cache_level,
 415                           bool dst_is_lmem,
 416                           struct i915_request **out)
 417{
 418        struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
 419        struct i915_request *rq;
 420        int err;
 421
 422        GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
 423        *out = NULL;
 424
 425        GEM_BUG_ON(ce->ring->size < SZ_64K);
 426
 427        do {
 428                int len;
 429
 430                rq = i915_request_create(ce);
 431                if (IS_ERR(rq)) {
 432                        err = PTR_ERR(rq);
 433                        goto out_ce;
 434                }
 435
 436                if (await) {
 437                        err = i915_request_await_dma_fence(rq, await);
 438                        if (err)
 439                                goto out_rq;
 440
 441                        if (rq->engine->emit_init_breadcrumb) {
 442                                err = rq->engine->emit_init_breadcrumb(rq);
 443                                if (err)
 444                                        goto out_rq;
 445                        }
 446
 447                        await = NULL;
 448                }
 449
 450                /* The PTE updates + copy must not be interrupted. */
 451                err = emit_no_arbitration(rq);
 452                if (err)
 453                        goto out_rq;
 454
 455                len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem, 0,
 456                               CHUNK_SZ);
 457                if (len <= 0) {
 458                        err = len;
 459                        goto out_rq;
 460                }
 461
 462                err = emit_pte(rq, &it_dst, dst_cache_level, dst_is_lmem,
 463                               CHUNK_SZ, len);
 464                if (err < 0)
 465                        goto out_rq;
 466                if (err < len) {
 467                        err = -EINVAL;
 468                        goto out_rq;
 469                }
 470
 471                err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
 472                if (err)
 473                        goto out_rq;
 474
 475                err = emit_copy(rq, len);
 476
 477                /* Arbitration is re-enabled between requests. */
 478out_rq:
 479                if (*out)
 480                        i915_request_put(*out);
 481                *out = i915_request_get(rq);
 482                i915_request_add(rq);
 483                if (err || !it_src.sg || !sg_dma_len(it_src.sg))
 484                        break;
 485
 486                cond_resched();
 487        } while (1);
 488
 489out_ce:
 490        return err;
 491}
 492
 493static int emit_clear(struct i915_request *rq, int size, u32 value)
 494{
 495        const int ver = GRAPHICS_VER(rq->engine->i915);
 496        u32 instance = rq->engine->instance;
 497        u32 *cs;
 498
 499        GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
 500
 501        cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
 502        if (IS_ERR(cs))
 503                return PTR_ERR(cs);
 504
 505        if (ver >= 8) {
 506                *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
 507                *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
 508                *cs++ = 0;
 509                *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
 510                *cs++ = 0; /* offset */
 511                *cs++ = instance;
 512                *cs++ = value;
 513                *cs++ = MI_NOOP;
 514        } else {
 515                GEM_BUG_ON(instance);
 516                *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
 517                *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
 518                *cs++ = 0;
 519                *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
 520                *cs++ = 0;
 521                *cs++ = value;
 522        }
 523
 524        intel_ring_advance(rq, cs);
 525        return 0;
 526}
 527
 528int
 529intel_context_migrate_clear(struct intel_context *ce,
 530                            struct dma_fence *await,
 531                            struct scatterlist *sg,
 532                            enum i915_cache_level cache_level,
 533                            bool is_lmem,
 534                            u32 value,
 535                            struct i915_request **out)
 536{
 537        struct sgt_dma it = sg_sgt(sg);
 538        struct i915_request *rq;
 539        int err;
 540
 541        GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
 542        *out = NULL;
 543
 544        GEM_BUG_ON(ce->ring->size < SZ_64K);
 545
 546        do {
 547                int len;
 548
 549                rq = i915_request_create(ce);
 550                if (IS_ERR(rq)) {
 551                        err = PTR_ERR(rq);
 552                        goto out_ce;
 553                }
 554
 555                if (await) {
 556                        err = i915_request_await_dma_fence(rq, await);
 557                        if (err)
 558                                goto out_rq;
 559
 560                        if (rq->engine->emit_init_breadcrumb) {
 561                                err = rq->engine->emit_init_breadcrumb(rq);
 562                                if (err)
 563                                        goto out_rq;
 564                        }
 565
 566                        await = NULL;
 567                }
 568
 569                /* The PTE updates + clear must not be interrupted. */
 570                err = emit_no_arbitration(rq);
 571                if (err)
 572                        goto out_rq;
 573
 574                len = emit_pte(rq, &it, cache_level, is_lmem, 0, CHUNK_SZ);
 575                if (len <= 0) {
 576                        err = len;
 577                        goto out_rq;
 578                }
 579
 580                err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
 581                if (err)
 582                        goto out_rq;
 583
 584                err = emit_clear(rq, len, value);
 585
 586                /* Arbitration is re-enabled between requests. */
 587out_rq:
 588                if (*out)
 589                        i915_request_put(*out);
 590                *out = i915_request_get(rq);
 591                i915_request_add(rq);
 592                if (err || !it.sg || !sg_dma_len(it.sg))
 593                        break;
 594
 595                cond_resched();
 596        } while (1);
 597
 598out_ce:
 599        return err;
 600}
 601
 602int intel_migrate_copy(struct intel_migrate *m,
 603                       struct i915_gem_ww_ctx *ww,
 604                       struct dma_fence *await,
 605                       struct scatterlist *src,
 606                       enum i915_cache_level src_cache_level,
 607                       bool src_is_lmem,
 608                       struct scatterlist *dst,
 609                       enum i915_cache_level dst_cache_level,
 610                       bool dst_is_lmem,
 611                       struct i915_request **out)
 612{
 613        struct intel_context *ce;
 614        int err;
 615
 616        *out = NULL;
 617        if (!m->context)
 618                return -ENODEV;
 619
 620        ce = intel_migrate_create_context(m);
 621        if (IS_ERR(ce))
 622                ce = intel_context_get(m->context);
 623        GEM_BUG_ON(IS_ERR(ce));
 624
 625        err = intel_context_pin_ww(ce, ww);
 626        if (err)
 627                goto out;
 628
 629        err = intel_context_migrate_copy(ce, await,
 630                                         src, src_cache_level, src_is_lmem,
 631                                         dst, dst_cache_level, dst_is_lmem,
 632                                         out);
 633
 634        intel_context_unpin(ce);
 635out:
 636        intel_context_put(ce);
 637        return err;
 638}
 639
 640int
 641intel_migrate_clear(struct intel_migrate *m,
 642                    struct i915_gem_ww_ctx *ww,
 643                    struct dma_fence *await,
 644                    struct scatterlist *sg,
 645                    enum i915_cache_level cache_level,
 646                    bool is_lmem,
 647                    u32 value,
 648                    struct i915_request **out)
 649{
 650        struct intel_context *ce;
 651        int err;
 652
 653        *out = NULL;
 654        if (!m->context)
 655                return -ENODEV;
 656
 657        ce = intel_migrate_create_context(m);
 658        if (IS_ERR(ce))
 659                ce = intel_context_get(m->context);
 660        GEM_BUG_ON(IS_ERR(ce));
 661
 662        err = intel_context_pin_ww(ce, ww);
 663        if (err)
 664                goto out;
 665
 666        err = intel_context_migrate_clear(ce, await, sg, cache_level,
 667                                          is_lmem, value, out);
 668
 669        intel_context_unpin(ce);
 670out:
 671        intel_context_put(ce);
 672        return err;
 673}
 674
 675void intel_migrate_fini(struct intel_migrate *m)
 676{
 677        struct intel_context *ce;
 678
 679        ce = fetch_and_zero(&m->context);
 680        if (!ce)
 681                return;
 682
 683        intel_engine_destroy_pinned_context(ce);
 684}
 685
 686#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 687#include "selftest_migrate.c"
 688#endif
 689