linux/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 OR MIT
   2/*
   3 * Copyright 2020-2021 Advanced Micro Devices, Inc.
   4 *
   5 * Permission is hereby granted, free of charge, to any person obtaining a
   6 * copy of this software and associated documentation files (the "Software"),
   7 * to deal in the Software without restriction, including without limitation
   8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 * and/or sell copies of the Software, and to permit persons to whom the
  10 * Software is furnished to do so, subject to the following conditions:
  11 *
  12 * The above copyright notice and this permission notice shall be included in
  13 * all copies or substantial portions of the Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  21 * OTHER DEALINGS IN THE SOFTWARE.
  22 */
  23
  24#include <linux/types.h>
  25#include <linux/sched/task.h>
  26#include <linux/dynamic_debug.h>
  27#include <drm/ttm/ttm_tt.h>
  28#include <drm/drm_exec.h>
  29
  30#include "amdgpu_sync.h"
  31#include "amdgpu_object.h"
  32#include "amdgpu_vm.h"
  33#include "amdgpu_hmm.h"
  34#include "amdgpu.h"
  35#include "amdgpu_xgmi.h"
  36#include "kfd_priv.h"
  37#include "kfd_svm.h"
  38#include "kfd_migrate.h"
  39#include "kfd_smi_events.h"
  40
  41#ifdef dev_fmt
  42#undef dev_fmt
  43#endif
  44#define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__
  45
  46#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
  47
  48/* Long enough to ensure no retry fault comes after svm range is restored and
  49 * page table is updated.
  50 */
  51#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING    (2UL * NSEC_PER_MSEC)
  52#if IS_ENABLED(CONFIG_DYNAMIC_DEBUG)
  53#define dynamic_svm_range_dump(svms) \
  54        _dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms)
  55#else
  56#define dynamic_svm_range_dump(svms) \
  57        do { if (0) svm_range_debug_dump(svms); } while (0)
  58#endif
  59
  60/* Giant svm range split into smaller ranges based on this, it is decided using
  61 * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to
  62 * power of 2MB.
  63 */
  64static uint64_t max_svm_range_pages;
  65
  66struct criu_svm_metadata {
  67        struct list_head list;
  68        struct kfd_criu_svm_range_priv_data data;
  69};
  70
  71static void svm_range_evict_svm_bo_worker(struct work_struct *work);
  72static bool
  73svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
  74                                    const struct mmu_notifier_range *range,
  75                                    unsigned long cur_seq);
  76static int
  77svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
  78                   uint64_t *bo_s, uint64_t *bo_l);
  79static const struct mmu_interval_notifier_ops svm_range_mn_ops = {
  80        .invalidate = svm_range_cpu_invalidate_pagetables,
  81};
  82
  83/**
  84 * svm_range_unlink - unlink svm_range from lists and interval tree
  85 * @prange: svm range structure to be removed
  86 *
  87 * Remove the svm_range from the svms and svm_bo lists and the svms
  88 * interval tree.
  89 *
  90 * Context: The caller must hold svms->lock
  91 */
  92static void svm_range_unlink(struct svm_range *prange)
  93{
  94        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
  95                 prange, prange->start, prange->last);
  96
  97        if (prange->svm_bo) {
  98                spin_lock(&prange->svm_bo->list_lock);
  99                list_del(&prange->svm_bo_list);
 100                spin_unlock(&prange->svm_bo->list_lock);
 101        }
 102
 103        list_del(&prange->list);
 104        if (prange->it_node.start != 0 && prange->it_node.last != 0)
 105                interval_tree_remove(&prange->it_node, &prange->svms->objects);
 106}
 107
 108static void
 109svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange)
 110{
 111        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
 112                 prange, prange->start, prange->last);
 113
 114        mmu_interval_notifier_insert_locked(&prange->notifier, mm,
 115                                     prange->start << PAGE_SHIFT,
 116                                     prange->npages << PAGE_SHIFT,
 117                                     &svm_range_mn_ops);
 118}
 119
 120/**
 121 * svm_range_add_to_svms - add svm range to svms
 122 * @prange: svm range structure to be added
 123 *
 124 * Add the svm range to svms interval tree and link list
 125 *
 126 * Context: The caller must hold svms->lock
 127 */
 128static void svm_range_add_to_svms(struct svm_range *prange)
 129{
 130        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
 131                 prange, prange->start, prange->last);
 132
 133        list_move_tail(&prange->list, &prange->svms->list);
 134        prange->it_node.start = prange->start;
 135        prange->it_node.last = prange->last;
 136        interval_tree_insert(&prange->it_node, &prange->svms->objects);
 137}
 138
 139static void svm_range_remove_notifier(struct svm_range *prange)
 140{
 141        pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n",
 142                 prange->svms, prange,
 143                 prange->notifier.interval_tree.start >> PAGE_SHIFT,
 144                 prange->notifier.interval_tree.last >> PAGE_SHIFT);
 145
 146        if (prange->notifier.interval_tree.start != 0 &&
 147            prange->notifier.interval_tree.last != 0)
 148                mmu_interval_notifier_remove(&prange->notifier);
 149}
 150
 151static bool
 152svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr)
 153{
 154        return dma_addr && !dma_mapping_error(dev, dma_addr) &&
 155               !(dma_addr & SVM_RANGE_VRAM_DOMAIN);
 156}
 157
 158static int
 159svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
 160                      unsigned long offset, unsigned long npages,
 161                      unsigned long *hmm_pfns, uint32_t gpuidx)
 162{
 163        enum dma_data_direction dir = DMA_BIDIRECTIONAL;
 164        dma_addr_t *addr = prange->dma_addr[gpuidx];
 165        struct device *dev = adev->dev;
 166        struct page *page;
 167        int i, r;
 168
 169        if (!addr) {
 170                addr = kvcalloc(prange->npages, sizeof(*addr), GFP_KERNEL);
 171                if (!addr)
 172                        return -ENOMEM;
 173                prange->dma_addr[gpuidx] = addr;
 174        }
 175
 176        addr += offset;
 177        for (i = 0; i < npages; i++) {
 178                if (svm_is_valid_dma_mapping_addr(dev, addr[i]))
 179                        dma_unmap_page(dev, addr[i], PAGE_SIZE, dir);
 180
 181                page = hmm_pfn_to_page(hmm_pfns[i]);
 182                if (is_zone_device_page(page)) {
 183                        struct amdgpu_device *bo_adev = prange->svm_bo->node->adev;
 184
 185                        addr[i] = (hmm_pfns[i] << PAGE_SHIFT) +
 186                                   bo_adev->vm_manager.vram_base_offset -
 187                                   bo_adev->kfd.pgmap.range.start;
 188                        addr[i] |= SVM_RANGE_VRAM_DOMAIN;
 189                        pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]);
 190                        continue;
 191                }
 192                addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
 193                r = dma_mapping_error(dev, addr[i]);
 194                if (r) {
 195                        dev_err(dev, "failed %d dma_map_page\n", r);
 196                        return r;
 197                }
 198                pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n",
 199                                     addr[i] >> PAGE_SHIFT, page_to_pfn(page));
 200        }
 201
 202        return 0;
 203}
 204
 205static int
 206svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
 207                  unsigned long offset, unsigned long npages,
 208                  unsigned long *hmm_pfns)
 209{
 210        struct kfd_process *p;
 211        uint32_t gpuidx;
 212        int r;
 213
 214        p = container_of(prange->svms, struct kfd_process, svms);
 215
 216        for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
 217                struct kfd_process_device *pdd;
 218
 219                pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
 220                pdd = kfd_process_device_from_gpuidx(p, gpuidx);
 221                if (!pdd) {
 222                        pr_debug("failed to find device idx %d\n", gpuidx);
 223                        return -EINVAL;
 224                }
 225
 226                r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages,
 227                                          hmm_pfns, gpuidx);
 228                if (r)
 229                        break;
 230        }
 231
 232        return r;
 233}
 234
 235void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr,
 236                         unsigned long offset, unsigned long npages)
 237{
 238        enum dma_data_direction dir = DMA_BIDIRECTIONAL;
 239        int i;
 240
 241        if (!dma_addr)
 242                return;
 243
 244        for (i = offset; i < offset + npages; i++) {
 245                if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i]))
 246                        continue;
 247                pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT);
 248                dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
 249                dma_addr[i] = 0;
 250        }
 251}
 252
 253void svm_range_dma_unmap(struct svm_range *prange)
 254{
 255        struct kfd_process_device *pdd;
 256        dma_addr_t *dma_addr;
 257        struct device *dev;
 258        struct kfd_process *p;
 259        uint32_t gpuidx;
 260
 261        p = container_of(prange->svms, struct kfd_process, svms);
 262
 263        for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
 264                dma_addr = prange->dma_addr[gpuidx];
 265                if (!dma_addr)
 266                        continue;
 267
 268                pdd = kfd_process_device_from_gpuidx(p, gpuidx);
 269                if (!pdd) {
 270                        pr_debug("failed to find device idx %d\n", gpuidx);
 271                        continue;
 272                }
 273                dev = &pdd->dev->adev->pdev->dev;
 274
 275                svm_range_dma_unmap_dev(dev, dma_addr, 0, prange->npages);
 276        }
 277}
 278
 279static void svm_range_free(struct svm_range *prange, bool do_unmap)
 280{
 281        uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT;
 282        struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
 283        uint32_t gpuidx;
 284
 285        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange,
 286                 prange->start, prange->last);
 287
 288        svm_range_vram_node_free(prange);
 289        if (do_unmap)
 290                svm_range_dma_unmap(prange);
 291
 292        if (do_unmap && !p->xnack_enabled) {
 293                pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size);
 294                amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
 295                                        KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
 296        }
 297
 298        /* free dma_addr array for each gpu */
 299        for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
 300                if (prange->dma_addr[gpuidx]) {
 301                        kvfree(prange->dma_addr[gpuidx]);
 302                        prange->dma_addr[gpuidx] = NULL;
 303                }
 304        }
 305
 306        mutex_destroy(&prange->lock);
 307        mutex_destroy(&prange->migrate_mutex);
 308        kfree(prange);
 309}
 310
 311static void
 312svm_range_set_default_attributes(struct svm_range_list *svms, int32_t *location,
 313                                 int32_t *prefetch_loc, uint8_t *granularity,
 314                                 uint32_t *flags)
 315{
 316        *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
 317        *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
 318        *granularity = svms->default_granularity;
 319        *flags =
 320                KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT;
 321}
 322
 323static struct
 324svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
 325                         uint64_t last, bool update_mem_usage)
 326{
 327        uint64_t size = last - start + 1;
 328        struct svm_range *prange;
 329        struct kfd_process *p;
 330
 331        prange = kzalloc(sizeof(*prange), GFP_KERNEL);
 332        if (!prange)
 333                return NULL;
 334
 335        p = container_of(svms, struct kfd_process, svms);
 336        if (!p->xnack_enabled && update_mem_usage &&
 337            amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT,
 338                                    KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) {
 339                pr_info("SVM mapping failed, exceeds resident system memory limit\n");
 340                kfree(prange);
 341                return NULL;
 342        }
 343        prange->npages = size;
 344        prange->svms = svms;
 345        prange->start = start;
 346        prange->last = last;
 347        INIT_LIST_HEAD(&prange->list);
 348        INIT_LIST_HEAD(&prange->update_list);
 349        INIT_LIST_HEAD(&prange->svm_bo_list);
 350        INIT_LIST_HEAD(&prange->deferred_list);
 351        INIT_LIST_HEAD(&prange->child_list);
 352        atomic_set(&prange->invalid, 0);
 353        prange->validate_timestamp = 0;
 354        prange->vram_pages = 0;
 355        mutex_init(&prange->migrate_mutex);
 356        mutex_init(&prange->lock);
 357
 358        if (p->xnack_enabled)
 359                bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
 360                            MAX_GPU_INSTANCE);
 361
 362        svm_range_set_default_attributes(svms, &prange->preferred_loc,
 363                                         &prange->prefetch_loc,
 364                                         &prange->granularity, &prange->flags);
 365
 366        pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
 367
 368        return prange;
 369}
 370
 371static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo)
 372{
 373        if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref))
 374                return false;
 375
 376        return true;
 377}
 378
 379static void svm_range_bo_release(struct kref *kref)
 380{
 381        struct svm_range_bo *svm_bo;
 382
 383        svm_bo = container_of(kref, struct svm_range_bo, kref);
 384        pr_debug("svm_bo 0x%p\n", svm_bo);
 385
 386        spin_lock(&svm_bo->list_lock);
 387        while (!list_empty(&svm_bo->range_list)) {
 388                struct svm_range *prange =
 389                                list_first_entry(&svm_bo->range_list,
 390                                                struct svm_range, svm_bo_list);
 391                /* list_del_init tells a concurrent svm_range_vram_node_new when
 392                 * it's safe to reuse the svm_bo pointer and svm_bo_list head.
 393                 */
 394                list_del_init(&prange->svm_bo_list);
 395                spin_unlock(&svm_bo->list_lock);
 396
 397                pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
 398                         prange->start, prange->last);
 399                mutex_lock(&prange->lock);
 400                prange->svm_bo = NULL;
 401                /* prange should not hold vram page now */
 402                WARN_ONCE(prange->actual_loc, "prange should not hold vram page");
 403                mutex_unlock(&prange->lock);
 404
 405                spin_lock(&svm_bo->list_lock);
 406        }
 407        spin_unlock(&svm_bo->list_lock);
 408
 409        if (mmget_not_zero(svm_bo->eviction_fence->mm)) {
 410                struct kfd_process_device *pdd;
 411                struct kfd_process *p;
 412                struct mm_struct *mm;
 413
 414                mm = svm_bo->eviction_fence->mm;
 415                /*
 416                 * The forked child process takes svm_bo device pages ref, svm_bo could be
 417                 * released after parent process is gone.
 418                 */
 419                p = kfd_lookup_process_by_mm(mm);
 420                if (p) {
 421                        pdd = kfd_get_process_device_data(svm_bo->node, p);
 422                        if (pdd)
 423                                atomic64_sub(amdgpu_bo_size(svm_bo->bo), &pdd->vram_usage);
 424                        kfd_unref_process(p);
 425                }
 426                mmput(mm);
 427        }
 428
 429        if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base))
 430                /* We're not in the eviction worker. Signal the fence. */
 431                dma_fence_signal(&svm_bo->eviction_fence->base);
 432        dma_fence_put(&svm_bo->eviction_fence->base);
 433        amdgpu_bo_unref(&svm_bo->bo);
 434        kfree(svm_bo);
 435}
 436
 437static void svm_range_bo_wq_release(struct work_struct *work)
 438{
 439        struct svm_range_bo *svm_bo;
 440
 441        svm_bo = container_of(work, struct svm_range_bo, release_work);
 442        svm_range_bo_release(&svm_bo->kref);
 443}
 444
 445static void svm_range_bo_release_async(struct kref *kref)
 446{
 447        struct svm_range_bo *svm_bo;
 448
 449        svm_bo = container_of(kref, struct svm_range_bo, kref);
 450        pr_debug("svm_bo 0x%p\n", svm_bo);
 451        INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release);
 452        schedule_work(&svm_bo->release_work);
 453}
 454
 455void svm_range_bo_unref_async(struct svm_range_bo *svm_bo)
 456{
 457        kref_put(&svm_bo->kref, svm_range_bo_release_async);
 458}
 459
 460static void svm_range_bo_unref(struct svm_range_bo *svm_bo)
 461{
 462        if (svm_bo)
 463                kref_put(&svm_bo->kref, svm_range_bo_release);
 464}
 465
 466static bool
 467svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange)
 468{
 469        mutex_lock(&prange->lock);
 470        if (!prange->svm_bo) {
 471                mutex_unlock(&prange->lock);
 472                return false;
 473        }
 474        if (prange->ttm_res) {
 475                /* We still have a reference, all is well */
 476                mutex_unlock(&prange->lock);
 477                return true;
 478        }
 479        if (svm_bo_ref_unless_zero(prange->svm_bo)) {
 480                /*
 481                 * Migrate from GPU to GPU, remove range from source svm_bo->node
 482                 * range list, and return false to allocate svm_bo from destination
 483                 * node.
 484                 */
 485                if (prange->svm_bo->node != node) {
 486                        mutex_unlock(&prange->lock);
 487
 488                        spin_lock(&prange->svm_bo->list_lock);
 489                        list_del_init(&prange->svm_bo_list);
 490                        spin_unlock(&prange->svm_bo->list_lock);
 491
 492                        svm_range_bo_unref(prange->svm_bo);
 493                        return false;
 494                }
 495                if (READ_ONCE(prange->svm_bo->evicting)) {
 496                        struct dma_fence *f;
 497                        struct svm_range_bo *svm_bo;
 498                        /* The BO is getting evicted,
 499                         * we need to get a new one
 500                         */
 501                        mutex_unlock(&prange->lock);
 502                        svm_bo = prange->svm_bo;
 503                        f = dma_fence_get(&svm_bo->eviction_fence->base);
 504                        svm_range_bo_unref(prange->svm_bo);
 505                        /* wait for the fence to avoid long spin-loop
 506                         * at list_empty_careful
 507                         */
 508                        dma_fence_wait(f, false);
 509                        dma_fence_put(f);
 510                } else {
 511                        /* The BO was still around and we got
 512                         * a new reference to it
 513                         */
 514                        mutex_unlock(&prange->lock);
 515                        pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n",
 516                                 prange->svms, prange->start, prange->last);
 517
 518                        prange->ttm_res = prange->svm_bo->bo->tbo.resource;
 519                        return true;
 520                }
 521
 522        } else {
 523                mutex_unlock(&prange->lock);
 524        }
 525
 526        /* We need a new svm_bo. Spin-loop to wait for concurrent
 527         * svm_range_bo_release to finish removing this range from
 528         * its range list and set prange->svm_bo to null. After this,
 529         * it is safe to reuse the svm_bo pointer and svm_bo_list head.
 530         */
 531        while (!list_empty_careful(&prange->svm_bo_list) || prange->svm_bo)
 532                cond_resched();
 533
 534        return false;
 535}
 536
 537static struct svm_range_bo *svm_range_bo_new(void)
 538{
 539        struct svm_range_bo *svm_bo;
 540
 541        svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL);
 542        if (!svm_bo)
 543                return NULL;
 544
 545        kref_init(&svm_bo->kref);
 546        INIT_LIST_HEAD(&svm_bo->range_list);
 547        spin_lock_init(&svm_bo->list_lock);
 548
 549        return svm_bo;
 550}
 551
 552int
 553svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
 554                        bool clear)
 555{
 556        struct kfd_process_device *pdd;
 557        struct amdgpu_bo_param bp;
 558        struct svm_range_bo *svm_bo;
 559        struct amdgpu_bo_user *ubo;
 560        struct amdgpu_bo *bo;
 561        struct kfd_process *p;
 562        struct mm_struct *mm;
 563        int r;
 564
 565        p = container_of(prange->svms, struct kfd_process, svms);
 566        pr_debug("process pid: %d svms 0x%p [0x%lx 0x%lx]\n",
 567                 p->lead_thread->pid, prange->svms,
 568                 prange->start, prange->last);
 569
 570        if (svm_range_validate_svm_bo(node, prange))
 571                return 0;
 572
 573        svm_bo = svm_range_bo_new();
 574        if (!svm_bo) {
 575                pr_debug("failed to alloc svm bo\n");
 576                return -ENOMEM;
 577        }
 578        mm = get_task_mm(p->lead_thread);
 579        if (!mm) {
 580                pr_debug("failed to get mm\n");
 581                kfree(svm_bo);
 582                return -ESRCH;
 583        }
 584        svm_bo->node = node;
 585        svm_bo->eviction_fence =
 586                amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
 587                                           mm,
 588                                           svm_bo);
 589        mmput(mm);
 590        INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker);
 591        svm_bo->evicting = 0;
 592        memset(&bp, 0, sizeof(bp));
 593        bp.size = prange->npages * PAGE_SIZE;
 594        bp.byte_align = PAGE_SIZE;
 595        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
 596        bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
 597        bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0;
 598        bp.flags |= AMDGPU_GEM_CREATE_DISCARDABLE;
 599        bp.type = ttm_bo_type_device;
 600        bp.resv = NULL;
 601        if (node->xcp)
 602                bp.xcp_id_plus1 = node->xcp->id + 1;
 603
 604        r = amdgpu_bo_create_user(node->adev, &bp, &ubo);
 605        if (r) {
 606                pr_debug("failed %d to create bo\n", r);
 607                goto create_bo_failed;
 608        }
 609        bo = &ubo->bo;
 610
 611        pr_debug("alloc bo at offset 0x%lx size 0x%lx on partition %d\n",
 612                 bo->tbo.resource->start << PAGE_SHIFT, bp.size,
 613                 bp.xcp_id_plus1 - 1);
 614
 615        r = amdgpu_bo_reserve(bo, true);
 616        if (r) {
 617                pr_debug("failed %d to reserve bo\n", r);
 618                goto reserve_bo_failed;
 619        }
 620
 621        if (clear) {
 622                r = amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
 623                if (r) {
 624                        pr_debug("failed %d to sync bo\n", r);
 625                        amdgpu_bo_unreserve(bo);
 626                        goto reserve_bo_failed;
 627                }
 628        }
 629
 630        r = dma_resv_reserve_fences(bo->tbo.base.resv, 1);
 631        if (r) {
 632                pr_debug("failed %d to reserve bo\n", r);
 633                amdgpu_bo_unreserve(bo);
 634                goto reserve_bo_failed;
 635        }
 636        amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true);
 637
 638        amdgpu_bo_unreserve(bo);
 639
 640        svm_bo->bo = bo;
 641        prange->svm_bo = svm_bo;
 642        prange->ttm_res = bo->tbo.resource;
 643        prange->offset = 0;
 644
 645        spin_lock(&svm_bo->list_lock);
 646        list_add(&prange->svm_bo_list, &svm_bo->range_list);
 647        spin_unlock(&svm_bo->list_lock);
 648
 649        pdd = svm_range_get_pdd_by_node(prange, node);
 650        if (pdd)
 651                atomic64_add(amdgpu_bo_size(bo), &pdd->vram_usage);
 652
 653        return 0;
 654
 655reserve_bo_failed:
 656        amdgpu_bo_unref(&bo);
 657create_bo_failed:
 658        dma_fence_put(&svm_bo->eviction_fence->base);
 659        kfree(svm_bo);
 660        prange->ttm_res = NULL;
 661
 662        return r;
 663}
 664
 665void svm_range_vram_node_free(struct svm_range *prange)
 666{
 667        /* serialize prange->svm_bo unref */
 668        mutex_lock(&prange->lock);
 669        /* prange->svm_bo has not been unref */
 670        if (prange->ttm_res) {
 671                prange->ttm_res = NULL;
 672                mutex_unlock(&prange->lock);
 673                svm_range_bo_unref(prange->svm_bo);
 674        } else
 675                mutex_unlock(&prange->lock);
 676}
 677
 678struct kfd_node *
 679svm_range_get_node_by_id(struct svm_range *prange, uint32_t gpu_id)
 680{
 681        struct kfd_process *p;
 682        struct kfd_process_device *pdd;
 683
 684        p = container_of(prange->svms, struct kfd_process, svms);
 685        pdd = kfd_process_device_data_by_id(p, gpu_id);
 686        if (!pdd) {
 687                pr_debug("failed to get kfd process device by id 0x%x\n", gpu_id);
 688                return NULL;
 689        }
 690
 691        return pdd->dev;
 692}
 693
 694struct kfd_process_device *
 695svm_range_get_pdd_by_node(struct svm_range *prange, struct kfd_node *node)
 696{
 697        struct kfd_process *p;
 698
 699        p = container_of(prange->svms, struct kfd_process, svms);
 700
 701        return kfd_get_process_device_data(node, p);
 702}
 703
 704static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
 705{
 706        struct ttm_operation_ctx ctx = { false, false };
 707
 708        amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);
 709
 710        return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
 711}
 712
 713static int
 714svm_range_check_attr(struct kfd_process *p,
 715                     uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
 716{
 717        uint32_t i;
 718
 719        for (i = 0; i < nattr; i++) {
 720                uint32_t val = attrs[i].value;
 721                int gpuidx = MAX_GPU_INSTANCE;
 722
 723                switch (attrs[i].type) {
 724                case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
 725                        if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM &&
 726                            val != KFD_IOCTL_SVM_LOCATION_UNDEFINED)
 727                                gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
 728                        break;
 729                case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
 730                        if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM)
 731                                gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
 732                        break;
 733                case KFD_IOCTL_SVM_ATTR_ACCESS:
 734                case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
 735                case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
 736                        gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
 737                        break;
 738                case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
 739                        break;
 740                case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
 741                        break;
 742                case KFD_IOCTL_SVM_ATTR_GRANULARITY:
 743                        break;
 744                default:
 745                        pr_debug("unknown attr type 0x%x\n", attrs[i].type);
 746                        return -EINVAL;
 747                }
 748
 749                if (gpuidx < 0) {
 750                        pr_debug("no GPU 0x%x found\n", val);
 751                        return -EINVAL;
 752                } else if (gpuidx < MAX_GPU_INSTANCE &&
 753                           !test_bit(gpuidx, p->svms.bitmap_supported)) {
 754                        pr_debug("GPU 0x%x not supported\n", val);
 755                        return -EINVAL;
 756                }
 757        }
 758
 759        return 0;
 760}
 761
 762static void
 763svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange,
 764                      uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
 765                      bool *update_mapping)
 766{
 767        uint32_t i;
 768        int gpuidx;
 769
 770        for (i = 0; i < nattr; i++) {
 771                switch (attrs[i].type) {
 772                case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
 773                        prange->preferred_loc = attrs[i].value;
 774                        break;
 775                case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
 776                        prange->prefetch_loc = attrs[i].value;
 777                        break;
 778                case KFD_IOCTL_SVM_ATTR_ACCESS:
 779                case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
 780                case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
 781                        if (!p->xnack_enabled)
 782                                *update_mapping = true;
 783
 784                        gpuidx = kfd_process_gpuidx_from_gpuid(p,
 785                                                               attrs[i].value);
 786                        if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
 787                                bitmap_clear(prange->bitmap_access, gpuidx, 1);
 788                                bitmap_clear(prange->bitmap_aip, gpuidx, 1);
 789                        } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
 790                                bitmap_set(prange->bitmap_access, gpuidx, 1);
 791                                bitmap_clear(prange->bitmap_aip, gpuidx, 1);
 792                        } else {
 793                                bitmap_clear(prange->bitmap_access, gpuidx, 1);
 794                                bitmap_set(prange->bitmap_aip, gpuidx, 1);
 795                        }
 796                        break;
 797                case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
 798                        *update_mapping = true;
 799                        prange->flags |= attrs[i].value;
 800                        break;
 801                case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
 802                        *update_mapping = true;
 803                        prange->flags &= ~attrs[i].value;
 804                        break;
 805                case KFD_IOCTL_SVM_ATTR_GRANULARITY:
 806                        prange->granularity = min_t(uint32_t, attrs[i].value, 0x3F);
 807                        break;
 808                default:
 809                        WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
 810                }
 811        }
 812}
 813
 814static bool
 815svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
 816                        uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
 817{
 818        uint32_t i;
 819        int gpuidx;
 820
 821        for (i = 0; i < nattr; i++) {
 822                switch (attrs[i].type) {
 823                case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
 824                        if (prange->preferred_loc != attrs[i].value)
 825                                return false;
 826                        break;
 827                case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
 828                        /* Prefetch should always trigger a migration even
 829                         * if the value of the attribute didn't change.
 830                         */
 831                        return false;
 832                case KFD_IOCTL_SVM_ATTR_ACCESS:
 833                case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
 834                case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
 835                        gpuidx = kfd_process_gpuidx_from_gpuid(p,
 836                                                               attrs[i].value);
 837                        if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
 838                                if (test_bit(gpuidx, prange->bitmap_access) ||
 839                                    test_bit(gpuidx, prange->bitmap_aip))
 840                                        return false;
 841                        } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
 842                                if (!test_bit(gpuidx, prange->bitmap_access))
 843                                        return false;
 844                        } else {
 845                                if (!test_bit(gpuidx, prange->bitmap_aip))
 846                                        return false;
 847                        }
 848                        break;
 849                case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
 850                        if ((prange->flags & attrs[i].value) != attrs[i].value)
 851                                return false;
 852                        break;
 853                case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
 854                        if ((prange->flags & attrs[i].value) != 0)
 855                                return false;
 856                        break;
 857                case KFD_IOCTL_SVM_ATTR_GRANULARITY:
 858                        if (prange->granularity != attrs[i].value)
 859                                return false;
 860                        break;
 861                default:
 862                        WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
 863                }
 864        }
 865
 866        return true;
 867}
 868
 869/**
 870 * svm_range_debug_dump - print all range information from svms
 871 * @svms: svm range list header
 872 *
 873 * debug output svm range start, end, prefetch location from svms
 874 * interval tree and link list
 875 *
 876 * Context: The caller must hold svms->lock
 877 */
 878static void svm_range_debug_dump(struct svm_range_list *svms)
 879{
 880        struct interval_tree_node *node;
 881        struct svm_range *prange;
 882
 883        pr_debug("dump svms 0x%p list\n", svms);
 884        pr_debug("range\tstart\tpage\tend\t\tlocation\n");
 885
 886        list_for_each_entry(prange, &svms->list, list) {
 887                pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
 888                         prange, prange->start, prange->npages,
 889                         prange->start + prange->npages - 1,
 890                         prange->actual_loc);
 891        }
 892
 893        pr_debug("dump svms 0x%p interval tree\n", svms);
 894        pr_debug("range\tstart\tpage\tend\t\tlocation\n");
 895        node = interval_tree_iter_first(&svms->objects, 0, ~0ULL);
 896        while (node) {
 897                prange = container_of(node, struct svm_range, it_node);
 898                pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
 899                         prange, prange->start, prange->npages,
 900                         prange->start + prange->npages - 1,
 901                         prange->actual_loc);
 902                node = interval_tree_iter_next(node, 0, ~0ULL);
 903        }
 904}
 905
 906static void *
 907svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements,
 908                     uint64_t offset, uint64_t *vram_pages)
 909{
 910        unsigned char *src = (unsigned char *)psrc + offset;
 911        unsigned char *dst;
 912        uint64_t i;
 913
 914        dst = kvmalloc_array(num_elements, size, GFP_KERNEL);
 915        if (!dst)
 916                return NULL;
 917
 918        if (!vram_pages) {
 919                memcpy(dst, src, num_elements * size);
 920                return (void *)dst;
 921        }
 922
 923        *vram_pages = 0;
 924        for (i = 0; i < num_elements; i++) {
 925                dma_addr_t *temp;
 926                temp = (dma_addr_t *)dst + i;
 927                *temp = *((dma_addr_t *)src + i);
 928                if (*temp&SVM_RANGE_VRAM_DOMAIN)
 929                        (*vram_pages)++;
 930        }
 931
 932        return (void *)dst;
 933}
 934
 935static int
 936svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src)
 937{
 938        int i;
 939
 940        for (i = 0; i < MAX_GPU_INSTANCE; i++) {
 941                if (!src->dma_addr[i])
 942                        continue;
 943                dst->dma_addr[i] = svm_range_copy_array(src->dma_addr[i],
 944                                        sizeof(*src->dma_addr[i]), src->npages, 0, NULL);
 945                if (!dst->dma_addr[i])
 946                        return -ENOMEM;
 947        }
 948
 949        return 0;
 950}
 951
 952static int
 953svm_range_split_array(void *ppnew, void *ppold, size_t size,
 954                      uint64_t old_start, uint64_t old_n,
 955                      uint64_t new_start, uint64_t new_n, uint64_t *new_vram_pages)
 956{
 957        unsigned char *new, *old, *pold;
 958        uint64_t d;
 959
 960        if (!ppold)
 961                return 0;
 962        pold = *(unsigned char **)ppold;
 963        if (!pold)
 964                return 0;
 965
 966        d = (new_start - old_start) * size;
 967        /* get dma addr array for new range and calculte its vram page number */
 968        new = svm_range_copy_array(pold, size, new_n, d, new_vram_pages);
 969        if (!new)
 970                return -ENOMEM;
 971        d = (new_start == old_start) ? new_n * size : 0;
 972        old = svm_range_copy_array(pold, size, old_n, d, NULL);
 973        if (!old) {
 974                kvfree(new);
 975                return -ENOMEM;
 976        }
 977        kvfree(pold);
 978        *(void **)ppold = old;
 979        *(void **)ppnew = new;
 980
 981        return 0;
 982}
 983
 984static int
 985svm_range_split_pages(struct svm_range *new, struct svm_range *old,
 986                      uint64_t start, uint64_t last)
 987{
 988        uint64_t npages = last - start + 1;
 989        int i, r;
 990
 991        for (i = 0; i < MAX_GPU_INSTANCE; i++) {
 992                r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i],
 993                                          sizeof(*old->dma_addr[i]), old->start,
 994                                          npages, new->start, new->npages,
 995                                          old->actual_loc ? &new->vram_pages : NULL);
 996                if (r)
 997                        return r;
 998        }
 999        if (old->actual_loc)
1000                old->vram_pages -= new->vram_pages;
1001
1002        return 0;
1003}
1004
1005static int
1006svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
1007                      uint64_t start, uint64_t last)
1008{
1009        uint64_t npages = last - start + 1;
1010
1011        pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n",
1012                 new->svms, new, new->start, start, last);
1013
1014        if (new->start == old->start) {
1015                new->offset = old->offset;
1016                old->offset += new->npages;
1017        } else {
1018                new->offset = old->offset + npages;
1019        }
1020
1021        new->svm_bo = svm_range_bo_ref(old->svm_bo);
1022        new->ttm_res = old->ttm_res;
1023
1024        spin_lock(&new->svm_bo->list_lock);
1025        list_add(&new->svm_bo_list, &new->svm_bo->range_list);
1026        spin_unlock(&new->svm_bo->list_lock);
1027
1028        return 0;
1029}
1030
1031/**
1032 * svm_range_split_adjust - split range and adjust
1033 *
1034 * @new: new range
1035 * @old: the old range
1036 * @start: the old range adjust to start address in pages
1037 * @last: the old range adjust to last address in pages
1038 *
1039 * Copy system memory dma_addr or vram ttm_res in old range to new
1040 * range from new_start up to size new->npages, the remaining old range is from
1041 * start to last
1042 *
1043 * Return:
1044 * 0 - OK, -ENOMEM - out of memory
1045 */
1046static int
1047svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
1048                      uint64_t start, uint64_t last)
1049{
1050        int r;
1051
1052        pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n",
1053                 new->svms, new->start, old->start, old->last, start, last);
1054
1055        if (new->start < old->start ||
1056            new->last > old->last) {
1057                WARN_ONCE(1, "invalid new range start or last\n");
1058                return -EINVAL;
1059        }
1060
1061        r = svm_range_split_pages(new, old, start, last);
1062        if (r)
1063                return r;
1064
1065        if (old->actual_loc && old->ttm_res) {
1066                r = svm_range_split_nodes(new, old, start, last);
1067                if (r)
1068                        return r;
1069        }
1070
1071        old->npages = last - start + 1;
1072        old->start = start;
1073        old->last = last;
1074        new->flags = old->flags;
1075        new->preferred_loc = old->preferred_loc;
1076        new->prefetch_loc = old->prefetch_loc;
1077        new->actual_loc = old->actual_loc;
1078        new->granularity = old->granularity;
1079        new->mapped_to_gpu = old->mapped_to_gpu;
1080        bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
1081        bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
1082        atomic_set(&new->queue_refcount, atomic_read(&old->queue_refcount));
1083
1084        return 0;
1085}
1086
1087/**
1088 * svm_range_split - split a range in 2 ranges
1089 *
1090 * @prange: the svm range to split
1091 * @start: the remaining range start address in pages
1092 * @last: the remaining range last address in pages
1093 * @new: the result new range generated
1094 *
1095 * Two cases only:
1096 * case 1: if start == prange->start
1097 *         prange ==> prange[start, last]
1098 *         new range [last + 1, prange->last]
1099 *
1100 * case 2: if last == prange->last
1101 *         prange ==> prange[start, last]
1102 *         new range [prange->start, start - 1]
1103 *
1104 * Return:
1105 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last
1106 */
1107static int
1108svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last,
1109                struct svm_range **new)
1110{
1111        uint64_t old_start = prange->start;
1112        uint64_t old_last = prange->last;
1113        struct svm_range_list *svms;
1114        int r = 0;
1115
1116        pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms,
1117                 old_start, old_last, start, last);
1118
1119        if (old_start != start && old_last != last)
1120                return -EINVAL;
1121        if (start < old_start || last > old_last)
1122                return -EINVAL;
1123
1124        svms = prange->svms;
1125        if (old_start == start)
1126                *new = svm_range_new(svms, last + 1, old_last, false);
1127        else
1128                *new = svm_range_new(svms, old_start, start - 1, false);
1129        if (!*new)
1130                return -ENOMEM;
1131
1132        r = svm_range_split_adjust(*new, prange, start, last);
1133        if (r) {
1134                pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n",
1135                         r, old_start, old_last, start, last);
1136                svm_range_free(*new, false);
1137                *new = NULL;
1138        }
1139
1140        return r;
1141}
1142
1143static int
1144svm_range_split_tail(struct svm_range *prange, uint64_t new_last,
1145                     struct list_head *insert_list, struct list_head *remap_list)
1146{
1147        struct svm_range *tail = NULL;
1148        int r = svm_range_split(prange, prange->start, new_last, &tail);
1149
1150        if (!r) {
1151                list_add(&tail->list, insert_list);
1152                if (!IS_ALIGNED(new_last + 1, 1UL << prange->granularity))
1153                        list_add(&tail->update_list, remap_list);
1154        }
1155        return r;
1156}
1157
1158static int
1159svm_range_split_head(struct svm_range *prange, uint64_t new_start,
1160                     struct list_head *insert_list, struct list_head *remap_list)
1161{
1162        struct svm_range *head = NULL;
1163        int r = svm_range_split(prange, new_start, prange->last, &head);
1164
1165        if (!r) {
1166                list_add(&head->list, insert_list);
1167                if (!IS_ALIGNED(new_start, 1UL << prange->granularity))
1168                        list_add(&head->update_list, remap_list);
1169        }
1170        return r;
1171}
1172
1173static void
1174svm_range_add_child(struct svm_range *prange, struct svm_range *pchild, enum svm_work_list_ops op)
1175{
1176        pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
1177                 pchild, pchild->start, pchild->last, prange, op);
1178
1179        pchild->work_item.mm = NULL;
1180        pchild->work_item.op = op;
1181        list_add_tail(&pchild->child_list, &prange->child_list);
1182}
1183
1184static bool
1185svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b)
1186{
1187        return (node_a->adev == node_b->adev ||
1188                amdgpu_xgmi_same_hive(node_a->adev, node_b->adev));
1189}
1190
1191static uint64_t
1192svm_range_get_pte_flags(struct kfd_node *node,
1193                        struct svm_range *prange, int domain)
1194{
1195        struct kfd_node *bo_node;
1196        uint32_t flags = prange->flags;
1197        uint32_t mapping_flags = 0;
1198        uint32_t gc_ip_version = KFD_GC_VERSION(node);
1199        uint64_t pte_flags;
1200        bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
1201        bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT);
1202        bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT;
1203        unsigned int mtype_local;
1204
1205        if (domain == SVM_RANGE_VRAM_DOMAIN)
1206                bo_node = prange->svm_bo->node;
1207
1208        switch (gc_ip_version) {
1209        case IP_VERSION(9, 4, 1):
1210                if (domain == SVM_RANGE_VRAM_DOMAIN) {
1211                        if (bo_node == node) {
1212                                mapping_flags |= coherent ?
1213                                        AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1214                        } else {
1215                                mapping_flags |= coherent ?
1216                                        AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1217                                if (svm_nodes_in_same_hive(node, bo_node))
1218                                        snoop = true;
1219                        }
1220                } else {
1221                        mapping_flags |= coherent ?
1222                                AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1223                }
1224                break;
1225        case IP_VERSION(9, 4, 2):
1226                if (domain == SVM_RANGE_VRAM_DOMAIN) {
1227                        if (bo_node == node) {
1228                                mapping_flags |= coherent ?
1229                                        AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1230                                if (node->adev->gmc.xgmi.connected_to_cpu)
1231                                        snoop = true;
1232                        } else {
1233                                mapping_flags |= coherent ?
1234                                        AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1235                                if (svm_nodes_in_same_hive(node, bo_node))
1236                                        snoop = true;
1237                        }
1238                } else {
1239                        mapping_flags |= coherent ?
1240                                AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1241                }
1242                break;
1243        case IP_VERSION(9, 4, 3):
1244        case IP_VERSION(9, 4, 4):
1245        case IP_VERSION(9, 5, 0):
1246                if (ext_coherent)
1247                        mtype_local = AMDGPU_VM_MTYPE_CC;
1248                else
1249                        mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC :
1250                                amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1251                snoop = true;
1252                if (domain == SVM_RANGE_VRAM_DOMAIN) {
1253                        /* local HBM region close to partition */
1254                        if (bo_node->adev == node->adev &&
1255                            (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id))
1256                                mapping_flags |= mtype_local;
1257                        /* local HBM region far from partition or remote XGMI GPU
1258                         * with regular system scope coherence
1259                         */
1260                        else if (svm_nodes_in_same_hive(bo_node, node) && !ext_coherent)
1261                                mapping_flags |= AMDGPU_VM_MTYPE_NC;
1262                        /* PCIe P2P on GPUs pre-9.5.0 */
1263                        else if (gc_ip_version < IP_VERSION(9, 5, 0) &&
1264                                 !svm_nodes_in_same_hive(bo_node, node))
1265                                mapping_flags |= AMDGPU_VM_MTYPE_UC;
1266                        /* Other remote memory */
1267                        else
1268                                mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1269                /* system memory accessed by the APU */
1270                } else if (node->adev->flags & AMD_IS_APU) {
1271                        /* On NUMA systems, locality is determined per-page
1272                         * in amdgpu_gmc_override_vm_pte_flags
1273                         */
1274                        if (num_possible_nodes() <= 1)
1275                                mapping_flags |= mtype_local;
1276                        else
1277                                mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1278                /* system memory accessed by the dGPU */
1279                } else {
1280                        if (gc_ip_version < IP_VERSION(9, 5, 0) || ext_coherent)
1281                                mapping_flags |= AMDGPU_VM_MTYPE_UC;
1282                        else
1283                                mapping_flags |= AMDGPU_VM_MTYPE_NC;
1284                }
1285                break;
1286        case IP_VERSION(12, 0, 0):
1287        case IP_VERSION(12, 0, 1):
1288                mapping_flags |= AMDGPU_VM_MTYPE_NC;
1289                break;
1290        default:
1291                mapping_flags |= coherent ?
1292                        AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1293        }
1294
1295        mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
1296
1297        if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO)
1298                mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE;
1299        if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC)
1300                mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
1301
1302        pte_flags = AMDGPU_PTE_VALID;
1303        pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM;
1304        pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
1305        if (gc_ip_version >= IP_VERSION(12, 0, 0))
1306                pte_flags |= AMDGPU_PTE_IS_PTE;
1307
1308        pte_flags |= amdgpu_gem_va_map_flags(node->adev, mapping_flags);
1309        return pte_flags;
1310}
1311
1312static int
1313svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
1314                         uint64_t start, uint64_t last,
1315                         struct dma_fence **fence)
1316{
1317        uint64_t init_pte_value = 0;
1318
1319        pr_debug("[0x%llx 0x%llx]\n", start, last);
1320
1321        return amdgpu_vm_update_range(adev, vm, false, true, true, false, NULL, start,
1322                                      last, init_pte_value, 0, 0, NULL, NULL,
1323                                      fence);
1324}
1325
1326static int
1327svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
1328                          unsigned long last, uint32_t trigger)
1329{
1330        DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1331        struct kfd_process_device *pdd;
1332        struct dma_fence *fence = NULL;
1333        struct kfd_process *p;
1334        uint32_t gpuidx;
1335        int r = 0;
1336
1337        if (!prange->mapped_to_gpu) {
1338                pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n",
1339                         prange, prange->start, prange->last);
1340                return 0;
1341        }
1342
1343        if (prange->start == start && prange->last == last) {
1344                pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange);
1345                prange->mapped_to_gpu = false;
1346        }
1347
1348        bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
1349                  MAX_GPU_INSTANCE);
1350        p = container_of(prange->svms, struct kfd_process, svms);
1351
1352        for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1353                pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
1354                pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1355                if (!pdd) {
1356                        pr_debug("failed to find device idx %d\n", gpuidx);
1357                        return -EINVAL;
1358                }
1359
1360                kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid,
1361                                             start, last, trigger);
1362
1363                r = svm_range_unmap_from_gpu(pdd->dev->adev,
1364                                             drm_priv_to_vm(pdd->drm_priv),
1365                                             start, last, &fence);
1366                if (r)
1367                        break;
1368
1369                if (fence) {
1370                        r = dma_fence_wait(fence, false);
1371                        dma_fence_put(fence);
1372                        fence = NULL;
1373                        if (r)
1374                                break;
1375                }
1376                kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT);
1377        }
1378
1379        return r;
1380}
1381
1382static int
1383svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
1384                     unsigned long offset, unsigned long npages, bool readonly,
1385                     dma_addr_t *dma_addr, struct amdgpu_device *bo_adev,
1386                     struct dma_fence **fence, bool flush_tlb)
1387{
1388        struct amdgpu_device *adev = pdd->dev->adev;
1389        struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv);
1390        uint64_t pte_flags;
1391        unsigned long last_start;
1392        int last_domain;
1393        int r = 0;
1394        int64_t i, j;
1395
1396        last_start = prange->start + offset;
1397
1398        pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms,
1399                 last_start, last_start + npages - 1, readonly);
1400
1401        for (i = offset; i < offset + npages; i++) {
1402                last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN;
1403                dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN;
1404
1405                /* Collect all pages in the same address range and memory domain
1406                 * that can be mapped with a single call to update mapping.
1407                 */
1408                if (i < offset + npages - 1 &&
1409                    last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN))
1410                        continue;
1411
1412                pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n",
1413                         last_start, prange->start + i, last_domain ? "GPU" : "CPU");
1414
1415                pte_flags = svm_range_get_pte_flags(pdd->dev, prange, last_domain);
1416                if (readonly)
1417                        pte_flags &= ~AMDGPU_PTE_WRITEABLE;
1418
1419                pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n",
1420                         prange->svms, last_start, prange->start + i,
1421                         (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0,
1422                         pte_flags);
1423
1424                /* For dGPU mode, we use same vm_manager to allocate VRAM for
1425                 * different memory partition based on fpfn/lpfn, we should use
1426                 * same vm_manager.vram_base_offset regardless memory partition.
1427                 */
1428                r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, true,
1429                                           NULL, last_start, prange->start + i,
1430                                           pte_flags,
1431                                           (last_start - prange->start) << PAGE_SHIFT,
1432                                           bo_adev ? bo_adev->vm_manager.vram_base_offset : 0,
1433                                           NULL, dma_addr, &vm->last_update);
1434
1435                for (j = last_start - prange->start; j <= i; j++)
1436                        dma_addr[j] |= last_domain;
1437
1438                if (r) {
1439                        pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start);
1440                        goto out;
1441                }
1442                last_start = prange->start + i + 1;
1443        }
1444
1445        r = amdgpu_vm_update_pdes(adev, vm, false);
1446        if (r) {
1447                pr_debug("failed %d to update directories 0x%lx\n", r,
1448                         prange->start);
1449                goto out;
1450        }
1451
1452        if (fence)
1453                *fence = dma_fence_get(vm->last_update);
1454
1455out:
1456        return r;
1457}
1458
1459static int
1460svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
1461                      unsigned long npages, bool readonly,
1462                      unsigned long *bitmap, bool wait, bool flush_tlb)
1463{
1464        struct kfd_process_device *pdd;
1465        struct amdgpu_device *bo_adev = NULL;
1466        struct kfd_process *p;
1467        struct dma_fence *fence = NULL;
1468        uint32_t gpuidx;
1469        int r = 0;
1470
1471        if (prange->svm_bo && prange->ttm_res)
1472                bo_adev = prange->svm_bo->node->adev;
1473
1474        p = container_of(prange->svms, struct kfd_process, svms);
1475        for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1476                pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
1477                pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1478                if (!pdd) {
1479                        pr_debug("failed to find device idx %d\n", gpuidx);
1480                        return -EINVAL;
1481                }
1482
1483                pdd = kfd_bind_process_to_device(pdd->dev, p);
1484                if (IS_ERR(pdd))
1485                        return -EINVAL;
1486
1487                if (bo_adev && pdd->dev->adev != bo_adev &&
1488                    !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) {
1489                        pr_debug("cannot map to device idx %d\n", gpuidx);
1490                        continue;
1491                }
1492
1493                r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly,
1494                                         prange->dma_addr[gpuidx],
1495                                         bo_adev, wait ? &fence : NULL,
1496                                         flush_tlb);
1497                if (r)
1498                        break;
1499
1500                if (fence) {
1501                        r = dma_fence_wait(fence, false);
1502                        dma_fence_put(fence);
1503                        fence = NULL;
1504                        if (r) {
1505                                pr_debug("failed %d to dma fence wait\n", r);
1506                                break;
1507                        }
1508                }
1509
1510                kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
1511        }
1512
1513        return r;
1514}
1515
1516struct svm_validate_context {
1517        struct kfd_process *process;
1518        struct svm_range *prange;
1519        bool intr;
1520        DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1521        struct drm_exec exec;
1522};
1523
1524static int svm_range_reserve_bos(struct svm_validate_context *ctx, bool intr)
1525{
1526        struct kfd_process_device *pdd;
1527        struct amdgpu_vm *vm;
1528        uint32_t gpuidx;
1529        int r;
1530
1531        drm_exec_init(&ctx->exec, intr ? DRM_EXEC_INTERRUPTIBLE_WAIT: 0, 0);
1532        drm_exec_until_all_locked(&ctx->exec) {
1533                for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1534                        pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1535                        if (!pdd) {
1536                                pr_debug("failed to find device idx %d\n", gpuidx);
1537                                r = -EINVAL;
1538                                goto unreserve_out;
1539                        }
1540                        vm = drm_priv_to_vm(pdd->drm_priv);
1541
1542                        r = amdgpu_vm_lock_pd(vm, &ctx->exec, 2);
1543                        drm_exec_retry_on_contention(&ctx->exec);
1544                        if (unlikely(r)) {
1545                                pr_debug("failed %d to reserve bo\n", r);
1546                                goto unreserve_out;
1547                        }
1548                }
1549        }
1550
1551        for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1552                pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1553                if (!pdd) {
1554                        pr_debug("failed to find device idx %d\n", gpuidx);
1555                        r = -EINVAL;
1556                        goto unreserve_out;
1557                }
1558
1559                r = amdgpu_vm_validate(pdd->dev->adev,
1560                                       drm_priv_to_vm(pdd->drm_priv), NULL,
1561                                       svm_range_bo_validate, NULL);
1562                if (r) {
1563                        pr_debug("failed %d validate pt bos\n", r);
1564                        goto unreserve_out;
1565                }
1566        }
1567
1568        return 0;
1569
1570unreserve_out:
1571        drm_exec_fini(&ctx->exec);
1572        return r;
1573}
1574
1575static void svm_range_unreserve_bos(struct svm_validate_context *ctx)
1576{
1577        drm_exec_fini(&ctx->exec);
1578}
1579
1580static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx)
1581{
1582        struct kfd_process_device *pdd;
1583
1584        pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1585        if (!pdd)
1586                return NULL;
1587
1588        return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev);
1589}
1590
1591/*
1592 * Validation+GPU mapping with concurrent invalidation (MMU notifiers)
1593 *
1594 * To prevent concurrent destruction or change of range attributes, the
1595 * svm_read_lock must be held. The caller must not hold the svm_write_lock
1596 * because that would block concurrent evictions and lead to deadlocks. To
1597 * serialize concurrent migrations or validations of the same range, the
1598 * prange->migrate_mutex must be held.
1599 *
1600 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its
1601 * eviction fence.
1602 *
1603 * The following sequence ensures race-free validation and GPU mapping:
1604 *
1605 * 1. Reserve page table (and SVM BO if range is in VRAM)
1606 * 2. hmm_range_fault to get page addresses (if system memory)
1607 * 3. DMA-map pages (if system memory)
1608 * 4-a. Take notifier lock
1609 * 4-b. Check that pages still valid (mmu_interval_read_retry)
1610 * 4-c. Check that the range was not split or otherwise invalidated
1611 * 4-d. Update GPU page table
1612 * 4.e. Release notifier lock
1613 * 5. Release page table (and SVM BO) reservation
1614 */
1615static int svm_range_validate_and_map(struct mm_struct *mm,
1616                                      unsigned long map_start, unsigned long map_last,
1617                                      struct svm_range *prange, int32_t gpuidx,
1618                                      bool intr, bool wait, bool flush_tlb)
1619{
1620        struct svm_validate_context *ctx;
1621        unsigned long start, end, addr;
1622        struct kfd_process *p;
1623        void *owner;
1624        int32_t idx;
1625        int r = 0;
1626
1627        ctx = kzalloc(sizeof(struct svm_validate_context), GFP_KERNEL);
1628        if (!ctx)
1629                return -ENOMEM;
1630        ctx->process = container_of(prange->svms, struct kfd_process, svms);
1631        ctx->prange = prange;
1632        ctx->intr = intr;
1633
1634        if (gpuidx < MAX_GPU_INSTANCE) {
1635                bitmap_zero(ctx->bitmap, MAX_GPU_INSTANCE);
1636                bitmap_set(ctx->bitmap, gpuidx, 1);
1637        } else if (ctx->process->xnack_enabled) {
1638                bitmap_copy(ctx->bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
1639
1640                /* If prefetch range to GPU, or GPU retry fault migrate range to
1641                 * GPU, which has ACCESS attribute to the range, create mapping
1642                 * on that GPU.
1643                 */
1644                if (prange->actual_loc) {
1645                        gpuidx = kfd_process_gpuidx_from_gpuid(ctx->process,
1646                                                        prange->actual_loc);
1647                        if (gpuidx < 0) {
1648                                WARN_ONCE(1, "failed get device by id 0x%x\n",
1649                                         prange->actual_loc);
1650                                r = -EINVAL;
1651                                goto free_ctx;
1652                        }
1653                        if (test_bit(gpuidx, prange->bitmap_access))
1654                                bitmap_set(ctx->bitmap, gpuidx, 1);
1655                }
1656
1657                /*
1658                 * If prange is already mapped or with always mapped flag,
1659                 * update mapping on GPUs with ACCESS attribute
1660                 */
1661                if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
1662                        if (prange->mapped_to_gpu ||
1663                            prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)
1664                                bitmap_copy(ctx->bitmap, prange->bitmap_access, MAX_GPU_INSTANCE);
1665                }
1666        } else {
1667                bitmap_or(ctx->bitmap, prange->bitmap_access,
1668                          prange->bitmap_aip, MAX_GPU_INSTANCE);
1669        }
1670
1671        if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
1672                r = 0;
1673                goto free_ctx;
1674        }
1675
1676        if (prange->actual_loc && !prange->ttm_res) {
1677                /* This should never happen. actual_loc gets set by
1678                 * svm_migrate_ram_to_vram after allocating a BO.
1679                 */
1680                WARN_ONCE(1, "VRAM BO missing during validation\n");
1681                r = -EINVAL;
1682                goto free_ctx;
1683        }
1684
1685        r = svm_range_reserve_bos(ctx, intr);
1686        if (r)
1687                goto free_ctx;
1688
1689        p = container_of(prange->svms, struct kfd_process, svms);
1690        owner = kfd_svm_page_owner(p, find_first_bit(ctx->bitmap,
1691                                                MAX_GPU_INSTANCE));
1692        for_each_set_bit(idx, ctx->bitmap, MAX_GPU_INSTANCE) {
1693                if (kfd_svm_page_owner(p, idx) != owner) {
1694                        owner = NULL;
1695                        break;
1696                }
1697        }
1698
1699        start = map_start << PAGE_SHIFT;
1700        end = (map_last + 1) << PAGE_SHIFT;
1701        for (addr = start; !r && addr < end; ) {
1702                struct hmm_range *hmm_range = NULL;
1703                unsigned long map_start_vma;
1704                unsigned long map_last_vma;
1705                struct vm_area_struct *vma;
1706                unsigned long next = 0;
1707                unsigned long offset;
1708                unsigned long npages;
1709                bool readonly;
1710
1711                vma = vma_lookup(mm, addr);
1712                if (vma) {
1713                        readonly = !(vma->vm_flags & VM_WRITE);
1714
1715                        next = min(vma->vm_end, end);
1716                        npages = (next - addr) >> PAGE_SHIFT;
1717                        WRITE_ONCE(p->svms.faulting_task, current);
1718                        r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
1719                                                       readonly, owner, NULL,
1720                                                       &hmm_range);
1721                        WRITE_ONCE(p->svms.faulting_task, NULL);
1722                        if (r)
1723                                pr_debug("failed %d to get svm range pages\n", r);
1724                } else {
1725                        r = -EFAULT;
1726                }
1727
1728                if (!r) {
1729                        offset = (addr >> PAGE_SHIFT) - prange->start;
1730                        r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
1731                                              hmm_range->hmm_pfns);
1732                        if (r)
1733                                pr_debug("failed %d to dma map range\n", r);
1734                }
1735
1736                svm_range_lock(prange);
1737
1738                /* Free backing memory of hmm_range if it was initialized
1739                 * Overrride return value to TRY AGAIN only if prior returns
1740                 * were successful
1741                 */
1742                if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && !r) {
1743                        pr_debug("hmm update the range, need validate again\n");
1744                        r = -EAGAIN;
1745                }
1746
1747                if (!r && !list_empty(&prange->child_list)) {
1748                        pr_debug("range split by unmap in parallel, validate again\n");
1749                        r = -EAGAIN;
1750                }
1751
1752                if (!r) {
1753                        map_start_vma = max(map_start, prange->start + offset);
1754                        map_last_vma = min(map_last, prange->start + offset + npages - 1);
1755                        if (map_start_vma <= map_last_vma) {
1756                                offset = map_start_vma - prange->start;
1757                                npages = map_last_vma - map_start_vma + 1;
1758                                r = svm_range_map_to_gpus(prange, offset, npages, readonly,
1759                                                          ctx->bitmap, wait, flush_tlb);
1760                        }
1761                }
1762
1763                if (!r && next == end)
1764                        prange->mapped_to_gpu = true;
1765
1766                svm_range_unlock(prange);
1767
1768                addr = next;
1769        }
1770
1771        svm_range_unreserve_bos(ctx);
1772        if (!r)
1773                prange->validate_timestamp = ktime_get_boottime();
1774
1775free_ctx:
1776        kfree(ctx);
1777
1778        return r;
1779}
1780
1781/**
1782 * svm_range_list_lock_and_flush_work - flush pending deferred work
1783 *
1784 * @svms: the svm range list
1785 * @mm: the mm structure
1786 *
1787 * Context: Returns with mmap write lock held, pending deferred work flushed
1788 *
1789 */
1790void
1791svm_range_list_lock_and_flush_work(struct svm_range_list *svms,
1792                                   struct mm_struct *mm)
1793{
1794retry_flush_work:
1795        flush_work(&svms->deferred_list_work);
1796        mmap_write_lock(mm);
1797
1798        if (list_empty(&svms->deferred_range_list))
1799                return;
1800        mmap_write_unlock(mm);
1801        pr_debug("retry flush\n");
1802        goto retry_flush_work;
1803}
1804
1805static void svm_range_restore_work(struct work_struct *work)
1806{
1807        struct delayed_work *dwork = to_delayed_work(work);
1808        struct amdkfd_process_info *process_info;
1809        struct svm_range_list *svms;
1810        struct svm_range *prange;
1811        struct kfd_process *p;
1812        struct mm_struct *mm;
1813        int evicted_ranges;
1814        int invalid;
1815        int r;
1816
1817        svms = container_of(dwork, struct svm_range_list, restore_work);
1818        evicted_ranges = atomic_read(&svms->evicted_ranges);
1819        if (!evicted_ranges)
1820                return;
1821
1822        pr_debug("restore svm ranges\n");
1823
1824        p = container_of(svms, struct kfd_process, svms);
1825        process_info = p->kgd_process_info;
1826
1827        /* Keep mm reference when svm_range_validate_and_map ranges */
1828        mm = get_task_mm(p->lead_thread);
1829        if (!mm) {
1830                pr_debug("svms 0x%p process mm gone\n", svms);
1831                return;
1832        }
1833
1834        mutex_lock(&process_info->lock);
1835        svm_range_list_lock_and_flush_work(svms, mm);
1836        mutex_lock(&svms->lock);
1837
1838        evicted_ranges = atomic_read(&svms->evicted_ranges);
1839
1840        list_for_each_entry(prange, &svms->list, list) {
1841                invalid = atomic_read(&prange->invalid);
1842                if (!invalid)
1843                        continue;
1844
1845                pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n",
1846                         prange->svms, prange, prange->start, prange->last,
1847                         invalid);
1848
1849                /*
1850                 * If range is migrating, wait for migration is done.
1851                 */
1852                mutex_lock(&prange->migrate_mutex);
1853
1854                r = svm_range_validate_and_map(mm, prange->start, prange->last, prange,
1855                                               MAX_GPU_INSTANCE, false, true, false);
1856                if (r)
1857                        pr_debug("failed %d to map 0x%lx to gpus\n", r,
1858                                 prange->start);
1859
1860                mutex_unlock(&prange->migrate_mutex);
1861                if (r)
1862                        goto out_reschedule;
1863
1864                if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid)
1865                        goto out_reschedule;
1866        }
1867
1868        if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) !=
1869            evicted_ranges)
1870                goto out_reschedule;
1871
1872        evicted_ranges = 0;
1873
1874        r = kgd2kfd_resume_mm(mm);
1875        if (r) {
1876                /* No recovery from this failure. Probably the CP is
1877                 * hanging. No point trying again.
1878                 */
1879                pr_debug("failed %d to resume KFD\n", r);
1880        }
1881
1882        pr_debug("restore svm ranges successfully\n");
1883
1884out_reschedule:
1885        mutex_unlock(&svms->lock);
1886        mmap_write_unlock(mm);
1887        mutex_unlock(&process_info->lock);
1888
1889        /* If validation failed, reschedule another attempt */
1890        if (evicted_ranges) {
1891                pr_debug("reschedule to restore svm range\n");
1892                queue_delayed_work(system_freezable_wq, &svms->restore_work,
1893                        msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1894
1895                kfd_smi_event_queue_restore_rescheduled(mm);
1896        }
1897        mmput(mm);
1898}
1899
1900/**
1901 * svm_range_evict - evict svm range
1902 * @prange: svm range structure
1903 * @mm: current process mm_struct
1904 * @start: starting process queue number
1905 * @last: last process queue number
1906 * @event: mmu notifier event when range is evicted or migrated
1907 *
1908 * Stop all queues of the process to ensure GPU doesn't access the memory, then
1909 * return to let CPU evict the buffer and proceed CPU pagetable update.
1910 *
1911 * Don't need use lock to sync cpu pagetable invalidation with GPU execution.
1912 * If invalidation happens while restore work is running, restore work will
1913 * restart to ensure to get the latest CPU pages mapping to GPU, then start
1914 * the queues.
1915 */
1916static int
1917svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
1918                unsigned long start, unsigned long last,
1919                enum mmu_notifier_event event)
1920{
1921        struct svm_range_list *svms = prange->svms;
1922        struct svm_range *pchild;
1923        struct kfd_process *p;
1924        int r = 0;
1925
1926        p = container_of(svms, struct kfd_process, svms);
1927
1928        pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
1929                 svms, prange->start, prange->last, start, last);
1930
1931        if (!p->xnack_enabled ||
1932            (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) {
1933                int evicted_ranges;
1934                bool mapped = prange->mapped_to_gpu;
1935
1936                list_for_each_entry(pchild, &prange->child_list, child_list) {
1937                        if (!pchild->mapped_to_gpu)
1938                                continue;
1939                        mapped = true;
1940                        mutex_lock_nested(&pchild->lock, 1);
1941                        if (pchild->start <= last && pchild->last >= start) {
1942                                pr_debug("increment pchild invalid [0x%lx 0x%lx]\n",
1943                                         pchild->start, pchild->last);
1944                                atomic_inc(&pchild->invalid);
1945                        }
1946                        mutex_unlock(&pchild->lock);
1947                }
1948
1949                if (!mapped)
1950                        return r;
1951
1952                if (prange->start <= last && prange->last >= start)
1953                        atomic_inc(&prange->invalid);
1954
1955                evicted_ranges = atomic_inc_return(&svms->evicted_ranges);
1956                if (evicted_ranges != 1)
1957                        return r;
1958
1959                pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n",
1960                         prange->svms, prange->start, prange->last);
1961
1962                /* First eviction, stop the queues */
1963                r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
1964                if (r)
1965                        pr_debug("failed to quiesce KFD\n");
1966
1967                pr_debug("schedule to restore svm %p ranges\n", svms);
1968                queue_delayed_work(system_freezable_wq, &svms->restore_work,
1969                        msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1970        } else {
1971                unsigned long s, l;
1972                uint32_t trigger;
1973
1974                if (event == MMU_NOTIFY_MIGRATE)
1975                        trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE;
1976                else
1977                        trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY;
1978
1979                pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n",
1980                         prange->svms, start, last);
1981                list_for_each_entry(pchild, &prange->child_list, child_list) {
1982                        mutex_lock_nested(&pchild->lock, 1);
1983                        s = max(start, pchild->start);
1984                        l = min(last, pchild->last);
1985                        if (l >= s)
1986                                svm_range_unmap_from_gpus(pchild, s, l, trigger);
1987                        mutex_unlock(&pchild->lock);
1988                }
1989                s = max(start, prange->start);
1990                l = min(last, prange->last);
1991                if (l >= s)
1992                        svm_range_unmap_from_gpus(prange, s, l, trigger);
1993        }
1994
1995        return r;
1996}
1997
1998static struct svm_range *svm_range_clone(struct svm_range *old)
1999{
2000        struct svm_range *new;
2001
2002        new = svm_range_new(old->svms, old->start, old->last, false);
2003        if (!new)
2004                return NULL;
2005        if (svm_range_copy_dma_addrs(new, old)) {
2006                svm_range_free(new, false);
2007                return NULL;
2008        }
2009        if (old->svm_bo) {
2010                new->ttm_res = old->ttm_res;
2011                new->offset = old->offset;
2012                new->svm_bo = svm_range_bo_ref(old->svm_bo);
2013                spin_lock(&new->svm_bo->list_lock);
2014                list_add(&new->svm_bo_list, &new->svm_bo->range_list);
2015                spin_unlock(&new->svm_bo->list_lock);
2016        }
2017        new->flags = old->flags;
2018        new->preferred_loc = old->preferred_loc;
2019        new->prefetch_loc = old->prefetch_loc;
2020        new->actual_loc = old->actual_loc;
2021        new->granularity = old->granularity;
2022        new->mapped_to_gpu = old->mapped_to_gpu;
2023        new->vram_pages = old->vram_pages;
2024        bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
2025        bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
2026        atomic_set(&new->queue_refcount, atomic_read(&old->queue_refcount));
2027
2028        return new;
2029}
2030
2031void svm_range_set_max_pages(struct amdgpu_device *adev)
2032{
2033        uint64_t max_pages;
2034        uint64_t pages, _pages;
2035        uint64_t min_pages = 0;
2036        int i, id;
2037
2038        for (i = 0; i < adev->kfd.dev->num_nodes; i++) {
2039                if (adev->kfd.dev->nodes[i]->xcp)
2040                        id = adev->kfd.dev->nodes[i]->xcp->id;
2041                else
2042                        id = -1;
2043                pages = KFD_XCP_MEMORY_SIZE(adev, id) >> 17;
2044                pages = clamp(pages, 1ULL << 9, 1ULL << 18);
2045                pages = rounddown_pow_of_two(pages);
2046                min_pages = min_not_zero(min_pages, pages);
2047        }
2048
2049        do {
2050                max_pages = READ_ONCE(max_svm_range_pages);
2051                _pages = min_not_zero(max_pages, min_pages);
2052        } while (cmpxchg(&max_svm_range_pages, max_pages, _pages) != max_pages);
2053}
2054
2055static int
2056svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last,
2057                    uint64_t max_pages, struct list_head *insert_list,
2058                    struct list_head *update_list)
2059{
2060        struct svm_range *prange;
2061        uint64_t l;
2062
2063        pr_debug("max_svm_range_pages 0x%llx adding [0x%llx 0x%llx]\n",
2064                 max_pages, start, last);
2065
2066        while (last >= start) {
2067                l = min(last, ALIGN_DOWN(start + max_pages, max_pages) - 1);
2068
2069                prange = svm_range_new(svms, start, l, true);
2070                if (!prange)
2071                        return -ENOMEM;
2072                list_add(&prange->list, insert_list);
2073                list_add(&prange->update_list, update_list);
2074
2075                start = l + 1;
2076        }
2077        return 0;
2078}
2079
2080/**
2081 * svm_range_add - add svm range and handle overlap
2082 * @p: the range add to this process svms
2083 * @start: page size aligned
2084 * @size: page size aligned
2085 * @nattr: number of attributes
2086 * @attrs: array of attributes
2087 * @update_list: output, the ranges need validate and update GPU mapping
2088 * @insert_list: output, the ranges need insert to svms
2089 * @remove_list: output, the ranges are replaced and need remove from svms
2090 * @remap_list: output, remap unaligned svm ranges
2091 *
2092 * Check if the virtual address range has overlap with any existing ranges,
2093 * split partly overlapping ranges and add new ranges in the gaps. All changes
2094 * should be applied to the range_list and interval tree transactionally. If
2095 * any range split or allocation fails, the entire update fails. Therefore any
2096 * existing overlapping svm_ranges are cloned and the original svm_ranges left
2097 * unchanged.
2098 *
2099 * If the transaction succeeds, the caller can update and insert clones and
2100 * new ranges, then free the originals.
2101 *
2102 * Otherwise the caller can free the clones and new ranges, while the old
2103 * svm_ranges remain unchanged.
2104 *
2105 * Context: Process context, caller must hold svms->lock
2106 *
2107 * Return:
2108 * 0 - OK, otherwise error code
2109 */
2110static int
2111svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
2112              uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
2113              struct list_head *update_list, struct list_head *insert_list,
2114              struct list_head *remove_list, struct list_head *remap_list)
2115{
2116        unsigned long last = start + size - 1UL;
2117        struct svm_range_list *svms = &p->svms;
2118        struct interval_tree_node *node;
2119        struct svm_range *prange;
2120        struct svm_range *tmp;
2121        struct list_head new_list;
2122        int r = 0;
2123
2124        pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last);
2125
2126        INIT_LIST_HEAD(update_list);
2127        INIT_LIST_HEAD(insert_list);
2128        INIT_LIST_HEAD(remove_list);
2129        INIT_LIST_HEAD(&new_list);
2130        INIT_LIST_HEAD(remap_list);
2131
2132        node = interval_tree_iter_first(&svms->objects, start, last);
2133        while (node) {
2134                struct interval_tree_node *next;
2135                unsigned long next_start;
2136
2137                pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start,
2138                         node->last);
2139
2140                prange = container_of(node, struct svm_range, it_node);
2141                next = interval_tree_iter_next(node, start, last);
2142                next_start = min(node->last, last) + 1;
2143
2144                if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
2145                    prange->mapped_to_gpu) {
2146                        /* nothing to do */
2147                } else if (node->start < start || node->last > last) {
2148                        /* node intersects the update range and its attributes
2149                         * will change. Clone and split it, apply updates only
2150                         * to the overlapping part
2151                         */
2152                        struct svm_range *old = prange;
2153
2154                        prange = svm_range_clone(old);
2155                        if (!prange) {
2156                                r = -ENOMEM;
2157                                goto out;
2158                        }
2159
2160                        list_add(&old->update_list, remove_list);
2161                        list_add(&prange->list, insert_list);
2162                        list_add(&prange->update_list, update_list);
2163
2164                        if (node->start < start) {
2165                                pr_debug("change old range start\n");
2166                                r = svm_range_split_head(prange, start,
2167                                                         insert_list, remap_list);
2168                                if (r)
2169                                        goto out;
2170                        }
2171                        if (node->last > last) {
2172                                pr_debug("change old range last\n");
2173                                r = svm_range_split_tail(prange, last,
2174                                                         insert_list, remap_list);
2175                                if (r)
2176                                        goto out;
2177                        }
2178                } else {
2179                        /* The node is contained within start..last,
2180                         * just update it
2181                         */
2182                        list_add(&prange->update_list, update_list);
2183                }
2184
2185                /* insert a new node if needed */
2186                if (node->start > start) {
2187                        r = svm_range_split_new(svms, start, node->start - 1,
2188                                                READ_ONCE(max_svm_range_pages),
2189                                                &new_list, update_list);
2190                        if (r)
2191                                goto out;
2192                }
2193
2194                node = next;
2195                start = next_start;
2196        }
2197
2198        /* add a final range at the end if needed */
2199        if (start <= last)
2200                r = svm_range_split_new(svms, start, last,
2201                                        READ_ONCE(max_svm_range_pages),
2202                                        &new_list, update_list);
2203
2204out:
2205        if (r) {
2206                list_for_each_entry_safe(prange, tmp, insert_list, list)
2207                        svm_range_free(prange, false);
2208                list_for_each_entry_safe(prange, tmp, &new_list, list)
2209                        svm_range_free(prange, true);
2210        } else {
2211                list_splice(&new_list, insert_list);
2212        }
2213
2214        return r;
2215}
2216
2217static void
2218svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
2219                                            struct svm_range *prange)
2220{
2221        unsigned long start;
2222        unsigned long last;
2223
2224        start = prange->notifier.interval_tree.start >> PAGE_SHIFT;
2225        last = prange->notifier.interval_tree.last >> PAGE_SHIFT;
2226
2227        if (prange->start == start && prange->last == last)
2228                return;
2229
2230        pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
2231                  prange->svms, prange, start, last, prange->start,
2232                  prange->last);
2233
2234        if (start != 0 && last != 0) {
2235                interval_tree_remove(&prange->it_node, &prange->svms->objects);
2236                svm_range_remove_notifier(prange);
2237        }
2238        prange->it_node.start = prange->start;
2239        prange->it_node.last = prange->last;
2240
2241        interval_tree_insert(&prange->it_node, &prange->svms->objects);
2242        svm_range_add_notifier_locked(mm, prange);
2243}
2244
2245static void
2246svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange,
2247                         struct mm_struct *mm)
2248{
2249        switch (prange->work_item.op) {
2250        case SVM_OP_NULL:
2251                pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2252                         svms, prange, prange->start, prange->last);
2253                break;
2254        case SVM_OP_UNMAP_RANGE:
2255                pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2256                         svms, prange, prange->start, prange->last);
2257                svm_range_unlink(prange);
2258                svm_range_remove_notifier(prange);
2259                svm_range_free(prange, true);
2260                break;
2261        case SVM_OP_UPDATE_RANGE_NOTIFIER:
2262                pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2263                         svms, prange, prange->start, prange->last);
2264                svm_range_update_notifier_and_interval_tree(mm, prange);
2265                break;
2266        case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
2267                pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2268                         svms, prange, prange->start, prange->last);
2269                svm_range_update_notifier_and_interval_tree(mm, prange);
2270                /* TODO: implement deferred validation and mapping */
2271                break;
2272        case SVM_OP_ADD_RANGE:
2273                pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
2274                         prange->start, prange->last);
2275                svm_range_add_to_svms(prange);
2276                svm_range_add_notifier_locked(mm, prange);
2277                break;
2278        case SVM_OP_ADD_RANGE_AND_MAP:
2279                pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
2280                         prange, prange->start, prange->last);
2281                svm_range_add_to_svms(prange);
2282                svm_range_add_notifier_locked(mm, prange);
2283                /* TODO: implement deferred validation and mapping */
2284                break;
2285        default:
2286                WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange,
2287                         prange->work_item.op);
2288        }
2289}
2290
2291static void svm_range_drain_retry_fault(struct svm_range_list *svms)
2292{
2293        struct kfd_process_device *pdd;
2294        struct kfd_process *p;
2295        uint32_t i;
2296
2297        p = container_of(svms, struct kfd_process, svms);
2298
2299        for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
2300                pdd = p->pdds[i];
2301                if (!pdd)
2302                        continue;
2303
2304                pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
2305
2306                amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
2307                                pdd->dev->adev->irq.retry_cam_enabled ?
2308                                &pdd->dev->adev->irq.ih :
2309                                &pdd->dev->adev->irq.ih1);
2310
2311                if (pdd->dev->adev->irq.retry_cam_enabled)
2312                        amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
2313                                &pdd->dev->adev->irq.ih_soft);
2314
2315
2316                pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
2317        }
2318}
2319
2320static void svm_range_deferred_list_work(struct work_struct *work)
2321{
2322        struct svm_range_list *svms;
2323        struct svm_range *prange;
2324        struct mm_struct *mm;
2325
2326        svms = container_of(work, struct svm_range_list, deferred_list_work);
2327        pr_debug("enter svms 0x%p\n", svms);
2328
2329        spin_lock(&svms->deferred_list_lock);
2330        while (!list_empty(&svms->deferred_range_list)) {
2331                prange = list_first_entry(&svms->deferred_range_list,
2332                                          struct svm_range, deferred_list);
2333                spin_unlock(&svms->deferred_list_lock);
2334
2335                pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
2336                         prange->start, prange->last, prange->work_item.op);
2337
2338                mm = prange->work_item.mm;
2339
2340                mmap_write_lock(mm);
2341
2342                /* Remove from deferred_list must be inside mmap write lock, for
2343                 * two race cases:
2344                 * 1. unmap_from_cpu may change work_item.op and add the range
2345                 *    to deferred_list again, cause use after free bug.
2346                 * 2. svm_range_list_lock_and_flush_work may hold mmap write
2347                 *    lock and continue because deferred_list is empty, but
2348                 *    deferred_list work is actually waiting for mmap lock.
2349                 */
2350                spin_lock(&svms->deferred_list_lock);
2351                list_del_init(&prange->deferred_list);
2352                spin_unlock(&svms->deferred_list_lock);
2353
2354                mutex_lock(&svms->lock);
2355                mutex_lock(&prange->migrate_mutex);
2356                while (!list_empty(&prange->child_list)) {
2357                        struct svm_range *pchild;
2358
2359                        pchild = list_first_entry(&prange->child_list,
2360                                                struct svm_range, child_list);
2361                        pr_debug("child prange 0x%p op %d\n", pchild,
2362                                 pchild->work_item.op);
2363                        list_del_init(&pchild->child_list);
2364                        svm_range_handle_list_op(svms, pchild, mm);
2365                }
2366                mutex_unlock(&prange->migrate_mutex);
2367
2368                svm_range_handle_list_op(svms, prange, mm);
2369                mutex_unlock(&svms->lock);
2370                mmap_write_unlock(mm);
2371
2372                /* Pairs with mmget in svm_range_add_list_work. If dropping the
2373                 * last mm refcount, schedule release work to avoid circular locking
2374                 */
2375                mmput_async(mm);
2376
2377                spin_lock(&svms->deferred_list_lock);
2378        }
2379        spin_unlock(&svms->deferred_list_lock);
2380        pr_debug("exit svms 0x%p\n", svms);
2381}
2382
2383void
2384svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
2385                        struct mm_struct *mm, enum svm_work_list_ops op)
2386{
2387        spin_lock(&svms->deferred_list_lock);
2388        /* if prange is on the deferred list */
2389        if (!list_empty(&prange->deferred_list)) {
2390                pr_debug("update exist prange 0x%p work op %d\n", prange, op);
2391                WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n");
2392                if (op != SVM_OP_NULL &&
2393                    prange->work_item.op != SVM_OP_UNMAP_RANGE)
2394                        prange->work_item.op = op;
2395        } else {
2396                /* Pairs with mmput in deferred_list_work.
2397                 * If process is exiting and mm is gone, don't update mmu notifier.
2398                 */
2399                if (mmget_not_zero(mm)) {
2400                        prange->work_item.mm = mm;
2401                        prange->work_item.op = op;
2402                        list_add_tail(&prange->deferred_list,
2403                                      &prange->svms->deferred_range_list);
2404                        pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
2405                                 prange, prange->start, prange->last, op);
2406                }
2407        }
2408        spin_unlock(&svms->deferred_list_lock);
2409}
2410
2411void schedule_deferred_list_work(struct svm_range_list *svms)
2412{
2413        spin_lock(&svms->deferred_list_lock);
2414        if (!list_empty(&svms->deferred_range_list))
2415                schedule_work(&svms->deferred_list_work);
2416        spin_unlock(&svms->deferred_list_lock);
2417}
2418
2419static void
2420svm_range_unmap_split(struct svm_range *parent, struct svm_range *prange, unsigned long start,
2421                      unsigned long last)
2422{
2423        struct svm_range *head;
2424        struct svm_range *tail;
2425
2426        if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2427                pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange,
2428                         prange->start, prange->last);
2429                return;
2430        }
2431        if (start > prange->last || last < prange->start)
2432                return;
2433
2434        head = tail = prange;
2435        if (start > prange->start)
2436                svm_range_split(prange, prange->start, start - 1, &tail);
2437        if (last < tail->last)
2438                svm_range_split(tail, last + 1, tail->last, &head);
2439
2440        if (head != prange && tail != prange) {
2441                svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE);
2442                svm_range_add_child(parent, tail, SVM_OP_ADD_RANGE);
2443        } else if (tail != prange) {
2444                svm_range_add_child(parent, tail, SVM_OP_UNMAP_RANGE);
2445        } else if (head != prange) {
2446                svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE);
2447        } else if (parent != prange) {
2448                prange->work_item.op = SVM_OP_UNMAP_RANGE;
2449        }
2450}
2451
2452static void
2453svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
2454                         unsigned long start, unsigned long last)
2455{
2456        uint32_t trigger = KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU;
2457        struct svm_range_list *svms;
2458        struct svm_range *pchild;
2459        struct kfd_process *p;
2460        unsigned long s, l;
2461        bool unmap_parent;
2462        uint32_t i;
2463
2464        if (atomic_read(&prange->queue_refcount)) {
2465                int r;
2466
2467                pr_warn("Freeing queue vital buffer 0x%lx, queue evicted\n",
2468                        prange->start << PAGE_SHIFT);
2469                r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
2470                if (r)
2471                        pr_debug("failed %d to quiesce KFD queues\n", r);
2472        }
2473
2474        p = kfd_lookup_process_by_mm(mm);
2475        if (!p)
2476                return;
2477        svms = &p->svms;
2478
2479        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
2480                 prange, prange->start, prange->last, start, last);
2481
2482        /* calculate time stamps that are used to decide which page faults need be
2483         * dropped or handled before unmap pages from gpu vm
2484         */
2485        for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
2486                struct kfd_process_device *pdd;
2487                struct amdgpu_device *adev;
2488                struct amdgpu_ih_ring *ih;
2489                uint32_t checkpoint_wptr;
2490
2491                pdd = p->pdds[i];
2492                if (!pdd)
2493                        continue;
2494
2495                adev = pdd->dev->adev;
2496
2497                /* Check and drain ih1 ring if cam not available */
2498                if (adev->irq.ih1.ring_size) {
2499                        ih = &adev->irq.ih1;
2500                        checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
2501                        if (ih->rptr != checkpoint_wptr) {
2502                                svms->checkpoint_ts[i] =
2503                                        amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1);
2504                                continue;
2505                        }
2506                }
2507
2508                /* check if dev->irq.ih_soft is not empty */
2509                ih = &adev->irq.ih_soft;
2510                checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
2511                if (ih->rptr != checkpoint_wptr)
2512                        svms->checkpoint_ts[i] = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1);
2513        }
2514
2515        unmap_parent = start <= prange->start && last >= prange->last;
2516
2517        list_for_each_entry(pchild, &prange->child_list, child_list) {
2518                mutex_lock_nested(&pchild->lock, 1);
2519                s = max(start, pchild->start);
2520                l = min(last, pchild->last);
2521                if (l >= s)
2522                        svm_range_unmap_from_gpus(pchild, s, l, trigger);
2523                svm_range_unmap_split(prange, pchild, start, last);
2524                mutex_unlock(&pchild->lock);
2525        }
2526        s = max(start, prange->start);
2527        l = min(last, prange->last);
2528        if (l >= s)
2529                svm_range_unmap_from_gpus(prange, s, l, trigger);
2530        svm_range_unmap_split(prange, prange, start, last);
2531
2532        if (unmap_parent)
2533                svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
2534        else
2535                svm_range_add_list_work(svms, prange, mm,
2536                                        SVM_OP_UPDATE_RANGE_NOTIFIER);
2537        schedule_deferred_list_work(svms);
2538
2539        kfd_unref_process(p);
2540}
2541
2542/**
2543 * svm_range_cpu_invalidate_pagetables - interval notifier callback
2544 * @mni: mmu_interval_notifier struct
2545 * @range: mmu_notifier_range struct
2546 * @cur_seq: value to pass to mmu_interval_set_seq()
2547 *
2548 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it
2549 * is from migration, or CPU page invalidation callback.
2550 *
2551 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed
2552 * work thread, and split prange if only part of prange is unmapped.
2553 *
2554 * For invalidation event, if GPU retry fault is not enabled, evict the queues,
2555 * then schedule svm_range_restore_work to update GPU mapping and resume queues.
2556 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will
2557 * update GPU mapping to recover.
2558 *
2559 * Context: mmap lock, notifier_invalidate_start lock are held
2560 *          for invalidate event, prange lock is held if this is from migration
2561 */
2562static bool
2563svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
2564                                    const struct mmu_notifier_range *range,
2565                                    unsigned long cur_seq)
2566{
2567        struct svm_range *prange;
2568        unsigned long start;
2569        unsigned long last;
2570
2571        if (range->event == MMU_NOTIFY_RELEASE)
2572                return true;
2573
2574        start = mni->interval_tree.start;
2575        last = mni->interval_tree.last;
2576        start = max(start, range->start) >> PAGE_SHIFT;
2577        last = min(last, range->end - 1) >> PAGE_SHIFT;
2578        pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n",
2579                 start, last, range->start >> PAGE_SHIFT,
2580                 (range->end - 1) >> PAGE_SHIFT,
2581                 mni->interval_tree.start >> PAGE_SHIFT,
2582                 mni->interval_tree.last >> PAGE_SHIFT, range->event);
2583
2584        prange = container_of(mni, struct svm_range, notifier);
2585
2586        svm_range_lock(prange);
2587        mmu_interval_set_seq(mni, cur_seq);
2588
2589        switch (range->event) {
2590        case MMU_NOTIFY_UNMAP:
2591                svm_range_unmap_from_cpu(mni->mm, prange, start, last);
2592                break;
2593        default:
2594                svm_range_evict(prange, mni->mm, start, last, range->event);
2595                break;
2596        }
2597
2598        svm_range_unlock(prange);
2599
2600        return true;
2601}
2602
2603/**
2604 * svm_range_from_addr - find svm range from fault address
2605 * @svms: svm range list header
2606 * @addr: address to search range interval tree, in pages
2607 * @parent: parent range if range is on child list
2608 *
2609 * Context: The caller must hold svms->lock
2610 *
2611 * Return: the svm_range found or NULL
2612 */
2613struct svm_range *
2614svm_range_from_addr(struct svm_range_list *svms, unsigned long addr,
2615                    struct svm_range **parent)
2616{
2617        struct interval_tree_node *node;
2618        struct svm_range *prange;
2619        struct svm_range *pchild;
2620
2621        node = interval_tree_iter_first(&svms->objects, addr, addr);
2622        if (!node)
2623                return NULL;
2624
2625        prange = container_of(node, struct svm_range, it_node);
2626        pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n",
2627                 addr, prange->start, prange->last, node->start, node->last);
2628
2629        if (addr >= prange->start && addr <= prange->last) {
2630                if (parent)
2631                        *parent = prange;
2632                return prange;
2633        }
2634        list_for_each_entry(pchild, &prange->child_list, child_list)
2635                if (addr >= pchild->start && addr <= pchild->last) {
2636                        pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n",
2637                                 addr, pchild->start, pchild->last);
2638                        if (parent)
2639                                *parent = prange;
2640                        return pchild;
2641                }
2642
2643        return NULL;
2644}
2645
2646/* svm_range_best_restore_location - decide the best fault restore location
2647 * @prange: svm range structure
2648 * @adev: the GPU on which vm fault happened
2649 *
2650 * This is only called when xnack is on, to decide the best location to restore
2651 * the range mapping after GPU vm fault. Caller uses the best location to do
2652 * migration if actual loc is not best location, then update GPU page table
2653 * mapping to the best location.
2654 *
2655 * If the preferred loc is accessible by faulting GPU, use preferred loc.
2656 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu
2657 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then
2658 *    if range actual loc is cpu, best_loc is cpu
2659 *    if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is
2660 *    range actual loc.
2661 * Otherwise, GPU no access, best_loc is -1.
2662 *
2663 * Return:
2664 * -1 means vm fault GPU no access
2665 * 0 for CPU or GPU id
2666 */
2667static int32_t
2668svm_range_best_restore_location(struct svm_range *prange,
2669                                struct kfd_node *node,
2670                                int32_t *gpuidx)
2671{
2672        struct kfd_node *bo_node, *preferred_node;
2673        struct kfd_process *p;
2674        uint32_t gpuid;
2675        int r;
2676
2677        p = container_of(prange->svms, struct kfd_process, svms);
2678
2679        r = kfd_process_gpuid_from_node(p, node, &gpuid, gpuidx);
2680        if (r < 0) {
2681                pr_debug("failed to get gpuid from kgd\n");
2682                return -1;
2683        }
2684
2685        if (node->adev->apu_prefer_gtt)
2686                return 0;
2687
2688        if (prange->preferred_loc == gpuid ||
2689            prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) {
2690                return prange->preferred_loc;
2691        } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
2692                preferred_node = svm_range_get_node_by_id(prange, prange->preferred_loc);
2693                if (preferred_node && svm_nodes_in_same_hive(node, preferred_node))
2694                        return prange->preferred_loc;
2695                /* fall through */
2696        }
2697
2698        if (test_bit(*gpuidx, prange->bitmap_access))
2699                return gpuid;
2700
2701        if (test_bit(*gpuidx, prange->bitmap_aip)) {
2702                if (!prange->actual_loc)
2703                        return 0;
2704
2705                bo_node = svm_range_get_node_by_id(prange, prange->actual_loc);
2706                if (bo_node && svm_nodes_in_same_hive(node, bo_node))
2707                        return prange->actual_loc;
2708                else
2709                        return 0;
2710        }
2711
2712        return -1;
2713}
2714
2715static int
2716svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
2717                               unsigned long *start, unsigned long *last,
2718                               bool *is_heap_stack)
2719{
2720        struct vm_area_struct *vma;
2721        struct interval_tree_node *node;
2722        struct rb_node *rb_node;
2723        unsigned long start_limit, end_limit;
2724
2725        vma = vma_lookup(p->mm, addr << PAGE_SHIFT);
2726        if (!vma) {
2727                pr_debug("VMA does not exist in address [0x%llx]\n", addr);
2728                return -EFAULT;
2729        }
2730
2731        *is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma);
2732
2733        start_limit = max(vma->vm_start >> PAGE_SHIFT,
2734                      (unsigned long)ALIGN_DOWN(addr, 1UL << p->svms.default_granularity));
2735        end_limit = min(vma->vm_end >> PAGE_SHIFT,
2736                    (unsigned long)ALIGN(addr + 1, 1UL << p->svms.default_granularity));
2737
2738        /* First range that starts after the fault address */
2739        node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX);
2740        if (node) {
2741                end_limit = min(end_limit, node->start);
2742                /* Last range that ends before the fault address */
2743                rb_node = rb_prev(&node->rb);
2744        } else {
2745                /* Last range must end before addr because
2746                 * there was no range after addr
2747                 */
2748                rb_node = rb_last(&p->svms.objects.rb_root);
2749        }
2750        if (rb_node) {
2751                node = container_of(rb_node, struct interval_tree_node, rb);
2752                if (node->last >= addr) {
2753                        WARN(1, "Overlap with prev node and page fault addr\n");
2754                        return -EFAULT;
2755                }
2756                start_limit = max(start_limit, node->last + 1);
2757        }
2758
2759        *start = start_limit;
2760        *last = end_limit - 1;
2761
2762        pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n",
2763                 vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT,
2764                 *start, *last, *is_heap_stack);
2765
2766        return 0;
2767}
2768
2769static int
2770svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last,
2771                           uint64_t *bo_s, uint64_t *bo_l)
2772{
2773        struct amdgpu_bo_va_mapping *mapping;
2774        struct interval_tree_node *node;
2775        struct amdgpu_bo *bo = NULL;
2776        unsigned long userptr;
2777        uint32_t i;
2778        int r;
2779
2780        for (i = 0; i < p->n_pdds; i++) {
2781                struct amdgpu_vm *vm;
2782
2783                if (!p->pdds[i]->drm_priv)
2784                        continue;
2785
2786                vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
2787                r = amdgpu_bo_reserve(vm->root.bo, false);
2788                if (r)
2789                        return r;
2790
2791                /* Check userptr by searching entire vm->va interval tree */
2792                node = interval_tree_iter_first(&vm->va, 0, ~0ULL);
2793                while (node) {
2794                        mapping = container_of((struct rb_node *)node,
2795                                               struct amdgpu_bo_va_mapping, rb);
2796                        bo = mapping->bo_va->base.bo;
2797
2798                        if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
2799                                                         start << PAGE_SHIFT,
2800                                                         last << PAGE_SHIFT,
2801                                                         &userptr)) {
2802                                node = interval_tree_iter_next(node, 0, ~0ULL);
2803                                continue;
2804                        }
2805
2806                        pr_debug("[0x%llx 0x%llx] already userptr mapped\n",
2807                                 start, last);
2808                        if (bo_s && bo_l) {
2809                                *bo_s = userptr >> PAGE_SHIFT;
2810                                *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1;
2811                        }
2812                        amdgpu_bo_unreserve(vm->root.bo);
2813                        return -EADDRINUSE;
2814                }
2815                amdgpu_bo_unreserve(vm->root.bo);
2816        }
2817        return 0;
2818}
2819
2820static struct
2821svm_range *svm_range_create_unregistered_range(struct kfd_node *node,
2822                                                struct kfd_process *p,
2823                                                struct mm_struct *mm,
2824                                                int64_t addr)
2825{
2826        struct svm_range *prange = NULL;
2827        unsigned long start, last;
2828        uint32_t gpuid, gpuidx;
2829        bool is_heap_stack;
2830        uint64_t bo_s = 0;
2831        uint64_t bo_l = 0;
2832        int r;
2833
2834        if (svm_range_get_range_boundaries(p, addr, &start, &last,
2835                                           &is_heap_stack))
2836                return NULL;
2837
2838        r = svm_range_check_vm(p, start, last, &bo_s, &bo_l);
2839        if (r != -EADDRINUSE)
2840                r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l);
2841
2842        if (r == -EADDRINUSE) {
2843                if (addr >= bo_s && addr <= bo_l)
2844                        return NULL;
2845
2846                /* Create one page svm range if 2MB range overlapping */
2847                start = addr;
2848                last = addr;
2849        }
2850
2851        prange = svm_range_new(&p->svms, start, last, true);
2852        if (!prange) {
2853                pr_debug("Failed to create prange in address [0x%llx]\n", addr);
2854                return NULL;
2855        }
2856        if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) {
2857                pr_debug("failed to get gpuid from kgd\n");
2858                svm_range_free(prange, true);
2859                return NULL;
2860        }
2861
2862        if (is_heap_stack)
2863                prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM;
2864
2865        svm_range_add_to_svms(prange);
2866        svm_range_add_notifier_locked(mm, prange);
2867
2868        return prange;
2869}
2870
2871/* svm_range_skip_recover - decide if prange can be recovered
2872 * @prange: svm range structure
2873 *
2874 * GPU vm retry fault handle skip recover the range for cases:
2875 * 1. prange is on deferred list to be removed after unmap, it is stale fault,
2876 *    deferred list work will drain the stale fault before free the prange.
2877 * 2. prange is on deferred list to add interval notifier after split, or
2878 * 3. prange is child range, it is split from parent prange, recover later
2879 *    after interval notifier is added.
2880 *
2881 * Return: true to skip recover, false to recover
2882 */
2883static bool svm_range_skip_recover(struct svm_range *prange)
2884{
2885        struct svm_range_list *svms = prange->svms;
2886
2887        spin_lock(&svms->deferred_list_lock);
2888        if (list_empty(&prange->deferred_list) &&
2889            list_empty(&prange->child_list)) {
2890                spin_unlock(&svms->deferred_list_lock);
2891                return false;
2892        }
2893        spin_unlock(&svms->deferred_list_lock);
2894
2895        if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2896                pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n",
2897                         svms, prange, prange->start, prange->last);
2898                return true;
2899        }
2900        if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP ||
2901            prange->work_item.op == SVM_OP_ADD_RANGE) {
2902                pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n",
2903                         svms, prange, prange->start, prange->last);
2904                return true;
2905        }
2906        return false;
2907}
2908
2909static void
2910svm_range_count_fault(struct kfd_node *node, struct kfd_process *p,
2911                      int32_t gpuidx)
2912{
2913        struct kfd_process_device *pdd;
2914
2915        /* fault is on different page of same range
2916         * or fault is skipped to recover later
2917         * or fault is on invalid virtual address
2918         */
2919        if (gpuidx == MAX_GPU_INSTANCE) {
2920                uint32_t gpuid;
2921                int r;
2922
2923                r = kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx);
2924                if (r < 0)
2925                        return;
2926        }
2927
2928        /* fault is recovered
2929         * or fault cannot recover because GPU no access on the range
2930         */
2931        pdd = kfd_process_device_from_gpuidx(p, gpuidx);
2932        if (pdd)
2933                WRITE_ONCE(pdd->faults, pdd->faults + 1);
2934}
2935
2936static bool
2937svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
2938{
2939        unsigned long requested = VM_READ;
2940
2941        if (write_fault)
2942                requested |= VM_WRITE;
2943
2944        pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
2945                vma->vm_flags);
2946        return (vma->vm_flags & requested) == requested;
2947}
2948
2949int
2950svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
2951                        uint32_t vmid, uint32_t node_id,
2952                        uint64_t addr, uint64_t ts, bool write_fault)
2953{
2954        unsigned long start, last, size;
2955        struct mm_struct *mm = NULL;
2956        struct svm_range_list *svms;
2957        struct svm_range *prange;
2958        struct kfd_process *p;
2959        ktime_t timestamp = ktime_get_boottime();
2960        struct kfd_node *node;
2961        int32_t best_loc;
2962        int32_t gpuid, gpuidx = MAX_GPU_INSTANCE;
2963        bool write_locked = false;
2964        struct vm_area_struct *vma;
2965        bool migration = false;
2966        int r = 0;
2967
2968        if (!KFD_IS_SVM_API_SUPPORTED(adev)) {
2969                pr_debug("device does not support SVM\n");
2970                return -EFAULT;
2971        }
2972
2973        p = kfd_lookup_process_by_pasid(pasid, NULL);
2974        if (!p) {
2975                pr_debug("kfd process not founded pasid 0x%x\n", pasid);
2976                return 0;
2977        }
2978        svms = &p->svms;
2979
2980        pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
2981
2982        if (atomic_read(&svms->drain_pagefaults)) {
2983                pr_debug("page fault handling disabled, drop fault 0x%llx\n", addr);
2984                r = 0;
2985                goto out;
2986        }
2987
2988        node = kfd_node_by_irq_ids(adev, node_id, vmid);
2989        if (!node) {
2990                pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
2991                         vmid);
2992                r = -EFAULT;
2993                goto out;
2994        }
2995
2996        if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) {
2997                pr_debug("failed to get gpuid/gpuidex for node_id: %d\n", node_id);
2998                r = -EFAULT;
2999                goto out;
3000        }
3001
3002        if (!p->xnack_enabled) {
3003                pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
3004                r = -EFAULT;
3005                goto out;
3006        }
3007
3008        /* p->lead_thread is available as kfd_process_wq_release flush the work
3009         * before releasing task ref.
3010         */
3011        mm = get_task_mm(p->lead_thread);
3012        if (!mm) {
3013                pr_debug("svms 0x%p failed to get mm\n", svms);
3014                r = 0;
3015                goto out;
3016        }
3017
3018        mmap_read_lock(mm);
3019retry_write_locked:
3020        mutex_lock(&svms->lock);
3021
3022        /* check if this page fault time stamp is before svms->checkpoint_ts */
3023        if (svms->checkpoint_ts[gpuidx] != 0) {
3024                if (amdgpu_ih_ts_after_or_equal(ts,  svms->checkpoint_ts[gpuidx])) {
3025                        pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
3026                        r = -EAGAIN;
3027                        goto out_unlock_svms;
3028                } else {
3029                        /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts
3030                         * to zero to avoid following ts wrap around give wrong comparing
3031                         */
3032                        svms->checkpoint_ts[gpuidx] = 0;
3033                }
3034        }
3035
3036        prange = svm_range_from_addr(svms, addr, NULL);
3037        if (!prange) {
3038                pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
3039                         svms, addr);
3040                if (!write_locked) {
3041                        /* Need the write lock to create new range with MMU notifier.
3042                         * Also flush pending deferred work to make sure the interval
3043                         * tree is up to date before we add a new range
3044                         */
3045                        mutex_unlock(&svms->lock);
3046                        mmap_read_unlock(mm);
3047                        mmap_write_lock(mm);
3048                        write_locked = true;
3049                        goto retry_write_locked;
3050                }
3051                prange = svm_range_create_unregistered_range(node, p, mm, addr);
3052                if (!prange) {
3053                        pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n",
3054                                 svms, addr);
3055                        mmap_write_downgrade(mm);
3056                        r = -EFAULT;
3057                        goto out_unlock_svms;
3058                }
3059        }
3060        if (write_locked)
3061                mmap_write_downgrade(mm);
3062
3063        mutex_lock(&prange->migrate_mutex);
3064
3065        if (svm_range_skip_recover(prange)) {
3066                amdgpu_gmc_filter_faults_remove(node->adev, addr, pasid);
3067                r = 0;
3068                goto out_unlock_range;
3069        }
3070
3071        /* skip duplicate vm fault on different pages of same range */
3072        if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
3073                                AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
3074                pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
3075                         svms, prange->start, prange->last);
3076                r = 0;
3077                goto out_unlock_range;
3078        }
3079
3080        /* __do_munmap removed VMA, return success as we are handling stale
3081         * retry fault.
3082         */
3083        vma = vma_lookup(mm, addr << PAGE_SHIFT);
3084        if (!vma) {
3085                pr_debug("address 0x%llx VMA is removed\n", addr);
3086                r = 0;
3087                goto out_unlock_range;
3088        }
3089
3090        if (!svm_fault_allowed(vma, write_fault)) {
3091                pr_debug("fault addr 0x%llx no %s permission\n", addr,
3092                        write_fault ? "write" : "read");
3093                r = -EPERM;
3094                goto out_unlock_range;
3095        }
3096
3097        best_loc = svm_range_best_restore_location(prange, node, &gpuidx);
3098        if (best_loc == -1) {
3099                pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
3100                         svms, prange->start, prange->last);
3101                r = -EACCES;
3102                goto out_unlock_range;
3103        }
3104
3105        pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
3106                 svms, prange->start, prange->last, best_loc,
3107                 prange->actual_loc);
3108
3109        kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr,
3110                                       write_fault, timestamp);
3111
3112        /* Align migration range start and size to granularity size */
3113        size = 1UL << prange->granularity;
3114        start = max_t(unsigned long, ALIGN_DOWN(addr, size), prange->start);
3115        last = min_t(unsigned long, ALIGN(addr + 1, size) - 1, prange->last);
3116        if (prange->actual_loc != 0 || best_loc != 0) {
3117                if (best_loc) {
3118                        r = svm_migrate_to_vram(prange, best_loc, start, last,
3119                                        mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
3120                        if (r) {
3121                                pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n",
3122                                         r, addr);
3123                                /* Fallback to system memory if migration to
3124                                 * VRAM failed
3125                                 */
3126                                if (prange->actual_loc && prange->actual_loc != best_loc)
3127                                        r = svm_migrate_vram_to_ram(prange, mm, start, last,
3128                                                KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, NULL);
3129                                else
3130                                        r = 0;
3131                        }
3132                } else {
3133                        r = svm_migrate_vram_to_ram(prange, mm, start, last,
3134                                        KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, NULL);
3135                }
3136                if (r) {
3137                        pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
3138                                 r, svms, start, last);
3139                        goto out_migrate_fail;
3140                } else {
3141                        migration = true;
3142                }
3143        }
3144
3145        r = svm_range_validate_and_map(mm, start, last, prange, gpuidx, false,
3146                                       false, false);
3147        if (r)
3148                pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
3149                         r, svms, start, last);
3150
3151out_migrate_fail:
3152        kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr,
3153                                     migration);
3154
3155out_unlock_range:
3156        mutex_unlock(&prange->migrate_mutex);
3157out_unlock_svms:
3158        mutex_unlock(&svms->lock);
3159        mmap_read_unlock(mm);
3160
3161        if (r != -EAGAIN)
3162                svm_range_count_fault(node, p, gpuidx);
3163
3164        mmput(mm);
3165out:
3166        kfd_unref_process(p);
3167
3168        if (r == -EAGAIN) {
3169                pr_debug("recover vm fault later\n");
3170                amdgpu_gmc_filter_faults_remove(node->adev, addr, pasid);
3171                r = 0;
3172        }
3173        return r;
3174}
3175
3176int
3177svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled)
3178{
3179        struct svm_range *prange, *pchild;
3180        uint64_t reserved_size = 0;
3181        uint64_t size;
3182        int r = 0;
3183
3184        pr_debug("switching xnack from %d to %d\n", p->xnack_enabled, xnack_enabled);
3185
3186        mutex_lock(&p->svms.lock);
3187
3188        list_for_each_entry(prange, &p->svms.list, list) {
3189                svm_range_lock(prange);
3190                list_for_each_entry(pchild, &prange->child_list, child_list) {
3191                        size = (pchild->last - pchild->start + 1) << PAGE_SHIFT;
3192                        if (xnack_enabled) {
3193                                amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
3194                                        KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3195                        } else {
3196                                r = amdgpu_amdkfd_reserve_mem_limit(NULL, size,
3197                                        KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3198                                if (r)
3199                                        goto out_unlock;
3200                                reserved_size += size;
3201                        }
3202                }
3203
3204                size = (prange->last - prange->start + 1) << PAGE_SHIFT;
3205                if (xnack_enabled) {
3206                        amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
3207                                        KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3208                } else {
3209                        r = amdgpu_amdkfd_reserve_mem_limit(NULL, size,
3210                                        KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3211                        if (r)
3212                                goto out_unlock;
3213                        reserved_size += size;
3214                }
3215out_unlock:
3216                svm_range_unlock(prange);
3217                if (r)
3218                        break;
3219        }
3220
3221        if (r)
3222                amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size,
3223                                        KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3224        else
3225                /* Change xnack mode must be inside svms lock, to avoid race with
3226                 * svm_range_deferred_list_work unreserve memory in parallel.
3227                 */
3228                p->xnack_enabled = xnack_enabled;
3229
3230        mutex_unlock(&p->svms.lock);
3231        return r;
3232}
3233
3234void svm_range_list_fini(struct kfd_process *p)
3235{
3236        struct svm_range *prange;
3237        struct svm_range *next;
3238
3239        pr_debug("process pid %d svms 0x%p\n", p->lead_thread->pid,
3240                 &p->svms);
3241
3242        cancel_delayed_work_sync(&p->svms.restore_work);
3243
3244        /* Ensure list work is finished before process is destroyed */
3245        flush_work(&p->svms.deferred_list_work);
3246
3247        /*
3248         * Ensure no retry fault comes in afterwards, as page fault handler will
3249         * not find kfd process and take mm lock to recover fault.
3250         * stop kfd page fault handing, then wait pending page faults got drained
3251         */
3252        atomic_set(&p->svms.drain_pagefaults, 1);
3253        svm_range_drain_retry_fault(&p->svms);
3254
3255        list_for_each_entry_safe(prange, next, &p->svms.list, list) {
3256                svm_range_unlink(prange);
3257                svm_range_remove_notifier(prange);
3258                svm_range_free(prange, true);
3259        }
3260
3261        mutex_destroy(&p->svms.lock);
3262
3263        pr_debug("process pid %d svms 0x%p done\n",
3264                p->lead_thread->pid, &p->svms);
3265}
3266
3267int svm_range_list_init(struct kfd_process *p)
3268{
3269        struct svm_range_list *svms = &p->svms;
3270        int i;
3271
3272        svms->objects = RB_ROOT_CACHED;
3273        mutex_init(&svms->lock);
3274        INIT_LIST_HEAD(&svms->list);
3275        atomic_set(&svms->evicted_ranges, 0);
3276        atomic_set(&svms->drain_pagefaults, 0);
3277        INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
3278        INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
3279        INIT_LIST_HEAD(&svms->deferred_range_list);
3280        INIT_LIST_HEAD(&svms->criu_svm_metadata_list);
3281        spin_lock_init(&svms->deferred_list_lock);
3282
3283        for (i = 0; i < p->n_pdds; i++)
3284                if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev->adev))
3285                        bitmap_set(svms->bitmap_supported, i, 1);
3286
3287         /* Value of default granularity cannot exceed 0x1B, the
3288          * number of pages supported by a 4-level paging table
3289          */
3290        svms->default_granularity = min_t(u8, amdgpu_svm_default_granularity, 0x1B);
3291        pr_debug("Default SVM Granularity to use: %d\n", svms->default_granularity);
3292
3293        return 0;
3294}
3295
3296/**
3297 * svm_range_check_vm - check if virtual address range mapped already
3298 * @p: current kfd_process
3299 * @start: range start address, in pages
3300 * @last: range last address, in pages
3301 * @bo_s: mapping start address in pages if address range already mapped
3302 * @bo_l: mapping last address in pages if address range already mapped
3303 *
3304 * The purpose is to avoid virtual address ranges already allocated by
3305 * kfd_ioctl_alloc_memory_of_gpu ioctl.
3306 * It looks for each pdd in the kfd_process.
3307 *
3308 * Context: Process context
3309 *
3310 * Return 0 - OK, if the range is not mapped.
3311 * Otherwise error code:
3312 * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu
3313 * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by
3314 * a signal. Release all buffer reservations and return to user-space.
3315 */
3316static int
3317svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
3318                   uint64_t *bo_s, uint64_t *bo_l)
3319{
3320        struct amdgpu_bo_va_mapping *mapping;
3321        struct interval_tree_node *node;
3322        uint32_t i;
3323        int r;
3324
3325        for (i = 0; i < p->n_pdds; i++) {
3326                struct amdgpu_vm *vm;
3327
3328                if (!p->pdds[i]->drm_priv)
3329                        continue;
3330
3331                vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
3332                r = amdgpu_bo_reserve(vm->root.bo, false);
3333                if (r)
3334                        return r;
3335
3336                node = interval_tree_iter_first(&vm->va, start, last);
3337                if (node) {
3338                        pr_debug("range [0x%llx 0x%llx] already TTM mapped\n",
3339                                 start, last);
3340                        mapping = container_of((struct rb_node *)node,
3341                                               struct amdgpu_bo_va_mapping, rb);
3342                        if (bo_s && bo_l) {
3343                                *bo_s = mapping->start;
3344                                *bo_l = mapping->last;
3345                        }
3346                        amdgpu_bo_unreserve(vm->root.bo);
3347                        return -EADDRINUSE;
3348                }
3349                amdgpu_bo_unreserve(vm->root.bo);
3350        }
3351
3352        return 0;
3353}
3354
3355/**
3356 * svm_range_is_valid - check if virtual address range is valid
3357 * @p: current kfd_process
3358 * @start: range start address, in pages
3359 * @size: range size, in pages
3360 *
3361 * Valid virtual address range means it belongs to one or more VMAs
3362 *
3363 * Context: Process context
3364 *
3365 * Return:
3366 *  0 - OK, otherwise error code
3367 */
3368static int
3369svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size)
3370{
3371        const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
3372        struct vm_area_struct *vma;
3373        unsigned long end;
3374        unsigned long start_unchg = start;
3375
3376        start <<= PAGE_SHIFT;
3377        end = start + (size << PAGE_SHIFT);
3378        do {
3379                vma = vma_lookup(p->mm, start);
3380                if (!vma || (vma->vm_flags & device_vma))
3381                        return -EFAULT;
3382                start = min(end, vma->vm_end);
3383        } while (start < end);
3384
3385        return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL,
3386                                  NULL);
3387}
3388
3389/**
3390 * svm_range_best_prefetch_location - decide the best prefetch location
3391 * @prange: svm range structure
3392 *
3393 * For xnack off:
3394 * If range map to single GPU, the best prefetch location is prefetch_loc, which
3395 * can be CPU or GPU.
3396 *
3397 * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on
3398 * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise
3399 * the best prefetch location is always CPU, because GPU can not have coherent
3400 * mapping VRAM of other GPUs even with large-BAR PCIe connection.
3401 *
3402 * For xnack on:
3403 * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is
3404 * prefetch_loc, other GPU access will generate vm fault and trigger migration.
3405 *
3406 * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same
3407 * hive, the best prefetch location is prefetch_loc GPU, otherwise the best
3408 * prefetch location is always CPU.
3409 *
3410 * Context: Process context
3411 *
3412 * Return:
3413 * 0 for CPU or GPU id
3414 */
3415static uint32_t
3416svm_range_best_prefetch_location(struct svm_range *prange)
3417{
3418        DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
3419        uint32_t best_loc = prange->prefetch_loc;
3420        struct kfd_process_device *pdd;
3421        struct kfd_node *bo_node;
3422        struct kfd_process *p;
3423        uint32_t gpuidx;
3424
3425        p = container_of(prange->svms, struct kfd_process, svms);
3426
3427        if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
3428                goto out;
3429
3430        bo_node = svm_range_get_node_by_id(prange, best_loc);
3431        if (!bo_node) {
3432                WARN_ONCE(1, "failed to get valid kfd node at id%x\n", best_loc);
3433                best_loc = 0;
3434                goto out;
3435        }
3436
3437        if (bo_node->adev->apu_prefer_gtt) {
3438                best_loc = 0;
3439                goto out;
3440        }
3441
3442        if (p->xnack_enabled)
3443                bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
3444        else
3445                bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
3446                          MAX_GPU_INSTANCE);
3447
3448        for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
3449                pdd = kfd_process_device_from_gpuidx(p, gpuidx);
3450                if (!pdd) {
3451                        pr_debug("failed to get device by idx 0x%x\n", gpuidx);
3452                        continue;
3453                }
3454
3455                if (pdd->dev->adev == bo_node->adev)
3456                        continue;
3457
3458                if (!svm_nodes_in_same_hive(pdd->dev, bo_node)) {
3459                        best_loc = 0;
3460                        break;
3461                }
3462        }
3463
3464out:
3465        pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n",
3466                 p->xnack_enabled, &p->svms, prange->start, prange->last,
3467                 best_loc);
3468
3469        return best_loc;
3470}
3471
3472/* svm_range_trigger_migration - start page migration if prefetch loc changed
3473 * @mm: current process mm_struct
3474 * @prange: svm range structure
3475 * @migrated: output, true if migration is triggered
3476 *
3477 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range
3478 * from ram to vram.
3479 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range
3480 * from vram to ram.
3481 *
3482 * If GPU vm fault retry is not enabled, migration interact with MMU notifier
3483 * and restore work:
3484 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict
3485 *    stops all queues, schedule restore work
3486 * 2. svm_range_restore_work wait for migration is done by
3487 *    a. svm_range_validate_vram takes prange->migrate_mutex
3488 *    b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns
3489 * 3. restore work update mappings of GPU, resume all queues.
3490 *
3491 * Context: Process context
3492 *
3493 * Return:
3494 * 0 - OK, otherwise - error code of migration
3495 */
3496static int
3497svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
3498                            bool *migrated)
3499{
3500        uint32_t best_loc;
3501        int r = 0;
3502
3503        *migrated = false;
3504        best_loc = svm_range_best_prefetch_location(prange);
3505
3506        /* when best_loc is a gpu node and same as prange->actual_loc
3507         * we still need do migration as prange->actual_loc !=0 does
3508         * not mean all pages in prange are vram. hmm migrate will pick
3509         * up right pages during migration.
3510         */
3511        if ((best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) ||
3512            (best_loc == 0 && prange->actual_loc == 0))
3513                return 0;
3514
3515        if (!best_loc) {
3516                r = svm_migrate_vram_to_ram(prange, mm, prange->start, prange->last,
3517                                        KFD_MIGRATE_TRIGGER_PREFETCH, NULL);
3518                *migrated = !r;
3519                return r;
3520        }
3521
3522        r = svm_migrate_to_vram(prange, best_loc, prange->start, prange->last,
3523                                mm, KFD_MIGRATE_TRIGGER_PREFETCH);
3524        *migrated = !r;
3525
3526        return 0;
3527}
3528
3529int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
3530{
3531        /* Dereferencing fence->svm_bo is safe here because the fence hasn't
3532         * signaled yet and we're under the protection of the fence->lock.
3533         * After the fence is signaled in svm_range_bo_release, we cannot get
3534         * here any more.
3535         *
3536         * Reference is dropped in svm_range_evict_svm_bo_worker.
3537         */
3538        if (svm_bo_ref_unless_zero(fence->svm_bo)) {
3539                WRITE_ONCE(fence->svm_bo->evicting, 1);
3540                schedule_work(&fence->svm_bo->eviction_work);
3541        }
3542
3543        return 0;
3544}
3545
3546static void svm_range_evict_svm_bo_worker(struct work_struct *work)
3547{
3548        struct svm_range_bo *svm_bo;
3549        struct mm_struct *mm;
3550        int r = 0;
3551
3552        svm_bo = container_of(work, struct svm_range_bo, eviction_work);
3553
3554        if (mmget_not_zero(svm_bo->eviction_fence->mm)) {
3555                mm = svm_bo->eviction_fence->mm;
3556        } else {
3557                svm_range_bo_unref(svm_bo);
3558                return;
3559        }
3560
3561        mmap_read_lock(mm);
3562        spin_lock(&svm_bo->list_lock);
3563        while (!list_empty(&svm_bo->range_list) && !r) {
3564                struct svm_range *prange =
3565                                list_first_entry(&svm_bo->range_list,
3566                                                struct svm_range, svm_bo_list);
3567                int retries = 3;
3568
3569                list_del_init(&prange->svm_bo_list);
3570                spin_unlock(&svm_bo->list_lock);
3571
3572                pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
3573                         prange->start, prange->last);
3574
3575                mutex_lock(&prange->migrate_mutex);
3576                do {
3577                        /* migrate all vram pages in this prange to sys ram
3578                         * after that prange->actual_loc should be zero
3579                         */
3580                        r = svm_migrate_vram_to_ram(prange, mm,
3581                                        prange->start, prange->last,
3582                                        KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL);
3583                } while (!r && prange->actual_loc && --retries);
3584
3585                if (!r && prange->actual_loc)
3586                        pr_info_once("Migration failed during eviction");
3587
3588                if (!prange->actual_loc) {
3589                        mutex_lock(&prange->lock);
3590                        prange->svm_bo = NULL;
3591                        mutex_unlock(&prange->lock);
3592                }
3593                mutex_unlock(&prange->migrate_mutex);
3594
3595                spin_lock(&svm_bo->list_lock);
3596        }
3597        spin_unlock(&svm_bo->list_lock);
3598        mmap_read_unlock(mm);
3599        mmput(mm);
3600
3601        dma_fence_signal(&svm_bo->eviction_fence->base);
3602
3603        /* This is the last reference to svm_bo, after svm_range_vram_node_free
3604         * has been called in svm_migrate_vram_to_ram
3605         */
3606        WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
3607        svm_range_bo_unref(svm_bo);
3608}
3609
3610static int
3611svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
3612                   uint64_t start, uint64_t size, uint32_t nattr,
3613                   struct kfd_ioctl_svm_attribute *attrs)
3614{
3615        struct amdkfd_process_info *process_info = p->kgd_process_info;
3616        struct list_head update_list;
3617        struct list_head insert_list;
3618        struct list_head remove_list;
3619        struct list_head remap_list;
3620        struct svm_range_list *svms;
3621        struct svm_range *prange;
3622        struct svm_range *next;
3623        bool update_mapping = false;
3624        bool flush_tlb;
3625        int r, ret = 0;
3626
3627        pr_debug("process pid %d svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
3628                 p->lead_thread->pid, &p->svms, start, start + size - 1, size);
3629
3630        r = svm_range_check_attr(p, nattr, attrs);
3631        if (r)
3632                return r;
3633
3634        svms = &p->svms;
3635
3636        mutex_lock(&process_info->lock);
3637
3638        svm_range_list_lock_and_flush_work(svms, mm);
3639
3640        r = svm_range_is_valid(p, start, size);
3641        if (r) {
3642                pr_debug("invalid range r=%d\n", r);
3643                mmap_write_unlock(mm);
3644                goto out;
3645        }
3646
3647        mutex_lock(&svms->lock);
3648
3649        /* Add new range and split existing ranges as needed */
3650        r = svm_range_add(p, start, size, nattr, attrs, &update_list,
3651                          &insert_list, &remove_list, &remap_list);
3652        if (r) {
3653                mutex_unlock(&svms->lock);
3654                mmap_write_unlock(mm);
3655                goto out;
3656        }
3657        /* Apply changes as a transaction */
3658        list_for_each_entry_safe(prange, next, &insert_list, list) {
3659                svm_range_add_to_svms(prange);
3660                svm_range_add_notifier_locked(mm, prange);
3661        }
3662        list_for_each_entry(prange, &update_list, update_list) {
3663                svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping);
3664                /* TODO: unmap ranges from GPU that lost access */
3665        }
3666        list_for_each_entry_safe(prange, next, &remove_list, update_list) {
3667                pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n",
3668                         prange->svms, prange, prange->start,
3669                         prange->last);
3670                svm_range_unlink(prange);
3671                svm_range_remove_notifier(prange);
3672                svm_range_free(prange, false);
3673        }
3674
3675        mmap_write_downgrade(mm);
3676        /* Trigger migrations and revalidate and map to GPUs as needed. If
3677         * this fails we may be left with partially completed actions. There
3678         * is no clean way of rolling back to the previous state in such a
3679         * case because the rollback wouldn't be guaranteed to work either.
3680         */
3681        list_for_each_entry(prange, &update_list, update_list) {
3682                bool migrated;
3683
3684                mutex_lock(&prange->migrate_mutex);
3685
3686                r = svm_range_trigger_migration(mm, prange, &migrated);
3687                if (r)
3688                        goto out_unlock_range;
3689
3690                if (migrated && (!p->xnack_enabled ||
3691                    (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) &&
3692                    prange->mapped_to_gpu) {
3693                        pr_debug("restore_work will update mappings of GPUs\n");
3694                        mutex_unlock(&prange->migrate_mutex);
3695                        continue;
3696                }
3697
3698                if (!migrated && !update_mapping) {
3699                        mutex_unlock(&prange->migrate_mutex);
3700                        continue;
3701                }
3702
3703                flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu;
3704
3705                r = svm_range_validate_and_map(mm, prange->start, prange->last, prange,
3706                                               MAX_GPU_INSTANCE, true, true, flush_tlb);
3707                if (r)
3708                        pr_debug("failed %d to map svm range\n", r);
3709
3710out_unlock_range:
3711                mutex_unlock(&prange->migrate_mutex);
3712                if (r)
3713                        ret = r;
3714        }
3715
3716        list_for_each_entry(prange, &remap_list, update_list) {
3717                pr_debug("Remapping prange 0x%p [0x%lx 0x%lx]\n",
3718                         prange, prange->start, prange->last);
3719                mutex_lock(&prange->migrate_mutex);
3720                r = svm_range_validate_and_map(mm,  prange->start, prange->last, prange,
3721                                               MAX_GPU_INSTANCE, true, true, prange->mapped_to_gpu);
3722                if (r)
3723                        pr_debug("failed %d on remap svm range\n", r);
3724                mutex_unlock(&prange->migrate_mutex);
3725                if (r)
3726                        ret = r;
3727        }
3728
3729        dynamic_svm_range_dump(svms);
3730
3731        mutex_unlock(&svms->lock);
3732        mmap_read_unlock(mm);
3733out:
3734        mutex_unlock(&process_info->lock);
3735
3736        pr_debug("process pid %d svms 0x%p [0x%llx 0x%llx] done, r=%d\n",
3737                 p->lead_thread->pid, &p->svms, start, start + size - 1, r);
3738
3739        return ret ? ret : r;
3740}
3741
3742static int
3743svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm,
3744                   uint64_t start, uint64_t size, uint32_t nattr,
3745                   struct kfd_ioctl_svm_attribute *attrs)
3746{
3747        DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
3748        DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
3749        bool get_preferred_loc = false;
3750        bool get_prefetch_loc = false;
3751        bool get_granularity = false;
3752        bool get_accessible = false;
3753        bool get_flags = false;
3754        uint64_t last = start + size - 1UL;
3755        uint8_t granularity = 0xff;
3756        struct interval_tree_node *node;
3757        struct svm_range_list *svms;
3758        struct svm_range *prange;
3759        uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3760        uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3761        uint32_t flags_and = 0xffffffff;
3762        uint32_t flags_or = 0;
3763        int gpuidx;
3764        uint32_t i;
3765        int r = 0;
3766
3767        pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start,
3768                 start + size - 1, nattr);
3769
3770        /* Flush pending deferred work to avoid racing with deferred actions from
3771         * previous memory map changes (e.g. munmap). Concurrent memory map changes
3772         * can still race with get_attr because we don't hold the mmap lock. But that
3773         * would be a race condition in the application anyway, and undefined
3774         * behaviour is acceptable in that case.
3775         */
3776        flush_work(&p->svms.deferred_list_work);
3777
3778        mmap_read_lock(mm);
3779        r = svm_range_is_valid(p, start, size);
3780        mmap_read_unlock(mm);
3781        if (r) {
3782                pr_debug("invalid range r=%d\n", r);
3783                return r;
3784        }
3785
3786        for (i = 0; i < nattr; i++) {
3787                switch (attrs[i].type) {
3788                case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3789                        get_preferred_loc = true;
3790                        break;
3791                case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3792                        get_prefetch_loc = true;
3793                        break;
3794                case KFD_IOCTL_SVM_ATTR_ACCESS:
3795                        get_accessible = true;
3796                        break;
3797                case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3798                case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3799                        get_flags = true;
3800                        break;
3801                case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3802                        get_granularity = true;
3803                        break;
3804                case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
3805                case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
3806                        fallthrough;
3807                default:
3808                        pr_debug("get invalid attr type 0x%x\n", attrs[i].type);
3809                        return -EINVAL;
3810                }
3811        }
3812
3813        svms = &p->svms;
3814
3815        mutex_lock(&svms->lock);
3816
3817        node = interval_tree_iter_first(&svms->objects, start, last);
3818        if (!node) {
3819                pr_debug("range attrs not found return default values\n");
3820                svm_range_set_default_attributes(svms, &location, &prefetch_loc,
3821                                                 &granularity, &flags_and);
3822                flags_or = flags_and;
3823                if (p->xnack_enabled)
3824                        bitmap_copy(bitmap_access, svms->bitmap_supported,
3825                                    MAX_GPU_INSTANCE);
3826                else
3827                        bitmap_zero(bitmap_access, MAX_GPU_INSTANCE);
3828                bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE);
3829                goto fill_values;
3830        }
3831        bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE);
3832        bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE);
3833
3834        while (node) {
3835                struct interval_tree_node *next;
3836
3837                prange = container_of(node, struct svm_range, it_node);
3838                next = interval_tree_iter_next(node, start, last);
3839
3840                if (get_preferred_loc) {
3841                        if (prange->preferred_loc ==
3842                                        KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3843                            (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3844                             location != prange->preferred_loc)) {
3845                                location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3846                                get_preferred_loc = false;
3847                        } else {
3848                                location = prange->preferred_loc;
3849                        }
3850                }
3851                if (get_prefetch_loc) {
3852                        if (prange->prefetch_loc ==
3853                                        KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3854                            (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3855                             prefetch_loc != prange->prefetch_loc)) {
3856                                prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3857                                get_prefetch_loc = false;
3858                        } else {
3859                                prefetch_loc = prange->prefetch_loc;
3860                        }
3861                }
3862                if (get_accessible) {
3863                        bitmap_and(bitmap_access, bitmap_access,
3864                                   prange->bitmap_access, MAX_GPU_INSTANCE);
3865                        bitmap_and(bitmap_aip, bitmap_aip,
3866                                   prange->bitmap_aip, MAX_GPU_INSTANCE);
3867                }
3868                if (get_flags) {
3869                        flags_and &= prange->flags;
3870                        flags_or |= prange->flags;
3871                }
3872
3873                if (get_granularity && prange->granularity < granularity)
3874                        granularity = prange->granularity;
3875
3876                node = next;
3877        }
3878fill_values:
3879        mutex_unlock(&svms->lock);
3880
3881        for (i = 0; i < nattr; i++) {
3882                switch (attrs[i].type) {
3883                case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3884                        attrs[i].value = location;
3885                        break;
3886                case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3887                        attrs[i].value = prefetch_loc;
3888                        break;
3889                case KFD_IOCTL_SVM_ATTR_ACCESS:
3890                        gpuidx = kfd_process_gpuidx_from_gpuid(p,
3891                                                               attrs[i].value);
3892                        if (gpuidx < 0) {
3893                                pr_debug("invalid gpuid %x\n", attrs[i].value);
3894                                return -EINVAL;
3895                        }
3896                        if (test_bit(gpuidx, bitmap_access))
3897                                attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS;
3898                        else if (test_bit(gpuidx, bitmap_aip))
3899                                attrs[i].type =
3900                                        KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE;
3901                        else
3902                                attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS;
3903                        break;
3904                case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3905                        attrs[i].value = flags_and;
3906                        break;
3907                case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3908                        attrs[i].value = ~flags_or;
3909                        break;
3910                case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3911                        attrs[i].value = (uint32_t)granularity;
3912                        break;
3913                }
3914        }
3915
3916        return 0;
3917}
3918
3919int kfd_criu_resume_svm(struct kfd_process *p)
3920{
3921        struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL;
3922        int nattr_common = 4, nattr_accessibility = 1;
3923        struct criu_svm_metadata *criu_svm_md = NULL;
3924        struct svm_range_list *svms = &p->svms;
3925        struct criu_svm_metadata *next = NULL;
3926        uint32_t set_flags = 0xffffffff;
3927        int i, j, num_attrs, ret = 0;
3928        uint64_t set_attr_size;
3929        struct mm_struct *mm;
3930
3931        if (list_empty(&svms->criu_svm_metadata_list)) {
3932                pr_debug("No SVM data from CRIU restore stage 2\n");
3933                return ret;
3934        }
3935
3936        mm = get_task_mm(p->lead_thread);
3937        if (!mm) {
3938                pr_err("failed to get mm for the target process\n");
3939                return -ESRCH;
3940        }
3941
3942        num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);
3943
3944        i = j = 0;
3945        list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
3946                pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n",
3947                         i, criu_svm_md->data.start_addr, criu_svm_md->data.size);
3948
3949                for (j = 0; j < num_attrs; j++) {
3950                        pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x\ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
3951                                 i, j, criu_svm_md->data.attrs[j].type,
3952                                 i, j, criu_svm_md->data.attrs[j].value);
3953                        switch (criu_svm_md->data.attrs[j].type) {
3954                        /* During Checkpoint operation, the query for
3955                         * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might
3956                         * return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were
3957                         * not used by the range which was checkpointed. Care
3958                         * must be taken to not restore with an invalid value
3959                         * otherwise the gpuidx value will be invalid and
3960                         * set_attr would eventually fail so just replace those
3961                         * with another dummy attribute such as
3962                         * KFD_IOCTL_SVM_ATTR_SET_FLAGS.
3963                         */
3964                        case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3965                                if (criu_svm_md->data.attrs[j].value ==
3966                                    KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
3967                                        criu_svm_md->data.attrs[j].type =
3968                                                KFD_IOCTL_SVM_ATTR_SET_FLAGS;
3969                                        criu_svm_md->data.attrs[j].value = 0;
3970                                }
3971                                break;
3972                        case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3973                                set_flags = criu_svm_md->data.attrs[j].value;
3974                                break;
3975                        default:
3976                                break;
3977                        }
3978                }
3979
3980                /* CLR_FLAGS is not available via get_attr during checkpoint but
3981                 * it needs to be inserted before restoring the ranges so
3982                 * allocate extra space for it before calling set_attr
3983                 */
3984                set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
3985                                                (num_attrs + 1);
3986                set_attr_new = krealloc(set_attr, set_attr_size,
3987                                            GFP_KERNEL);
3988                if (!set_attr_new) {
3989                        ret = -ENOMEM;
3990                        goto exit;
3991                }
3992                set_attr = set_attr_new;
3993
3994                memcpy(set_attr, criu_svm_md->data.attrs, num_attrs *
3995                                        sizeof(struct kfd_ioctl_svm_attribute));
3996                set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS;
3997                set_attr[num_attrs].value = ~set_flags;
3998
3999                ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr,
4000                                         criu_svm_md->data.size, num_attrs + 1,
4001                                         set_attr);
4002                if (ret) {
4003                        pr_err("CRIU: failed to set range attributes\n");
4004                        goto exit;
4005                }
4006
4007                i++;
4008        }
4009exit:
4010        kfree(set_attr);
4011        list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) {
4012                pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
4013                                                criu_svm_md->data.start_addr);
4014                kfree(criu_svm_md);
4015        }
4016
4017        mmput(mm);
4018        return ret;
4019
4020}
4021
4022int kfd_criu_restore_svm(struct kfd_process *p,
4023                         uint8_t __user *user_priv_ptr,
4024                         uint64_t *priv_data_offset,
4025                         uint64_t max_priv_data_size)
4026{
4027        uint64_t svm_priv_data_size, svm_object_md_size, svm_attrs_size;
4028        int nattr_common = 4, nattr_accessibility = 1;
4029        struct criu_svm_metadata *criu_svm_md = NULL;
4030        struct svm_range_list *svms = &p->svms;
4031        uint32_t num_devices;
4032        int ret = 0;
4033
4034        num_devices = p->n_pdds;
4035        /* Handle one SVM range object at a time, also the number of gpus are
4036         * assumed to be same on the restore node, checking must be done while
4037         * evaluating the topology earlier
4038         */
4039
4040        svm_attrs_size = sizeof(struct kfd_ioctl_svm_attribute) *
4041                (nattr_common + nattr_accessibility * num_devices);
4042        svm_object_md_size = sizeof(struct criu_svm_metadata) + svm_attrs_size;
4043
4044        svm_priv_data_size = sizeof(struct kfd_criu_svm_range_priv_data) +
4045                                                                svm_attrs_size;
4046
4047        criu_svm_md = kzalloc(svm_object_md_size, GFP_KERNEL);
4048        if (!criu_svm_md) {
4049                pr_err("failed to allocate memory to store svm metadata\n");
4050                return -ENOMEM;
4051        }
4052        if (*priv_data_offset + svm_priv_data_size > max_priv_data_size) {
4053                ret = -EINVAL;
4054                goto exit;
4055        }
4056
4057        ret = copy_from_user(&criu_svm_md->data, user_priv_ptr + *priv_data_offset,
4058                             svm_priv_data_size);
4059        if (ret) {
4060                ret = -EFAULT;
4061                goto exit;
4062        }
4063        *priv_data_offset += svm_priv_data_size;
4064
4065        list_add_tail(&criu_svm_md->list, &svms->criu_svm_metadata_list);
4066
4067        return 0;
4068
4069
4070exit:
4071        kfree(criu_svm_md);
4072        return ret;
4073}
4074
4075void svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
4076                        uint64_t *svm_priv_data_size)
4077{
4078        uint64_t total_size, accessibility_size, common_attr_size;
4079        int nattr_common = 4, nattr_accessibility = 1;
4080        int num_devices = p->n_pdds;
4081        struct svm_range_list *svms;
4082        struct svm_range *prange;
4083        uint32_t count = 0;
4084
4085        *svm_priv_data_size = 0;
4086
4087        svms = &p->svms;
4088
4089        mutex_lock(&svms->lock);
4090        list_for_each_entry(prange, &svms->list, list) {
4091                pr_debug("prange: 0x%p start: 0x%lx\t npages: 0x%llx\t end: 0x%llx\n",
4092                         prange, prange->start, prange->npages,
4093                         prange->start + prange->npages - 1);
4094                count++;
4095        }
4096        mutex_unlock(&svms->lock);
4097
4098        *num_svm_ranges = count;
4099        /* Only the accessbility attributes need to be queried for all the gpus
4100         * individually, remaining ones are spanned across the entire process
4101         * regardless of the various gpu nodes. Of the remaining attributes,
4102         * KFD_IOCTL_SVM_ATTR_CLR_FLAGS need not be saved.
4103         *
4104         * KFD_IOCTL_SVM_ATTR_PREFERRED_LOC
4105         * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC
4106         * KFD_IOCTL_SVM_ATTR_SET_FLAGS
4107         * KFD_IOCTL_SVM_ATTR_GRANULARITY
4108         *
4109         * ** ACCESSBILITY ATTRIBUTES **
4110         * (Considered as one, type is altered during query, value is gpuid)
4111         * KFD_IOCTL_SVM_ATTR_ACCESS
4112         * KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE
4113         * KFD_IOCTL_SVM_ATTR_NO_ACCESS
4114         */
4115        if (*num_svm_ranges > 0) {
4116                common_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
4117                        nattr_common;
4118                accessibility_size = sizeof(struct kfd_ioctl_svm_attribute) *
4119                        nattr_accessibility * num_devices;
4120
4121                total_size = sizeof(struct kfd_criu_svm_range_priv_data) +
4122                        common_attr_size + accessibility_size;
4123
4124                *svm_priv_data_size = *num_svm_ranges * total_size;
4125        }
4126
4127        pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges,
4128                 *svm_priv_data_size);
4129}
4130
4131int kfd_criu_checkpoint_svm(struct kfd_process *p,
4132                            uint8_t __user *user_priv_data,
4133                            uint64_t *priv_data_offset)
4134{
4135        struct kfd_criu_svm_range_priv_data *svm_priv = NULL;
4136        struct kfd_ioctl_svm_attribute *query_attr = NULL;
4137        uint64_t svm_priv_data_size, query_attr_size = 0;
4138        int index, nattr_common = 4, ret = 0;
4139        struct svm_range_list *svms;
4140        int num_devices = p->n_pdds;
4141        struct svm_range *prange;
4142        struct mm_struct *mm;
4143
4144        svms = &p->svms;
4145
4146        mm = get_task_mm(p->lead_thread);
4147        if (!mm) {
4148                pr_err("failed to get mm for the target process\n");
4149                return -ESRCH;
4150        }
4151
4152        query_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
4153                                (nattr_common + num_devices);
4154
4155        query_attr = kzalloc(query_attr_size, GFP_KERNEL);
4156        if (!query_attr) {
4157                ret = -ENOMEM;
4158                goto exit;
4159        }
4160
4161        query_attr[0].type = KFD_IOCTL_SVM_ATTR_PREFERRED_LOC;
4162        query_attr[1].type = KFD_IOCTL_SVM_ATTR_PREFETCH_LOC;
4163        query_attr[2].type = KFD_IOCTL_SVM_ATTR_SET_FLAGS;
4164        query_attr[3].type = KFD_IOCTL_SVM_ATTR_GRANULARITY;
4165
4166        for (index = 0; index < num_devices; index++) {
4167                struct kfd_process_device *pdd = p->pdds[index];
4168
4169                query_attr[index + nattr_common].type =
4170                        KFD_IOCTL_SVM_ATTR_ACCESS;
4171                query_attr[index + nattr_common].value = pdd->user_gpu_id;
4172        }
4173
4174        svm_priv_data_size = sizeof(*svm_priv) + query_attr_size;
4175
4176        svm_priv = kzalloc(svm_priv_data_size, GFP_KERNEL);
4177        if (!svm_priv) {
4178                ret = -ENOMEM;
4179                goto exit_query;
4180        }
4181
4182        index = 0;
4183        list_for_each_entry(prange, &svms->list, list) {
4184
4185                svm_priv->object_type = KFD_CRIU_OBJECT_TYPE_SVM_RANGE;
4186                svm_priv->start_addr = prange->start;
4187                svm_priv->size = prange->npages;
4188                memcpy(&svm_priv->attrs, query_attr, query_attr_size);
4189                pr_debug("CRIU: prange: 0x%p start: 0x%lx\t npages: 0x%llx end: 0x%llx\t size: 0x%llx\n",
4190                         prange, prange->start, prange->npages,
4191                         prange->start + prange->npages - 1,
4192                         prange->npages * PAGE_SIZE);
4193
4194                ret = svm_range_get_attr(p, mm, svm_priv->start_addr,
4195                                         svm_priv->size,
4196                                         (nattr_common + num_devices),
4197                                         svm_priv->attrs);
4198                if (ret) {
4199                        pr_err("CRIU: failed to obtain range attributes\n");
4200                        goto exit_priv;
4201                }
4202
4203                if (copy_to_user(user_priv_data + *priv_data_offset, svm_priv,
4204                                 svm_priv_data_size)) {
4205                        pr_err("Failed to copy svm priv to user\n");
4206                        ret = -EFAULT;
4207                        goto exit_priv;
4208                }
4209
4210                *priv_data_offset += svm_priv_data_size;
4211
4212        }
4213
4214
4215exit_priv:
4216        kfree(svm_priv);
4217exit_query:
4218        kfree(query_attr);
4219exit:
4220        mmput(mm);
4221        return ret;
4222}
4223
4224int
4225svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
4226          uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs)
4227{
4228        struct mm_struct *mm = current->mm;
4229        int r;
4230
4231        start >>= PAGE_SHIFT;
4232        size >>= PAGE_SHIFT;
4233
4234        switch (op) {
4235        case KFD_IOCTL_SVM_OP_SET_ATTR:
4236                r = svm_range_set_attr(p, mm, start, size, nattrs, attrs);
4237                break;
4238        case KFD_IOCTL_SVM_OP_GET_ATTR:
4239                r = svm_range_get_attr(p, mm, start, size, nattrs, attrs);
4240                break;
4241        default:
4242                r = EINVAL;
4243                break;
4244        }
4245
4246        return r;
4247}
4248