LXR linux/drivers/gpu/drm/amd/amdkfd/kfd

   1// SPDX-License-Identifier: GPL-2.0 OR MIT
   2/*
   3 * Copyright 2020-2021 Advanced Micro Devices, Inc.
   4 *
   5 * Permission is hereby granted, free of charge, to any person obtaining a
   6 * copy of this software and associated documentation files (the "Software"),
   7 * to deal in the Software without restriction, including without limitation
   8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 * and/or sell copies of the Software, and to permit persons to whom the
  10 * Software is furnished to do so, subject to the following conditions:
  11 *
  12 * The above copyright notice and this permission notice shall be included in
  13 * all copies or substantial portions of the Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  21 * OTHER DEALINGS IN THE SOFTWARE.
  22 */
  23
  24#include <linux/types.h>
  25#include <linux/sched/task.h>
  26#include "amdgpu_sync.h"
  27#include "amdgpu_object.h"
  28#include "amdgpu_vm.h"
  29#include "amdgpu_mn.h"
  30#include "amdgpu.h"
  31#include "amdgpu_xgmi.h"
  32#include "kfd_priv.h"
  33#include "kfd_svm.h"
  34#include "kfd_migrate.h"
  35
  36#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
  37
  38/* Long enough to ensure no retry fault comes after svm range is restored and
  39 * page table is updated.
  40 */
  41#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING    2000
  42
  43static void svm_range_evict_svm_bo_worker(struct work_struct *work);
  44static bool
  45svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
  46                                    const struct mmu_notifier_range *range,
  47                                    unsigned long cur_seq);
  48
  49static const struct mmu_interval_notifier_ops svm_range_mn_ops = {
  50        .invalidate = svm_range_cpu_invalidate_pagetables,
  51};
  52
  53/**
  54 * svm_range_unlink - unlink svm_range from lists and interval tree
  55 * @prange: svm range structure to be removed
  56 *
  57 * Remove the svm_range from the svms and svm_bo lists and the svms
  58 * interval tree.
  59 *
  60 * Context: The caller must hold svms->lock
  61 */
  62static void svm_range_unlink(struct svm_range *prange)
  63{
  64        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
  65                 prange, prange->start, prange->last);
  66
  67        if (prange->svm_bo) {
  68                spin_lock(&prange->svm_bo->list_lock);
  69                list_del(&prange->svm_bo_list);
  70                spin_unlock(&prange->svm_bo->list_lock);
  71        }
  72
  73        list_del(&prange->list);
  74        if (prange->it_node.start != 0 && prange->it_node.last != 0)
  75                interval_tree_remove(&prange->it_node, &prange->svms->objects);
  76}
  77
  78static void
  79svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange)
  80{
  81        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
  82                 prange, prange->start, prange->last);
  83
  84        mmu_interval_notifier_insert_locked(&prange->notifier, mm,
  85                                     prange->start << PAGE_SHIFT,
  86                                     prange->npages << PAGE_SHIFT,
  87                                     &svm_range_mn_ops);
  88}
  89
  90/**
  91 * svm_range_add_to_svms - add svm range to svms
  92 * @prange: svm range structure to be added
  93 *
  94 * Add the svm range to svms interval tree and link list
  95 *
  96 * Context: The caller must hold svms->lock
  97 */
  98static void svm_range_add_to_svms(struct svm_range *prange)
  99{
 100        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
 101                 prange, prange->start, prange->last);
 102
 103        list_add_tail(&prange->list, &prange->svms->list);
 104        prange->it_node.start = prange->start;
 105        prange->it_node.last = prange->last;
 106        interval_tree_insert(&prange->it_node, &prange->svms->objects);
 107}
 108
 109static void svm_range_remove_notifier(struct svm_range *prange)
 110{
 111        pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n",
 112                 prange->svms, prange,
 113                 prange->notifier.interval_tree.start >> PAGE_SHIFT,
 114                 prange->notifier.interval_tree.last >> PAGE_SHIFT);
 115
 116        if (prange->notifier.interval_tree.start != 0 &&
 117            prange->notifier.interval_tree.last != 0)
 118                mmu_interval_notifier_remove(&prange->notifier);
 119}
 120
 121static bool
 122svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr)
 123{
 124        return dma_addr && !dma_mapping_error(dev, dma_addr) &&
 125               !(dma_addr & SVM_RANGE_VRAM_DOMAIN);
 126}
 127
 128static int
 129svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
 130                      unsigned long offset, unsigned long npages,
 131                      unsigned long *hmm_pfns, uint32_t gpuidx)
 132{
 133        enum dma_data_direction dir = DMA_BIDIRECTIONAL;
 134        dma_addr_t *addr = prange->dma_addr[gpuidx];
 135        struct device *dev = adev->dev;
 136        struct page *page;
 137        int i, r;
 138
 139        if (!addr) {
 140                addr = kvmalloc_array(prange->npages, sizeof(*addr),
 141                                      GFP_KERNEL | __GFP_ZERO);
 142                if (!addr)
 143                        return -ENOMEM;
 144                prange->dma_addr[gpuidx] = addr;
 145        }
 146
 147        addr += offset;
 148        for (i = 0; i < npages; i++) {
 149                if (svm_is_valid_dma_mapping_addr(dev, addr[i]))
 150                        dma_unmap_page(dev, addr[i], PAGE_SIZE, dir);
 151
 152                page = hmm_pfn_to_page(hmm_pfns[i]);
 153                if (is_zone_device_page(page)) {
 154                        struct amdgpu_device *bo_adev =
 155                                        amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
 156
 157                        addr[i] = (hmm_pfns[i] << PAGE_SHIFT) +
 158                                   bo_adev->vm_manager.vram_base_offset -
 159                                   bo_adev->kfd.dev->pgmap.range.start;
 160                        addr[i] |= SVM_RANGE_VRAM_DOMAIN;
 161                        pr_debug("vram address detected: 0x%llx\n", addr[i]);
 162                        continue;
 163                }
 164                addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
 165                r = dma_mapping_error(dev, addr[i]);
 166                if (r) {
 167                        pr_debug("failed %d dma_map_page\n", r);
 168                        return r;
 169                }
 170                pr_debug("dma mapping 0x%llx for page addr 0x%lx\n",
 171                         addr[i] >> PAGE_SHIFT, page_to_pfn(page));
 172        }
 173        return 0;
 174}
 175
 176static int
 177svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
 178                  unsigned long offset, unsigned long npages,
 179                  unsigned long *hmm_pfns)
 180{
 181        struct kfd_process *p;
 182        uint32_t gpuidx;
 183        int r;
 184
 185        p = container_of(prange->svms, struct kfd_process, svms);
 186
 187        for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
 188                struct kfd_process_device *pdd;
 189                struct amdgpu_device *adev;
 190
 191                pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
 192                pdd = kfd_process_device_from_gpuidx(p, gpuidx);
 193                if (!pdd) {
 194                        pr_debug("failed to find device idx %d\n", gpuidx);
 195                        return -EINVAL;
 196                }
 197                adev = (struct amdgpu_device *)pdd->dev->kgd;
 198
 199                r = svm_range_dma_map_dev(adev, prange, offset, npages,
 200                                          hmm_pfns, gpuidx);
 201                if (r)
 202                        break;
 203        }
 204
 205        return r;
 206}
 207
 208void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
 209                         unsigned long offset, unsigned long npages)
 210{
 211        enum dma_data_direction dir = DMA_BIDIRECTIONAL;
 212        int i;
 213
 214        if (!dma_addr)
 215                return;
 216
 217        for (i = offset; i < offset + npages; i++) {
 218                if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i]))
 219                        continue;
 220                pr_debug("dma unmapping 0x%llx\n", dma_addr[i] >> PAGE_SHIFT);
 221                dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
 222                dma_addr[i] = 0;
 223        }
 224}
 225
 226void svm_range_free_dma_mappings(struct svm_range *prange)
 227{
 228        struct kfd_process_device *pdd;
 229        dma_addr_t *dma_addr;
 230        struct device *dev;
 231        struct kfd_process *p;
 232        uint32_t gpuidx;
 233
 234        p = container_of(prange->svms, struct kfd_process, svms);
 235
 236        for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
 237                dma_addr = prange->dma_addr[gpuidx];
 238                if (!dma_addr)
 239                        continue;
 240
 241                pdd = kfd_process_device_from_gpuidx(p, gpuidx);
 242                if (!pdd) {
 243                        pr_debug("failed to find device idx %d\n", gpuidx);
 244                        continue;
 245                }
 246                dev = &pdd->dev->pdev->dev;
 247                svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
 248                kvfree(dma_addr);
 249                prange->dma_addr[gpuidx] = NULL;
 250        }
 251}
 252
 253static void svm_range_free(struct svm_range *prange)
 254{
 255        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange,
 256                 prange->start, prange->last);
 257
 258        svm_range_vram_node_free(prange);
 259        svm_range_free_dma_mappings(prange);
 260        mutex_destroy(&prange->lock);
 261        mutex_destroy(&prange->migrate_mutex);
 262        kfree(prange);
 263}
 264
 265static void
 266svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc,
 267                                 uint8_t *granularity, uint32_t *flags)
 268{
 269        *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
 270        *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
 271        *granularity = 9;
 272        *flags =
 273                KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT;
 274}
 275
 276static struct
 277svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
 278                         uint64_t last)
 279{
 280        uint64_t size = last - start + 1;
 281        struct svm_range *prange;
 282        struct kfd_process *p;
 283
 284        prange = kzalloc(sizeof(*prange), GFP_KERNEL);
 285        if (!prange)
 286                return NULL;
 287        prange->npages = size;
 288        prange->svms = svms;
 289        prange->start = start;
 290        prange->last = last;
 291        INIT_LIST_HEAD(&prange->list);
 292        INIT_LIST_HEAD(&prange->update_list);
 293        INIT_LIST_HEAD(&prange->remove_list);
 294        INIT_LIST_HEAD(&prange->insert_list);
 295        INIT_LIST_HEAD(&prange->svm_bo_list);
 296        INIT_LIST_HEAD(&prange->deferred_list);
 297        INIT_LIST_HEAD(&prange->child_list);
 298        atomic_set(&prange->invalid, 0);
 299        prange->validate_timestamp = 0;
 300        mutex_init(&prange->migrate_mutex);
 301        mutex_init(&prange->lock);
 302
 303        p = container_of(svms, struct kfd_process, svms);
 304        if (p->xnack_enabled)
 305                bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
 306                            MAX_GPU_INSTANCE);
 307
 308        svm_range_set_default_attributes(&prange->preferred_loc,
 309                                         &prange->prefetch_loc,
 310                                         &prange->granularity, &prange->flags);
 311
 312        pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
 313
 314        return prange;
 315}
 316
 317static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo)
 318{
 319        if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref))
 320                return false;
 321
 322        return true;
 323}
 324
 325static void svm_range_bo_release(struct kref *kref)
 326{
 327        struct svm_range_bo *svm_bo;
 328
 329        svm_bo = container_of(kref, struct svm_range_bo, kref);
 330        spin_lock(&svm_bo->list_lock);
 331        while (!list_empty(&svm_bo->range_list)) {
 332                struct svm_range *prange =
 333                                list_first_entry(&svm_bo->range_list,
 334                                                struct svm_range, svm_bo_list);
 335                /* list_del_init tells a concurrent svm_range_vram_node_new when
 336                 * it's safe to reuse the svm_bo pointer and svm_bo_list head.
 337                 */
 338                list_del_init(&prange->svm_bo_list);
 339                spin_unlock(&svm_bo->list_lock);
 340
 341                pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
 342                         prange->start, prange->last);
 343                mutex_lock(&prange->lock);
 344                prange->svm_bo = NULL;
 345                mutex_unlock(&prange->lock);
 346
 347                spin_lock(&svm_bo->list_lock);
 348        }
 349        spin_unlock(&svm_bo->list_lock);
 350        if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) {
 351                /* We're not in the eviction worker.
 352                 * Signal the fence and synchronize with any
 353                 * pending eviction work.
 354                 */
 355                dma_fence_signal(&svm_bo->eviction_fence->base);
 356                cancel_work_sync(&svm_bo->eviction_work);
 357        }
 358        dma_fence_put(&svm_bo->eviction_fence->base);
 359        amdgpu_bo_unref(&svm_bo->bo);
 360        kfree(svm_bo);
 361}
 362
 363void svm_range_bo_unref(struct svm_range_bo *svm_bo)
 364{
 365        if (!svm_bo)
 366                return;
 367
 368        kref_put(&svm_bo->kref, svm_range_bo_release);
 369}
 370
 371static bool
 372svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange)
 373{
 374        struct amdgpu_device *bo_adev;
 375
 376        mutex_lock(&prange->lock);
 377        if (!prange->svm_bo) {
 378                mutex_unlock(&prange->lock);
 379                return false;
 380        }
 381        if (prange->ttm_res) {
 382                /* We still have a reference, all is well */
 383                mutex_unlock(&prange->lock);
 384                return true;
 385        }
 386        if (svm_bo_ref_unless_zero(prange->svm_bo)) {
 387                /*
 388                 * Migrate from GPU to GPU, remove range from source bo_adev
 389                 * svm_bo range list, and return false to allocate svm_bo from
 390                 * destination adev.
 391                 */
 392                bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
 393                if (bo_adev != adev) {
 394                        mutex_unlock(&prange->lock);
 395
 396                        spin_lock(&prange->svm_bo->list_lock);
 397                        list_del_init(&prange->svm_bo_list);
 398                        spin_unlock(&prange->svm_bo->list_lock);
 399
 400                        svm_range_bo_unref(prange->svm_bo);
 401                        return false;
 402                }
 403                if (READ_ONCE(prange->svm_bo->evicting)) {
 404                        struct dma_fence *f;
 405                        struct svm_range_bo *svm_bo;
 406                        /* The BO is getting evicted,
 407                         * we need to get a new one
 408                         */
 409                        mutex_unlock(&prange->lock);
 410                        svm_bo = prange->svm_bo;
 411                        f = dma_fence_get(&svm_bo->eviction_fence->base);
 412                        svm_range_bo_unref(prange->svm_bo);
 413                        /* wait for the fence to avoid long spin-loop
 414                         * at list_empty_careful
 415                         */
 416                        dma_fence_wait(f, false);
 417                        dma_fence_put(f);
 418                } else {
 419                        /* The BO was still around and we got
 420                         * a new reference to it
 421                         */
 422                        mutex_unlock(&prange->lock);
 423                        pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n",
 424                                 prange->svms, prange->start, prange->last);
 425
 426                        prange->ttm_res = prange->svm_bo->bo->tbo.resource;
 427                        return true;
 428                }
 429
 430        } else {
 431                mutex_unlock(&prange->lock);
 432        }
 433
 434        /* We need a new svm_bo. Spin-loop to wait for concurrent
 435         * svm_range_bo_release to finish removing this range from
 436         * its range list. After this, it is safe to reuse the
 437         * svm_bo pointer and svm_bo_list head.
 438         */
 439        while (!list_empty_careful(&prange->svm_bo_list))
 440                ;
 441
 442        return false;
 443}
 444
 445static struct svm_range_bo *svm_range_bo_new(void)
 446{
 447        struct svm_range_bo *svm_bo;
 448
 449        svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL);
 450        if (!svm_bo)
 451                return NULL;
 452
 453        kref_init(&svm_bo->kref);
 454        INIT_LIST_HEAD(&svm_bo->range_list);
 455        spin_lock_init(&svm_bo->list_lock);
 456
 457        return svm_bo;
 458}
 459
 460int
 461svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange,
 462                        bool clear)
 463{
 464        struct amdgpu_bo_param bp;
 465        struct svm_range_bo *svm_bo;
 466        struct amdgpu_bo_user *ubo;
 467        struct amdgpu_bo *bo;
 468        struct kfd_process *p;
 469        struct mm_struct *mm;
 470        int r;
 471
 472        p = container_of(prange->svms, struct kfd_process, svms);
 473        pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms,
 474                 prange->start, prange->last);
 475
 476        if (svm_range_validate_svm_bo(adev, prange))
 477                return 0;
 478
 479        svm_bo = svm_range_bo_new();
 480        if (!svm_bo) {
 481                pr_debug("failed to alloc svm bo\n");
 482                return -ENOMEM;
 483        }
 484        mm = get_task_mm(p->lead_thread);
 485        if (!mm) {
 486                pr_debug("failed to get mm\n");
 487                kfree(svm_bo);
 488                return -ESRCH;
 489        }
 490        svm_bo->svms = prange->svms;
 491        svm_bo->eviction_fence =
 492                amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
 493                                           mm,
 494                                           svm_bo);
 495        mmput(mm);
 496        INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker);
 497        svm_bo->evicting = 0;
 498        memset(&bp, 0, sizeof(bp));
 499        bp.size = prange->npages * PAGE_SIZE;
 500        bp.byte_align = PAGE_SIZE;
 501        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
 502        bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
 503        bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0;
 504        bp.flags |= AMDGPU_AMDKFD_CREATE_SVM_BO;
 505        bp.type = ttm_bo_type_device;
 506        bp.resv = NULL;
 507
 508        r = amdgpu_bo_create_user(adev, &bp, &ubo);
 509        if (r) {
 510                pr_debug("failed %d to create bo\n", r);
 511                goto create_bo_failed;
 512        }
 513        bo = &ubo->bo;
 514        r = amdgpu_bo_reserve(bo, true);
 515        if (r) {
 516                pr_debug("failed %d to reserve bo\n", r);
 517                goto reserve_bo_failed;
 518        }
 519
 520        r = dma_resv_reserve_shared(bo->tbo.base.resv, 1);
 521        if (r) {
 522                pr_debug("failed %d to reserve bo\n", r);
 523                amdgpu_bo_unreserve(bo);
 524                goto reserve_bo_failed;
 525        }
 526        amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true);
 527
 528        amdgpu_bo_unreserve(bo);
 529
 530        svm_bo->bo = bo;
 531        prange->svm_bo = svm_bo;
 532        prange->ttm_res = bo->tbo.resource;
 533        prange->offset = 0;
 534
 535        spin_lock(&svm_bo->list_lock);
 536        list_add(&prange->svm_bo_list, &svm_bo->range_list);
 537        spin_unlock(&svm_bo->list_lock);
 538
 539        return 0;
 540
 541reserve_bo_failed:
 542        amdgpu_bo_unref(&bo);
 543create_bo_failed:
 544        dma_fence_put(&svm_bo->eviction_fence->base);
 545        kfree(svm_bo);
 546        prange->ttm_res = NULL;
 547
 548        return r;
 549}
 550
 551void svm_range_vram_node_free(struct svm_range *prange)
 552{
 553        svm_range_bo_unref(prange->svm_bo);
 554        prange->ttm_res = NULL;
 555}
 556
 557struct amdgpu_device *
 558svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id)
 559{
 560        struct kfd_process_device *pdd;
 561        struct kfd_process *p;
 562        int32_t gpu_idx;
 563
 564        p = container_of(prange->svms, struct kfd_process, svms);
 565
 566        gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id);
 567        if (gpu_idx < 0) {
 568                pr_debug("failed to get device by id 0x%x\n", gpu_id);
 569                return NULL;
 570        }
 571        pdd = kfd_process_device_from_gpuidx(p, gpu_idx);
 572        if (!pdd) {
 573                pr_debug("failed to get device by idx 0x%x\n", gpu_idx);
 574                return NULL;
 575        }
 576
 577        return (struct amdgpu_device *)pdd->dev->kgd;
 578}
 579
 580struct kfd_process_device *
 581svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev)
 582{
 583        struct kfd_process *p;
 584        int32_t gpu_idx, gpuid;
 585        int r;
 586
 587        p = container_of(prange->svms, struct kfd_process, svms);
 588
 589        r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpu_idx);
 590        if (r) {
 591                pr_debug("failed to get device id by adev %p\n", adev);
 592                return NULL;
 593        }
 594
 595        return kfd_process_device_from_gpuidx(p, gpu_idx);
 596}
 597
 598static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
 599{
 600        struct ttm_operation_ctx ctx = { false, false };
 601
 602        amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);
 603
 604        return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
 605}
 606
 607static int
 608svm_range_check_attr(struct kfd_process *p,
 609                     uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
 610{
 611        uint32_t i;
 612
 613        for (i = 0; i < nattr; i++) {
 614                uint32_t val = attrs[i].value;
 615                int gpuidx = MAX_GPU_INSTANCE;
 616
 617                switch (attrs[i].type) {
 618                case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
 619                        if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM &&
 620                            val != KFD_IOCTL_SVM_LOCATION_UNDEFINED)
 621                                gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
 622                        break;
 623                case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
 624                        if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM)
 625                                gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
 626                        break;
 627                case KFD_IOCTL_SVM_ATTR_ACCESS:
 628                case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
 629                case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
 630                        gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
 631                        break;
 632                case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
 633                        break;
 634                case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
 635                        break;
 636                case KFD_IOCTL_SVM_ATTR_GRANULARITY:
 637                        break;
 638                default:
 639                        pr_debug("unknown attr type 0x%x\n", attrs[i].type);
 640                        return -EINVAL;
 641                }
 642
 643                if (gpuidx < 0) {
 644                        pr_debug("no GPU 0x%x found\n", val);
 645                        return -EINVAL;
 646                } else if (gpuidx < MAX_GPU_INSTANCE &&
 647                           !test_bit(gpuidx, p->svms.bitmap_supported)) {
 648                        pr_debug("GPU 0x%x not supported\n", val);
 649                        return -EINVAL;
 650                }
 651        }
 652
 653        return 0;
 654}
 655
 656static void
 657svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange,
 658                      uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
 659{
 660        uint32_t i;
 661        int gpuidx;
 662
 663        for (i = 0; i < nattr; i++) {
 664                switch (attrs[i].type) {
 665                case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
 666                        prange->preferred_loc = attrs[i].value;
 667                        break;
 668                case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
 669                        prange->prefetch_loc = attrs[i].value;
 670                        break;
 671                case KFD_IOCTL_SVM_ATTR_ACCESS:
 672                case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
 673                case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
 674                        gpuidx = kfd_process_gpuidx_from_gpuid(p,
 675                                                               attrs[i].value);
 676                        if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
 677                                bitmap_clear(prange->bitmap_access, gpuidx, 1);
 678                                bitmap_clear(prange->bitmap_aip, gpuidx, 1);
 679                        } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
 680                                bitmap_set(prange->bitmap_access, gpuidx, 1);
 681                                bitmap_clear(prange->bitmap_aip, gpuidx, 1);
 682                        } else {
 683                                bitmap_clear(prange->bitmap_access, gpuidx, 1);
 684                                bitmap_set(prange->bitmap_aip, gpuidx, 1);
 685                        }
 686                        break;
 687                case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
 688                        prange->flags |= attrs[i].value;
 689                        break;
 690                case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
 691                        prange->flags &= ~attrs[i].value;
 692                        break;
 693                case KFD_IOCTL_SVM_ATTR_GRANULARITY:
 694                        prange->granularity = attrs[i].value;
 695                        break;
 696                default:
 697                        WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
 698                }
 699        }
 700}
 701
 702/**
 703 * svm_range_debug_dump - print all range information from svms
 704 * @svms: svm range list header
 705 *
 706 * debug output svm range start, end, prefetch location from svms
 707 * interval tree and link list
 708 *
 709 * Context: The caller must hold svms->lock
 710 */
 711static void svm_range_debug_dump(struct svm_range_list *svms)
 712{
 713        struct interval_tree_node *node;
 714        struct svm_range *prange;
 715
 716        pr_debug("dump svms 0x%p list\n", svms);
 717        pr_debug("range\tstart\tpage\tend\t\tlocation\n");
 718
 719        list_for_each_entry(prange, &svms->list, list) {
 720                pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
 721                         prange, prange->start, prange->npages,
 722                         prange->start + prange->npages - 1,
 723                         prange->actual_loc);
 724        }
 725
 726        pr_debug("dump svms 0x%p interval tree\n", svms);
 727        pr_debug("range\tstart\tpage\tend\t\tlocation\n");
 728        node = interval_tree_iter_first(&svms->objects, 0, ~0ULL);
 729        while (node) {
 730                prange = container_of(node, struct svm_range, it_node);
 731                pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
 732                         prange, prange->start, prange->npages,
 733                         prange->start + prange->npages - 1,
 734                         prange->actual_loc);
 735                node = interval_tree_iter_next(node, 0, ~0ULL);
 736        }
 737}
 738
 739static bool
 740svm_range_is_same_attrs(struct svm_range *old, struct svm_range *new)
 741{
 742        return (old->prefetch_loc == new->prefetch_loc &&
 743                old->flags == new->flags &&
 744                old->granularity == new->granularity);
 745}
 746
 747static int
 748svm_range_split_array(void *ppnew, void *ppold, size_t size,
 749                      uint64_t old_start, uint64_t old_n,
 750                      uint64_t new_start, uint64_t new_n)
 751{
 752        unsigned char *new, *old, *pold;
 753        uint64_t d;
 754
 755        if (!ppold)
 756                return 0;
 757        pold = *(unsigned char **)ppold;
 758        if (!pold)
 759                return 0;
 760
 761        new = kvmalloc_array(new_n, size, GFP_KERNEL);
 762        if (!new)
 763                return -ENOMEM;
 764
 765        d = (new_start - old_start) * size;
 766        memcpy(new, pold + d, new_n * size);
 767
 768        old = kvmalloc_array(old_n, size, GFP_KERNEL);
 769        if (!old) {
 770                kvfree(new);
 771                return -ENOMEM;
 772        }
 773
 774        d = (new_start == old_start) ? new_n * size : 0;
 775        memcpy(old, pold + d, old_n * size);
 776
 777        kvfree(pold);
 778        *(void **)ppold = old;
 779        *(void **)ppnew = new;
 780
 781        return 0;
 782}
 783
 784static int
 785svm_range_split_pages(struct svm_range *new, struct svm_range *old,
 786                      uint64_t start, uint64_t last)
 787{
 788        uint64_t npages = last - start + 1;
 789        int i, r;
 790
 791        for (i = 0; i < MAX_GPU_INSTANCE; i++) {
 792                r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i],
 793                                          sizeof(*old->dma_addr[i]), old->start,
 794                                          npages, new->start, new->npages);
 795                if (r)
 796                        return r;
 797        }
 798
 799        return 0;
 800}
 801
 802static int
 803svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
 804                      uint64_t start, uint64_t last)
 805{
 806        uint64_t npages = last - start + 1;
 807
 808        pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n",
 809                 new->svms, new, new->start, start, last);
 810
 811        if (new->start == old->start) {
 812                new->offset = old->offset;
 813                old->offset += new->npages;
 814        } else {
 815                new->offset = old->offset + npages;
 816        }
 817
 818        new->svm_bo = svm_range_bo_ref(old->svm_bo);
 819        new->ttm_res = old->ttm_res;
 820
 821        spin_lock(&new->svm_bo->list_lock);
 822        list_add(&new->svm_bo_list, &new->svm_bo->range_list);
 823        spin_unlock(&new->svm_bo->list_lock);
 824
 825        return 0;
 826}
 827
 828/**
 829 * svm_range_split_adjust - split range and adjust
 830 *
 831 * @new: new range
 832 * @old: the old range
 833 * @start: the old range adjust to start address in pages
 834 * @last: the old range adjust to last address in pages
 835 *
 836 * Copy system memory dma_addr or vram ttm_res in old range to new
 837 * range from new_start up to size new->npages, the remaining old range is from
 838 * start to last
 839 *
 840 * Return:
 841 * 0 - OK, -ENOMEM - out of memory
 842 */
 843static int
 844svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
 845                      uint64_t start, uint64_t last)
 846{
 847        int r;
 848
 849        pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n",
 850                 new->svms, new->start, old->start, old->last, start, last);
 851
 852        if (new->start < old->start ||
 853            new->last > old->last) {
 854                WARN_ONCE(1, "invalid new range start or last\n");
 855                return -EINVAL;
 856        }
 857
 858        r = svm_range_split_pages(new, old, start, last);
 859        if (r)
 860                return r;
 861
 862        if (old->actual_loc && old->ttm_res) {
 863                r = svm_range_split_nodes(new, old, start, last);
 864                if (r)
 865                        return r;
 866        }
 867
 868        old->npages = last - start + 1;
 869        old->start = start;
 870        old->last = last;
 871        new->flags = old->flags;
 872        new->preferred_loc = old->preferred_loc;
 873        new->prefetch_loc = old->prefetch_loc;
 874        new->actual_loc = old->actual_loc;
 875        new->granularity = old->granularity;
 876        bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
 877        bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
 878
 879        return 0;
 880}
 881
 882/**
 883 * svm_range_split - split a range in 2 ranges
 884 *
 885 * @prange: the svm range to split
 886 * @start: the remaining range start address in pages
 887 * @last: the remaining range last address in pages
 888 * @new: the result new range generated
 889 *
 890 * Two cases only:
 891 * case 1: if start == prange->start
 892 *         prange ==> prange[start, last]
 893 *         new range [last + 1, prange->last]
 894 *
 895 * case 2: if last == prange->last
 896 *         prange ==> prange[start, last]
 897 *         new range [prange->start, start - 1]
 898 *
 899 * Return:
 900 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last
 901 */
 902static int
 903svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last,
 904                struct svm_range **new)
 905{
 906        uint64_t old_start = prange->start;
 907        uint64_t old_last = prange->last;
 908        struct svm_range_list *svms;
 909        int r = 0;
 910
 911        pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms,
 912                 old_start, old_last, start, last);
 913
 914        if (old_start != start && old_last != last)
 915                return -EINVAL;
 916        if (start < old_start || last > old_last)
 917                return -EINVAL;
 918
 919        svms = prange->svms;
 920        if (old_start == start)
 921                *new = svm_range_new(svms, last + 1, old_last);
 922        else
 923                *new = svm_range_new(svms, old_start, start - 1);
 924        if (!*new)
 925                return -ENOMEM;
 926
 927        r = svm_range_split_adjust(*new, prange, start, last);
 928        if (r) {
 929                pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n",
 930                         r, old_start, old_last, start, last);
 931                svm_range_free(*new);
 932                *new = NULL;
 933        }
 934
 935        return r;
 936}
 937
 938static int
 939svm_range_split_tail(struct svm_range *prange, struct svm_range *new,
 940                     uint64_t new_last, struct list_head *insert_list)
 941{
 942        struct svm_range *tail;
 943        int r = svm_range_split(prange, prange->start, new_last, &tail);
 944
 945        if (!r)
 946                list_add(&tail->insert_list, insert_list);
 947        return r;
 948}
 949
 950static int
 951svm_range_split_head(struct svm_range *prange, struct svm_range *new,
 952                     uint64_t new_start, struct list_head *insert_list)
 953{
 954        struct svm_range *head;
 955        int r = svm_range_split(prange, new_start, prange->last, &head);
 956
 957        if (!r)
 958                list_add(&head->insert_list, insert_list);
 959        return r;
 960}
 961
 962static void
 963svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
 964                    struct svm_range *pchild, enum svm_work_list_ops op)
 965{
 966        pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
 967                 pchild, pchild->start, pchild->last, prange, op);
 968
 969        pchild->work_item.mm = mm;
 970        pchild->work_item.op = op;
 971        list_add_tail(&pchild->child_list, &prange->child_list);
 972}
 973
 974/**
 975 * svm_range_split_by_granularity - collect ranges within granularity boundary
 976 *
 977 * @p: the process with svms list
 978 * @mm: mm structure
 979 * @addr: the vm fault address in pages, to split the prange
 980 * @parent: parent range if prange is from child list
 981 * @prange: prange to split
 982 *
 983 * Trims @prange to be a single aligned block of prange->granularity if
 984 * possible. The head and tail are added to the child_list in @parent.
 985 *
 986 * Context: caller must hold mmap_read_lock and prange->lock
 987 *
 988 * Return:
 989 * 0 - OK, otherwise error code
 990 */
 991int
 992svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
 993                               unsigned long addr, struct svm_range *parent,
 994                               struct svm_range *prange)
 995{
 996        struct svm_range *head, *tail;
 997        unsigned long start, last, size;
 998        int r;
 999
1000        /* Align splited range start and size to granularity size, then a single

1001         * PTE will be used for whole range, this reduces the number of PTE
1002         * updated and the L1 TLB space used for translation.
1003         */
1004        size = 1UL << prange->granularity;
1005        start = ALIGN_DOWN(addr, size);
1006        last = ALIGN(addr + 1, size) - 1;
1007
1008        pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n",
1009                 prange->svms, prange->start, prange->last, start, last, size);
1010
1011        if (start > prange->start) {
1012                r = svm_range_split(prange, start, prange->last, &head);
1013                if (r)
1014                        return r;
1015                svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE);
1016        }
1017
1018        if (last < prange->last) {
1019                r = svm_range_split(prange, prange->start, last, &tail);
1020                if (r)
1021                        return r;
1022                svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
1023        }
1024
1025        /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */
1026        if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) {
1027                prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP;
1028                pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n",
1029                         prange, prange->start, prange->last,
1030                         SVM_OP_ADD_RANGE_AND_MAP);
1031        }
1032        return 0;
1033}
1034
1035static uint64_t
1036svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange,
1037                        int domain)
1038{
1039        struct amdgpu_device *bo_adev;
1040        uint32_t flags = prange->flags;
1041        uint32_t mapping_flags = 0;
1042        uint64_t pte_flags;
1043        bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
1044        bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT;
1045
1046        if (domain == SVM_RANGE_VRAM_DOMAIN)
1047                bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
1048
1049        switch (adev->asic_type) {
1050        case CHIP_ARCTURUS:
1051                if (domain == SVM_RANGE_VRAM_DOMAIN) {
1052                        if (bo_adev == adev) {
1053                                mapping_flags |= coherent ?
1054                                        AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1055                        } else {
1056                                mapping_flags |= coherent ?
1057                                        AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1058                                if (amdgpu_xgmi_same_hive(adev, bo_adev))
1059                                        snoop = true;
1060                        }
1061                } else {
1062                        mapping_flags |= coherent ?
1063                                AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1064                }
1065                break;
1066        case CHIP_ALDEBARAN:
1067                if (domain == SVM_RANGE_VRAM_DOMAIN) {
1068                        if (bo_adev == adev) {
1069                                mapping_flags |= coherent ?
1070                                        AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1071                                if (adev->gmc.xgmi.connected_to_cpu)
1072                                        snoop = true;
1073                        } else {
1074                                mapping_flags |= coherent ?
1075                                        AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1076                                if (amdgpu_xgmi_same_hive(adev, bo_adev))
1077                                        snoop = true;
1078                        }
1079                } else {
1080                        mapping_flags |= coherent ?
1081                                AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1082                }
1083                break;
1084        default:
1085                mapping_flags |= coherent ?
1086                        AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1087        }
1088
1089        mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
1090
1091        if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO)
1092                mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE;
1093        if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC)
1094                mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
1095
1096        pte_flags = AMDGPU_PTE_VALID;
1097        pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM;
1098        pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
1099
1100        pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags);
1101        return pte_flags;
1102}
1103
1104static int
1105svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
1106                         uint64_t start, uint64_t last,
1107                         struct dma_fence **fence)
1108{
1109        uint64_t init_pte_value = 0;
1110
1111        pr_debug("[0x%llx 0x%llx]\n", start, last);
1112
1113        return amdgpu_vm_bo_update_mapping(adev, adev, vm, false, true, NULL,
1114                                           start, last, init_pte_value, 0,
1115                                           NULL, NULL, fence, NULL);
1116}
1117
1118static int
1119svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
1120                          unsigned long last)
1121{
1122        DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1123        struct kfd_process_device *pdd;
1124        struct dma_fence *fence = NULL;
1125        struct amdgpu_device *adev;
1126        struct kfd_process *p;
1127        uint32_t gpuidx;
1128        int r = 0;
1129
1130        bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
1131                  MAX_GPU_INSTANCE);
1132        p = container_of(prange->svms, struct kfd_process, svms);
1133
1134        for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1135                pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
1136                pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1137                if (!pdd) {
1138                        pr_debug("failed to find device idx %d\n", gpuidx);
1139                        return -EINVAL;
1140                }
1141                adev = (struct amdgpu_device *)pdd->dev->kgd;
1142
1143                r = svm_range_unmap_from_gpu(adev, drm_priv_to_vm(pdd->drm_priv),
1144                                             start, last, &fence);
1145                if (r)
1146                        break;
1147
1148                if (fence) {
1149                        r = dma_fence_wait(fence, false);
1150                        dma_fence_put(fence);
1151                        fence = NULL;
1152                        if (r)
1153                                break;
1154                }
1155                amdgpu_amdkfd_flush_gpu_tlb_pasid((struct kgd_dev *)adev,
1156                                        p->pasid, TLB_FLUSH_HEAVYWEIGHT);
1157        }
1158
1159        return r;
1160}
1161
1162static int
1163svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
1164                     struct svm_range *prange, unsigned long offset,
1165                     unsigned long npages, bool readonly, dma_addr_t *dma_addr,
1166                     struct amdgpu_device *bo_adev, struct dma_fence **fence)
1167{
1168        struct amdgpu_bo_va bo_va;
1169        bool table_freed = false;
1170        uint64_t pte_flags;
1171        unsigned long last_start;
1172        int last_domain;
1173        int r = 0;
1174        int64_t i, j;
1175
1176        last_start = prange->start + offset;
1177
1178        pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms,
1179                 last_start, last_start + npages - 1, readonly);
1180
1181        if (prange->svm_bo && prange->ttm_res)
1182                bo_va.is_xgmi = amdgpu_xgmi_same_hive(adev, bo_adev);
1183
1184        for (i = offset; i < offset + npages; i++) {
1185                last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN;
1186                dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN;
1187
1188                /* Collect all pages in the same address range and memory domain
1189                 * that can be mapped with a single call to update mapping.
1190                 */
1191                if (i < offset + npages - 1 &&
1192                    last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN))
1193                        continue;
1194
1195                pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n",
1196                         last_start, prange->start + i, last_domain ? "GPU" : "CPU");
1197
1198                pte_flags = svm_range_get_pte_flags(adev, prange, last_domain);
1199                if (readonly)
1200                        pte_flags &= ~AMDGPU_PTE_WRITEABLE;
1201
1202                pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n",
1203                         prange->svms, last_start, prange->start + i,
1204                         (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0,
1205                         pte_flags);
1206
1207                r = amdgpu_vm_bo_update_mapping(adev, bo_adev, vm, false, false,
1208                                                NULL, last_start,
1209                                                prange->start + i, pte_flags,
1210                                                last_start - prange->start,
1211                                                NULL, dma_addr,
1212                                                &vm->last_update,
1213                                                &table_freed);
1214
1215                for (j = last_start - prange->start; j <= i; j++)
1216                        dma_addr[j] |= last_domain;
1217
1218                if (r) {
1219                        pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start);
1220                        goto out;
1221                }
1222                last_start = prange->start + i + 1;
1223        }
1224
1225        r = amdgpu_vm_update_pdes(adev, vm, false);
1226        if (r) {
1227                pr_debug("failed %d to update directories 0x%lx\n", r,
1228                         prange->start);
1229                goto out;
1230        }
1231
1232        if (fence)
1233                *fence = dma_fence_get(vm->last_update);
1234
1235        if (table_freed) {
1236                struct kfd_process *p;
1237
1238                p = container_of(prange->svms, struct kfd_process, svms);
1239                amdgpu_amdkfd_flush_gpu_tlb_pasid((struct kgd_dev *)adev,
1240                                                p->pasid, TLB_FLUSH_LEGACY);
1241        }
1242out:
1243        return r;
1244}
1245
1246static int
1247svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
1248                      unsigned long npages, bool readonly,
1249                      unsigned long *bitmap, bool wait)
1250{
1251        struct kfd_process_device *pdd;
1252        struct amdgpu_device *bo_adev;
1253        struct amdgpu_device *adev;
1254        struct kfd_process *p;
1255        struct dma_fence *fence = NULL;
1256        uint32_t gpuidx;
1257        int r = 0;
1258
1259        if (prange->svm_bo && prange->ttm_res)
1260                bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
1261        else
1262                bo_adev = NULL;
1263
1264        p = container_of(prange->svms, struct kfd_process, svms);
1265        for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1266                pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
1267                pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1268                if (!pdd) {
1269                        pr_debug("failed to find device idx %d\n", gpuidx);
1270                        return -EINVAL;
1271                }
1272                adev = (struct amdgpu_device *)pdd->dev->kgd;
1273
1274                pdd = kfd_bind_process_to_device(pdd->dev, p);
1275                if (IS_ERR(pdd))
1276                        return -EINVAL;
1277
1278                if (bo_adev && adev != bo_adev &&
1279                    !amdgpu_xgmi_same_hive(adev, bo_adev)) {
1280                        pr_debug("cannot map to device idx %d\n", gpuidx);
1281                        continue;
1282                }
1283
1284                r = svm_range_map_to_gpu(adev, drm_priv_to_vm(pdd->drm_priv),
1285                                         prange, offset, npages, readonly,
1286                                         prange->dma_addr[gpuidx],
1287                                         bo_adev, wait ? &fence : NULL);
1288                if (r)
1289                        break;
1290
1291                if (fence) {
1292                        r = dma_fence_wait(fence, false);
1293                        dma_fence_put(fence);
1294                        fence = NULL;
1295                        if (r) {
1296                                pr_debug("failed %d to dma fence wait\n", r);
1297                                break;
1298                        }
1299                }
1300        }
1301
1302        return r;
1303}
1304
1305struct svm_validate_context {
1306        struct kfd_process *process;
1307        struct svm_range *prange;
1308        bool intr;
1309        unsigned long bitmap[MAX_GPU_INSTANCE];
1310        struct ttm_validate_buffer tv[MAX_GPU_INSTANCE+1];
1311        struct list_head validate_list;
1312        struct ww_acquire_ctx ticket;
1313};
1314
1315static int svm_range_reserve_bos(struct svm_validate_context *ctx)
1316{
1317        struct kfd_process_device *pdd;
1318        struct amdgpu_device *adev;
1319        struct amdgpu_vm *vm;
1320        uint32_t gpuidx;
1321        int r;
1322
1323        INIT_LIST_HEAD(&ctx->validate_list);
1324        for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1325                pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1326                if (!pdd) {
1327                        pr_debug("failed to find device idx %d\n", gpuidx);
1328                        return -EINVAL;
1329                }
1330                adev = (struct amdgpu_device *)pdd->dev->kgd;
1331                vm = drm_priv_to_vm(pdd->drm_priv);
1332
1333                ctx->tv[gpuidx].bo = &vm->root.bo->tbo;
1334                ctx->tv[gpuidx].num_shared = 4;
1335                list_add(&ctx->tv[gpuidx].head, &ctx->validate_list);
1336        }
1337        if (ctx->prange->svm_bo && ctx->prange->ttm_res) {
1338                ctx->tv[MAX_GPU_INSTANCE].bo = &ctx->prange->svm_bo->bo->tbo;
1339                ctx->tv[MAX_GPU_INSTANCE].num_shared = 1;
1340                list_add(&ctx->tv[MAX_GPU_INSTANCE].head, &ctx->validate_list);
1341        }
1342
1343        r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list,
1344                                   ctx->intr, NULL);
1345        if (r) {
1346                pr_debug("failed %d to reserve bo\n", r);
1347                return r;
1348        }
1349
1350        for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1351                pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1352                if (!pdd) {
1353                        pr_debug("failed to find device idx %d\n", gpuidx);
1354                        r = -EINVAL;
1355                        goto unreserve_out;
1356                }
1357                adev = (struct amdgpu_device *)pdd->dev->kgd;
1358
1359                r = amdgpu_vm_validate_pt_bos(adev, drm_priv_to_vm(pdd->drm_priv),
1360                                              svm_range_bo_validate, NULL);
1361                if (r) {
1362                        pr_debug("failed %d validate pt bos\n", r);
1363                        goto unreserve_out;
1364                }
1365        }
1366
1367        return 0;
1368
1369unreserve_out:
1370        ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list);
1371        return r;
1372}
1373
1374static void svm_range_unreserve_bos(struct svm_validate_context *ctx)
1375{
1376        ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list);
1377}
1378
1379static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx)
1380{
1381        struct kfd_process_device *pdd;
1382        struct amdgpu_device *adev;
1383
1384        pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1385        adev = (struct amdgpu_device *)pdd->dev->kgd;
1386
1387        return SVM_ADEV_PGMAP_OWNER(adev);
1388}
1389
1390/*
1391 * Validation+GPU mapping with concurrent invalidation (MMU notifiers)
1392 *
1393 * To prevent concurrent destruction or change of range attributes, the
1394 * svm_read_lock must be held. The caller must not hold the svm_write_lock
1395 * because that would block concurrent evictions and lead to deadlocks. To
1396 * serialize concurrent migrations or validations of the same range, the
1397 * prange->migrate_mutex must be held.
1398 *
1399 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its
1400 * eviction fence.
1401 *
1402 * The following sequence ensures race-free validation and GPU mapping:
1403 *
1404 * 1. Reserve page table (and SVM BO if range is in VRAM)
1405 * 2. hmm_range_fault to get page addresses (if system memory)
1406 * 3. DMA-map pages (if system memory)
1407 * 4-a. Take notifier lock
1408 * 4-b. Check that pages still valid (mmu_interval_read_retry)
1409 * 4-c. Check that the range was not split or otherwise invalidated
1410 * 4-d. Update GPU page table
1411 * 4.e. Release notifier lock
1412 * 5. Release page table (and SVM BO) reservation
1413 */
1414static int svm_range_validate_and_map(struct mm_struct *mm,
1415                                      struct svm_range *prange,
1416                                      int32_t gpuidx, bool intr, bool wait)
1417{
1418        struct svm_validate_context ctx;
1419        unsigned long start, end, addr;
1420        struct kfd_process *p;
1421        void *owner;
1422        int32_t idx;
1423        int r = 0;
1424
1425        ctx.process = container_of(prange->svms, struct kfd_process, svms);
1426        ctx.prange = prange;
1427        ctx.intr = intr;
1428
1429        if (gpuidx < MAX_GPU_INSTANCE) {
1430                bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE);
1431                bitmap_set(ctx.bitmap, gpuidx, 1);
1432        } else if (ctx.process->xnack_enabled) {
1433                bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
1434
1435                /* If prefetch range to GPU, or GPU retry fault migrate range to
1436                 * GPU, which has ACCESS attribute to the range, create mapping
1437                 * on that GPU.
1438                 */
1439                if (prange->actual_loc) {
1440                        gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process,
1441                                                        prange->actual_loc);
1442                        if (gpuidx < 0) {
1443                                WARN_ONCE(1, "failed get device by id 0x%x\n",
1444                                         prange->actual_loc);
1445                                return -EINVAL;
1446                        }
1447                        if (test_bit(gpuidx, prange->bitmap_access))
1448                                bitmap_set(ctx.bitmap, gpuidx, 1);
1449                }
1450        } else {
1451                bitmap_or(ctx.bitmap, prange->bitmap_access,
1452                          prange->bitmap_aip, MAX_GPU_INSTANCE);
1453        }
1454
1455        if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE))
1456                return 0;
1457
1458        if (prange->actual_loc && !prange->ttm_res) {
1459                /* This should never happen. actual_loc gets set by
1460                 * svm_migrate_ram_to_vram after allocating a BO.
1461                 */
1462                WARN(1, "VRAM BO missing during validation\n");
1463                return -EINVAL;
1464        }
1465
1466        svm_range_reserve_bos(&ctx);
1467
1468        p = container_of(prange->svms, struct kfd_process, svms);
1469        owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap,
1470                                                MAX_GPU_INSTANCE));
1471        for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) {
1472                if (kfd_svm_page_owner(p, idx) != owner) {
1473                        owner = NULL;
1474                        break;
1475                }
1476        }
1477
1478        start = prange->start << PAGE_SHIFT;
1479        end = (prange->last + 1) << PAGE_SHIFT;
1480        for (addr = start; addr < end && !r; ) {
1481                struct hmm_range *hmm_range;
1482                struct vm_area_struct *vma;
1483                unsigned long next;
1484                unsigned long offset;
1485                unsigned long npages;
1486                bool readonly;
1487
1488                vma = find_vma(mm, addr);
1489                if (!vma || addr < vma->vm_start) {
1490                        r = -EFAULT;
1491                        goto unreserve_out;
1492                }
1493                readonly = !(vma->vm_flags & VM_WRITE);
1494
1495                next = min(vma->vm_end, end);
1496                npages = (next - addr) >> PAGE_SHIFT;
1497                r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
1498                                               addr, npages, &hmm_range,
1499                                               readonly, true, owner);
1500                if (r) {
1501                        pr_debug("failed %d to get svm range pages\n", r);
1502                        goto unreserve_out;
1503                }
1504
1505                offset = (addr - start) >> PAGE_SHIFT;
1506                r = svm_range_dma_map(prange, ctx.bitmap, offset, npages,
1507                                      hmm_range->hmm_pfns);
1508                if (r) {
1509                        pr_debug("failed %d to dma map range\n", r);
1510                        goto unreserve_out;
1511                }
1512
1513                svm_range_lock(prange);
1514                if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
1515                        pr_debug("hmm update the range, need validate again\n");
1516                        r = -EAGAIN;
1517                        goto unlock_out;
1518                }
1519                if (!list_empty(&prange->child_list)) {
1520                        pr_debug("range split by unmap in parallel, validate again\n");
1521                        r = -EAGAIN;
1522                        goto unlock_out;
1523                }
1524
1525                r = svm_range_map_to_gpus(prange, offset, npages, readonly,
1526                                          ctx.bitmap, wait);
1527
1528unlock_out:
1529                svm_range_unlock(prange);
1530
1531                addr = next;
1532        }
1533
1534        if (addr == end)
1535                prange->validated_once = true;
1536
1537unreserve_out:
1538        svm_range_unreserve_bos(&ctx);
1539
1540        if (!r)
1541                prange->validate_timestamp = ktime_to_us(ktime_get());
1542
1543        return r;
1544}
1545
1546/**
1547 * svm_range_list_lock_and_flush_work - flush pending deferred work
1548 *
1549 * @svms: the svm range list
1550 * @mm: the mm structure
1551 *
1552 * Context: Returns with mmap write lock held, pending deferred work flushed
1553 *
1554 */
1555static void
1556svm_range_list_lock_and_flush_work(struct svm_range_list *svms,
1557                                   struct mm_struct *mm)
1558{
1559retry_flush_work:
1560        flush_work(&svms->deferred_list_work);
1561        mmap_write_lock(mm);
1562
1563        if (list_empty(&svms->deferred_range_list))
1564                return;
1565        mmap_write_unlock(mm);
1566        pr_debug("retry flush\n");
1567        goto retry_flush_work;
1568}
1569
1570static void svm_range_restore_work(struct work_struct *work)
1571{
1572        struct delayed_work *dwork = to_delayed_work(work);
1573        struct amdkfd_process_info *process_info;
1574        struct svm_range_list *svms;
1575        struct svm_range *prange;
1576        struct kfd_process *p;
1577        struct mm_struct *mm;
1578        int evicted_ranges;
1579        int invalid;
1580        int r;
1581
1582        svms = container_of(dwork, struct svm_range_list, restore_work);
1583        evicted_ranges = atomic_read(&svms->evicted_ranges);
1584        if (!evicted_ranges)
1585                return;
1586
1587        pr_debug("restore svm ranges\n");
1588
1589        /* kfd_process_notifier_release destroys this worker thread. So during
1590         * the lifetime of this thread, kfd_process and mm will be valid.
1591         */
1592        p = container_of(svms, struct kfd_process, svms);
1593        process_info = p->kgd_process_info;
1594        mm = p->mm;
1595        if (!mm)
1596                return;
1597
1598        mutex_lock(&process_info->lock);
1599        svm_range_list_lock_and_flush_work(svms, mm);
1600        mutex_lock(&svms->lock);
1601
1602        evicted_ranges = atomic_read(&svms->evicted_ranges);
1603
1604        list_for_each_entry(prange, &svms->list, list) {
1605                invalid = atomic_read(&prange->invalid);
1606                if (!invalid)
1607                        continue;
1608
1609                pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n",
1610                         prange->svms, prange, prange->start, prange->last,
1611                         invalid);
1612
1613                /*
1614                 * If range is migrating, wait for migration is done.
1615                 */
1616                mutex_lock(&prange->migrate_mutex);
1617
1618                r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
1619                                               false, true);
1620                if (r)
1621                        pr_debug("failed %d to map 0x%lx to gpus\n", r,
1622                                 prange->start);
1623
1624                mutex_unlock(&prange->migrate_mutex);
1625                if (r)
1626                        goto out_reschedule;
1627
1628                if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid)
1629                        goto out_reschedule;
1630        }
1631
1632        if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) !=
1633            evicted_ranges)
1634                goto out_reschedule;
1635
1636        evicted_ranges = 0;
1637
1638        r = kgd2kfd_resume_mm(mm);
1639        if (r) {
1640                /* No recovery from this failure. Probably the CP is
1641                 * hanging. No point trying again.
1642                 */
1643                pr_debug("failed %d to resume KFD\n", r);
1644        }
1645
1646        pr_debug("restore svm ranges successfully\n");
1647
1648out_reschedule:
1649        mutex_unlock(&svms->lock);
1650        mmap_write_unlock(mm);
1651        mutex_unlock(&process_info->lock);
1652
1653        /* If validation failed, reschedule another attempt */
1654        if (evicted_ranges) {
1655                pr_debug("reschedule to restore svm range\n");
1656                schedule_delayed_work(&svms->restore_work,
1657                        msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1658        }
1659}
1660
1661/**
1662 * svm_range_evict - evict svm range
1663 *
1664 * Stop all queues of the process to ensure GPU doesn't access the memory, then
1665 * return to let CPU evict the buffer and proceed CPU pagetable update.
1666 *
1667 * Don't need use lock to sync cpu pagetable invalidation with GPU execution.
1668 * If invalidation happens while restore work is running, restore work will
1669 * restart to ensure to get the latest CPU pages mapping to GPU, then start
1670 * the queues.
1671 */
1672static int
1673svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
1674                unsigned long start, unsigned long last)
1675{
1676        struct svm_range_list *svms = prange->svms;
1677        struct svm_range *pchild;
1678        struct kfd_process *p;
1679        int r = 0;
1680
1681        p = container_of(svms, struct kfd_process, svms);
1682
1683        pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
1684                 svms, prange->start, prange->last, start, last);
1685
1686        if (!p->xnack_enabled) {
1687                int evicted_ranges;
1688
1689                list_for_each_entry(pchild, &prange->child_list, child_list) {
1690                        mutex_lock_nested(&pchild->lock, 1);
1691                        if (pchild->start <= last && pchild->last >= start) {
1692                                pr_debug("increment pchild invalid [0x%lx 0x%lx]\n",
1693                                         pchild->start, pchild->last);
1694                                atomic_inc(&pchild->invalid);
1695                        }
1696                        mutex_unlock(&pchild->lock);
1697                }
1698
1699                if (prange->start <= last && prange->last >= start)
1700                        atomic_inc(&prange->invalid);
1701
1702                evicted_ranges = atomic_inc_return(&svms->evicted_ranges);
1703                if (evicted_ranges != 1)
1704                        return r;
1705
1706                pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n",
1707                         prange->svms, prange->start, prange->last);
1708
1709                /* First eviction, stop the queues */
1710                r = kgd2kfd_quiesce_mm(mm);
1711                if (r)
1712                        pr_debug("failed to quiesce KFD\n");
1713
1714                pr_debug("schedule to restore svm %p ranges\n", svms);
1715                schedule_delayed_work(&svms->restore_work,
1716                        msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1717        } else {
1718                unsigned long s, l;
1719
1720                pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n",
1721                         prange->svms, start, last);
1722                list_for_each_entry(pchild, &prange->child_list, child_list) {
1723                        mutex_lock_nested(&pchild->lock, 1);
1724                        s = max(start, pchild->start);
1725                        l = min(last, pchild->last);
1726                        if (l >= s)
1727                                svm_range_unmap_from_gpus(pchild, s, l);
1728                        mutex_unlock(&pchild->lock);
1729                }
1730                s = max(start, prange->start);
1731                l = min(last, prange->last);
1732                if (l >= s)
1733                        svm_range_unmap_from_gpus(prange, s, l);
1734        }
1735
1736        return r;
1737}
1738
1739static struct svm_range *svm_range_clone(struct svm_range *old)
1740{
1741        struct svm_range *new;
1742
1743        new = svm_range_new(old->svms, old->start, old->last);
1744        if (!new)
1745                return NULL;
1746
1747        if (old->svm_bo) {
1748                new->ttm_res = old->ttm_res;
1749                new->offset = old->offset;
1750                new->svm_bo = svm_range_bo_ref(old->svm_bo);
1751                spin_lock(&new->svm_bo->list_lock);
1752                list_add(&new->svm_bo_list, &new->svm_bo->range_list);
1753                spin_unlock(&new->svm_bo->list_lock);
1754        }
1755        new->flags = old->flags;
1756        new->preferred_loc = old->preferred_loc;
1757        new->prefetch_loc = old->prefetch_loc;
1758        new->actual_loc = old->actual_loc;
1759        new->granularity = old->granularity;
1760        bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
1761        bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
1762
1763        return new;
1764}
1765
1766/**
1767 * svm_range_handle_overlap - split overlap ranges
1768 * @svms: svm range list header
1769 * @new: range added with this attributes
1770 * @start: range added start address, in pages
1771 * @last: range last address, in pages
1772 * @update_list: output, the ranges attributes are updated. For set_attr, this
1773 *               will do validation and map to GPUs. For unmap, this will be
1774 *               removed and unmap from GPUs
1775 * @insert_list: output, the ranges will be inserted into svms, attributes are
1776 *               not changes. For set_attr, this will add into svms.
1777 * @remove_list:output, the ranges will be removed from svms
1778 * @left: the remaining range after overlap, For set_attr, this will be added
1779 *        as new range.
1780 *
1781 * Total have 5 overlap cases.
1782 *
1783 * This function handles overlap of an address interval with existing
1784 * struct svm_ranges for applying new attributes. This may require
1785 * splitting existing struct svm_ranges. All changes should be applied to
1786 * the range_list and interval tree transactionally. If any split operation
1787 * fails, the entire update fails. Therefore the existing overlapping
1788 * svm_ranges are cloned and the original svm_ranges left unchanged. If the
1789 * transaction succeeds, the modified clones are added and the originals
1790 * freed. Otherwise the clones are removed and the old svm_ranges remain.
1791 *
1792 * Context: The caller must hold svms->lock
1793 */
1794static int
1795svm_range_handle_overlap(struct svm_range_list *svms, struct svm_range *new,
1796                         unsigned long start, unsigned long last,
1797                         struct list_head *update_list,
1798                         struct list_head *insert_list,
1799                         struct list_head *remove_list,
1800                         unsigned long *left)
1801{
1802        struct interval_tree_node *node;
1803        struct svm_range *prange;
1804        struct svm_range *tmp;
1805        int r = 0;
1806
1807        INIT_LIST_HEAD(update_list);
1808        INIT_LIST_HEAD(insert_list);
1809        INIT_LIST_HEAD(remove_list);
1810
1811        node = interval_tree_iter_first(&svms->objects, start, last);
1812        while (node) {
1813                struct interval_tree_node *next;
1814                struct svm_range *old;
1815                unsigned long next_start;
1816
1817                pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start,
1818                         node->last);
1819
1820                old = container_of(node, struct svm_range, it_node);
1821                next = interval_tree_iter_next(node, start, last);
1822                next_start = min(node->last, last) + 1;
1823
1824                if (node->start < start || node->last > last) {
1825                        /* node intersects the updated range, clone+split it */
1826                        prange = svm_range_clone(old);
1827                        if (!prange) {
1828                                r = -ENOMEM;
1829                                goto out;
1830                        }
1831
1832                        list_add(&old->remove_list, remove_list);
1833                        list_add(&prange->insert_list, insert_list);
1834
1835                        if (node->start < start) {
1836                                pr_debug("change old range start\n");
1837                                r = svm_range_split_head(prange, new, start,
1838                                                         insert_list);
1839                                if (r)
1840                                        goto out;
1841                        }
1842                        if (node->last > last) {
1843                                pr_debug("change old range last\n");
1844                                r = svm_range_split_tail(prange, new, last,
1845                                                         insert_list);
1846                                if (r)
1847                                        goto out;
1848                        }
1849                } else {
1850                        /* The node is contained within start..last,
1851                         * just update it
1852                         */
1853                        prange = old;
1854                }
1855
1856                if (!svm_range_is_same_attrs(prange, new))
1857                        list_add(&prange->update_list, update_list);
1858
1859                /* insert a new node if needed */
1860                if (node->start > start) {
1861                        prange = svm_range_new(prange->svms, start,
1862                                               node->start - 1);
1863                        if (!prange) {
1864                                r = -ENOMEM;
1865                                goto out;
1866                        }
1867
1868                        list_add(&prange->insert_list, insert_list);
1869                        list_add(&prange->update_list, update_list);
1870                }
1871
1872                node = next;
1873                start = next_start;
1874        }
1875
1876        if (left && start <= last)
1877                *left = last - start + 1;
1878
1879out:
1880        if (r)
1881                list_for_each_entry_safe(prange, tmp, insert_list, insert_list)
1882                        svm_range_free(prange);
1883
1884        return r;
1885}
1886
1887static void
1888svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
1889                                            struct svm_range *prange)
1890{
1891        unsigned long start;
1892        unsigned long last;
1893
1894        start = prange->notifier.interval_tree.start >> PAGE_SHIFT;
1895        last = prange->notifier.interval_tree.last >> PAGE_SHIFT;
1896
1897        if (prange->start == start && prange->last == last)
1898                return;
1899
1900        pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
1901                  prange->svms, prange, start, last, prange->start,
1902                  prange->last);
1903
1904        if (start != 0 && last != 0) {
1905                interval_tree_remove(&prange->it_node, &prange->svms->objects);
1906                svm_range_remove_notifier(prange);
1907        }
1908        prange->it_node.start = prange->start;
1909        prange->it_node.last = prange->last;
1910
1911        interval_tree_insert(&prange->it_node, &prange->svms->objects);
1912        svm_range_add_notifier_locked(mm, prange);
1913}
1914
1915static void
1916svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
1917{
1918        struct mm_struct *mm = prange->work_item.mm;
1919
1920        switch (prange->work_item.op) {
1921        case SVM_OP_NULL:
1922                pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
1923                         svms, prange, prange->start, prange->last);
1924                break;
1925        case SVM_OP_UNMAP_RANGE:
1926                pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n",
1927                         svms, prange, prange->start, prange->last);
1928                svm_range_unlink(prange);
1929                svm_range_remove_notifier(prange);
1930                svm_range_free(prange);
1931                break;
1932        case SVM_OP_UPDATE_RANGE_NOTIFIER:
1933                pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
1934                         svms, prange, prange->start, prange->last);
1935                svm_range_update_notifier_and_interval_tree(mm, prange);
1936                break;
1937        case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
1938                pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
1939                         svms, prange, prange->start, prange->last);
1940                svm_range_update_notifier_and_interval_tree(mm, prange);
1941                /* TODO: implement deferred validation and mapping */
1942                break;
1943        case SVM_OP_ADD_RANGE:
1944                pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
1945                         prange->start, prange->last);
1946                svm_range_add_to_svms(prange);
1947                svm_range_add_notifier_locked(mm, prange);
1948                break;
1949        case SVM_OP_ADD_RANGE_AND_MAP:
1950                pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
1951                         prange, prange->start, prange->last);
1952                svm_range_add_to_svms(prange);
1953                svm_range_add_notifier_locked(mm, prange);
1954                /* TODO: implement deferred validation and mapping */
1955                break;
1956        default:
1957                WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange,
1958                         prange->work_item.op);
1959        }
1960}
1961
1962static void svm_range_drain_retry_fault(struct svm_range_list *svms)
1963{
1964        struct kfd_process_device *pdd;
1965        struct amdgpu_device *adev;
1966        struct kfd_process *p;
1967        uint32_t i;
1968
1969        p = container_of(svms, struct kfd_process, svms);
1970
1971        for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
1972                pdd = p->pdds[i];
1973                if (!pdd)
1974                        continue;
1975
1976                pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
1977                adev = (struct amdgpu_device *)pdd->dev->kgd;
1978
1979                amdgpu_ih_wait_on_checkpoint_process(adev, &adev->irq.ih1);
1980                pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
1981        }
1982}
1983
1984static void svm_range_deferred_list_work(struct work_struct *work)
1985{
1986        struct svm_range_list *svms;
1987        struct svm_range *prange;
1988        struct mm_struct *mm;
1989
1990        svms = container_of(work, struct svm_range_list, deferred_list_work);
1991        pr_debug("enter svms 0x%p\n", svms);
1992
1993        spin_lock(&svms->deferred_list_lock);
1994        while (!list_empty(&svms->deferred_range_list)) {
1995                prange = list_first_entry(&svms->deferred_range_list,
1996                                          struct svm_range, deferred_list);
1997                spin_unlock(&svms->deferred_list_lock);
1998                pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
1999                         prange->start, prange->last, prange->work_item.op);
2000

2001                /* Make sure no stale retry fault coming after range is freed */
2002                if (prange->work_item.op == SVM_OP_UNMAP_RANGE)
2003                        svm_range_drain_retry_fault(prange->svms);
2004
2005                mm = prange->work_item.mm;
2006                mmap_write_lock(mm);
2007                mutex_lock(&svms->lock);
2008
2009                /* Remove from deferred_list must be inside mmap write lock,
2010                 * otherwise, svm_range_list_lock_and_flush_work may hold mmap
2011                 * write lock, and continue because deferred_list is empty, then
2012                 * deferred_list handle is blocked by mmap write lock.
2013                 */
2014                spin_lock(&svms->deferred_list_lock);
2015                list_del_init(&prange->deferred_list);
2016                spin_unlock(&svms->deferred_list_lock);
2017
2018                mutex_lock(&prange->migrate_mutex);
2019                while (!list_empty(&prange->child_list)) {
2020                        struct svm_range *pchild;
2021
2022                        pchild = list_first_entry(&prange->child_list,
2023                                                struct svm_range, child_list);
2024                        pr_debug("child prange 0x%p op %d\n", pchild,
2025                                 pchild->work_item.op);
2026                        list_del_init(&pchild->child_list);
2027                        svm_range_handle_list_op(svms, pchild);
2028                }
2029                mutex_unlock(&prange->migrate_mutex);
2030
2031                svm_range_handle_list_op(svms, prange);
2032                mutex_unlock(&svms->lock);
2033                mmap_write_unlock(mm);
2034
2035                spin_lock(&svms->deferred_list_lock);
2036        }
2037        spin_unlock(&svms->deferred_list_lock);
2038
2039        pr_debug("exit svms 0x%p\n", svms);
2040}
2041
2042void
2043svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
2044                        struct mm_struct *mm, enum svm_work_list_ops op)
2045{
2046        spin_lock(&svms->deferred_list_lock);
2047        /* if prange is on the deferred list */
2048        if (!list_empty(&prange->deferred_list)) {
2049                pr_debug("update exist prange 0x%p work op %d\n", prange, op);
2050                WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n");
2051                if (op != SVM_OP_NULL &&
2052                    prange->work_item.op != SVM_OP_UNMAP_RANGE)
2053                        prange->work_item.op = op;
2054        } else {
2055                prange->work_item.op = op;
2056                prange->work_item.mm = mm;
2057                list_add_tail(&prange->deferred_list,
2058                              &prange->svms->deferred_range_list);
2059                pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
2060                         prange, prange->start, prange->last, op);
2061        }
2062        spin_unlock(&svms->deferred_list_lock);
2063}
2064
2065void schedule_deferred_list_work(struct svm_range_list *svms)
2066{
2067        spin_lock(&svms->deferred_list_lock);
2068        if (!list_empty(&svms->deferred_range_list))
2069                schedule_work(&svms->deferred_list_work);
2070        spin_unlock(&svms->deferred_list_lock);
2071}
2072
2073static void
2074svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
2075                      struct svm_range *prange, unsigned long start,
2076                      unsigned long last)
2077{
2078        struct svm_range *head;
2079        struct svm_range *tail;
2080
2081        if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2082                pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange,
2083                         prange->start, prange->last);
2084                return;
2085        }
2086        if (start > prange->last || last < prange->start)
2087                return;
2088
2089        head = tail = prange;
2090        if (start > prange->start)
2091                svm_range_split(prange, prange->start, start - 1, &tail);
2092        if (last < tail->last)
2093                svm_range_split(tail, last + 1, tail->last, &head);
2094
2095        if (head != prange && tail != prange) {
2096                svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
2097                svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
2098        } else if (tail != prange) {
2099                svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE);
2100        } else if (head != prange) {
2101                svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
2102        } else if (parent != prange) {
2103                prange->work_item.op = SVM_OP_UNMAP_RANGE;
2104        }
2105}
2106
2107static void
2108svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
2109                         unsigned long start, unsigned long last)
2110{
2111        struct svm_range_list *svms;
2112        struct svm_range *pchild;
2113        struct kfd_process *p;
2114        unsigned long s, l;
2115        bool unmap_parent;
2116
2117        p = kfd_lookup_process_by_mm(mm);
2118        if (!p)
2119                return;
2120        svms = &p->svms;
2121
2122        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
2123                 prange, prange->start, prange->last, start, last);
2124
2125        unmap_parent = start <= prange->start && last >= prange->last;
2126
2127        list_for_each_entry(pchild, &prange->child_list, child_list) {
2128                mutex_lock_nested(&pchild->lock, 1);
2129                s = max(start, pchild->start);
2130                l = min(last, pchild->last);
2131                if (l >= s)
2132                        svm_range_unmap_from_gpus(pchild, s, l);
2133                svm_range_unmap_split(mm, prange, pchild, start, last);
2134                mutex_unlock(&pchild->lock);
2135        }
2136        s = max(start, prange->start);
2137        l = min(last, prange->last);
2138        if (l >= s)
2139                svm_range_unmap_from_gpus(prange, s, l);
2140        svm_range_unmap_split(mm, prange, prange, start, last);
2141
2142        if (unmap_parent)
2143                svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
2144        else
2145                svm_range_add_list_work(svms, prange, mm,
2146                                        SVM_OP_UPDATE_RANGE_NOTIFIER);
2147        schedule_deferred_list_work(svms);
2148
2149        kfd_unref_process(p);
2150}
2151
2152/**
2153 * svm_range_cpu_invalidate_pagetables - interval notifier callback
2154 *
2155 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it
2156 * is from migration, or CPU page invalidation callback.
2157 *
2158 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed
2159 * work thread, and split prange if only part of prange is unmapped.
2160 *
2161 * For invalidation event, if GPU retry fault is not enabled, evict the queues,
2162 * then schedule svm_range_restore_work to update GPU mapping and resume queues.
2163 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will
2164 * update GPU mapping to recover.
2165 *
2166 * Context: mmap lock, notifier_invalidate_start lock are held
2167 *          for invalidate event, prange lock is held if this is from migration
2168 */
2169static bool
2170svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
2171                                    const struct mmu_notifier_range *range,
2172                                    unsigned long cur_seq)
2173{
2174        struct svm_range *prange;
2175        unsigned long start;
2176        unsigned long last;
2177
2178        if (range->event == MMU_NOTIFY_RELEASE)
2179                return true;
2180
2181        start = mni->interval_tree.start;
2182        last = mni->interval_tree.last;
2183        start = (start > range->start ? start : range->start) >> PAGE_SHIFT;
2184        last = (last < (range->end - 1) ? last : range->end - 1) >> PAGE_SHIFT;
2185        pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n",
2186                 start, last, range->start >> PAGE_SHIFT,
2187                 (range->end - 1) >> PAGE_SHIFT,
2188                 mni->interval_tree.start >> PAGE_SHIFT,
2189                 mni->interval_tree.last >> PAGE_SHIFT, range->event);
2190
2191        prange = container_of(mni, struct svm_range, notifier);
2192
2193        svm_range_lock(prange);
2194        mmu_interval_set_seq(mni, cur_seq);
2195
2196        switch (range->event) {
2197        case MMU_NOTIFY_UNMAP:
2198                svm_range_unmap_from_cpu(mni->mm, prange, start, last);
2199                break;
2200        default:
2201                svm_range_evict(prange, mni->mm, start, last);
2202                break;
2203        }
2204
2205        svm_range_unlock(prange);
2206
2207        return true;
2208}
2209
2210/**
2211 * svm_range_from_addr - find svm range from fault address
2212 * @svms: svm range list header
2213 * @addr: address to search range interval tree, in pages
2214 * @parent: parent range if range is on child list
2215 *
2216 * Context: The caller must hold svms->lock
2217 *
2218 * Return: the svm_range found or NULL
2219 */
2220struct svm_range *
2221svm_range_from_addr(struct svm_range_list *svms, unsigned long addr,
2222                    struct svm_range **parent)
2223{
2224        struct interval_tree_node *node;
2225        struct svm_range *prange;
2226        struct svm_range *pchild;
2227
2228        node = interval_tree_iter_first(&svms->objects, addr, addr);
2229        if (!node)
2230                return NULL;
2231
2232        prange = container_of(node, struct svm_range, it_node);
2233        pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n",
2234                 addr, prange->start, prange->last, node->start, node->last);
2235
2236        if (addr >= prange->start && addr <= prange->last) {
2237                if (parent)
2238                        *parent = prange;
2239                return prange;
2240        }
2241        list_for_each_entry(pchild, &prange->child_list, child_list)
2242                if (addr >= pchild->start && addr <= pchild->last) {
2243                        pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n",
2244                                 addr, pchild->start, pchild->last);
2245                        if (parent)
2246                                *parent = prange;
2247                        return pchild;
2248                }
2249
2250        return NULL;
2251}
2252
2253/* svm_range_best_restore_location - decide the best fault restore location
2254 * @prange: svm range structure
2255 * @adev: the GPU on which vm fault happened
2256 *
2257 * This is only called when xnack is on, to decide the best location to restore
2258 * the range mapping after GPU vm fault. Caller uses the best location to do
2259 * migration if actual loc is not best location, then update GPU page table
2260 * mapping to the best location.
2261 *
2262 * If vm fault gpu is range preferred loc, the best_loc is preferred loc.
2263 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu
2264 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then
2265 *    if range actual loc is cpu, best_loc is cpu
2266 *    if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is
2267 *    range actual loc.
2268 * Otherwise, GPU no access, best_loc is -1.
2269 *
2270 * Return:
2271 * -1 means vm fault GPU no access
2272 * 0 for CPU or GPU id
2273 */
2274static int32_t
2275svm_range_best_restore_location(struct svm_range *prange,
2276                                struct amdgpu_device *adev,
2277                                int32_t *gpuidx)
2278{
2279        struct amdgpu_device *bo_adev;
2280        struct kfd_process *p;
2281        uint32_t gpuid;
2282        int r;
2283
2284        p = container_of(prange->svms, struct kfd_process, svms);
2285
2286        r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, gpuidx);
2287        if (r < 0) {
2288                pr_debug("failed to get gpuid from kgd\n");
2289                return -1;
2290        }
2291
2292        if (prange->preferred_loc == gpuid)
2293                return prange->preferred_loc;
2294
2295        if (test_bit(*gpuidx, prange->bitmap_access))
2296                return gpuid;
2297
2298        if (test_bit(*gpuidx, prange->bitmap_aip)) {
2299                if (!prange->actual_loc)
2300                        return 0;
2301
2302                bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc);
2303                if (amdgpu_xgmi_same_hive(adev, bo_adev))
2304                        return prange->actual_loc;
2305                else
2306                        return 0;
2307        }
2308
2309        return -1;
2310}
2311static int
2312svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
2313                                unsigned long *start, unsigned long *last)
2314{
2315        struct vm_area_struct *vma;
2316        struct interval_tree_node *node;
2317        unsigned long start_limit, end_limit;
2318
2319        vma = find_vma(p->mm, addr << PAGE_SHIFT);
2320        if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
2321                pr_debug("VMA does not exist in address [0x%llx]\n", addr);
2322                return -EFAULT;
2323        }
2324        start_limit = max(vma->vm_start >> PAGE_SHIFT,
2325                      (unsigned long)ALIGN_DOWN(addr, 2UL << 8));
2326        end_limit = min(vma->vm_end >> PAGE_SHIFT,
2327                    (unsigned long)ALIGN(addr + 1, 2UL << 8));
2328        /* First range that starts after the fault address */
2329        node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX);
2330        if (node) {
2331                end_limit = min(end_limit, node->start);
2332                /* Last range that ends before the fault address */
2333                node = container_of(rb_prev(&node->rb),
2334                                    struct interval_tree_node, rb);
2335        } else {
2336                /* Last range must end before addr because
2337                 * there was no range after addr
2338                 */
2339                node = container_of(rb_last(&p->svms.objects.rb_root),
2340                                    struct interval_tree_node, rb);
2341        }
2342        if (node) {
2343                if (node->last >= addr) {
2344                        WARN(1, "Overlap with prev node and page fault addr\n");
2345                        return -EFAULT;
2346                }
2347                start_limit = max(start_limit, node->last + 1);
2348        }
2349
2350        *start = start_limit;
2351        *last = end_limit - 1;
2352
2353        pr_debug("vma start: 0x%lx start: 0x%lx vma end: 0x%lx last: 0x%lx\n",
2354                  vma->vm_start >> PAGE_SHIFT, *start,
2355                  vma->vm_end >> PAGE_SHIFT, *last);
2356
2357        return 0;
2358
2359}
2360static struct
2361svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev,
2362                                                struct kfd_process *p,
2363                                                struct mm_struct *mm,
2364                                                int64_t addr)
2365{
2366        struct svm_range *prange = NULL;
2367        unsigned long start, last;
2368        uint32_t gpuid, gpuidx;
2369
2370        if (svm_range_get_range_boundaries(p, addr, &start, &last))
2371                return NULL;
2372
2373        prange = svm_range_new(&p->svms, start, last);
2374        if (!prange) {
2375                pr_debug("Failed to create prange in address [0x%llx]\n", addr);
2376                return NULL;
2377        }
2378        if (kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpuidx)) {
2379                pr_debug("failed to get gpuid from kgd\n");
2380                svm_range_free(prange);
2381                return NULL;
2382        }
2383
2384        svm_range_add_to_svms(prange);
2385        svm_range_add_notifier_locked(mm, prange);
2386
2387        return prange;
2388}
2389
2390/* svm_range_skip_recover - decide if prange can be recovered
2391 * @prange: svm range structure
2392 *
2393 * GPU vm retry fault handle skip recover the range for cases:
2394 * 1. prange is on deferred list to be removed after unmap, it is stale fault,
2395 *    deferred list work will drain the stale fault before free the prange.
2396 * 2. prange is on deferred list to add interval notifier after split, or
2397 * 3. prange is child range, it is split from parent prange, recover later
2398 *    after interval notifier is added.
2399 *
2400 * Return: true to skip recover, false to recover
2401 */
2402static bool svm_range_skip_recover(struct svm_range *prange)
2403{
2404        struct svm_range_list *svms = prange->svms;
2405
2406        spin_lock(&svms->deferred_list_lock);
2407        if (list_empty(&prange->deferred_list) &&
2408            list_empty(&prange->child_list)) {
2409                spin_unlock(&svms->deferred_list_lock);
2410                return false;
2411        }
2412        spin_unlock(&svms->deferred_list_lock);
2413
2414        if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2415                pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n",
2416                         svms, prange, prange->start, prange->last);
2417                return true;
2418        }
2419        if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP ||
2420            prange->work_item.op == SVM_OP_ADD_RANGE) {
2421                pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n",
2422                         svms, prange, prange->start, prange->last);
2423                return true;
2424        }
2425        return false;
2426}
2427
2428static void
2429svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
2430                      int32_t gpuidx)
2431{
2432        struct kfd_process_device *pdd;
2433
2434        /* fault is on different page of same range
2435         * or fault is skipped to recover later
2436         * or fault is on invalid virtual address
2437         */
2438        if (gpuidx == MAX_GPU_INSTANCE) {
2439                uint32_t gpuid;
2440                int r;
2441
2442                r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpuidx);
2443                if (r < 0)
2444                        return;
2445        }
2446
2447        /* fault is recovered
2448         * or fault cannot recover because GPU no access on the range
2449         */
2450        pdd = kfd_process_device_from_gpuidx(p, gpuidx);
2451        if (pdd)
2452                WRITE_ONCE(pdd->faults, pdd->faults + 1);
2453}
2454
2455static bool
2456svm_fault_allowed(struct mm_struct *mm, uint64_t addr, bool write_fault)
2457{
2458        unsigned long requested = VM_READ;
2459        struct vm_area_struct *vma;
2460
2461        if (write_fault)
2462                requested |= VM_WRITE;
2463
2464        vma = find_vma(mm, addr << PAGE_SHIFT);
2465        if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
2466                pr_debug("address 0x%llx VMA is removed\n", addr);
2467                return true;
2468        }
2469
2470        pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
2471                vma->vm_flags);
2472        return (vma->vm_flags & requested) == requested;
2473}
2474
2475int
2476svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
2477                        uint64_t addr, bool write_fault)
2478{
2479        struct mm_struct *mm = NULL;
2480        struct svm_range_list *svms;
2481        struct svm_range *prange;
2482        struct kfd_process *p;
2483        uint64_t timestamp;
2484        int32_t best_loc;
2485        int32_t gpuidx = MAX_GPU_INSTANCE;
2486        bool write_locked = false;
2487        int r = 0;
2488
2489        if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
2490                pr_debug("device does not support SVM\n");
2491                return -EFAULT;
2492        }
2493
2494        p = kfd_lookup_process_by_pasid(pasid);
2495        if (!p) {
2496                pr_debug("kfd process not founded pasid 0x%x\n", pasid);
2497                return -ESRCH;
2498        }
2499        if (!p->xnack_enabled) {
2500                pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
2501                r = -EFAULT;
2502                goto out;
2503        }
2504        svms = &p->svms;
2505
2506        pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
2507
2508        mm = get_task_mm(p->lead_thread);
2509        if (!mm) {
2510                pr_debug("svms 0x%p failed to get mm\n", svms);
2511                r = -ESRCH;
2512                goto out;
2513        }
2514
2515        mmap_read_lock(mm);
2516retry_write_locked:
2517        mutex_lock(&svms->lock);
2518        prange = svm_range_from_addr(svms, addr, NULL);
2519        if (!prange) {
2520                pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
2521                         svms, addr);
2522                if (!write_locked) {
2523                        /* Need the write lock to create new range with MMU notifier.
2524                         * Also flush pending deferred work to make sure the interval
2525                         * tree is up to date before we add a new range
2526                         */
2527                        mutex_unlock(&svms->lock);
2528                        mmap_read_unlock(mm);
2529                        mmap_write_lock(mm);
2530                        write_locked = true;
2531                        goto retry_write_locked;
2532                }
2533                prange = svm_range_create_unregistered_range(adev, p, mm, addr);
2534                if (!prange) {
2535                        pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n",
2536                                 svms, addr);
2537                        mmap_write_downgrade(mm);
2538                        r = -EFAULT;
2539                        goto out_unlock_svms;
2540                }
2541        }
2542        if (write_locked)
2543                mmap_write_downgrade(mm);
2544
2545        mutex_lock(&prange->migrate_mutex);
2546
2547        if (svm_range_skip_recover(prange)) {
2548                amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
2549                goto out_unlock_range;
2550        }
2551
2552        timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
2553        /* skip duplicate vm fault on different pages of same range */
2554        if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
2555                pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
2556                         svms, prange->start, prange->last);
2557                goto out_unlock_range;
2558        }
2559
2560        if (!svm_fault_allowed(mm, addr, write_fault)) {
2561                pr_debug("fault addr 0x%llx no %s permission\n", addr,
2562                        write_fault ? "write" : "read");
2563                r = -EPERM;
2564                goto out_unlock_range;
2565        }
2566
2567        best_loc = svm_range_best_restore_location(prange, adev, &gpuidx);
2568        if (best_loc == -1) {
2569                pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
2570                         svms, prange->start, prange->last);
2571                r = -EACCES;
2572                goto out_unlock_range;
2573        }
2574
2575        pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
2576                 svms, prange->start, prange->last, best_loc,
2577                 prange->actual_loc);
2578
2579        if (prange->actual_loc != best_loc) {
2580                if (best_loc) {
2581                        r = svm_migrate_to_vram(prange, best_loc, mm);
2582                        if (r) {
2583                                pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n",
2584                                         r, addr);
2585                                /* Fallback to system memory if migration to
2586                                 * VRAM failed
2587                                 */
2588                                if (prange->actual_loc)
2589                                        r = svm_migrate_vram_to_ram(prange, mm);
2590                                else
2591                                        r = 0;
2592                        }
2593                } else {
2594                        r = svm_migrate_vram_to_ram(prange, mm);
2595                }
2596                if (r) {
2597                        pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
2598                                 r, svms, prange->start, prange->last);
2599                        goto out_unlock_range;
2600                }
2601        }
2602
2603        r = svm_range_validate_and_map(mm, prange, gpuidx, false, false);
2604        if (r)
2605                pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
2606                         r, svms, prange->start, prange->last);
2607
2608out_unlock_range:
2609        mutex_unlock(&prange->migrate_mutex);
2610out_unlock_svms:
2611        mutex_unlock(&svms->lock);
2612        mmap_read_unlock(mm);
2613
2614        svm_range_count_fault(adev, p, gpuidx);
2615
2616        mmput(mm);
2617out:
2618        kfd_unref_process(p);
2619
2620        if (r == -EAGAIN) {
2621                pr_debug("recover vm fault later\n");
2622                amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
2623                r = 0;
2624        }
2625        return r;
2626}
2627
2628void svm_range_list_fini(struct kfd_process *p)
2629{
2630        struct svm_range *prange;
2631        struct svm_range *next;
2632
2633        pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
2634
2635        /* Ensure list work is finished before process is destroyed */
2636        flush_work(&p->svms.deferred_list_work);
2637
2638        list_for_each_entry_safe(prange, next, &p->svms.list, list) {
2639                svm_range_unlink(prange);
2640                svm_range_remove_notifier(prange);
2641                svm_range_free(prange);
2642        }
2643
2644        mutex_destroy(&p->svms.lock);
2645
2646        pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms);
2647}
2648
2649int svm_range_list_init(struct kfd_process *p)
2650{
2651        struct svm_range_list *svms = &p->svms;
2652        int i;
2653
2654        svms->objects = RB_ROOT_CACHED;
2655        mutex_init(&svms->lock);
2656        INIT_LIST_HEAD(&svms->list);
2657        atomic_set(&svms->evicted_ranges, 0);
2658        INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
2659        INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
2660        INIT_LIST_HEAD(&svms->deferred_range_list);
2661        spin_lock_init(&svms->deferred_list_lock);
2662
2663        for (i = 0; i < p->n_pdds; i++)
2664                if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev))
2665                        bitmap_set(svms->bitmap_supported, i, 1);
2666
2667        return 0;
2668}
2669
2670/**
2671 * svm_range_is_valid - check if virtual address range is valid
2672 * @mm: current process mm_struct
2673 * @start: range start address, in pages
2674 * @size: range size, in pages
2675 *
2676 * Valid virtual address range means it belongs to one or more VMAs
2677 *
2678 * Context: Process context
2679 *
2680 * Return:
2681 *  true - valid svm range
2682 *  false - invalid svm range
2683 */
2684static bool
2685svm_range_is_valid(struct mm_struct *mm, uint64_t start, uint64_t size)
2686{
2687        const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
2688        struct vm_area_struct *vma;
2689        unsigned long end;
2690
2691        start <<= PAGE_SHIFT;
2692        end = start + (size << PAGE_SHIFT);
2693
2694        do {
2695                vma = find_vma(mm, start);
2696                if (!vma || start < vma->vm_start ||
2697                    (vma->vm_flags & device_vma))
2698                        return false;
2699                start = min(end, vma->vm_end);
2700        } while (start < end);
2701
2702        return true;
2703}
2704
2705/**
2706 * svm_range_add - add svm range and handle overlap
2707 * @p: the range add to this process svms
2708 * @start: page size aligned
2709 * @size: page size aligned
2710 * @nattr: number of attributes
2711 * @attrs: array of attributes
2712 * @update_list: output, the ranges need validate and update GPU mapping
2713 * @insert_list: output, the ranges need insert to svms
2714 * @remove_list: output, the ranges are replaced and need remove from svms
2715 *
2716 * Check if the virtual address range has overlap with the registered ranges,
2717 * split the overlapped range, copy and adjust pages address and vram nodes in
2718 * old and new ranges.
2719 *
2720 * Context: Process context, caller must hold svms->lock
2721 *
2722 * Return:
2723 * 0 - OK, otherwise error code
2724 */
2725static int
2726svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
2727              uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
2728              struct list_head *update_list, struct list_head *insert_list,
2729              struct list_head *remove_list)
2730{
2731        uint64_t last = start + size - 1UL;
2732        struct svm_range_list *svms;
2733        struct svm_range new = {0};
2734        struct svm_range *prange;
2735        unsigned long left = 0;
2736        int r = 0;
2737
2738        pr_debug("svms 0x%p [0x%llx 0x%llx]\n", &p->svms, start, last);
2739
2740        svm_range_apply_attrs(p, &new, nattr, attrs);
2741
2742        svms = &p->svms;
2743
2744        r = svm_range_handle_overlap(svms, &new, start, last, update_list,
2745                                     insert_list, remove_list, &left);
2746        if (r)
2747                return r;
2748
2749        if (left) {
2750                prange = svm_range_new(svms, last - left + 1, last);
2751                list_add(&prange->insert_list, insert_list);
2752                list_add(&prange->update_list, update_list);
2753        }
2754
2755        return 0;
2756}
2757
2758/**
2759 * svm_range_best_prefetch_location - decide the best prefetch location
2760 * @prange: svm range structure
2761 *
2762 * For xnack off:
2763 * If range map to single GPU, the best prefetch location is prefetch_loc, which
2764 * can be CPU or GPU.
2765 *
2766 * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on
2767 * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise
2768 * the best prefetch location is always CPU, because GPU can not have coherent
2769 * mapping VRAM of other GPUs even with large-BAR PCIe connection.
2770 *
2771 * For xnack on:
2772 * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is
2773 * prefetch_loc, other GPU access will generate vm fault and trigger migration.
2774 *
2775 * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same
2776 * hive, the best prefetch location is prefetch_loc GPU, otherwise the best
2777 * prefetch location is always CPU.
2778 *
2779 * Context: Process context
2780 *
2781 * Return:
2782 * 0 for CPU or GPU id
2783 */
2784static uint32_t
2785svm_range_best_prefetch_location(struct svm_range *prange)
2786{
2787        DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
2788        uint32_t best_loc = prange->prefetch_loc;
2789        struct kfd_process_device *pdd;
2790        struct amdgpu_device *bo_adev;
2791        struct amdgpu_device *adev;
2792        struct kfd_process *p;
2793        uint32_t gpuidx;
2794
2795        p = container_of(prange->svms, struct kfd_process, svms);
2796
2797        if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
2798                goto out;
2799
2800        bo_adev = svm_range_get_adev_by_id(prange, best_loc);
2801        if (!bo_adev) {
2802                WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc);
2803                best_loc = 0;
2804                goto out;
2805        }
2806
2807        if (p->xnack_enabled)
2808                bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
2809        else
2810                bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
2811                          MAX_GPU_INSTANCE);
2812
2813        for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
2814                pdd = kfd_process_device_from_gpuidx(p, gpuidx);
2815                if (!pdd) {
2816                        pr_debug("failed to get device by idx 0x%x\n", gpuidx);
2817                        continue;
2818                }
2819                adev = (struct amdgpu_device *)pdd->dev->kgd;
2820
2821                if (adev == bo_adev)
2822                        continue;
2823
2824                if (!amdgpu_xgmi_same_hive(adev, bo_adev)) {
2825                        best_loc = 0;
2826                        break;
2827                }
2828        }
2829
2830out:
2831        pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n",
2832                 p->xnack_enabled, &p->svms, prange->start, prange->last,
2833                 best_loc);
2834
2835        return best_loc;
2836}
2837
2838/* FIXME: This is a workaround for page locking bug when some pages are
2839 * invalid during migration to VRAM
2840 */
2841void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm,
2842                        void *owner)
2843{
2844        struct hmm_range *hmm_range;
2845        int r;
2846
2847        if (prange->validated_once)
2848                return;
2849
2850        r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
2851                                       prange->start << PAGE_SHIFT,
2852                                       prange->npages, &hmm_range,
2853                                       false, true, owner);
2854        if (!r) {
2855                amdgpu_hmm_range_get_pages_done(hmm_range);
2856                prange->validated_once = true;
2857        }
2858}
2859
2860/* svm_range_trigger_migration - start page migration if prefetch loc changed
2861 * @mm: current process mm_struct
2862 * @prange: svm range structure
2863 * @migrated: output, true if migration is triggered
2864 *
2865 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range
2866 * from ram to vram.
2867 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range
2868 * from vram to ram.
2869 *
2870 * If GPU vm fault retry is not enabled, migration interact with MMU notifier
2871 * and restore work:
2872 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict
2873 *    stops all queues, schedule restore work
2874 * 2. svm_range_restore_work wait for migration is done by
2875 *    a. svm_range_validate_vram takes prange->migrate_mutex
2876 *    b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns
2877 * 3. restore work update mappings of GPU, resume all queues.
2878 *
2879 * Context: Process context
2880 *
2881 * Return:
2882 * 0 - OK, otherwise - error code of migration
2883 */
2884static int
2885svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
2886                            bool *migrated)
2887{
2888        uint32_t best_loc;
2889        int r = 0;
2890
2891        *migrated = false;
2892        best_loc = svm_range_best_prefetch_location(prange);
2893
2894        if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
2895            best_loc == prange->actual_loc)
2896                return 0;
2897
2898        if (!best_loc) {
2899                r = svm_migrate_vram_to_ram(prange, mm);
2900                *migrated = !r;
2901                return r;
2902        }
2903
2904        r = svm_migrate_to_vram(prange, best_loc, mm);
2905        *migrated = !r;
2906
2907        return r;
2908}
2909
2910int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
2911{
2912        if (!fence)
2913                return -EINVAL;
2914
2915        if (dma_fence_is_signaled(&fence->base))
2916                return 0;
2917
2918        if (fence->svm_bo) {
2919                WRITE_ONCE(fence->svm_bo->evicting, 1);
2920                schedule_work(&fence->svm_bo->eviction_work);
2921        }
2922
2923        return 0;
2924}
2925
2926static void svm_range_evict_svm_bo_worker(struct work_struct *work)
2927{
2928        struct svm_range_bo *svm_bo;
2929        struct kfd_process *p;
2930        struct mm_struct *mm;
2931
2932        svm_bo = container_of(work, struct svm_range_bo, eviction_work);
2933        if (!svm_bo_ref_unless_zero(svm_bo))
2934                return; /* svm_bo was freed while eviction was pending */
2935
2936        /* svm_range_bo_release destroys this worker thread. So during
2937         * the lifetime of this thread, kfd_process and mm will be valid.
2938         */
2939        p = container_of(svm_bo->svms, struct kfd_process, svms);
2940        mm = p->mm;
2941        if (!mm)
2942                return;
2943
2944        mmap_read_lock(mm);
2945        spin_lock(&svm_bo->list_lock);
2946        while (!list_empty(&svm_bo->range_list)) {
2947                struct svm_range *prange =
2948                                list_first_entry(&svm_bo->range_list,
2949                                                struct svm_range, svm_bo_list);
2950                list_del_init(&prange->svm_bo_list);
2951                spin_unlock(&svm_bo->list_lock);
2952
2953                pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
2954                         prange->start, prange->last);
2955
2956                mutex_lock(&prange->migrate_mutex);
2957                svm_migrate_vram_to_ram(prange, svm_bo->eviction_fence->mm);
2958
2959                mutex_lock(&prange->lock);
2960                prange->svm_bo = NULL;
2961                mutex_unlock(&prange->lock);
2962
2963                mutex_unlock(&prange->migrate_mutex);
2964
2965                spin_lock(&svm_bo->list_lock);
2966        }
2967        spin_unlock(&svm_bo->list_lock);
2968        mmap_read_unlock(mm);
2969
2970        dma_fence_signal(&svm_bo->eviction_fence->base);
2971        /* This is the last reference to svm_bo, after svm_range_vram_node_free
2972         * has been called in svm_migrate_vram_to_ram
2973         */
2974        WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
2975        svm_range_bo_unref(svm_bo);
2976}
2977
2978static int
2979svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,
2980                   uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
2981{
2982        struct amdkfd_process_info *process_info = p->kgd_process_info;
2983        struct mm_struct *mm = current->mm;
2984        struct list_head update_list;
2985        struct list_head insert_list;
2986        struct list_head remove_list;
2987        struct svm_range_list *svms;
2988        struct svm_range *prange;
2989        struct svm_range *next;
2990        int r = 0;
2991
2992        pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
2993                 p->pasid, &p->svms, start, start + size - 1, size);
2994
2995        r = svm_range_check_attr(p, nattr, attrs);
2996        if (r)
2997                return r;
2998
2999        svms = &p->svms;
3000

3001        mutex_lock(&process_info->lock);
3002
3003        svm_range_list_lock_and_flush_work(svms, mm);
3004
3005        if (!svm_range_is_valid(mm, start, size)) {
3006                pr_debug("invalid range\n");
3007                r = -EFAULT;
3008                mmap_write_unlock(mm);
3009                goto out;
3010        }
3011
3012        mutex_lock(&svms->lock);
3013
3014        /* Add new range and split existing ranges as needed */
3015        r = svm_range_add(p, start, size, nattr, attrs, &update_list,
3016                          &insert_list, &remove_list);
3017        if (r) {
3018                mutex_unlock(&svms->lock);
3019                mmap_write_unlock(mm);
3020                goto out;
3021        }
3022        /* Apply changes as a transaction */
3023        list_for_each_entry_safe(prange, next, &insert_list, insert_list) {
3024                svm_range_add_to_svms(prange);
3025                svm_range_add_notifier_locked(mm, prange);
3026        }
3027        list_for_each_entry(prange, &update_list, update_list) {
3028                svm_range_apply_attrs(p, prange, nattr, attrs);
3029                /* TODO: unmap ranges from GPU that lost access */
3030        }
3031        list_for_each_entry_safe(prange, next, &remove_list,
3032                                remove_list) {
3033                pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n",
3034                         prange->svms, prange, prange->start,
3035                         prange->last);
3036                svm_range_unlink(prange);
3037                svm_range_remove_notifier(prange);
3038                svm_range_free(prange);
3039        }
3040
3041        mmap_write_downgrade(mm);
3042        /* Trigger migrations and revalidate and map to GPUs as needed. If
3043         * this fails we may be left with partially completed actions. There
3044         * is no clean way of rolling back to the previous state in such a
3045         * case because the rollback wouldn't be guaranteed to work either.
3046         */
3047        list_for_each_entry(prange, &update_list, update_list) {
3048                bool migrated;
3049
3050                mutex_lock(&prange->migrate_mutex);
3051
3052                r = svm_range_trigger_migration(mm, prange, &migrated);
3053                if (r)
3054                        goto out_unlock_range;
3055
3056                if (migrated && !p->xnack_enabled) {
3057                        pr_debug("restore_work will update mappings of GPUs\n");
3058                        mutex_unlock(&prange->migrate_mutex);
3059                        continue;
3060                }
3061
3062                r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
3063                                               true, true);
3064                if (r)
3065                        pr_debug("failed %d to map svm range\n", r);
3066
3067out_unlock_range:
3068                mutex_unlock(&prange->migrate_mutex);
3069                if (r)
3070                        break;
3071        }
3072
3073        svm_range_debug_dump(svms);
3074
3075        mutex_unlock(&svms->lock);
3076        mmap_read_unlock(mm);
3077out:
3078        mutex_unlock(&process_info->lock);
3079
3080        pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
3081                 &p->svms, start, start + size - 1, r);
3082
3083        return r;
3084}
3085
3086static int
3087svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size,
3088                   uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
3089{
3090        DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
3091        DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
3092        bool get_preferred_loc = false;
3093        bool get_prefetch_loc = false;
3094        bool get_granularity = false;
3095        bool get_accessible = false;
3096        bool get_flags = false;
3097        uint64_t last = start + size - 1UL;
3098        struct mm_struct *mm = current->mm;
3099        uint8_t granularity = 0xff;
3100        struct interval_tree_node *node;
3101        struct svm_range_list *svms;
3102        struct svm_range *prange;
3103        uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3104        uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3105        uint32_t flags_and = 0xffffffff;
3106        uint32_t flags_or = 0;
3107        int gpuidx;
3108        uint32_t i;
3109
3110        pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start,
3111                 start + size - 1, nattr);
3112
3113        /* Flush pending deferred work to avoid racing with deferred actions from
3114         * previous memory map changes (e.g. munmap). Concurrent memory map changes
3115         * can still race with get_attr because we don't hold the mmap lock. But that
3116         * would be a race condition in the application anyway, and undefined
3117         * behaviour is acceptable in that case.
3118         */
3119        flush_work(&p->svms.deferred_list_work);
3120
3121        mmap_read_lock(mm);
3122        if (!svm_range_is_valid(mm, start, size)) {
3123                pr_debug("invalid range\n");
3124                mmap_read_unlock(mm);
3125                return -EINVAL;
3126        }
3127        mmap_read_unlock(mm);
3128
3129        for (i = 0; i < nattr; i++) {
3130                switch (attrs[i].type) {
3131                case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3132                        get_preferred_loc = true;
3133                        break;
3134                case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3135                        get_prefetch_loc = true;
3136                        break;
3137                case KFD_IOCTL_SVM_ATTR_ACCESS:
3138                        get_accessible = true;
3139                        break;
3140                case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3141                case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3142                        get_flags = true;
3143                        break;
3144                case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3145                        get_granularity = true;
3146                        break;
3147                case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
3148                case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
3149                        fallthrough;
3150                default:
3151                        pr_debug("get invalid attr type 0x%x\n", attrs[i].type);
3152                        return -EINVAL;
3153                }
3154        }
3155
3156        svms = &p->svms;
3157
3158        mutex_lock(&svms->lock);
3159
3160        node = interval_tree_iter_first(&svms->objects, start, last);
3161        if (!node) {
3162                pr_debug("range attrs not found return default values\n");
3163                svm_range_set_default_attributes(&location, &prefetch_loc,
3164                                                 &granularity, &flags_and);
3165                flags_or = flags_and;
3166                if (p->xnack_enabled)
3167                        bitmap_copy(bitmap_access, svms->bitmap_supported,
3168                                    MAX_GPU_INSTANCE);
3169                else
3170                        bitmap_zero(bitmap_access, MAX_GPU_INSTANCE);
3171                bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE);
3172                goto fill_values;
3173        }
3174        bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE);
3175        bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE);
3176
3177        while (node) {
3178                struct interval_tree_node *next;
3179
3180                prange = container_of(node, struct svm_range, it_node);
3181                next = interval_tree_iter_next(node, start, last);
3182
3183                if (get_preferred_loc) {
3184                        if (prange->preferred_loc ==
3185                                        KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3186                            (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3187                             location != prange->preferred_loc)) {
3188                                location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3189                                get_preferred_loc = false;
3190                        } else {
3191                                location = prange->preferred_loc;
3192                        }
3193                }
3194                if (get_prefetch_loc) {
3195                        if (prange->prefetch_loc ==
3196                                        KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3197                            (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3198                             prefetch_loc != prange->prefetch_loc)) {
3199                                prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3200                                get_prefetch_loc = false;
3201                        } else {
3202                                prefetch_loc = prange->prefetch_loc;
3203                        }
3204                }
3205                if (get_accessible) {
3206                        bitmap_and(bitmap_access, bitmap_access,
3207                                   prange->bitmap_access, MAX_GPU_INSTANCE);
3208                        bitmap_and(bitmap_aip, bitmap_aip,
3209                                   prange->bitmap_aip, MAX_GPU_INSTANCE);
3210                }
3211                if (get_flags) {
3212                        flags_and &= prange->flags;
3213                        flags_or |= prange->flags;
3214                }
3215
3216                if (get_granularity && prange->granularity < granularity)
3217                        granularity = prange->granularity;
3218
3219                node = next;
3220        }
3221fill_values:
3222        mutex_unlock(&svms->lock);
3223
3224        for (i = 0; i < nattr; i++) {
3225                switch (attrs[i].type) {
3226                case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3227                        attrs[i].value = location;
3228                        break;
3229                case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3230                        attrs[i].value = prefetch_loc;
3231                        break;
3232                case KFD_IOCTL_SVM_ATTR_ACCESS:
3233                        gpuidx = kfd_process_gpuidx_from_gpuid(p,
3234                                                               attrs[i].value);
3235                        if (gpuidx < 0) {
3236                                pr_debug("invalid gpuid %x\n", attrs[i].value);
3237                                return -EINVAL;
3238                        }
3239                        if (test_bit(gpuidx, bitmap_access))
3240                                attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS;
3241                        else if (test_bit(gpuidx, bitmap_aip))
3242                                attrs[i].type =
3243                                        KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE;
3244                        else
3245                                attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS;
3246                        break;
3247                case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3248                        attrs[i].value = flags_and;
3249                        break;
3250                case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3251                        attrs[i].value = ~flags_or;
3252                        break;
3253                case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3254                        attrs[i].value = (uint32_t)granularity;
3255                        break;
3256                }
3257        }
3258
3259        return 0;
3260}
3261
3262int
3263svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
3264          uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs)
3265{
3266        int r;
3267
3268        start >>= PAGE_SHIFT;
3269        size >>= PAGE_SHIFT;
3270
3271        switch (op) {
3272        case KFD_IOCTL_SVM_OP_SET_ATTR:
3273                r = svm_range_set_attr(p, start, size, nattrs, attrs);
3274                break;
3275        case KFD_IOCTL_SVM_OP_GET_ATTR:
3276                r = svm_range_get_attr(p, start, size, nattrs, attrs);
3277                break;
3278        default:
3279                r = EINVAL;
3280                break;
3281        }
3282
3283        return r;
3284}
3285