linux/drivers/iommu/amd/io_pgtable.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * CPU-agnostic AMD IO page table allocator.
   4 *
   5 * Copyright (C) 2020 Advanced Micro Devices, Inc.
   6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
   7 */
   8
   9#define pr_fmt(fmt)     "AMD-Vi: " fmt
  10#define dev_fmt(fmt)    pr_fmt(fmt)
  11
  12#include <linux/atomic.h>
  13#include <linux/bitops.h>
  14#include <linux/io-pgtable.h>
  15#include <linux/kernel.h>
  16#include <linux/sizes.h>
  17#include <linux/slab.h>
  18#include <linux/types.h>
  19#include <linux/dma-mapping.h>
  20
  21#include <asm/barrier.h>
  22
  23#include "amd_iommu_types.h"
  24#include "amd_iommu.h"
  25
  26static void v1_tlb_flush_all(void *cookie)
  27{
  28}
  29
  30static void v1_tlb_flush_walk(unsigned long iova, size_t size,
  31                                  size_t granule, void *cookie)
  32{
  33}
  34
  35static void v1_tlb_add_page(struct iommu_iotlb_gather *gather,
  36                                         unsigned long iova, size_t granule,
  37                                         void *cookie)
  38{
  39}
  40
  41static const struct iommu_flush_ops v1_flush_ops = {
  42        .tlb_flush_all  = v1_tlb_flush_all,
  43        .tlb_flush_walk = v1_tlb_flush_walk,
  44        .tlb_add_page   = v1_tlb_add_page,
  45};
  46
  47/*
  48 * Helper function to get the first pte of a large mapping
  49 */
  50static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
  51                         unsigned long *count)
  52{
  53        unsigned long pte_mask, pg_size, cnt;
  54        u64 *fpte;
  55
  56        pg_size  = PTE_PAGE_SIZE(*pte);
  57        cnt      = PAGE_SIZE_PTE_COUNT(pg_size);
  58        pte_mask = ~((cnt << 3) - 1);
  59        fpte     = (u64 *)(((unsigned long)pte) & pte_mask);
  60
  61        if (page_size)
  62                *page_size = pg_size;
  63
  64        if (count)
  65                *count = cnt;
  66
  67        return fpte;
  68}
  69
  70/****************************************************************************
  71 *
  72 * The functions below are used the create the page table mappings for
  73 * unity mapped regions.
  74 *
  75 ****************************************************************************/
  76
  77static void free_page_list(struct page *freelist)
  78{
  79        while (freelist != NULL) {
  80                unsigned long p = (unsigned long)page_address(freelist);
  81
  82                freelist = freelist->freelist;
  83                free_page(p);
  84        }
  85}
  86
  87static struct page *free_pt_page(unsigned long pt, struct page *freelist)
  88{
  89        struct page *p = virt_to_page((void *)pt);
  90
  91        p->freelist = freelist;
  92
  93        return p;
  94}
  95
  96#define DEFINE_FREE_PT_FN(LVL, FN)                                              \
  97static struct page *free_pt_##LVL (unsigned long __pt, struct page *freelist)   \
  98{                                                                               \
  99        unsigned long p;                                                        \
 100        u64 *pt;                                                                \
 101        int i;                                                                  \
 102                                                                                \
 103        pt = (u64 *)__pt;                                                       \
 104                                                                                \
 105        for (i = 0; i < 512; ++i) {                                             \
 106                /* PTE present? */                                              \
 107                if (!IOMMU_PTE_PRESENT(pt[i]))                                  \
 108                        continue;                                               \
 109                                                                                \
 110                /* Large PTE? */                                                \
 111                if (PM_PTE_LEVEL(pt[i]) == 0 ||                                 \
 112                    PM_PTE_LEVEL(pt[i]) == 7)                                   \
 113                        continue;                                               \
 114                                                                                \
 115                p = (unsigned long)IOMMU_PTE_PAGE(pt[i]);                       \
 116                freelist = FN(p, freelist);                                     \
 117        }                                                                       \
 118                                                                                \
 119        return free_pt_page((unsigned long)pt, freelist);                       \
 120}
 121
 122DEFINE_FREE_PT_FN(l2, free_pt_page)
 123DEFINE_FREE_PT_FN(l3, free_pt_l2)
 124DEFINE_FREE_PT_FN(l4, free_pt_l3)
 125DEFINE_FREE_PT_FN(l5, free_pt_l4)
 126DEFINE_FREE_PT_FN(l6, free_pt_l5)
 127
 128static struct page *free_sub_pt(unsigned long root, int mode,
 129                                struct page *freelist)
 130{
 131        switch (mode) {
 132        case PAGE_MODE_NONE:
 133        case PAGE_MODE_7_LEVEL:
 134                break;
 135        case PAGE_MODE_1_LEVEL:
 136                freelist = free_pt_page(root, freelist);
 137                break;
 138        case PAGE_MODE_2_LEVEL:
 139                freelist = free_pt_l2(root, freelist);
 140                break;
 141        case PAGE_MODE_3_LEVEL:
 142                freelist = free_pt_l3(root, freelist);
 143                break;
 144        case PAGE_MODE_4_LEVEL:
 145                freelist = free_pt_l4(root, freelist);
 146                break;
 147        case PAGE_MODE_5_LEVEL:
 148                freelist = free_pt_l5(root, freelist);
 149                break;
 150        case PAGE_MODE_6_LEVEL:
 151                freelist = free_pt_l6(root, freelist);
 152                break;
 153        default:
 154                BUG();
 155        }
 156
 157        return freelist;
 158}
 159
 160void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
 161                                  u64 *root, int mode)
 162{
 163        u64 pt_root;
 164
 165        /* lowest 3 bits encode pgtable mode */
 166        pt_root = mode & 7;
 167        pt_root |= (u64)root;
 168
 169        amd_iommu_domain_set_pt_root(domain, pt_root);
 170}
 171
 172/*
 173 * This function is used to add another level to an IO page table. Adding
 174 * another level increases the size of the address space by 9 bits to a size up
 175 * to 64 bits.
 176 */
 177static bool increase_address_space(struct protection_domain *domain,
 178                                   unsigned long address,
 179                                   gfp_t gfp)
 180{
 181        unsigned long flags;
 182        bool ret = true;
 183        u64 *pte;
 184
 185        pte = (void *)get_zeroed_page(gfp);
 186        if (!pte)
 187                return false;
 188
 189        spin_lock_irqsave(&domain->lock, flags);
 190
 191        if (address <= PM_LEVEL_SIZE(domain->iop.mode))
 192                goto out;
 193
 194        ret = false;
 195        if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL))
 196                goto out;
 197
 198        *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root));
 199
 200        domain->iop.root  = pte;
 201        domain->iop.mode += 1;
 202        amd_iommu_update_and_flush_device_table(domain);
 203        amd_iommu_domain_flush_complete(domain);
 204
 205        /*
 206         * Device Table needs to be updated and flushed before the new root can
 207         * be published.
 208         */
 209        amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode);
 210
 211        pte = NULL;
 212        ret = true;
 213
 214out:
 215        spin_unlock_irqrestore(&domain->lock, flags);
 216        free_page((unsigned long)pte);
 217
 218        return ret;
 219}
 220
 221static u64 *alloc_pte(struct protection_domain *domain,
 222                      unsigned long address,
 223                      unsigned long page_size,
 224                      u64 **pte_page,
 225                      gfp_t gfp,
 226                      bool *updated)
 227{
 228        int level, end_lvl;
 229        u64 *pte, *page;
 230
 231        BUG_ON(!is_power_of_2(page_size));
 232
 233        while (address > PM_LEVEL_SIZE(domain->iop.mode)) {
 234                /*
 235                 * Return an error if there is no memory to update the
 236                 * page-table.
 237                 */
 238                if (!increase_address_space(domain, address, gfp))
 239                        return NULL;
 240        }
 241
 242
 243        level   = domain->iop.mode - 1;
 244        pte     = &domain->iop.root[PM_LEVEL_INDEX(level, address)];
 245        address = PAGE_SIZE_ALIGN(address, page_size);
 246        end_lvl = PAGE_SIZE_LEVEL(page_size);
 247
 248        while (level > end_lvl) {
 249                u64 __pte, __npte;
 250                int pte_level;
 251
 252                __pte     = *pte;
 253                pte_level = PM_PTE_LEVEL(__pte);
 254
 255                /*
 256                 * If we replace a series of large PTEs, we need
 257                 * to tear down all of them.
 258                 */
 259                if (IOMMU_PTE_PRESENT(__pte) &&
 260                    pte_level == PAGE_MODE_7_LEVEL) {
 261                        unsigned long count, i;
 262                        u64 *lpte;
 263
 264                        lpte = first_pte_l7(pte, NULL, &count);
 265
 266                        /*
 267                         * Unmap the replicated PTEs that still match the
 268                         * original large mapping
 269                         */
 270                        for (i = 0; i < count; ++i)
 271                                cmpxchg64(&lpte[i], __pte, 0ULL);
 272
 273                        *updated = true;
 274                        continue;
 275                }
 276
 277                if (!IOMMU_PTE_PRESENT(__pte) ||
 278                    pte_level == PAGE_MODE_NONE) {
 279                        page = (u64 *)get_zeroed_page(gfp);
 280
 281                        if (!page)
 282                                return NULL;
 283
 284                        __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
 285
 286                        /* pte could have been changed somewhere. */
 287                        if (cmpxchg64(pte, __pte, __npte) != __pte)
 288                                free_page((unsigned long)page);
 289                        else if (IOMMU_PTE_PRESENT(__pte))
 290                                *updated = true;
 291
 292                        continue;
 293                }
 294
 295                /* No level skipping support yet */
 296                if (pte_level != level)
 297                        return NULL;
 298
 299                level -= 1;
 300
 301                pte = IOMMU_PTE_PAGE(__pte);
 302
 303                if (pte_page && level == end_lvl)
 304                        *pte_page = pte;
 305
 306                pte = &pte[PM_LEVEL_INDEX(level, address)];
 307        }
 308
 309        return pte;
 310}
 311
 312/*
 313 * This function checks if there is a PTE for a given dma address. If
 314 * there is one, it returns the pointer to it.
 315 */
 316static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
 317                      unsigned long address,
 318                      unsigned long *page_size)
 319{
 320        int level;
 321        u64 *pte;
 322
 323        *page_size = 0;
 324
 325        if (address > PM_LEVEL_SIZE(pgtable->mode))
 326                return NULL;
 327
 328        level      =  pgtable->mode - 1;
 329        pte        = &pgtable->root[PM_LEVEL_INDEX(level, address)];
 330        *page_size =  PTE_LEVEL_PAGE_SIZE(level);
 331
 332        while (level > 0) {
 333
 334                /* Not Present */
 335                if (!IOMMU_PTE_PRESENT(*pte))
 336                        return NULL;
 337
 338                /* Large PTE */
 339                if (PM_PTE_LEVEL(*pte) == 7 ||
 340                    PM_PTE_LEVEL(*pte) == 0)
 341                        break;
 342
 343                /* No level skipping support yet */
 344                if (PM_PTE_LEVEL(*pte) != level)
 345                        return NULL;
 346
 347                level -= 1;
 348
 349                /* Walk to the next level */
 350                pte        = IOMMU_PTE_PAGE(*pte);
 351                pte        = &pte[PM_LEVEL_INDEX(level, address)];
 352                *page_size = PTE_LEVEL_PAGE_SIZE(level);
 353        }
 354
 355        /*
 356         * If we have a series of large PTEs, make
 357         * sure to return a pointer to the first one.
 358         */
 359        if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
 360                pte = first_pte_l7(pte, page_size, NULL);
 361
 362        return pte;
 363}
 364
 365static struct page *free_clear_pte(u64 *pte, u64 pteval, struct page *freelist)
 366{
 367        unsigned long pt;
 368        int mode;
 369
 370        while (cmpxchg64(pte, pteval, 0) != pteval) {
 371                pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
 372                pteval = *pte;
 373        }
 374
 375        if (!IOMMU_PTE_PRESENT(pteval))
 376                return freelist;
 377
 378        pt   = (unsigned long)IOMMU_PTE_PAGE(pteval);
 379        mode = IOMMU_PTE_MODE(pteval);
 380
 381        return free_sub_pt(pt, mode, freelist);
 382}
 383
 384/*
 385 * Generic mapping functions. It maps a physical address into a DMA
 386 * address space. It allocates the page table pages if necessary.
 387 * In the future it can be extended to a generic mapping function
 388 * supporting all features of AMD IOMMU page tables like level skipping
 389 * and full 64 bit address spaces.
 390 */
 391static int iommu_v1_map_page(struct io_pgtable_ops *ops, unsigned long iova,
 392                          phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 393{
 394        struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
 395        struct page *freelist = NULL;
 396        bool updated = false;
 397        u64 __pte, *pte;
 398        int ret, i, count;
 399
 400        BUG_ON(!IS_ALIGNED(iova, size));
 401        BUG_ON(!IS_ALIGNED(paddr, size));
 402
 403        ret = -EINVAL;
 404        if (!(prot & IOMMU_PROT_MASK))
 405                goto out;
 406
 407        count = PAGE_SIZE_PTE_COUNT(size);
 408        pte   = alloc_pte(dom, iova, size, NULL, gfp, &updated);
 409
 410        ret = -ENOMEM;
 411        if (!pte)
 412                goto out;
 413
 414        for (i = 0; i < count; ++i)
 415                freelist = free_clear_pte(&pte[i], pte[i], freelist);
 416
 417        if (freelist != NULL)
 418                updated = true;
 419
 420        if (count > 1) {
 421                __pte = PAGE_SIZE_PTE(__sme_set(paddr), size);
 422                __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
 423        } else
 424                __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
 425
 426        if (prot & IOMMU_PROT_IR)
 427                __pte |= IOMMU_PTE_IR;
 428        if (prot & IOMMU_PROT_IW)
 429                __pte |= IOMMU_PTE_IW;
 430
 431        for (i = 0; i < count; ++i)
 432                pte[i] = __pte;
 433
 434        ret = 0;
 435
 436out:
 437        if (updated) {
 438                unsigned long flags;
 439
 440                spin_lock_irqsave(&dom->lock, flags);
 441                /*
 442                 * Flush domain TLB(s) and wait for completion. Any Device-Table
 443                 * Updates and flushing already happened in
 444                 * increase_address_space().
 445                 */
 446                amd_iommu_domain_flush_tlb_pde(dom);
 447                amd_iommu_domain_flush_complete(dom);
 448                spin_unlock_irqrestore(&dom->lock, flags);
 449        }
 450
 451        /* Everything flushed out, free pages now */
 452        free_page_list(freelist);
 453
 454        return ret;
 455}
 456
 457static unsigned long iommu_v1_unmap_page(struct io_pgtable_ops *ops,
 458                                      unsigned long iova,
 459                                      size_t size,
 460                                      struct iommu_iotlb_gather *gather)
 461{
 462        struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
 463        unsigned long long unmapped;
 464        unsigned long unmap_size;
 465        u64 *pte;
 466
 467        BUG_ON(!is_power_of_2(size));
 468
 469        unmapped = 0;
 470
 471        while (unmapped < size) {
 472                pte = fetch_pte(pgtable, iova, &unmap_size);
 473                if (pte) {
 474                        int i, count;
 475
 476                        count = PAGE_SIZE_PTE_COUNT(unmap_size);
 477                        for (i = 0; i < count; i++)
 478                                pte[i] = 0ULL;
 479                }
 480
 481                iova = (iova & ~(unmap_size - 1)) + unmap_size;
 482                unmapped += unmap_size;
 483        }
 484
 485        BUG_ON(unmapped && !is_power_of_2(unmapped));
 486
 487        return unmapped;
 488}
 489
 490static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
 491{
 492        struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
 493        unsigned long offset_mask, pte_pgsize;
 494        u64 *pte, __pte;
 495
 496        pte = fetch_pte(pgtable, iova, &pte_pgsize);
 497
 498        if (!pte || !IOMMU_PTE_PRESENT(*pte))
 499                return 0;
 500
 501        offset_mask = pte_pgsize - 1;
 502        __pte       = __sme_clr(*pte & PM_ADDR_MASK);
 503
 504        return (__pte & ~offset_mask) | (iova & offset_mask);
 505}
 506
 507/*
 508 * ----------------------------------------------------
 509 */
 510static void v1_free_pgtable(struct io_pgtable *iop)
 511{
 512        struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop);
 513        struct protection_domain *dom;
 514        struct page *freelist = NULL;
 515        unsigned long root;
 516
 517        if (pgtable->mode == PAGE_MODE_NONE)
 518                return;
 519
 520        dom = container_of(pgtable, struct protection_domain, iop);
 521
 522        /* Update data structure */
 523        amd_iommu_domain_clr_pt_root(dom);
 524
 525        /* Make changes visible to IOMMUs */
 526        amd_iommu_domain_update(dom);
 527
 528        /* Page-table is not visible to IOMMU anymore, so free it */
 529        BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
 530               pgtable->mode > PAGE_MODE_6_LEVEL);
 531
 532        root = (unsigned long)pgtable->root;
 533        freelist = free_sub_pt(root, pgtable->mode, freelist);
 534
 535        free_page_list(freelist);
 536}
 537
 538static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
 539{
 540        struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
 541
 542        cfg->pgsize_bitmap  = AMD_IOMMU_PGSIZES,
 543        cfg->ias            = IOMMU_IN_ADDR_BIT_SIZE,
 544        cfg->oas            = IOMMU_OUT_ADDR_BIT_SIZE,
 545        cfg->tlb            = &v1_flush_ops;
 546
 547        pgtable->iop.ops.map          = iommu_v1_map_page;
 548        pgtable->iop.ops.unmap        = iommu_v1_unmap_page;
 549        pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
 550
 551        return &pgtable->iop;
 552}
 553
 554struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
 555        .alloc  = v1_alloc_pgtable,
 556        .free   = v1_free_pgtable,
 557};
 558