linux/drivers/misc/ocxl/link.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0+
   2// Copyright 2017 IBM Corp.
   3#include <linux/sched/mm.h>
   4#include <linux/mutex.h>
   5#include <linux/mm.h>
   6#include <linux/mm_types.h>
   7#include <linux/mmu_context.h>
   8#include <linux/mmu_notifier.h>
   9#include <asm/copro.h>
  10#include <asm/pnv-ocxl.h>
  11#include <asm/xive.h>
  12#include <misc/ocxl.h>
  13#include "ocxl_internal.h"
  14#include "trace.h"
  15
  16
  17#define SPA_PASID_BITS          15
  18#define SPA_PASID_MAX           ((1 << SPA_PASID_BITS) - 1)
  19#define SPA_PE_MASK             SPA_PASID_MAX
  20#define SPA_SPA_SIZE_LOG        22 /* Each SPA is 4 Mb */
  21
  22#define SPA_CFG_SF              (1ull << (63-0))
  23#define SPA_CFG_TA              (1ull << (63-1))
  24#define SPA_CFG_HV              (1ull << (63-3))
  25#define SPA_CFG_UV              (1ull << (63-4))
  26#define SPA_CFG_XLAT_hpt        (0ull << (63-6)) /* Hashed page table (HPT) mode */
  27#define SPA_CFG_XLAT_roh        (2ull << (63-6)) /* Radix on HPT mode */
  28#define SPA_CFG_XLAT_ror        (3ull << (63-6)) /* Radix on Radix mode */
  29#define SPA_CFG_PR              (1ull << (63-49))
  30#define SPA_CFG_TC              (1ull << (63-54))
  31#define SPA_CFG_DR              (1ull << (63-59))
  32
  33#define SPA_XSL_TF              (1ull << (63-3))  /* Translation fault */
  34#define SPA_XSL_S               (1ull << (63-38)) /* Store operation */
  35
  36#define SPA_PE_VALID            0x80000000
  37
  38struct ocxl_link;
  39
  40struct pe_data {
  41        struct mm_struct *mm;
  42        /* callback to trigger when a translation fault occurs */
  43        void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr);
  44        /* opaque pointer to be passed to the above callback */
  45        void *xsl_err_data;
  46        struct rcu_head rcu;
  47        struct ocxl_link *link;
  48        struct mmu_notifier mmu_notifier;
  49};
  50
  51struct spa {
  52        struct ocxl_process_element *spa_mem;
  53        int spa_order;
  54        struct mutex spa_lock;
  55        struct radix_tree_root pe_tree; /* Maps PE handles to pe_data */
  56        char *irq_name;
  57        int virq;
  58        void __iomem *reg_dsisr;
  59        void __iomem *reg_dar;
  60        void __iomem *reg_tfc;
  61        void __iomem *reg_pe_handle;
  62        /*
  63         * The following field are used by the memory fault
  64         * interrupt handler. We can only have one interrupt at a
  65         * time. The NPU won't raise another interrupt until the
  66         * previous one has been ack'd by writing to the TFC register
  67         */
  68        struct xsl_fault {
  69                struct work_struct fault_work;
  70                u64 pe;
  71                u64 dsisr;
  72                u64 dar;
  73                struct pe_data pe_data;
  74        } xsl_fault;
  75};
  76
  77/*
  78 * A opencapi link can be used be by several PCI functions. We have
  79 * one link per device slot.
  80 *
  81 * A linked list of opencapi links should suffice, as there's a
  82 * limited number of opencapi slots on a system and lookup is only
  83 * done when the device is probed
  84 */
  85struct ocxl_link {
  86        struct list_head list;
  87        struct kref ref;
  88        int domain;
  89        int bus;
  90        int dev;
  91        void __iomem *arva;     /* ATSD register virtual address */
  92        spinlock_t atsd_lock;   /* to serialize shootdowns */
  93        atomic_t irq_available;
  94        struct spa *spa;
  95        void *platform_data;
  96};
  97static struct list_head links_list = LIST_HEAD_INIT(links_list);
  98static DEFINE_MUTEX(links_list_lock);
  99
 100enum xsl_response {
 101        CONTINUE,
 102        ADDRESS_ERROR,
 103        RESTART,
 104};
 105
 106
 107static void read_irq(struct spa *spa, u64 *dsisr, u64 *dar, u64 *pe)
 108{
 109        u64 reg;
 110
 111        *dsisr = in_be64(spa->reg_dsisr);
 112        *dar = in_be64(spa->reg_dar);
 113        reg = in_be64(spa->reg_pe_handle);
 114        *pe = reg & SPA_PE_MASK;
 115}
 116
 117static void ack_irq(struct spa *spa, enum xsl_response r)
 118{
 119        u64 reg = 0;
 120
 121        /* continue is not supported */
 122        if (r == RESTART)
 123                reg = PPC_BIT(31);
 124        else if (r == ADDRESS_ERROR)
 125                reg = PPC_BIT(30);
 126        else
 127                WARN(1, "Invalid irq response %d\n", r);
 128
 129        if (reg) {
 130                trace_ocxl_fault_ack(spa->spa_mem, spa->xsl_fault.pe,
 131                                spa->xsl_fault.dsisr, spa->xsl_fault.dar, reg);
 132                out_be64(spa->reg_tfc, reg);
 133        }
 134}
 135
 136static void xsl_fault_handler_bh(struct work_struct *fault_work)
 137{
 138        vm_fault_t flt = 0;
 139        unsigned long access, flags, inv_flags = 0;
 140        enum xsl_response r;
 141        struct xsl_fault *fault = container_of(fault_work, struct xsl_fault,
 142                                        fault_work);
 143        struct spa *spa = container_of(fault, struct spa, xsl_fault);
 144
 145        int rc;
 146
 147        /*
 148         * We must release a reference on mm_users whenever exiting this
 149         * function (taken in the memory fault interrupt handler)
 150         */
 151        rc = copro_handle_mm_fault(fault->pe_data.mm, fault->dar, fault->dsisr,
 152                                &flt);
 153        if (rc) {
 154                pr_debug("copro_handle_mm_fault failed: %d\n", rc);
 155                if (fault->pe_data.xsl_err_cb) {
 156                        fault->pe_data.xsl_err_cb(
 157                                fault->pe_data.xsl_err_data,
 158                                fault->dar, fault->dsisr);
 159                }
 160                r = ADDRESS_ERROR;
 161                goto ack;
 162        }
 163
 164        if (!radix_enabled()) {
 165                /*
 166                 * update_mmu_cache() will not have loaded the hash
 167                 * since current->trap is not a 0x400 or 0x300, so
 168                 * just call hash_page_mm() here.
 169                 */
 170                access = _PAGE_PRESENT | _PAGE_READ;
 171                if (fault->dsisr & SPA_XSL_S)
 172                        access |= _PAGE_WRITE;
 173
 174                if (get_region_id(fault->dar) != USER_REGION_ID)
 175                        access |= _PAGE_PRIVILEGED;
 176
 177                local_irq_save(flags);
 178                hash_page_mm(fault->pe_data.mm, fault->dar, access, 0x300,
 179                        inv_flags);
 180                local_irq_restore(flags);
 181        }
 182        r = RESTART;
 183ack:
 184        mmput(fault->pe_data.mm);
 185        ack_irq(spa, r);
 186}
 187
 188static irqreturn_t xsl_fault_handler(int irq, void *data)
 189{
 190        struct ocxl_link *link = (struct ocxl_link *) data;
 191        struct spa *spa = link->spa;
 192        u64 dsisr, dar, pe_handle;
 193        struct pe_data *pe_data;
 194        struct ocxl_process_element *pe;
 195        int pid;
 196        bool schedule = false;
 197
 198        read_irq(spa, &dsisr, &dar, &pe_handle);
 199        trace_ocxl_fault(spa->spa_mem, pe_handle, dsisr, dar, -1);
 200
 201        WARN_ON(pe_handle > SPA_PE_MASK);
 202        pe = spa->spa_mem + pe_handle;
 203        pid = be32_to_cpu(pe->pid);
 204        /* We could be reading all null values here if the PE is being
 205         * removed while an interrupt kicks in. It's not supposed to
 206         * happen if the driver notified the AFU to terminate the
 207         * PASID, and the AFU waited for pending operations before
 208         * acknowledging. But even if it happens, we won't find a
 209         * memory context below and fail silently, so it should be ok.
 210         */
 211        if (!(dsisr & SPA_XSL_TF)) {
 212                WARN(1, "Invalid xsl interrupt fault register %#llx\n", dsisr);
 213                ack_irq(spa, ADDRESS_ERROR);
 214                return IRQ_HANDLED;
 215        }
 216
 217        rcu_read_lock();
 218        pe_data = radix_tree_lookup(&spa->pe_tree, pe_handle);
 219        if (!pe_data) {
 220                /*
 221                 * Could only happen if the driver didn't notify the
 222                 * AFU about PASID termination before removing the PE,
 223                 * or the AFU didn't wait for all memory access to
 224                 * have completed.
 225                 *
 226                 * Either way, we fail early, but we shouldn't log an
 227                 * error message, as it is a valid (if unexpected)
 228                 * scenario
 229                 */
 230                rcu_read_unlock();
 231                pr_debug("Unknown mm context for xsl interrupt\n");
 232                ack_irq(spa, ADDRESS_ERROR);
 233                return IRQ_HANDLED;
 234        }
 235
 236        if (!pe_data->mm) {
 237                /*
 238                 * translation fault from a kernel context - an OpenCAPI
 239                 * device tried to access a bad kernel address
 240                 */
 241                rcu_read_unlock();
 242                pr_warn("Unresolved OpenCAPI xsl fault in kernel context\n");
 243                ack_irq(spa, ADDRESS_ERROR);
 244                return IRQ_HANDLED;
 245        }
 246        WARN_ON(pe_data->mm->context.id != pid);
 247
 248        if (mmget_not_zero(pe_data->mm)) {
 249                        spa->xsl_fault.pe = pe_handle;
 250                        spa->xsl_fault.dar = dar;
 251                        spa->xsl_fault.dsisr = dsisr;
 252                        spa->xsl_fault.pe_data = *pe_data;
 253                        schedule = true;
 254                        /* mm_users count released by bottom half */
 255        }
 256        rcu_read_unlock();
 257        if (schedule)
 258                schedule_work(&spa->xsl_fault.fault_work);
 259        else
 260                ack_irq(spa, ADDRESS_ERROR);
 261        return IRQ_HANDLED;
 262}
 263
 264static void unmap_irq_registers(struct spa *spa)
 265{
 266        pnv_ocxl_unmap_xsl_regs(spa->reg_dsisr, spa->reg_dar, spa->reg_tfc,
 267                                spa->reg_pe_handle);
 268}
 269
 270static int map_irq_registers(struct pci_dev *dev, struct spa *spa)
 271{
 272        return pnv_ocxl_map_xsl_regs(dev, &spa->reg_dsisr, &spa->reg_dar,
 273                                &spa->reg_tfc, &spa->reg_pe_handle);
 274}
 275
 276static int setup_xsl_irq(struct pci_dev *dev, struct ocxl_link *link)
 277{
 278        struct spa *spa = link->spa;
 279        int rc;
 280        int hwirq;
 281
 282        rc = pnv_ocxl_get_xsl_irq(dev, &hwirq);
 283        if (rc)
 284                return rc;
 285
 286        rc = map_irq_registers(dev, spa);
 287        if (rc)
 288                return rc;
 289
 290        spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x",
 291                                link->domain, link->bus, link->dev);
 292        if (!spa->irq_name) {
 293                dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n");
 294                rc = -ENOMEM;
 295                goto err_xsl;
 296        }
 297        /*
 298         * At some point, we'll need to look into allowing a higher
 299         * number of interrupts. Could we have an IRQ domain per link?
 300         */
 301        spa->virq = irq_create_mapping(NULL, hwirq);
 302        if (!spa->virq) {
 303                dev_err(&dev->dev,
 304                        "irq_create_mapping failed for translation interrupt\n");
 305                rc = -EINVAL;
 306                goto err_name;
 307        }
 308
 309        dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq);
 310
 311        rc = request_irq(spa->virq, xsl_fault_handler, 0, spa->irq_name,
 312                        link);
 313        if (rc) {
 314                dev_err(&dev->dev,
 315                        "request_irq failed for translation interrupt: %d\n",
 316                        rc);
 317                rc = -EINVAL;
 318                goto err_mapping;
 319        }
 320        return 0;
 321
 322err_mapping:
 323        irq_dispose_mapping(spa->virq);
 324err_name:
 325        kfree(spa->irq_name);
 326err_xsl:
 327        unmap_irq_registers(spa);
 328        return rc;
 329}
 330
 331static void release_xsl_irq(struct ocxl_link *link)
 332{
 333        struct spa *spa = link->spa;
 334
 335        if (spa->virq) {
 336                free_irq(spa->virq, link);
 337                irq_dispose_mapping(spa->virq);
 338        }
 339        kfree(spa->irq_name);
 340        unmap_irq_registers(spa);
 341}
 342
 343static int alloc_spa(struct pci_dev *dev, struct ocxl_link *link)
 344{
 345        struct spa *spa;
 346
 347        spa = kzalloc(sizeof(struct spa), GFP_KERNEL);
 348        if (!spa)
 349                return -ENOMEM;
 350
 351        mutex_init(&spa->spa_lock);
 352        INIT_RADIX_TREE(&spa->pe_tree, GFP_KERNEL);
 353        INIT_WORK(&spa->xsl_fault.fault_work, xsl_fault_handler_bh);
 354
 355        spa->spa_order = SPA_SPA_SIZE_LOG - PAGE_SHIFT;
 356        spa->spa_mem = (struct ocxl_process_element *)
 357                __get_free_pages(GFP_KERNEL | __GFP_ZERO, spa->spa_order);
 358        if (!spa->spa_mem) {
 359                dev_err(&dev->dev, "Can't allocate Shared Process Area\n");
 360                kfree(spa);
 361                return -ENOMEM;
 362        }
 363        pr_debug("Allocated SPA for %x:%x:%x at %p\n", link->domain, link->bus,
 364                link->dev, spa->spa_mem);
 365
 366        link->spa = spa;
 367        return 0;
 368}
 369
 370static void free_spa(struct ocxl_link *link)
 371{
 372        struct spa *spa = link->spa;
 373
 374        pr_debug("Freeing SPA for %x:%x:%x\n", link->domain, link->bus,
 375                link->dev);
 376
 377        if (spa && spa->spa_mem) {
 378                free_pages((unsigned long) spa->spa_mem, spa->spa_order);
 379                kfree(spa);
 380                link->spa = NULL;
 381        }
 382}
 383
 384static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link)
 385{
 386        struct ocxl_link *link;
 387        int rc;
 388
 389        link = kzalloc(sizeof(struct ocxl_link), GFP_KERNEL);
 390        if (!link)
 391                return -ENOMEM;
 392
 393        kref_init(&link->ref);
 394        link->domain = pci_domain_nr(dev->bus);
 395        link->bus = dev->bus->number;
 396        link->dev = PCI_SLOT(dev->devfn);
 397        atomic_set(&link->irq_available, MAX_IRQ_PER_LINK);
 398        spin_lock_init(&link->atsd_lock);
 399
 400        rc = alloc_spa(dev, link);
 401        if (rc)
 402                goto err_free;
 403
 404        rc = setup_xsl_irq(dev, link);
 405        if (rc)
 406                goto err_spa;
 407
 408        /* platform specific hook */
 409        rc = pnv_ocxl_spa_setup(dev, link->spa->spa_mem, PE_mask,
 410                                &link->platform_data);
 411        if (rc)
 412                goto err_xsl_irq;
 413
 414        /* if link->arva is not defeined, MMIO registers are not used to
 415         * generate TLB invalidate. PowerBus snooping is enabled.
 416         * Otherwise, PowerBus snooping is disabled. TLB Invalidates are
 417         * initiated using MMIO registers.
 418         */
 419        pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0, &link->arva);
 420
 421        *out_link = link;
 422        return 0;
 423
 424err_xsl_irq:
 425        release_xsl_irq(link);
 426err_spa:
 427        free_spa(link);
 428err_free:
 429        kfree(link);
 430        return rc;
 431}
 432
 433static void free_link(struct ocxl_link *link)
 434{
 435        release_xsl_irq(link);
 436        free_spa(link);
 437        kfree(link);
 438}
 439
 440int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle)
 441{
 442        int rc = 0;
 443        struct ocxl_link *link;
 444
 445        mutex_lock(&links_list_lock);
 446        list_for_each_entry(link, &links_list, list) {
 447                /* The functions of a device all share the same link */
 448                if (link->domain == pci_domain_nr(dev->bus) &&
 449                        link->bus == dev->bus->number &&
 450                        link->dev == PCI_SLOT(dev->devfn)) {
 451                        kref_get(&link->ref);
 452                        *link_handle = link;
 453                        goto unlock;
 454                }
 455        }
 456        rc = alloc_link(dev, PE_mask, &link);
 457        if (rc)
 458                goto unlock;
 459
 460        list_add(&link->list, &links_list);
 461        *link_handle = link;
 462unlock:
 463        mutex_unlock(&links_list_lock);
 464        return rc;
 465}
 466EXPORT_SYMBOL_GPL(ocxl_link_setup);
 467
 468static void release_xsl(struct kref *ref)
 469{
 470        struct ocxl_link *link = container_of(ref, struct ocxl_link, ref);
 471
 472        if (link->arva) {
 473                pnv_ocxl_unmap_lpar(link->arva);
 474                link->arva = NULL;
 475        }
 476
 477        list_del(&link->list);
 478        /* call platform code before releasing data */
 479        pnv_ocxl_spa_release(link->platform_data);
 480        free_link(link);
 481}
 482
 483void ocxl_link_release(struct pci_dev *dev, void *link_handle)
 484{
 485        struct ocxl_link *link = (struct ocxl_link *) link_handle;
 486
 487        mutex_lock(&links_list_lock);
 488        kref_put(&link->ref, release_xsl);
 489        mutex_unlock(&links_list_lock);
 490}
 491EXPORT_SYMBOL_GPL(ocxl_link_release);
 492
 493static void invalidate_range(struct mmu_notifier *mn,
 494                             struct mm_struct *mm,
 495                             unsigned long start, unsigned long end)
 496{
 497        struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
 498        struct ocxl_link *link = pe_data->link;
 499        unsigned long addr, pid, page_size = PAGE_SIZE;
 500
 501        pid = mm->context.id;
 502        trace_ocxl_mmu_notifier_range(start, end, pid);
 503
 504        spin_lock(&link->atsd_lock);
 505        for (addr = start; addr < end; addr += page_size)
 506                pnv_ocxl_tlb_invalidate(link->arva, pid, addr, page_size);
 507        spin_unlock(&link->atsd_lock);
 508}
 509
 510static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
 511        .invalidate_range = invalidate_range,
 512};
 513
 514static u64 calculate_cfg_state(bool kernel)
 515{
 516        u64 state;
 517
 518        state = SPA_CFG_DR;
 519        if (mfspr(SPRN_LPCR) & LPCR_TC)
 520                state |= SPA_CFG_TC;
 521        if (radix_enabled())
 522                state |= SPA_CFG_XLAT_ror;
 523        else
 524                state |= SPA_CFG_XLAT_hpt;
 525        state |= SPA_CFG_HV;
 526        if (kernel) {
 527                if (mfmsr() & MSR_SF)
 528                        state |= SPA_CFG_SF;
 529        } else {
 530                state |= SPA_CFG_PR;
 531                if (!test_tsk_thread_flag(current, TIF_32BIT))
 532                        state |= SPA_CFG_SF;
 533        }
 534        return state;
 535}
 536
 537int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
 538                u64 amr, u16 bdf, struct mm_struct *mm,
 539                void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr),
 540                void *xsl_err_data)
 541{
 542        struct ocxl_link *link = (struct ocxl_link *) link_handle;
 543        struct spa *spa = link->spa;
 544        struct ocxl_process_element *pe;
 545        int pe_handle, rc = 0;
 546        struct pe_data *pe_data;
 547
 548        BUILD_BUG_ON(sizeof(struct ocxl_process_element) != 128);
 549        if (pasid > SPA_PASID_MAX)
 550                return -EINVAL;
 551
 552        mutex_lock(&spa->spa_lock);
 553        pe_handle = pasid & SPA_PE_MASK;
 554        pe = spa->spa_mem + pe_handle;
 555
 556        if (pe->software_state) {
 557                rc = -EBUSY;
 558                goto unlock;
 559        }
 560
 561        pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
 562        if (!pe_data) {
 563                rc = -ENOMEM;
 564                goto unlock;
 565        }
 566
 567        pe_data->mm = mm;
 568        pe_data->xsl_err_cb = xsl_err_cb;
 569        pe_data->xsl_err_data = xsl_err_data;
 570        pe_data->link = link;
 571        pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
 572
 573        memset(pe, 0, sizeof(struct ocxl_process_element));
 574        pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
 575        pe->pasid = cpu_to_be32(pasid << (31 - 19));
 576        pe->bdf = cpu_to_be16(bdf);
 577        pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
 578        pe->pid = cpu_to_be32(pidr);
 579        pe->tid = cpu_to_be32(tidr);
 580        pe->amr = cpu_to_be64(amr);
 581        pe->software_state = cpu_to_be32(SPA_PE_VALID);
 582
 583        /*
 584         * For user contexts, register a copro so that TLBIs are seen
 585         * by the nest MMU. If we have a kernel context, TLBIs are
 586         * already global.
 587         */
 588        if (mm) {
 589                mm_context_add_copro(mm);
 590                if (link->arva) {
 591                        /* Use MMIO registers for the TLB Invalidate
 592                         * operations.
 593                         */
 594                        trace_ocxl_init_mmu_notifier(pasid, mm->context.id);
 595                        mmu_notifier_register(&pe_data->mmu_notifier, mm);
 596                }
 597        }
 598
 599        /*
 600         * Barrier is to make sure PE is visible in the SPA before it
 601         * is used by the device. It also helps with the global TLBI
 602         * invalidation
 603         */
 604        mb();
 605        radix_tree_insert(&spa->pe_tree, pe_handle, pe_data);
 606
 607        /*
 608         * The mm must stay valid for as long as the device uses it. We
 609         * lower the count when the context is removed from the SPA.
 610         *
 611         * We grab mm_count (and not mm_users), as we don't want to
 612         * end up in a circular dependency if a process mmaps its
 613         * mmio, therefore incrementing the file ref count when
 614         * calling mmap(), and forgets to unmap before exiting. In
 615         * that scenario, when the kernel handles the death of the
 616         * process, the file is not cleaned because unmap was not
 617         * called, and the mm wouldn't be freed because we would still
 618         * have a reference on mm_users. Incrementing mm_count solves
 619         * the problem.
 620         */
 621        if (mm)
 622                mmgrab(mm);
 623        trace_ocxl_context_add(current->pid, spa->spa_mem, pasid, pidr, tidr);
 624unlock:
 625        mutex_unlock(&spa->spa_lock);
 626        return rc;
 627}
 628EXPORT_SYMBOL_GPL(ocxl_link_add_pe);
 629
 630int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid)
 631{
 632        struct ocxl_link *link = (struct ocxl_link *) link_handle;
 633        struct spa *spa = link->spa;
 634        struct ocxl_process_element *pe;
 635        int pe_handle, rc;
 636
 637        if (pasid > SPA_PASID_MAX)
 638                return -EINVAL;
 639
 640        pe_handle = pasid & SPA_PE_MASK;
 641        pe = spa->spa_mem + pe_handle;
 642
 643        mutex_lock(&spa->spa_lock);
 644
 645        pe->tid = cpu_to_be32(tid);
 646
 647        /*
 648         * The barrier makes sure the PE is updated
 649         * before we clear the NPU context cache below, so that the
 650         * old PE cannot be reloaded erroneously.
 651         */
 652        mb();
 653
 654        /*
 655         * hook to platform code
 656         * On powerpc, the entry needs to be cleared from the context
 657         * cache of the NPU.
 658         */
 659        rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
 660        WARN_ON(rc);
 661
 662        mutex_unlock(&spa->spa_lock);
 663        return rc;
 664}
 665
 666int ocxl_link_remove_pe(void *link_handle, int pasid)
 667{
 668        struct ocxl_link *link = (struct ocxl_link *) link_handle;
 669        struct spa *spa = link->spa;
 670        struct ocxl_process_element *pe;
 671        struct pe_data *pe_data;
 672        int pe_handle, rc;
 673
 674        if (pasid > SPA_PASID_MAX)
 675                return -EINVAL;
 676
 677        /*
 678         * About synchronization with our memory fault handler:
 679         *
 680         * Before removing the PE, the driver is supposed to have
 681         * notified the AFU, which should have cleaned up and make
 682         * sure the PASID is no longer in use, including pending
 683         * interrupts. However, there's no way to be sure...
 684         *
 685         * We clear the PE and remove the context from our radix
 686         * tree. From that point on, any new interrupt for that
 687         * context will fail silently, which is ok. As mentioned
 688         * above, that's not expected, but it could happen if the
 689         * driver or AFU didn't do the right thing.
 690         *
 691         * There could still be a bottom half running, but we don't
 692         * need to wait/flush, as it is managing a reference count on
 693         * the mm it reads from the radix tree.
 694         */
 695        pe_handle = pasid & SPA_PE_MASK;
 696        pe = spa->spa_mem + pe_handle;
 697
 698        mutex_lock(&spa->spa_lock);
 699
 700        if (!(be32_to_cpu(pe->software_state) & SPA_PE_VALID)) {
 701                rc = -EINVAL;
 702                goto unlock;
 703        }
 704
 705        trace_ocxl_context_remove(current->pid, spa->spa_mem, pasid,
 706                                be32_to_cpu(pe->pid), be32_to_cpu(pe->tid));
 707
 708        memset(pe, 0, sizeof(struct ocxl_process_element));
 709        /*
 710         * The barrier makes sure the PE is removed from the SPA
 711         * before we clear the NPU context cache below, so that the
 712         * old PE cannot be reloaded erroneously.
 713         */
 714        mb();
 715
 716        /*
 717         * hook to platform code
 718         * On powerpc, the entry needs to be cleared from the context
 719         * cache of the NPU.
 720         */
 721        rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
 722        WARN_ON(rc);
 723
 724        pe_data = radix_tree_delete(&spa->pe_tree, pe_handle);
 725        if (!pe_data) {
 726                WARN(1, "Couldn't find pe data when removing PE\n");
 727        } else {
 728                if (pe_data->mm) {
 729                        if (link->arva) {
 730                                trace_ocxl_release_mmu_notifier(pasid,
 731                                                                pe_data->mm->context.id);
 732                                mmu_notifier_unregister(&pe_data->mmu_notifier,
 733                                                        pe_data->mm);
 734                                spin_lock(&link->atsd_lock);
 735                                pnv_ocxl_tlb_invalidate(link->arva,
 736                                                        pe_data->mm->context.id,
 737                                                        0ull,
 738                                                        PAGE_SIZE);
 739                                spin_unlock(&link->atsd_lock);
 740                        }
 741                        mm_context_remove_copro(pe_data->mm);
 742                        mmdrop(pe_data->mm);
 743                }
 744                kfree_rcu(pe_data, rcu);
 745        }
 746unlock:
 747        mutex_unlock(&spa->spa_lock);
 748        return rc;
 749}
 750EXPORT_SYMBOL_GPL(ocxl_link_remove_pe);
 751
 752int ocxl_link_irq_alloc(void *link_handle, int *hw_irq)
 753{
 754        struct ocxl_link *link = (struct ocxl_link *) link_handle;
 755        int irq;
 756
 757        if (atomic_dec_if_positive(&link->irq_available) < 0)
 758                return -ENOSPC;
 759
 760        irq = xive_native_alloc_irq();
 761        if (!irq) {
 762                atomic_inc(&link->irq_available);
 763                return -ENXIO;
 764        }
 765
 766        *hw_irq = irq;
 767        return 0;
 768}
 769EXPORT_SYMBOL_GPL(ocxl_link_irq_alloc);
 770
 771void ocxl_link_free_irq(void *link_handle, int hw_irq)
 772{
 773        struct ocxl_link *link = (struct ocxl_link *) link_handle;
 774
 775        xive_native_free_irq(hw_irq);
 776        atomic_inc(&link->irq_available);
 777}
 778EXPORT_SYMBOL_GPL(ocxl_link_free_irq);
 779