linux/arch/powerpc/platforms/powernv/pci-ioda.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Support PCI/PCIe on PowerNV platforms
   4 *
   5 * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
   6 */
   7
   8#undef DEBUG
   9
  10#include <linux/kernel.h>
  11#include <linux/pci.h>
  12#include <linux/crash_dump.h>
  13#include <linux/delay.h>
  14#include <linux/string.h>
  15#include <linux/init.h>
  16#include <linux/memblock.h>
  17#include <linux/irq.h>
  18#include <linux/io.h>
  19#include <linux/msi.h>
  20#include <linux/iommu.h>
  21#include <linux/rculist.h>
  22#include <linux/sizes.h>
  23
  24#include <asm/sections.h>
  25#include <asm/io.h>
  26#include <asm/prom.h>
  27#include <asm/pci-bridge.h>
  28#include <asm/machdep.h>
  29#include <asm/msi_bitmap.h>
  30#include <asm/ppc-pci.h>
  31#include <asm/opal.h>
  32#include <asm/iommu.h>
  33#include <asm/tce.h>
  34#include <asm/xics.h>
  35#include <asm/debugfs.h>
  36#include <asm/firmware.h>
  37#include <asm/pnv-pci.h>
  38#include <asm/mmzone.h>
  39
  40#include <misc/cxl-base.h>
  41
  42#include "powernv.h"
  43#include "pci.h"
  44#include "../../../../drivers/pci/pci.h"
  45
  46#define PNV_IODA1_M64_NUM       16      /* Number of M64 BARs   */
  47#define PNV_IODA1_M64_SEGS      8       /* Segments per M64 BAR */
  48#define PNV_IODA1_DMA32_SEGSIZE 0x10000000
  49
  50static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
  51                                              "NPU_OCAPI" };
  52
  53static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
  54static void pnv_pci_configure_bus(struct pci_bus *bus);
  55
  56void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
  57                            const char *fmt, ...)
  58{
  59        struct va_format vaf;
  60        va_list args;
  61        char pfix[32];
  62
  63        va_start(args, fmt);
  64
  65        vaf.fmt = fmt;
  66        vaf.va = &args;
  67
  68        if (pe->flags & PNV_IODA_PE_DEV)
  69                strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
  70        else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
  71                sprintf(pfix, "%04x:%02x     ",
  72                        pci_domain_nr(pe->pbus), pe->pbus->number);
  73#ifdef CONFIG_PCI_IOV
  74        else if (pe->flags & PNV_IODA_PE_VF)
  75                sprintf(pfix, "%04x:%02x:%2x.%d",
  76                        pci_domain_nr(pe->parent_dev->bus),
  77                        (pe->rid & 0xff00) >> 8,
  78                        PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
  79#endif /* CONFIG_PCI_IOV*/
  80
  81        printk("%spci %s: [PE# %.2x] %pV",
  82               level, pfix, pe->pe_number, &vaf);
  83
  84        va_end(args);
  85}
  86
  87static bool pnv_iommu_bypass_disabled __read_mostly;
  88static bool pci_reset_phbs __read_mostly;
  89
  90static int __init iommu_setup(char *str)
  91{
  92        if (!str)
  93                return -EINVAL;
  94
  95        while (*str) {
  96                if (!strncmp(str, "nobypass", 8)) {
  97                        pnv_iommu_bypass_disabled = true;
  98                        pr_info("PowerNV: IOMMU bypass window disabled.\n");
  99                        break;
 100                }
 101                str += strcspn(str, ",");
 102                if (*str == ',')
 103                        str++;
 104        }
 105
 106        return 0;
 107}
 108early_param("iommu", iommu_setup);
 109
 110static int __init pci_reset_phbs_setup(char *str)
 111{
 112        pci_reset_phbs = true;
 113        return 0;
 114}
 115
 116early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
 117
 118static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
 119{
 120        s64 rc;
 121
 122        phb->ioda.pe_array[pe_no].phb = phb;
 123        phb->ioda.pe_array[pe_no].pe_number = pe_no;
 124        phb->ioda.pe_array[pe_no].dma_setup_done = false;
 125
 126        /*
 127         * Clear the PE frozen state as it might be put into frozen state
 128         * in the last PCI remove path. It's not harmful to do so when the
 129         * PE is already in unfrozen state.
 130         */
 131        rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
 132                                       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 133        if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
 134                pr_warn("%s: Error %lld unfreezing PHB#%x-PE#%x\n",
 135                        __func__, rc, phb->hose->global_number, pe_no);
 136
 137        return &phb->ioda.pe_array[pe_no];
 138}
 139
 140static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
 141{
 142        if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
 143                pr_warn("%s: Invalid PE %x on PHB#%x\n",
 144                        __func__, pe_no, phb->hose->global_number);
 145                return;
 146        }
 147
 148        mutex_lock(&phb->ioda.pe_alloc_mutex);
 149        if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
 150                pr_debug("%s: PE %x was reserved on PHB#%x\n",
 151                         __func__, pe_no, phb->hose->global_number);
 152        mutex_unlock(&phb->ioda.pe_alloc_mutex);
 153
 154        pnv_ioda_init_pe(phb, pe_no);
 155}
 156
 157struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
 158{
 159        struct pnv_ioda_pe *ret = NULL;
 160        int run = 0, pe, i;
 161
 162        mutex_lock(&phb->ioda.pe_alloc_mutex);
 163
 164        /* scan backwards for a run of @count cleared bits */
 165        for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
 166                if (test_bit(pe, phb->ioda.pe_alloc)) {
 167                        run = 0;
 168                        continue;
 169                }
 170
 171                run++;
 172                if (run == count)
 173                        break;
 174        }
 175        if (run != count)
 176                goto out;
 177
 178        for (i = pe; i < pe + count; i++) {
 179                set_bit(i, phb->ioda.pe_alloc);
 180                pnv_ioda_init_pe(phb, i);
 181        }
 182        ret = &phb->ioda.pe_array[pe];
 183
 184out:
 185        mutex_unlock(&phb->ioda.pe_alloc_mutex);
 186        return ret;
 187}
 188
 189void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
 190{
 191        struct pnv_phb *phb = pe->phb;
 192        unsigned int pe_num = pe->pe_number;
 193
 194        WARN_ON(pe->pdev);
 195        WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
 196        kfree(pe->npucomp);
 197        memset(pe, 0, sizeof(struct pnv_ioda_pe));
 198
 199        mutex_lock(&phb->ioda.pe_alloc_mutex);
 200        clear_bit(pe_num, phb->ioda.pe_alloc);
 201        mutex_unlock(&phb->ioda.pe_alloc_mutex);
 202}
 203
 204/* The default M64 BAR is shared by all PEs */
 205static int pnv_ioda2_init_m64(struct pnv_phb *phb)
 206{
 207        const char *desc;
 208        struct resource *r;
 209        s64 rc;
 210
 211        /* Configure the default M64 BAR */
 212        rc = opal_pci_set_phb_mem_window(phb->opal_id,
 213                                         OPAL_M64_WINDOW_TYPE,
 214                                         phb->ioda.m64_bar_idx,
 215                                         phb->ioda.m64_base,
 216                                         0, /* unused */
 217                                         phb->ioda.m64_size);
 218        if (rc != OPAL_SUCCESS) {
 219                desc = "configuring";
 220                goto fail;
 221        }
 222
 223        /* Enable the default M64 BAR */
 224        rc = opal_pci_phb_mmio_enable(phb->opal_id,
 225                                      OPAL_M64_WINDOW_TYPE,
 226                                      phb->ioda.m64_bar_idx,
 227                                      OPAL_ENABLE_M64_SPLIT);
 228        if (rc != OPAL_SUCCESS) {
 229                desc = "enabling";
 230                goto fail;
 231        }
 232
 233        /*
 234         * Exclude the segments for reserved and root bus PE, which
 235         * are first or last two PEs.
 236         */
 237        r = &phb->hose->mem_resources[1];
 238        if (phb->ioda.reserved_pe_idx == 0)
 239                r->start += (2 * phb->ioda.m64_segsize);
 240        else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
 241                r->end -= (2 * phb->ioda.m64_segsize);
 242        else
 243                pr_warn("  Cannot strip M64 segment for reserved PE#%x\n",
 244                        phb->ioda.reserved_pe_idx);
 245
 246        return 0;
 247
 248fail:
 249        pr_warn("  Failure %lld %s M64 BAR#%d\n",
 250                rc, desc, phb->ioda.m64_bar_idx);
 251        opal_pci_phb_mmio_enable(phb->opal_id,
 252                                 OPAL_M64_WINDOW_TYPE,
 253                                 phb->ioda.m64_bar_idx,
 254                                 OPAL_DISABLE_M64);
 255        return -EIO;
 256}
 257
 258static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
 259                                         unsigned long *pe_bitmap)
 260{
 261        struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
 262        struct resource *r;
 263        resource_size_t base, sgsz, start, end;
 264        int segno, i;
 265
 266        base = phb->ioda.m64_base;
 267        sgsz = phb->ioda.m64_segsize;
 268        for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
 269                r = &pdev->resource[i];
 270                if (!r->parent || !pnv_pci_is_m64(phb, r))
 271                        continue;
 272
 273                start = ALIGN_DOWN(r->start - base, sgsz);
 274                end = ALIGN(r->end - base, sgsz);
 275                for (segno = start / sgsz; segno < end / sgsz; segno++) {
 276                        if (pe_bitmap)
 277                                set_bit(segno, pe_bitmap);
 278                        else
 279                                pnv_ioda_reserve_pe(phb, segno);
 280                }
 281        }
 282}
 283
 284static int pnv_ioda1_init_m64(struct pnv_phb *phb)
 285{
 286        struct resource *r;
 287        int index;
 288
 289        /*
 290         * There are 16 M64 BARs, each of which has 8 segments. So
 291         * there are as many M64 segments as the maximum number of
 292         * PEs, which is 128.
 293         */
 294        for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
 295                unsigned long base, segsz = phb->ioda.m64_segsize;
 296                int64_t rc;
 297
 298                base = phb->ioda.m64_base +
 299                       index * PNV_IODA1_M64_SEGS * segsz;
 300                rc = opal_pci_set_phb_mem_window(phb->opal_id,
 301                                OPAL_M64_WINDOW_TYPE, index, base, 0,
 302                                PNV_IODA1_M64_SEGS * segsz);
 303                if (rc != OPAL_SUCCESS) {
 304                        pr_warn("  Error %lld setting M64 PHB#%x-BAR#%d\n",
 305                                rc, phb->hose->global_number, index);
 306                        goto fail;
 307                }
 308
 309                rc = opal_pci_phb_mmio_enable(phb->opal_id,
 310                                OPAL_M64_WINDOW_TYPE, index,
 311                                OPAL_ENABLE_M64_SPLIT);
 312                if (rc != OPAL_SUCCESS) {
 313                        pr_warn("  Error %lld enabling M64 PHB#%x-BAR#%d\n",
 314                                rc, phb->hose->global_number, index);
 315                        goto fail;
 316                }
 317        }
 318
 319        for (index = 0; index < phb->ioda.total_pe_num; index++) {
 320                int64_t rc;
 321
 322                /*
 323                 * P7IOC supports M64DT, which helps mapping M64 segment
 324                 * to one particular PE#. However, PHB3 has fixed mapping
 325                 * between M64 segment and PE#. In order to have same logic
 326                 * for P7IOC and PHB3, we enforce fixed mapping between M64
 327                 * segment and PE# on P7IOC.
 328                 */
 329                rc = opal_pci_map_pe_mmio_window(phb->opal_id,
 330                                index, OPAL_M64_WINDOW_TYPE,
 331                                index / PNV_IODA1_M64_SEGS,
 332                                index % PNV_IODA1_M64_SEGS);
 333                if (rc != OPAL_SUCCESS) {
 334                        pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
 335                                __func__, rc, phb->hose->global_number,
 336                                index);
 337                        goto fail;
 338                }
 339        }
 340
 341        /*
 342         * Exclude the segments for reserved and root bus PE, which
 343         * are first or last two PEs.
 344         */
 345        r = &phb->hose->mem_resources[1];
 346        if (phb->ioda.reserved_pe_idx == 0)
 347                r->start += (2 * phb->ioda.m64_segsize);
 348        else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
 349                r->end -= (2 * phb->ioda.m64_segsize);
 350        else
 351                WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
 352                     phb->ioda.reserved_pe_idx, phb->hose->global_number);
 353
 354        return 0;
 355
 356fail:
 357        for ( ; index >= 0; index--)
 358                opal_pci_phb_mmio_enable(phb->opal_id,
 359                        OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
 360
 361        return -EIO;
 362}
 363
 364static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
 365                                    unsigned long *pe_bitmap,
 366                                    bool all)
 367{
 368        struct pci_dev *pdev;
 369
 370        list_for_each_entry(pdev, &bus->devices, bus_list) {
 371                pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
 372
 373                if (all && pdev->subordinate)
 374                        pnv_ioda_reserve_m64_pe(pdev->subordinate,
 375                                                pe_bitmap, all);
 376        }
 377}
 378
 379static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
 380{
 381        struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
 382        struct pnv_ioda_pe *master_pe, *pe;
 383        unsigned long size, *pe_alloc;
 384        int i;
 385
 386        /* Root bus shouldn't use M64 */
 387        if (pci_is_root_bus(bus))
 388                return NULL;
 389
 390        /* Allocate bitmap */
 391        size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
 392        pe_alloc = kzalloc(size, GFP_KERNEL);
 393        if (!pe_alloc) {
 394                pr_warn("%s: Out of memory !\n",
 395                        __func__);
 396                return NULL;
 397        }
 398
 399        /* Figure out reserved PE numbers by the PE */
 400        pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
 401
 402        /*
 403         * the current bus might not own M64 window and that's all
 404         * contributed by its child buses. For the case, we needn't
 405         * pick M64 dependent PE#.
 406         */
 407        if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
 408                kfree(pe_alloc);
 409                return NULL;
 410        }
 411
 412        /*
 413         * Figure out the master PE and put all slave PEs to master
 414         * PE's list to form compound PE.
 415         */
 416        master_pe = NULL;
 417        i = -1;
 418        while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
 419                phb->ioda.total_pe_num) {
 420                pe = &phb->ioda.pe_array[i];
 421
 422                phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
 423                if (!master_pe) {
 424                        pe->flags |= PNV_IODA_PE_MASTER;
 425                        INIT_LIST_HEAD(&pe->slaves);
 426                        master_pe = pe;
 427                } else {
 428                        pe->flags |= PNV_IODA_PE_SLAVE;
 429                        pe->master = master_pe;
 430                        list_add_tail(&pe->list, &master_pe->slaves);
 431                }
 432        }
 433
 434        kfree(pe_alloc);
 435        return master_pe;
 436}
 437
 438static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
 439{
 440        struct pci_controller *hose = phb->hose;
 441        struct device_node *dn = hose->dn;
 442        struct resource *res;
 443        u32 m64_range[2], i;
 444        const __be32 *r;
 445        u64 pci_addr;
 446
 447        if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
 448                pr_info("  Not support M64 window\n");
 449                return;
 450        }
 451
 452        if (!firmware_has_feature(FW_FEATURE_OPAL)) {
 453                pr_info("  Firmware too old to support M64 window\n");
 454                return;
 455        }
 456
 457        r = of_get_property(dn, "ibm,opal-m64-window", NULL);
 458        if (!r) {
 459                pr_info("  No <ibm,opal-m64-window> on %pOF\n",
 460                        dn);
 461                return;
 462        }
 463
 464        /*
 465         * Find the available M64 BAR range and pickup the last one for
 466         * covering the whole 64-bits space. We support only one range.
 467         */
 468        if (of_property_read_u32_array(dn, "ibm,opal-available-m64-ranges",
 469                                       m64_range, 2)) {
 470                /* In absence of the property, assume 0..15 */
 471                m64_range[0] = 0;
 472                m64_range[1] = 16;
 473        }
 474        /* We only support 64 bits in our allocator */
 475        if (m64_range[1] > 63) {
 476                pr_warn("%s: Limiting M64 range to 63 (from %d) on PHB#%x\n",
 477                        __func__, m64_range[1], phb->hose->global_number);
 478                m64_range[1] = 63;
 479        }
 480        /* Empty range, no m64 */
 481        if (m64_range[1] <= m64_range[0]) {
 482                pr_warn("%s: M64 empty, disabling M64 usage on PHB#%x\n",
 483                        __func__, phb->hose->global_number);
 484                return;
 485        }
 486
 487        /* Configure M64 informations */
 488        res = &hose->mem_resources[1];
 489        res->name = dn->full_name;
 490        res->start = of_translate_address(dn, r + 2);
 491        res->end = res->start + of_read_number(r + 4, 2) - 1;
 492        res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
 493        pci_addr = of_read_number(r, 2);
 494        hose->mem_offset[1] = res->start - pci_addr;
 495
 496        phb->ioda.m64_size = resource_size(res);
 497        phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
 498        phb->ioda.m64_base = pci_addr;
 499
 500        /* This lines up nicely with the display from processing OF ranges */
 501        pr_info(" MEM 0x%016llx..0x%016llx -> 0x%016llx (M64 #%d..%d)\n",
 502                res->start, res->end, pci_addr, m64_range[0],
 503                m64_range[0] + m64_range[1] - 1);
 504
 505        /* Mark all M64 used up by default */
 506        phb->ioda.m64_bar_alloc = (unsigned long)-1;
 507
 508        /* Use last M64 BAR to cover M64 window */
 509        m64_range[1]--;
 510        phb->ioda.m64_bar_idx = m64_range[0] + m64_range[1];
 511
 512        pr_info(" Using M64 #%d as default window\n", phb->ioda.m64_bar_idx);
 513
 514        /* Mark remaining ones free */
 515        for (i = m64_range[0]; i < m64_range[1]; i++)
 516                clear_bit(i, &phb->ioda.m64_bar_alloc);
 517
 518        /*
 519         * Setup init functions for M64 based on IODA version, IODA3 uses
 520         * the IODA2 code.
 521         */
 522        if (phb->type == PNV_PHB_IODA1)
 523                phb->init_m64 = pnv_ioda1_init_m64;
 524        else
 525                phb->init_m64 = pnv_ioda2_init_m64;
 526}
 527
 528static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
 529{
 530        struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
 531        struct pnv_ioda_pe *slave;
 532        s64 rc;
 533
 534        /* Fetch master PE */
 535        if (pe->flags & PNV_IODA_PE_SLAVE) {
 536                pe = pe->master;
 537                if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
 538                        return;
 539
 540                pe_no = pe->pe_number;
 541        }
 542
 543        /* Freeze master PE */
 544        rc = opal_pci_eeh_freeze_set(phb->opal_id,
 545                                     pe_no,
 546                                     OPAL_EEH_ACTION_SET_FREEZE_ALL);
 547        if (rc != OPAL_SUCCESS) {
 548                pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
 549                        __func__, rc, phb->hose->global_number, pe_no);
 550                return;
 551        }
 552
 553        /* Freeze slave PEs */
 554        if (!(pe->flags & PNV_IODA_PE_MASTER))
 555                return;
 556
 557        list_for_each_entry(slave, &pe->slaves, list) {
 558                rc = opal_pci_eeh_freeze_set(phb->opal_id,
 559                                             slave->pe_number,
 560                                             OPAL_EEH_ACTION_SET_FREEZE_ALL);
 561                if (rc != OPAL_SUCCESS)
 562                        pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
 563                                __func__, rc, phb->hose->global_number,
 564                                slave->pe_number);
 565        }
 566}
 567
 568static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
 569{
 570        struct pnv_ioda_pe *pe, *slave;
 571        s64 rc;
 572
 573        /* Find master PE */
 574        pe = &phb->ioda.pe_array[pe_no];
 575        if (pe->flags & PNV_IODA_PE_SLAVE) {
 576                pe = pe->master;
 577                WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
 578                pe_no = pe->pe_number;
 579        }
 580
 581        /* Clear frozen state for master PE */
 582        rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
 583        if (rc != OPAL_SUCCESS) {
 584                pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
 585                        __func__, rc, opt, phb->hose->global_number, pe_no);
 586                return -EIO;
 587        }
 588
 589        if (!(pe->flags & PNV_IODA_PE_MASTER))
 590                return 0;
 591
 592        /* Clear frozen state for slave PEs */
 593        list_for_each_entry(slave, &pe->slaves, list) {
 594                rc = opal_pci_eeh_freeze_clear(phb->opal_id,
 595                                             slave->pe_number,
 596                                             opt);
 597                if (rc != OPAL_SUCCESS) {
 598                        pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
 599                                __func__, rc, opt, phb->hose->global_number,
 600                                slave->pe_number);
 601                        return -EIO;
 602                }
 603        }
 604
 605        return 0;
 606}
 607
 608static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
 609{
 610        struct pnv_ioda_pe *slave, *pe;
 611        u8 fstate = 0, state;
 612        __be16 pcierr = 0;
 613        s64 rc;
 614
 615        /* Sanity check on PE number */
 616        if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
 617                return OPAL_EEH_STOPPED_PERM_UNAVAIL;
 618
 619        /*
 620         * Fetch the master PE and the PE instance might be
 621         * not initialized yet.
 622         */
 623        pe = &phb->ioda.pe_array[pe_no];
 624        if (pe->flags & PNV_IODA_PE_SLAVE) {
 625                pe = pe->master;
 626                WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
 627                pe_no = pe->pe_number;
 628        }
 629
 630        /* Check the master PE */
 631        rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
 632                                        &state, &pcierr, NULL);
 633        if (rc != OPAL_SUCCESS) {
 634                pr_warn("%s: Failure %lld getting "
 635                        "PHB#%x-PE#%x state\n",
 636                        __func__, rc,
 637                        phb->hose->global_number, pe_no);
 638                return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
 639        }
 640
 641        /* Check the slave PE */
 642        if (!(pe->flags & PNV_IODA_PE_MASTER))
 643                return state;
 644
 645        list_for_each_entry(slave, &pe->slaves, list) {
 646                rc = opal_pci_eeh_freeze_status(phb->opal_id,
 647                                                slave->pe_number,
 648                                                &fstate,
 649                                                &pcierr,
 650                                                NULL);
 651                if (rc != OPAL_SUCCESS) {
 652                        pr_warn("%s: Failure %lld getting "
 653                                "PHB#%x-PE#%x state\n",
 654                                __func__, rc,
 655                                phb->hose->global_number, slave->pe_number);
 656                        return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
 657                }
 658
 659                /*
 660                 * Override the result based on the ascending
 661                 * priority.
 662                 */
 663                if (fstate > state)
 664                        state = fstate;
 665        }
 666
 667        return state;
 668}
 669
 670struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
 671{
 672        int pe_number = phb->ioda.pe_rmap[bdfn];
 673
 674        if (pe_number == IODA_INVALID_PE)
 675                return NULL;
 676
 677        return &phb->ioda.pe_array[pe_number];
 678}
 679
 680struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 681{
 682        struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
 683        struct pci_dn *pdn = pci_get_pdn(dev);
 684
 685        if (!pdn)
 686                return NULL;
 687        if (pdn->pe_number == IODA_INVALID_PE)
 688                return NULL;
 689        return &phb->ioda.pe_array[pdn->pe_number];
 690}
 691
 692static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
 693                                  struct pnv_ioda_pe *parent,
 694                                  struct pnv_ioda_pe *child,
 695                                  bool is_add)
 696{
 697        const char *desc = is_add ? "adding" : "removing";
 698        uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
 699                              OPAL_REMOVE_PE_FROM_DOMAIN;
 700        struct pnv_ioda_pe *slave;
 701        long rc;
 702
 703        /* Parent PE affects child PE */
 704        rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
 705                                child->pe_number, op);
 706        if (rc != OPAL_SUCCESS) {
 707                pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
 708                        rc, desc);
 709                return -ENXIO;
 710        }
 711
 712        if (!(child->flags & PNV_IODA_PE_MASTER))
 713                return 0;
 714
 715        /* Compound case: parent PE affects slave PEs */
 716        list_for_each_entry(slave, &child->slaves, list) {
 717                rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
 718                                        slave->pe_number, op);
 719                if (rc != OPAL_SUCCESS) {
 720                        pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
 721                                rc, desc);
 722                        return -ENXIO;
 723                }
 724        }
 725
 726        return 0;
 727}
 728
 729static int pnv_ioda_set_peltv(struct pnv_phb *phb,
 730                              struct pnv_ioda_pe *pe,
 731                              bool is_add)
 732{
 733        struct pnv_ioda_pe *slave;
 734        struct pci_dev *pdev = NULL;
 735        int ret;
 736
 737        /*
 738         * Clear PE frozen state. If it's master PE, we need
 739         * clear slave PE frozen state as well.
 740         */
 741        if (is_add) {
 742                opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
 743                                          OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 744                if (pe->flags & PNV_IODA_PE_MASTER) {
 745                        list_for_each_entry(slave, &pe->slaves, list)
 746                                opal_pci_eeh_freeze_clear(phb->opal_id,
 747                                                          slave->pe_number,
 748                                                          OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 749                }
 750        }
 751
 752        /*
 753         * Associate PE in PELT. We need add the PE into the
 754         * corresponding PELT-V as well. Otherwise, the error
 755         * originated from the PE might contribute to other
 756         * PEs.
 757         */
 758        ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
 759        if (ret)
 760                return ret;
 761
 762        /* For compound PEs, any one affects all of them */
 763        if (pe->flags & PNV_IODA_PE_MASTER) {
 764                list_for_each_entry(slave, &pe->slaves, list) {
 765                        ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
 766                        if (ret)
 767                                return ret;
 768                }
 769        }
 770
 771        if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
 772                pdev = pe->pbus->self;
 773        else if (pe->flags & PNV_IODA_PE_DEV)
 774                pdev = pe->pdev->bus->self;
 775#ifdef CONFIG_PCI_IOV
 776        else if (pe->flags & PNV_IODA_PE_VF)
 777                pdev = pe->parent_dev;
 778#endif /* CONFIG_PCI_IOV */
 779        while (pdev) {
 780                struct pci_dn *pdn = pci_get_pdn(pdev);
 781                struct pnv_ioda_pe *parent;
 782
 783                if (pdn && pdn->pe_number != IODA_INVALID_PE) {
 784                        parent = &phb->ioda.pe_array[pdn->pe_number];
 785                        ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
 786                        if (ret)
 787                                return ret;
 788                }
 789
 790                pdev = pdev->bus->self;
 791        }
 792
 793        return 0;
 794}
 795
 796static void pnv_ioda_unset_peltv(struct pnv_phb *phb,
 797                                 struct pnv_ioda_pe *pe,
 798                                 struct pci_dev *parent)
 799{
 800        int64_t rc;
 801
 802        while (parent) {
 803                struct pci_dn *pdn = pci_get_pdn(parent);
 804
 805                if (pdn && pdn->pe_number != IODA_INVALID_PE) {
 806                        rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
 807                                                pe->pe_number,
 808                                                OPAL_REMOVE_PE_FROM_DOMAIN);
 809                        /* XXX What to do in case of error ? */
 810                }
 811                parent = parent->bus->self;
 812        }
 813
 814        opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
 815                                  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 816
 817        /* Disassociate PE in PELT */
 818        rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
 819                                pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
 820        if (rc)
 821                pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
 822}
 823
 824int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 825{
 826        struct pci_dev *parent;
 827        uint8_t bcomp, dcomp, fcomp;
 828        int64_t rc;
 829        long rid_end, rid;
 830
 831        /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
 832        if (pe->pbus) {
 833                int count;
 834
 835                dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
 836                fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
 837                parent = pe->pbus->self;
 838                if (pe->flags & PNV_IODA_PE_BUS_ALL)
 839                        count = resource_size(&pe->pbus->busn_res);
 840                else
 841                        count = 1;
 842
 843                switch(count) {
 844                case  1: bcomp = OpalPciBusAll;         break;
 845                case  2: bcomp = OpalPciBus7Bits;       break;
 846                case  4: bcomp = OpalPciBus6Bits;       break;
 847                case  8: bcomp = OpalPciBus5Bits;       break;
 848                case 16: bcomp = OpalPciBus4Bits;       break;
 849                case 32: bcomp = OpalPciBus3Bits;       break;
 850                default:
 851                        dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
 852                                count);
 853                        /* Do an exact match only */
 854                        bcomp = OpalPciBusAll;
 855                }
 856                rid_end = pe->rid + (count << 8);
 857        } else {
 858#ifdef CONFIG_PCI_IOV
 859                if (pe->flags & PNV_IODA_PE_VF)
 860                        parent = pe->parent_dev;
 861                else
 862#endif
 863                        parent = pe->pdev->bus->self;
 864                bcomp = OpalPciBusAll;
 865                dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
 866                fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
 867                rid_end = pe->rid + 1;
 868        }
 869
 870        /* Clear the reverse map */
 871        for (rid = pe->rid; rid < rid_end; rid++)
 872                phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
 873
 874        /*
 875         * Release from all parents PELT-V. NPUs don't have a PELTV
 876         * table
 877         */
 878        if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
 879                pnv_ioda_unset_peltv(phb, pe, parent);
 880
 881        rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
 882                             bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
 883        if (rc)
 884                pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc);
 885
 886        pe->pbus = NULL;
 887        pe->pdev = NULL;
 888#ifdef CONFIG_PCI_IOV
 889        pe->parent_dev = NULL;
 890#endif
 891
 892        return 0;
 893}
 894
 895int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 896{
 897        struct pci_dev *parent;
 898        uint8_t bcomp, dcomp, fcomp;
 899        long rc, rid_end, rid;
 900
 901        /* Bus validation ? */
 902        if (pe->pbus) {
 903                int count;
 904
 905                dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
 906                fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
 907                parent = pe->pbus->self;
 908                if (pe->flags & PNV_IODA_PE_BUS_ALL)
 909                        count = resource_size(&pe->pbus->busn_res);
 910                else
 911                        count = 1;
 912
 913                switch(count) {
 914                case  1: bcomp = OpalPciBusAll;         break;
 915                case  2: bcomp = OpalPciBus7Bits;       break;
 916                case  4: bcomp = OpalPciBus6Bits;       break;
 917                case  8: bcomp = OpalPciBus5Bits;       break;
 918                case 16: bcomp = OpalPciBus4Bits;       break;
 919                case 32: bcomp = OpalPciBus3Bits;       break;
 920                default:
 921                        dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
 922                                count);
 923                        /* Do an exact match only */
 924                        bcomp = OpalPciBusAll;
 925                }
 926                rid_end = pe->rid + (count << 8);
 927        } else {
 928#ifdef CONFIG_PCI_IOV
 929                if (pe->flags & PNV_IODA_PE_VF)
 930                        parent = pe->parent_dev;
 931                else
 932#endif /* CONFIG_PCI_IOV */
 933                        parent = pe->pdev->bus->self;
 934                bcomp = OpalPciBusAll;
 935                dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
 936                fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
 937                rid_end = pe->rid + 1;
 938        }
 939
 940        /*
 941         * Associate PE in PELT. We need add the PE into the
 942         * corresponding PELT-V as well. Otherwise, the error
 943         * originated from the PE might contribute to other
 944         * PEs.
 945         */
 946        rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
 947                             bcomp, dcomp, fcomp, OPAL_MAP_PE);
 948        if (rc) {
 949                pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
 950                return -ENXIO;
 951        }
 952
 953        /*
 954         * Configure PELTV. NPUs don't have a PELTV table so skip
 955         * configuration on them.
 956         */
 957        if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
 958                pnv_ioda_set_peltv(phb, pe, true);
 959
 960        /* Setup reverse map */
 961        for (rid = pe->rid; rid < rid_end; rid++)
 962                phb->ioda.pe_rmap[rid] = pe->pe_number;
 963
 964        /* Setup one MVTs on IODA1 */
 965        if (phb->type != PNV_PHB_IODA1) {
 966                pe->mve_number = 0;
 967                goto out;
 968        }
 969
 970        pe->mve_number = pe->pe_number;
 971        rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
 972        if (rc != OPAL_SUCCESS) {
 973                pe_err(pe, "OPAL error %ld setting up MVE %x\n",
 974                       rc, pe->mve_number);
 975                pe->mve_number = -1;
 976        } else {
 977                rc = opal_pci_set_mve_enable(phb->opal_id,
 978                                             pe->mve_number, OPAL_ENABLE_MVE);
 979                if (rc) {
 980                        pe_err(pe, "OPAL error %ld enabling MVE %x\n",
 981                               rc, pe->mve_number);
 982                        pe->mve_number = -1;
 983                }
 984        }
 985
 986out:
 987        return 0;
 988}
 989
 990static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 991{
 992        struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
 993        struct pci_dn *pdn = pci_get_pdn(dev);
 994        struct pnv_ioda_pe *pe;
 995
 996        if (!pdn) {
 997                pr_err("%s: Device tree node not associated properly\n",
 998                           pci_name(dev));
 999                return NULL;
1000        }
1001        if (pdn->pe_number != IODA_INVALID_PE)
1002                return NULL;
1003
1004        pe = pnv_ioda_alloc_pe(phb, 1);
1005        if (!pe) {
1006                pr_warn("%s: Not enough PE# available, disabling device\n",
1007                        pci_name(dev));
1008                return NULL;
1009        }
1010
1011        /* NOTE: We don't get a reference for the pointer in the PE
1012         * data structure, both the device and PE structures should be
1013         * destroyed at the same time. However, removing nvlink
1014         * devices will need some work.
1015         *
1016         * At some point we want to remove the PDN completely anyways
1017         */
1018        pdn->pe_number = pe->pe_number;
1019        pe->flags = PNV_IODA_PE_DEV;
1020        pe->pdev = dev;
1021        pe->pbus = NULL;
1022        pe->mve_number = -1;
1023        pe->rid = dev->bus->number << 8 | pdn->devfn;
1024        pe->device_count++;
1025
1026        pe_info(pe, "Associated device to PE\n");
1027
1028        if (pnv_ioda_configure_pe(phb, pe)) {
1029                /* XXX What do we do here ? */
1030                pnv_ioda_free_pe(pe);
1031                pdn->pe_number = IODA_INVALID_PE;
1032                pe->pdev = NULL;
1033                return NULL;
1034        }
1035
1036        /* Put PE to the list */
1037        mutex_lock(&phb->ioda.pe_list_mutex);
1038        list_add_tail(&pe->list, &phb->ioda.pe_list);
1039        mutex_unlock(&phb->ioda.pe_list_mutex);
1040        return pe;
1041}
1042
1043/*
1044 * There're 2 types of PCI bus sensitive PEs: One that is compromised of
1045 * single PCI bus. Another one that contains the primary PCI bus and its
1046 * subordinate PCI devices and buses. The second type of PE is normally
1047 * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
1048 */
1049static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
1050{
1051        struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
1052        struct pnv_ioda_pe *pe = NULL;
1053        unsigned int pe_num;
1054
1055        /*
1056         * In partial hotplug case, the PE instance might be still alive.
1057         * We should reuse it instead of allocating a new one.
1058         */
1059        pe_num = phb->ioda.pe_rmap[bus->number << 8];
1060        if (WARN_ON(pe_num != IODA_INVALID_PE)) {
1061                pe = &phb->ioda.pe_array[pe_num];
1062                return NULL;
1063        }
1064
1065        /* PE number for root bus should have been reserved */
1066        if (pci_is_root_bus(bus))
1067                pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
1068
1069        /* Check if PE is determined by M64 */
1070        if (!pe)
1071                pe = pnv_ioda_pick_m64_pe(bus, all);
1072
1073        /* The PE number isn't pinned by M64 */
1074        if (!pe)
1075                pe = pnv_ioda_alloc_pe(phb, 1);
1076
1077        if (!pe) {
1078                pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
1079                        __func__, pci_domain_nr(bus), bus->number);
1080                return NULL;
1081        }
1082
1083        pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
1084        pe->pbus = bus;
1085        pe->pdev = NULL;
1086        pe->mve_number = -1;
1087        pe->rid = bus->busn_res.start << 8;
1088
1089        if (all)
1090                pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n",
1091                        &bus->busn_res.start, &bus->busn_res.end,
1092                        pe->pe_number);
1093        else
1094                pe_info(pe, "Secondary bus %pad associated with PE#%x\n",
1095                        &bus->busn_res.start, pe->pe_number);
1096
1097        if (pnv_ioda_configure_pe(phb, pe)) {
1098                /* XXX What do we do here ? */
1099                pnv_ioda_free_pe(pe);
1100                pe->pbus = NULL;
1101                return NULL;
1102        }
1103
1104        /* Put PE to the list */
1105        list_add_tail(&pe->list, &phb->ioda.pe_list);
1106
1107        return pe;
1108}
1109
1110static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
1111{
1112        int pe_num, found_pe = false, rc;
1113        long rid;
1114        struct pnv_ioda_pe *pe;
1115        struct pci_dev *gpu_pdev;
1116        struct pci_dn *npu_pdn;
1117        struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus);
1118
1119        /*
1120         * Intentionally leak a reference on the npu device (for
1121         * nvlink only; this is not an opencapi path) to make sure it
1122         * never goes away, as it's been the case all along and some
1123         * work is needed otherwise.
1124         */
1125        pci_dev_get(npu_pdev);
1126
1127        /*
1128         * Due to a hardware errata PE#0 on the NPU is reserved for
1129         * error handling. This means we only have three PEs remaining
1130         * which need to be assigned to four links, implying some
1131         * links must share PEs.
1132         *
1133         * To achieve this we assign PEs such that NPUs linking the
1134         * same GPU get assigned the same PE.
1135         */
1136        gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
1137        for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
1138                pe = &phb->ioda.pe_array[pe_num];
1139                if (!pe->pdev)
1140                        continue;
1141
1142                if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
1143                        /*
1144                         * This device has the same peer GPU so should
1145                         * be assigned the same PE as the existing
1146                         * peer NPU.
1147                         */
1148                        dev_info(&npu_pdev->dev,
1149                                "Associating to existing PE %x\n", pe_num);
1150                        npu_pdn = pci_get_pdn(npu_pdev);
1151                        rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
1152                        npu_pdn->pe_number = pe_num;
1153                        phb->ioda.pe_rmap[rid] = pe->pe_number;
1154                        pe->device_count++;
1155
1156                        /* Map the PE to this link */
1157                        rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
1158                                        OpalPciBusAll,
1159                                        OPAL_COMPARE_RID_DEVICE_NUMBER,
1160                                        OPAL_COMPARE_RID_FUNCTION_NUMBER,
1161                                        OPAL_MAP_PE);
1162                        WARN_ON(rc != OPAL_SUCCESS);
1163                        found_pe = true;
1164                        break;
1165                }
1166        }
1167
1168        if (!found_pe)
1169                /*
1170                 * Could not find an existing PE so allocate a new
1171                 * one.
1172                 */
1173                return pnv_ioda_setup_dev_PE(npu_pdev);
1174        else
1175                return pe;
1176}
1177
1178static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
1179{
1180        struct pci_dev *pdev;
1181
1182        list_for_each_entry(pdev, &bus->devices, bus_list)
1183                pnv_ioda_setup_npu_PE(pdev);
1184}
1185
1186static void pnv_pci_ioda_setup_nvlink(void)
1187{
1188        struct pci_controller *hose;
1189        struct pnv_phb *phb;
1190        struct pnv_ioda_pe *pe;
1191
1192        list_for_each_entry(hose, &hose_list, list_node) {
1193                phb = hose->private_data;
1194                if (phb->type == PNV_PHB_NPU_NVLINK) {
1195                        /* PE#0 is needed for error reporting */
1196                        pnv_ioda_reserve_pe(phb, 0);
1197                        pnv_ioda_setup_npu_PEs(hose->bus);
1198                        if (phb->model == PNV_PHB_MODEL_NPU2)
1199                                WARN_ON_ONCE(pnv_npu2_init(hose));
1200                }
1201        }
1202        list_for_each_entry(hose, &hose_list, list_node) {
1203                phb = hose->private_data;
1204                if (phb->type != PNV_PHB_IODA2)
1205                        continue;
1206
1207                list_for_each_entry(pe, &phb->ioda.pe_list, list)
1208                        pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
1209        }
1210
1211#ifdef CONFIG_IOMMU_API
1212        /* setup iommu groups so we can do nvlink pass-thru */
1213        pnv_pci_npu_setup_iommu_groups();
1214#endif
1215}
1216
1217static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
1218                                       struct pnv_ioda_pe *pe);
1219
1220static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
1221{
1222        struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
1223        struct pci_dn *pdn = pci_get_pdn(pdev);
1224        struct pnv_ioda_pe *pe;
1225
1226        /* Check if the BDFN for this device is associated with a PE yet */
1227        pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
1228        if (!pe) {
1229                /* VF PEs should be pre-configured in pnv_pci_sriov_enable() */
1230                if (WARN_ON(pdev->is_virtfn))
1231                        return;
1232
1233                pnv_pci_configure_bus(pdev->bus);
1234                pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
1235                pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff);
1236
1237
1238                /*
1239                 * If we can't setup the IODA PE something has gone horribly
1240                 * wrong and we can't enable DMA for the device.
1241                 */
1242                if (WARN_ON(!pe))
1243                        return;
1244        } else {
1245                pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
1246        }
1247
1248        /*
1249         * We assume that bridges *probably* don't need to do any DMA so we can
1250         * skip allocating a TCE table, etc unless we get a non-bridge device.
1251         */
1252        if (!pe->dma_setup_done && !pci_is_bridge(pdev)) {
1253                switch (phb->type) {
1254                case PNV_PHB_IODA1:
1255                        pnv_pci_ioda1_setup_dma_pe(phb, pe);
1256                        break;
1257                case PNV_PHB_IODA2:
1258                        pnv_pci_ioda2_setup_dma_pe(phb, pe);
1259                        break;
1260                default:
1261                        pr_warn("%s: No DMA for PHB#%x (type %d)\n",
1262                                __func__, phb->hose->global_number, phb->type);
1263                }
1264        }
1265
1266        if (pdn)
1267                pdn->pe_number = pe->pe_number;
1268        pe->device_count++;
1269
1270        WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1271        pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
1272        set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
1273
1274        /* PEs with a DMA weight of zero won't have a group */
1275        if (pe->table_group.group)
1276                iommu_add_device(&pe->table_group, &pdev->dev);
1277}
1278
1279/*
1280 * Reconfigure TVE#0 to be usable as 64-bit DMA space.
1281 *
1282 * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
1283 * Devices can only access more than that if bit 59 of the PCI address is set
1284 * by hardware, which indicates TVE#1 should be used instead of TVE#0.
1285 * Many PCI devices are not capable of addressing that many bits, and as a
1286 * result are limited to the 4GB of virtual memory made available to 32-bit
1287 * devices in TVE#0.
1288 *
1289 * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
1290 * devices by configuring the virtual memory past the first 4GB inaccessible
1291 * by 64-bit DMAs.  This should only be used by devices that want more than
1292 * 4GB, and only on PEs that have no 32-bit devices.
1293 *
1294 * Currently this will only work on PHB3 (POWER8).
1295 */
1296static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
1297{
1298        u64 window_size, table_size, tce_count, addr;
1299        struct page *table_pages;
1300        u64 tce_order = 28; /* 256MB TCEs */
1301        __be64 *tces;
1302        s64 rc;
1303
1304        /*
1305         * Window size needs to be a power of two, but needs to account for
1306         * shifting memory by the 4GB offset required to skip 32bit space.
1307         */
1308        window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
1309        tce_count = window_size >> tce_order;
1310        table_size = tce_count << 3;
1311
1312        if (table_size < PAGE_SIZE)
1313                table_size = PAGE_SIZE;
1314
1315        table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
1316                                       get_order(table_size));
1317        if (!table_pages)
1318                goto err;
1319
1320        tces = page_address(table_pages);
1321        if (!tces)
1322                goto err;
1323
1324        memset(tces, 0, table_size);
1325
1326        for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
1327                tces[(addr + (1ULL << 32)) >> tce_order] =
1328                        cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
1329        }
1330
1331        rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
1332                                        pe->pe_number,
1333                                        /* reconfigure window 0 */
1334                                        (pe->pe_number << 1) + 0,
1335                                        1,
1336                                        __pa(tces),
1337                                        table_size,
1338                                        1 << tce_order);
1339        if (rc == OPAL_SUCCESS) {
1340                pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
1341                return 0;
1342        }
1343err:
1344        pe_err(pe, "Error configuring 64-bit DMA bypass\n");
1345        return -EIO;
1346}
1347
1348static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
1349                u64 dma_mask)
1350{
1351        struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
1352        struct pci_dn *pdn = pci_get_pdn(pdev);
1353        struct pnv_ioda_pe *pe;
1354
1355        if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1356                return false;
1357
1358        pe = &phb->ioda.pe_array[pdn->pe_number];
1359        if (pe->tce_bypass_enabled) {
1360                u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1361                if (dma_mask >= top)
1362                        return true;
1363        }
1364
1365        /*
1366         * If the device can't set the TCE bypass bit but still wants
1367         * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
1368         * bypass the 32-bit region and be usable for 64-bit DMAs.
1369         * The device needs to be able to address all of this space.
1370         */
1371        if (dma_mask >> 32 &&
1372            dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
1373            /* pe->pdev should be set if it's a single device, pe->pbus if not */
1374            (pe->device_count == 1 || !pe->pbus) &&
1375            phb->model == PNV_PHB_MODEL_PHB3) {
1376                /* Configure the bypass mode */
1377                s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
1378                if (rc)
1379                        return false;
1380                /* 4GB offset bypasses 32-bit space */
1381                pdev->dev.archdata.dma_offset = (1ULL << 32);
1382                return true;
1383        }
1384
1385        return false;
1386}
1387
1388static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
1389                                                     bool real_mode)
1390{
1391        return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
1392                (phb->regs + 0x210);
1393}
1394
1395static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
1396                unsigned long index, unsigned long npages, bool rm)
1397{
1398        struct iommu_table_group_link *tgl = list_first_entry_or_null(
1399                        &tbl->it_group_list, struct iommu_table_group_link,
1400                        next);
1401        struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1402                        struct pnv_ioda_pe, table_group);
1403        __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
1404        unsigned long start, end, inc;
1405
1406        start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
1407        end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
1408                        npages - 1);
1409
1410        /* p7ioc-style invalidation, 2 TCEs per write */
1411        start |= (1ull << 63);
1412        end |= (1ull << 63);
1413        inc = 16;
1414        end |= inc - 1; /* round up end to be different than start */
1415
1416        mb(); /* Ensure above stores are visible */
1417        while (start <= end) {
1418                if (rm)
1419                        __raw_rm_writeq_be(start, invalidate);
1420                else
1421                        __raw_writeq_be(start, invalidate);
1422
1423                start += inc;
1424        }
1425
1426        /*
1427         * The iommu layer will do another mb() for us on build()
1428         * and we don't care on free()
1429         */
1430}
1431
1432static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
1433                long npages, unsigned long uaddr,
1434                enum dma_data_direction direction,
1435                unsigned long attrs)
1436{
1437        int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1438                        attrs);
1439
1440        if (!ret)
1441                pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
1442
1443        return ret;
1444}
1445
1446#ifdef CONFIG_IOMMU_API
1447/* Common for IODA1 and IODA2 */
1448static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
1449                unsigned long *hpa, enum dma_data_direction *direction,
1450                bool realmode)
1451{
1452        return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
1453}
1454#endif
1455
1456static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
1457                long npages)
1458{
1459        pnv_tce_free(tbl, index, npages);
1460
1461        pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
1462}
1463
1464static struct iommu_table_ops pnv_ioda1_iommu_ops = {
1465        .set = pnv_ioda1_tce_build,
1466#ifdef CONFIG_IOMMU_API
1467        .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
1468        .tce_kill = pnv_pci_p7ioc_tce_invalidate,
1469        .useraddrptr = pnv_tce_useraddrptr,
1470#endif
1471        .clear = pnv_ioda1_tce_free,
1472        .get = pnv_tce_get,
1473};
1474
1475#define PHB3_TCE_KILL_INVAL_ALL         PPC_BIT(0)
1476#define PHB3_TCE_KILL_INVAL_PE          PPC_BIT(1)
1477#define PHB3_TCE_KILL_INVAL_ONE         PPC_BIT(2)
1478
1479static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
1480{
1481        __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
1482        const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
1483
1484        mb(); /* Ensure previous TCE table stores are visible */
1485        if (rm)
1486                __raw_rm_writeq_be(val, invalidate);
1487        else
1488                __raw_writeq_be(val, invalidate);
1489}
1490
1491static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
1492{
1493        /* 01xb - invalidate TCEs that match the specified PE# */
1494        __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
1495        unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
1496
1497        mb(); /* Ensure above stores are visible */
1498        __raw_writeq_be(val, invalidate);
1499}
1500
1501static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
1502                                        unsigned shift, unsigned long index,
1503                                        unsigned long npages)
1504{
1505        __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
1506        unsigned long start, end, inc;
1507
1508        /* We'll invalidate DMA address in PE scope */
1509        start = PHB3_TCE_KILL_INVAL_ONE;
1510        start |= (pe->pe_number & 0xFF);
1511        end = start;
1512
1513        /* Figure out the start, end and step */
1514        start |= (index << shift);
1515        end |= ((index + npages - 1) << shift);
1516        inc = (0x1ull << shift);
1517        mb();
1518
1519        while (start <= end) {
1520                if (rm)
1521                        __raw_rm_writeq_be(start, invalidate);
1522                else
1523                        __raw_writeq_be(start, invalidate);
1524                start += inc;
1525        }
1526}
1527
1528static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
1529{
1530        struct pnv_phb *phb = pe->phb;
1531
1532        if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
1533                pnv_pci_phb3_tce_invalidate_pe(pe);
1534        else
1535                opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PE,
1536                                  pe->pe_number, 0, 0, 0);
1537}
1538
1539static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
1540                unsigned long index, unsigned long npages, bool rm)
1541{
1542        struct iommu_table_group_link *tgl;
1543
1544        list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
1545                struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1546                                struct pnv_ioda_pe, table_group);
1547                struct pnv_phb *phb = pe->phb;
1548                unsigned int shift = tbl->it_page_shift;
1549
1550                /*
1551                 * NVLink1 can use the TCE kill register directly as
1552                 * it's the same as PHB3. NVLink2 is different and
1553                 * should go via the OPAL call.
1554                 */
1555                if (phb->model == PNV_PHB_MODEL_NPU) {
1556                        /*
1557                         * The NVLink hardware does not support TCE kill
1558                         * per TCE entry so we have to invalidate
1559                         * the entire cache for it.
1560                         */
1561                        pnv_pci_phb3_tce_invalidate_entire(phb, rm);
1562                        continue;
1563                }
1564                if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
1565                        pnv_pci_phb3_tce_invalidate(pe, rm, shift,
1566                                                    index, npages);
1567                else
1568                        opal_pci_tce_kill(phb->opal_id,
1569                                          OPAL_PCI_TCE_KILL_PAGES,
1570                                          pe->pe_number, 1u << shift,
1571                                          index << shift, npages);
1572        }
1573}
1574
1575void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
1576{
1577        if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
1578                pnv_pci_phb3_tce_invalidate_entire(phb, rm);
1579        else
1580                opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
1581}
1582
1583static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
1584                long npages, unsigned long uaddr,
1585                enum dma_data_direction direction,
1586                unsigned long attrs)
1587{
1588        int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1589                        attrs);
1590
1591        if (!ret)
1592                pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
1593
1594        return ret;
1595}
1596
1597static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
1598                long npages)
1599{
1600        pnv_tce_free(tbl, index, npages);
1601
1602        pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
1603}
1604
1605static struct iommu_table_ops pnv_ioda2_iommu_ops = {
1606        .set = pnv_ioda2_tce_build,
1607#ifdef CONFIG_IOMMU_API
1608        .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
1609        .tce_kill = pnv_pci_ioda2_tce_invalidate,
1610        .useraddrptr = pnv_tce_useraddrptr,
1611#endif
1612        .clear = pnv_ioda2_tce_free,
1613        .get = pnv_tce_get,
1614        .free = pnv_pci_ioda2_table_free_pages,
1615};
1616
1617static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
1618{
1619        unsigned int *weight = (unsigned int *)data;
1620
1621        /* This is quite simplistic. The "base" weight of a device
1622         * is 10. 0 means no DMA is to be accounted for it.
1623         */
1624        if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
1625                return 0;
1626
1627        if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
1628            dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
1629            dev->class == PCI_CLASS_SERIAL_USB_EHCI)
1630                *weight += 3;
1631        else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
1632                *weight += 15;
1633        else
1634                *weight += 10;
1635
1636        return 0;
1637}
1638
1639static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
1640{
1641        unsigned int weight = 0;
1642
1643        /* SRIOV VF has same DMA32 weight as its PF */
1644#ifdef CONFIG_PCI_IOV
1645        if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
1646                pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
1647                return weight;
1648        }
1649#endif
1650
1651        if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
1652                pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
1653        } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
1654                struct pci_dev *pdev;
1655
1656                list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
1657                        pnv_pci_ioda_dev_dma_weight(pdev, &weight);
1658        } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
1659                pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
1660        }
1661
1662        return weight;
1663}
1664
1665static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
1666                                       struct pnv_ioda_pe *pe)
1667{
1668
1669        struct page *tce_mem = NULL;
1670        struct iommu_table *tbl;
1671        unsigned int weight, total_weight = 0;
1672        unsigned int tce32_segsz, base, segs, avail, i;
1673        int64_t rc;
1674        void *addr;
1675
1676        /* XXX FIXME: Handle 64-bit only DMA devices */
1677        /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
1678        /* XXX FIXME: Allocate multi-level tables on PHB3 */
1679        weight = pnv_pci_ioda_pe_dma_weight(pe);
1680        if (!weight)
1681                return;
1682
1683        pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
1684                     &total_weight);
1685        segs = (weight * phb->ioda.dma32_count) / total_weight;
1686        if (!segs)
1687                segs = 1;
1688
1689        /*
1690         * Allocate contiguous DMA32 segments. We begin with the expected
1691         * number of segments. With one more attempt, the number of DMA32
1692         * segments to be allocated is decreased by one until one segment
1693         * is allocated successfully.
1694         */
1695        do {
1696                for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
1697                        for (avail = 0, i = base; i < base + segs; i++) {
1698                                if (phb->ioda.dma32_segmap[i] ==
1699                                    IODA_INVALID_PE)
1700                                        avail++;
1701                        }
1702
1703                        if (avail == segs)
1704                                goto found;
1705                }
1706        } while (--segs);
1707
1708        if (!segs) {
1709                pe_warn(pe, "No available DMA32 segments\n");
1710                return;
1711        }
1712
1713found:
1714        tbl = pnv_pci_table_alloc(phb->hose->node);
1715        if (WARN_ON(!tbl))
1716                return;
1717
1718        iommu_register_group(&pe->table_group, phb->hose->global_number,
1719                        pe->pe_number);
1720        pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
1721
1722        /* Grab a 32-bit TCE table */
1723        pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
1724                weight, total_weight, base, segs);
1725        pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
1726                base * PNV_IODA1_DMA32_SEGSIZE,
1727                (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
1728
1729        /* XXX Currently, we allocate one big contiguous table for the
1730         * TCEs. We only really need one chunk per 256M of TCE space
1731         * (ie per segment) but that's an optimization for later, it
1732         * requires some added smarts with our get/put_tce implementation
1733         *
1734         * Each TCE page is 4KB in size and each TCE entry occupies 8
1735         * bytes
1736         */
1737        tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
1738        tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
1739                                   get_order(tce32_segsz * segs));
1740        if (!tce_mem) {
1741                pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
1742                goto fail;
1743        }
1744        addr = page_address(tce_mem);
1745        memset(addr, 0, tce32_segsz * segs);
1746
1747        /* Configure HW */
1748        for (i = 0; i < segs; i++) {
1749                rc = opal_pci_map_pe_dma_window(phb->opal_id,
1750                                              pe->pe_number,
1751                                              base + i, 1,
1752                                              __pa(addr) + tce32_segsz * i,
1753                                              tce32_segsz, IOMMU_PAGE_SIZE_4K);
1754                if (rc) {
1755                        pe_err(pe, " Failed to configure 32-bit TCE table, err %lld\n",
1756                               rc);
1757                        goto fail;
1758                }
1759        }
1760
1761        /* Setup DMA32 segment mapping */
1762        for (i = base; i < base + segs; i++)
1763                phb->ioda.dma32_segmap[i] = pe->pe_number;
1764
1765        /* Setup linux iommu table */
1766        pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
1767                                  base * PNV_IODA1_DMA32_SEGSIZE,
1768                                  IOMMU_PAGE_SHIFT_4K);
1769
1770        tbl->it_ops = &pnv_ioda1_iommu_ops;
1771        pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
1772        pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
1773        iommu_init_table(tbl, phb->hose->node, 0, 0);
1774
1775        pe->dma_setup_done = true;
1776        return;
1777 fail:
1778        /* XXX Failure: Try to fallback to 64-bit only ? */
1779        if (tce_mem)
1780                __free_pages(tce_mem, get_order(tce32_segsz * segs));
1781        if (tbl) {
1782                pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
1783                iommu_tce_table_put(tbl);
1784        }
1785}
1786
1787static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
1788                int num, struct iommu_table *tbl)
1789{
1790        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
1791                        table_group);
1792        struct pnv_phb *phb = pe->phb;
1793        int64_t rc;
1794        const unsigned long size = tbl->it_indirect_levels ?
1795                        tbl->it_level_size : tbl->it_size;
1796        const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
1797        const __u64 win_size = tbl->it_size << tbl->it_page_shift;
1798
1799        pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n",
1800                num, start_addr, start_addr + win_size - 1,
1801                IOMMU_PAGE_SIZE(tbl));
1802
1803        /*
1804         * Map TCE table through TVT. The TVE index is the PE number
1805         * shifted by 1 bit for 32-bits DMA space.
1806         */
1807        rc = opal_pci_map_pe_dma_window(phb->opal_id,
1808                        pe->pe_number,
1809                        (pe->pe_number << 1) + num,
1810                        tbl->it_indirect_levels + 1,
1811                        __pa(tbl->it_base),
1812                        size << 3,
1813                        IOMMU_PAGE_SIZE(tbl));
1814        if (rc) {
1815                pe_err(pe, "Failed to configure TCE table, err %lld\n", rc);
1816                return rc;
1817        }
1818
1819        pnv_pci_link_table_and_group(phb->hose->node, num,
1820                        tbl, &pe->table_group);
1821        pnv_pci_ioda2_tce_invalidate_pe(pe);
1822
1823        return 0;
1824}
1825
1826static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
1827{
1828        uint16_t window_id = (pe->pe_number << 1 ) + 1;
1829        int64_t rc;
1830
1831        pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
1832        if (enable) {
1833                phys_addr_t top = memblock_end_of_DRAM();
1834
1835                top = roundup_pow_of_two(top);
1836                rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
1837                                                     pe->pe_number,
1838                                                     window_id,
1839                                                     pe->tce_bypass_base,
1840                                                     top);
1841        } else {
1842                rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
1843                                                     pe->pe_number,
1844                                                     window_id,
1845                                                     pe->tce_bypass_base,
1846                                                     0);
1847        }
1848        if (rc)
1849                pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
1850        else
1851                pe->tce_bypass_enabled = enable;
1852}
1853
1854static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
1855                int num, __u32 page_shift, __u64 window_size, __u32 levels,
1856                bool alloc_userspace_copy, struct iommu_table **ptbl)
1857{
1858        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
1859                        table_group);
1860        int nid = pe->phb->hose->node;
1861        __u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
1862        long ret;
1863        struct iommu_table *tbl;
1864
1865        tbl = pnv_pci_table_alloc(nid);
1866        if (!tbl)
1867                return -ENOMEM;
1868
1869        tbl->it_ops = &pnv_ioda2_iommu_ops;
1870
1871        ret = pnv_pci_ioda2_table_alloc_pages(nid,
1872                        bus_offset, page_shift, window_size,
1873                        levels, alloc_userspace_copy, tbl);
1874        if (ret) {
1875                iommu_tce_table_put(tbl);
1876                return ret;
1877        }
1878
1879        *ptbl = tbl;
1880
1881        return 0;
1882}
1883
1884static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
1885{
1886        struct iommu_table *tbl = NULL;
1887        long rc;
1888        unsigned long res_start, res_end;
1889
1890        /*
1891         * crashkernel= specifies the kdump kernel's maximum memory at
1892         * some offset and there is no guaranteed the result is a power
1893         * of 2, which will cause errors later.
1894         */
1895        const u64 max_memory = __rounddown_pow_of_two(memory_hotplug_max());
1896
1897        /*
1898         * In memory constrained environments, e.g. kdump kernel, the
1899         * DMA window can be larger than available memory, which will
1900         * cause errors later.
1901         */
1902        const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
1903
1904        /*
1905         * We create the default window as big as we can. The constraint is
1906         * the max order of allocation possible. The TCE table is likely to
1907         * end up being multilevel and with on-demand allocation in place,
1908         * the initial use is not going to be huge as the default window aims
1909         * to support crippled devices (i.e. not fully 64bit DMAble) only.
1910         */
1911        /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
1912        const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
1913        /* Each TCE level cannot exceed maxblock so go multilevel if needed */
1914        unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
1915        unsigned long tcelevel_order = ilog2(maxblock >> 3);
1916        unsigned int levels = tces_order / tcelevel_order;
1917
1918        if (tces_order % tcelevel_order)
1919                levels += 1;
1920        /*
1921         * We try to stick to default levels (which is >1 at the moment) in
1922         * order to save memory by relying on on-demain TCE level allocation.
1923         */
1924        levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
1925
1926        rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
1927                        window_size, levels, false, &tbl);
1928        if (rc) {
1929                pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
1930                                rc);
1931                return rc;
1932        }
1933
1934        /* We use top part of 32bit space for MMIO so exclude it from DMA */
1935        res_start = 0;
1936        res_end = 0;
1937        if (window_size > pe->phb->ioda.m32_pci_base) {
1938                res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
1939                res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
1940        }
1941        iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
1942
1943        rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
1944        if (rc) {
1945                pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
1946                                rc);
1947                iommu_tce_table_put(tbl);
1948                return rc;
1949        }
1950
1951        if (!pnv_iommu_bypass_disabled)
1952                pnv_pci_ioda2_set_bypass(pe, true);
1953
1954        /*
1955         * Set table base for the case of IOMMU DMA use. Usually this is done
1956         * from dma_dev_setup() which is not called when a device is returned
1957         * from VFIO so do it here.
1958         */
1959        if (pe->pdev)
1960                set_iommu_table_base(&pe->pdev->dev, tbl);
1961
1962        return 0;
1963}
1964
1965static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
1966                int num)
1967{
1968        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
1969                        table_group);
1970        struct pnv_phb *phb = pe->phb;
1971        long ret;
1972
1973        pe_info(pe, "Removing DMA window #%d\n", num);
1974
1975        ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
1976                        (pe->pe_number << 1) + num,
1977                        0/* levels */, 0/* table address */,
1978                        0/* table size */, 0/* page size */);
1979        if (ret)
1980                pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
1981        else
1982                pnv_pci_ioda2_tce_invalidate_pe(pe);
1983
1984        pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
1985
1986        return ret;
1987}
1988
1989#ifdef CONFIG_IOMMU_API
1990unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
1991                __u64 window_size, __u32 levels)
1992{
1993        unsigned long bytes = 0;
1994        const unsigned window_shift = ilog2(window_size);
1995        unsigned entries_shift = window_shift - page_shift;
1996        unsigned table_shift = entries_shift + 3;
1997        unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
1998        unsigned long direct_table_size;
1999
2000        if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
2001                        !is_power_of_2(window_size))
2002                return 0;
2003
2004        /* Calculate a direct table size from window_size and levels */
2005        entries_shift = (entries_shift + levels - 1) / levels;
2006        table_shift = entries_shift + 3;
2007        table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
2008        direct_table_size =  1UL << table_shift;
2009
2010        for ( ; levels; --levels) {
2011                bytes += ALIGN(tce_table_size, direct_table_size);
2012
2013                tce_table_size /= direct_table_size;
2014                tce_table_size <<= 3;
2015                tce_table_size = max_t(unsigned long,
2016                                tce_table_size, direct_table_size);
2017        }
2018
2019        return bytes + bytes; /* one for HW table, one for userspace copy */
2020}
2021
2022static long pnv_pci_ioda2_create_table_userspace(
2023                struct iommu_table_group *table_group,
2024                int num, __u32 page_shift, __u64 window_size, __u32 levels,
2025                struct iommu_table **ptbl)
2026{
2027        long ret = pnv_pci_ioda2_create_table(table_group,
2028                        num, page_shift, window_size, levels, true, ptbl);
2029
2030        if (!ret)
2031                (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
2032                                page_shift, window_size, levels);
2033        return ret;
2034}
2035
2036static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
2037{
2038        struct pci_dev *dev;
2039
2040        list_for_each_entry(dev, &bus->devices, bus_list) {
2041                set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
2042                dev->dev.archdata.dma_offset = pe->tce_bypass_base;
2043
2044                if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
2045                        pnv_ioda_setup_bus_dma(pe, dev->subordinate);
2046        }
2047}
2048
2049static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
2050{
2051        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2052                                                table_group);
2053        /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
2054        struct iommu_table *tbl = pe->table_group.tables[0];
2055
2056        pnv_pci_ioda2_set_bypass(pe, false);
2057        pnv_pci_ioda2_unset_window(&pe->table_group, 0);
2058        if (pe->pbus)
2059                pnv_ioda_setup_bus_dma(pe, pe->pbus);
2060        else if (pe->pdev)
2061                set_iommu_table_base(&pe->pdev->dev, NULL);
2062        iommu_tce_table_put(tbl);
2063}
2064
2065static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
2066{
2067        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2068                                                table_group);
2069
2070        pnv_pci_ioda2_setup_default_config(pe);
2071        if (pe->pbus)
2072                pnv_ioda_setup_bus_dma(pe, pe->pbus);
2073}
2074
2075static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
2076        .get_table_size = pnv_pci_ioda2_get_table_size,
2077        .create_table = pnv_pci_ioda2_create_table_userspace,
2078        .set_window = pnv_pci_ioda2_set_window,
2079        .unset_window = pnv_pci_ioda2_unset_window,
2080        .take_ownership = pnv_ioda2_take_ownership,
2081        .release_ownership = pnv_ioda2_release_ownership,
2082};
2083#endif
2084
2085void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
2086                                struct pnv_ioda_pe *pe)
2087{
2088        int64_t rc;
2089
2090        /* TVE #1 is selected by PCI address bit 59 */
2091        pe->tce_bypass_base = 1ull << 59;
2092
2093        /* The PE will reserve all possible 32-bits space */
2094        pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
2095                phb->ioda.m32_pci_base);
2096
2097        /* Setup linux iommu table */
2098        pe->table_group.tce32_start = 0;
2099        pe->table_group.tce32_size = phb->ioda.m32_pci_base;
2100        pe->table_group.max_dynamic_windows_supported =
2101                        IOMMU_TABLE_GROUP_MAX_TABLES;
2102        pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
2103        pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
2104
2105        rc = pnv_pci_ioda2_setup_default_config(pe);
2106        if (rc)
2107                return;
2108
2109#ifdef CONFIG_IOMMU_API
2110        pe->table_group.ops = &pnv_pci_ioda2_ops;
2111        iommu_register_group(&pe->table_group, phb->hose->global_number,
2112                             pe->pe_number);
2113#endif
2114        pe->dma_setup_done = true;
2115}
2116
2117int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
2118{
2119        struct pnv_phb *phb = container_of(chip, struct pnv_phb,
2120                                           ioda.irq_chip);
2121
2122        return opal_pci_msi_eoi(phb->opal_id, hw_irq);
2123}
2124
2125static void pnv_ioda2_msi_eoi(struct irq_data *d)
2126{
2127        int64_t rc;
2128        unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
2129        struct irq_chip *chip = irq_data_get_irq_chip(d);
2130
2131        rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
2132        WARN_ON_ONCE(rc);
2133
2134        icp_native_eoi(d);
2135}
2136
2137
2138void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
2139{
2140        struct irq_data *idata;
2141        struct irq_chip *ichip;
2142
2143        /* The MSI EOI OPAL call is only needed on PHB3 */
2144        if (phb->model != PNV_PHB_MODEL_PHB3)
2145                return;
2146
2147        if (!phb->ioda.irq_chip_init) {
2148                /*
2149                 * First time we setup an MSI IRQ, we need to setup the
2150                 * corresponding IRQ chip to route correctly.
2151                 */
2152                idata = irq_get_irq_data(virq);
2153                ichip = irq_data_get_irq_chip(idata);
2154                phb->ioda.irq_chip_init = 1;
2155                phb->ioda.irq_chip = *ichip;
2156                phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
2157        }
2158        irq_set_chip(virq, &phb->ioda.irq_chip);
2159}
2160
2161/*
2162 * Returns true iff chip is something that we could call
2163 * pnv_opal_pci_msi_eoi for.
2164 */
2165bool is_pnv_opal_msi(struct irq_chip *chip)
2166{
2167        return chip->irq_eoi == pnv_ioda2_msi_eoi;
2168}
2169EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
2170
2171static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
2172                                  unsigned int hwirq, unsigned int virq,
2173                                  unsigned int is_64, struct msi_msg *msg)
2174{
2175        struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
2176        unsigned int xive_num = hwirq - phb->msi_base;
2177        __be32 data;
2178        int rc;
2179
2180        /* No PE assigned ? bail out ... no MSI for you ! */
2181        if (pe == NULL)
2182                return -ENXIO;
2183
2184        /* Check if we have an MVE */
2185        if (pe->mve_number < 0)
2186                return -ENXIO;
2187
2188        /* Force 32-bit MSI on some broken devices */
2189        if (dev->no_64bit_msi)
2190                is_64 = 0;
2191
2192        /* Assign XIVE to PE */
2193        rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2194        if (rc) {
2195                pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
2196                        pci_name(dev), rc, xive_num);
2197                return -EIO;
2198        }
2199
2200        if (is_64) {
2201                __be64 addr64;
2202
2203                rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
2204                                     &addr64, &data);
2205                if (rc) {
2206                        pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
2207                                pci_name(dev), rc);
2208                        return -EIO;
2209                }
2210                msg->address_hi = be64_to_cpu(addr64) >> 32;
2211                msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
2212        } else {
2213                __be32 addr32;
2214
2215                rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
2216                                     &addr32, &data);
2217                if (rc) {
2218                        pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
2219                                pci_name(dev), rc);
2220                        return -EIO;
2221                }
2222                msg->address_hi = 0;
2223                msg->address_lo = be32_to_cpu(addr32);
2224        }
2225        msg->data = be32_to_cpu(data);
2226
2227        pnv_set_msi_irq_chip(phb, virq);
2228
2229        pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
2230                 " address=%x_%08x data=%x PE# %x\n",
2231                 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
2232                 msg->address_hi, msg->address_lo, data, pe->pe_number);
2233
2234        return 0;
2235}
2236
2237static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
2238{
2239        unsigned int count;
2240        const __be32 *prop = of_get_property(phb->hose->dn,
2241                                             "ibm,opal-msi-ranges", NULL);
2242        if (!prop) {
2243                /* BML Fallback */
2244                prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
2245        }
2246        if (!prop)
2247                return;
2248
2249        phb->msi_base = be32_to_cpup(prop);
2250        count = be32_to_cpup(prop + 1);
2251        if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
2252                pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
2253                       phb->hose->global_number);
2254                return;
2255        }
2256
2257        phb->msi_setup = pnv_pci_ioda_msi_setup;
2258        phb->msi32_support = 1;
2259        pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
2260                count, phb->msi_base);
2261}
2262
2263static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
2264                                  struct resource *res)
2265{
2266        struct pnv_phb *phb = pe->phb;
2267        struct pci_bus_region region;
2268        int index;
2269        int64_t rc;
2270
2271        if (!res || !res->flags || res->start > res->end)
2272                return;
2273
2274        if (res->flags & IORESOURCE_IO) {
2275                region.start = res->start - phb->ioda.io_pci_base;
2276                region.end   = res->end - phb->ioda.io_pci_base;
2277                index = region.start / phb->ioda.io_segsize;
2278
2279                while (index < phb->ioda.total_pe_num &&
2280                       region.start <= region.end) {
2281                        phb->ioda.io_segmap[index] = pe->pe_number;
2282                        rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2283                                pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
2284                        if (rc != OPAL_SUCCESS) {
2285                                pr_err("%s: Error %lld mapping IO segment#%d to PE#%x\n",
2286                                       __func__, rc, index, pe->pe_number);
2287                                break;
2288                        }
2289
2290                        region.start += phb->ioda.io_segsize;
2291                        index++;
2292                }
2293        } else if ((res->flags & IORESOURCE_MEM) &&
2294                   !pnv_pci_is_m64(phb, res)) {
2295                region.start = res->start -
2296                               phb->hose->mem_offset[0] -
2297                               phb->ioda.m32_pci_base;
2298                region.end   = res->end -
2299                               phb->hose->mem_offset[0] -
2300                               phb->ioda.m32_pci_base;
2301                index = region.start / phb->ioda.m32_segsize;
2302
2303                while (index < phb->ioda.total_pe_num &&
2304                       region.start <= region.end) {
2305                        phb->ioda.m32_segmap[index] = pe->pe_number;
2306                        rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2307                                pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
2308                        if (rc != OPAL_SUCCESS) {
2309                                pr_err("%s: Error %lld mapping M32 segment#%d to PE#%x",
2310                                       __func__, rc, index, pe->pe_number);
2311                                break;
2312                        }
2313
2314                        region.start += phb->ioda.m32_segsize;
2315                        index++;
2316                }
2317        }
2318}
2319
2320/*
2321 * This function is supposed to be called on basis of PE from top
2322 * to bottom style. So the the I/O or MMIO segment assigned to
2323 * parent PE could be overridden by its child PEs if necessary.
2324 */
2325static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
2326{
2327        struct pci_dev *pdev;
2328        int i;
2329
2330        /*
2331         * NOTE: We only care PCI bus based PE for now. For PCI
2332         * device based PE, for example SRIOV sensitive VF should
2333         * be figured out later.
2334         */
2335        BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
2336
2337        list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
2338                for (i = 0; i <= PCI_ROM_RESOURCE; i++)
2339                        pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
2340
2341                /*
2342                 * If the PE contains all subordinate PCI buses, the
2343                 * windows of the child bridges should be mapped to
2344                 * the PE as well.
2345                 */
2346                if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
2347                        continue;
2348                for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
2349                        pnv_ioda_setup_pe_res(pe,
2350                                &pdev->resource[PCI_BRIDGE_RESOURCES + i]);
2351        }
2352}
2353
2354#ifdef CONFIG_DEBUG_FS
2355static int pnv_pci_diag_data_set(void *data, u64 val)
2356{
2357        struct pnv_phb *phb = data;
2358        s64 ret;
2359
2360        /* Retrieve the diag data from firmware */
2361        ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
2362                                          phb->diag_data_size);
2363        if (ret != OPAL_SUCCESS)
2364                return -EIO;
2365
2366        /* Print the diag data to the kernel log */
2367        pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
2368        return 0;
2369}
2370
2371DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set,
2372                         "%llu\n");
2373
2374static int pnv_pci_ioda_pe_dump(void *data, u64 val)
2375{
2376        struct pnv_phb *phb = data;
2377        int pe_num;
2378
2379        for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
2380                struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num];
2381
2382                if (!test_bit(pe_num, phb->ioda.pe_alloc))
2383                        continue;
2384
2385                pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n",
2386                        pe->rid, pe->device_count,
2387                        (pe->flags & PNV_IODA_PE_DEV) ? "dev " : "",
2388                        (pe->flags & PNV_IODA_PE_BUS) ? "bus " : "",
2389                        (pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "",
2390                        (pe->flags & PNV_IODA_PE_MASTER) ? "master " : "",
2391                        (pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "",
2392                        (pe->flags & PNV_IODA_PE_VF) ? "vf " : "");
2393        }
2394
2395        return 0;
2396}
2397
2398DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL,
2399                         pnv_pci_ioda_pe_dump, "%llu\n");
2400
2401#endif /* CONFIG_DEBUG_FS */
2402
2403static void pnv_pci_ioda_create_dbgfs(void)
2404{
2405#ifdef CONFIG_DEBUG_FS
2406        struct pci_controller *hose, *tmp;
2407        struct pnv_phb *phb;
2408        char name[16];
2409
2410        list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
2411                phb = hose->private_data;
2412
2413                /* Notify initialization of PHB done */
2414                phb->initialized = 1;
2415
2416                sprintf(name, "PCI%04x", hose->global_number);
2417                phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
2418
2419                debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs,
2420                                           phb, &pnv_pci_diag_data_fops);
2421                debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs,
2422                                           phb, &pnv_pci_ioda_pe_dump_fops);
2423        }
2424#endif /* CONFIG_DEBUG_FS */
2425}
2426
2427static void pnv_pci_enable_bridge(struct pci_bus *bus)
2428{
2429        struct pci_dev *dev = bus->self;
2430        struct pci_bus *child;
2431
2432        /* Empty bus ? bail */
2433        if (list_empty(&bus->devices))
2434                return;
2435
2436        /*
2437         * If there's a bridge associated with that bus enable it. This works
2438         * around races in the generic code if the enabling is done during
2439         * parallel probing. This can be removed once those races have been
2440         * fixed.
2441         */
2442        if (dev) {
2443                int rc = pci_enable_device(dev);
2444                if (rc)
2445                        pci_err(dev, "Error enabling bridge (%d)\n", rc);
2446                pci_set_master(dev);
2447        }
2448
2449        /* Perform the same to child busses */
2450        list_for_each_entry(child, &bus->children, node)
2451                pnv_pci_enable_bridge(child);
2452}
2453
2454static void pnv_pci_enable_bridges(void)
2455{
2456        struct pci_controller *hose;
2457
2458        list_for_each_entry(hose, &hose_list, list_node)
2459                pnv_pci_enable_bridge(hose->bus);
2460}
2461
2462static void pnv_pci_ioda_fixup(void)
2463{
2464        pnv_pci_ioda_setup_nvlink();
2465        pnv_pci_ioda_create_dbgfs();
2466
2467        pnv_pci_enable_bridges();
2468
2469#ifdef CONFIG_EEH
2470        pnv_eeh_post_init();
2471#endif
2472}
2473
2474/*
2475 * Returns the alignment for I/O or memory windows for P2P
2476 * bridges. That actually depends on how PEs are segmented.
2477 * For now, we return I/O or M32 segment size for PE sensitive
2478 * P2P bridges. Otherwise, the default values (4KiB for I/O,
2479 * 1MiB for memory) will be returned.
2480 *
2481 * The current PCI bus might be put into one PE, which was
2482 * create against the parent PCI bridge. For that case, we
2483 * needn't enlarge the alignment so that we can save some
2484 * resources.
2485 */
2486static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
2487                                                unsigned long type)
2488{
2489        struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
2490        int num_pci_bridges = 0;
2491        struct pci_dev *bridge;
2492
2493        bridge = bus->self;
2494        while (bridge) {
2495                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
2496                        num_pci_bridges++;
2497                        if (num_pci_bridges >= 2)
2498                                return 1;
2499                }
2500
2501                bridge = bridge->bus->self;
2502        }
2503
2504        /*
2505         * We fall back to M32 if M64 isn't supported. We enforce the M64
2506         * alignment for any 64-bit resource, PCIe doesn't care and
2507         * bridges only do 64-bit prefetchable anyway.
2508         */
2509        if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
2510                return phb->ioda.m64_segsize;
2511        if (type & IORESOURCE_MEM)
2512                return phb->ioda.m32_segsize;
2513
2514        return phb->ioda.io_segsize;
2515}
2516
2517/*
2518 * We are updating root port or the upstream port of the
2519 * bridge behind the root port with PHB's windows in order
2520 * to accommodate the changes on required resources during
2521 * PCI (slot) hotplug, which is connected to either root
2522 * port or the downstream ports of PCIe switch behind the
2523 * root port.
2524 */
2525static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
2526                                           unsigned long type)
2527{
2528        struct pci_controller *hose = pci_bus_to_host(bus);
2529        struct pnv_phb *phb = hose->private_data;
2530        struct pci_dev *bridge = bus->self;
2531        struct resource *r, *w;
2532        bool msi_region = false;
2533        int i;
2534
2535        /* Check if we need apply fixup to the bridge's windows */
2536        if (!pci_is_root_bus(bridge->bus) &&
2537            !pci_is_root_bus(bridge->bus->self->bus))
2538                return;
2539
2540        /* Fixup the resources */
2541        for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
2542                r = &bridge->resource[PCI_BRIDGE_RESOURCES + i];
2543                if (!r->flags || !r->parent)
2544                        continue;
2545
2546                w = NULL;
2547                if (r->flags & type & IORESOURCE_IO)
2548                        w = &hose->io_resource;
2549                else if (pnv_pci_is_m64(phb, r) &&
2550                         (type & IORESOURCE_PREFETCH) &&
2551                         phb->ioda.m64_segsize)
2552                        w = &hose->mem_resources[1];
2553                else if (r->flags & type & IORESOURCE_MEM) {
2554                        w = &hose->mem_resources[0];
2555                        msi_region = true;
2556                }
2557
2558                r->start = w->start;
2559                r->end = w->end;
2560
2561                /* The 64KB 32-bits MSI region shouldn't be included in
2562                 * the 32-bits bridge window. Otherwise, we can see strange
2563                 * issues. One of them is EEH error observed on Garrison.
2564                 *
2565                 * Exclude top 1MB region which is the minimal alignment of
2566                 * 32-bits bridge window.
2567                 */
2568                if (msi_region) {
2569                        r->end += 0x10000;
2570                        r->end -= 0x100000;
2571                }
2572        }
2573}
2574
2575static void pnv_pci_configure_bus(struct pci_bus *bus)
2576{
2577        struct pci_dev *bridge = bus->self;
2578        struct pnv_ioda_pe *pe;
2579        bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
2580
2581        dev_info(&bus->dev, "Configuring PE for bus\n");
2582
2583        /* Don't assign PE to PCI bus, which doesn't have subordinate devices */
2584        if (WARN_ON(list_empty(&bus->devices)))
2585                return;
2586
2587        /* Reserve PEs according to used M64 resources */
2588        pnv_ioda_reserve_m64_pe(bus, NULL, all);
2589
2590        /*
2591         * Assign PE. We might run here because of partial hotplug.
2592         * For the case, we just pick up the existing PE and should
2593         * not allocate resources again.
2594         */
2595        pe = pnv_ioda_setup_bus_PE(bus, all);
2596        if (!pe)
2597                return;
2598
2599        pnv_ioda_setup_pe_seg(pe);
2600}
2601
2602static resource_size_t pnv_pci_default_alignment(void)
2603{
2604        return PAGE_SIZE;
2605}
2606
2607/* Prevent enabling devices for which we couldn't properly
2608 * assign a PE
2609 */
2610static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
2611{
2612        struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
2613        struct pci_dn *pdn;
2614
2615        /* The function is probably called while the PEs have
2616         * not be created yet. For example, resource reassignment
2617         * during PCI probe period. We just skip the check if
2618         * PEs isn't ready.
2619         */
2620        if (!phb->initialized)
2621                return true;
2622
2623        pdn = pci_get_pdn(dev);
2624        if (!pdn || pdn->pe_number == IODA_INVALID_PE)
2625                return false;
2626
2627        return true;
2628}
2629
2630static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev)
2631{
2632        struct pci_controller *hose = pci_bus_to_host(dev->bus);
2633        struct pnv_phb *phb = hose->private_data;
2634        struct pci_dn *pdn;
2635        struct pnv_ioda_pe *pe;
2636
2637        if (!phb->initialized)
2638                return true;
2639
2640        pdn = pci_get_pdn(dev);
2641        if (!pdn)
2642                return false;
2643
2644        if (pdn->pe_number == IODA_INVALID_PE) {
2645                pe = pnv_ioda_setup_dev_PE(dev);
2646                if (!pe)
2647                        return false;
2648        }
2649        return true;
2650}
2651
2652static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
2653                                       int num)
2654{
2655        struct pnv_ioda_pe *pe = container_of(table_group,
2656                                              struct pnv_ioda_pe, table_group);
2657        struct pnv_phb *phb = pe->phb;
2658        unsigned int idx;
2659        long rc;
2660
2661        pe_info(pe, "Removing DMA window #%d\n", num);
2662        for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
2663                if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
2664                        continue;
2665
2666                rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
2667                                                idx, 0, 0ul, 0ul, 0ul);
2668                if (rc != OPAL_SUCCESS) {
2669                        pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
2670                                rc, idx);
2671                        return rc;
2672                }
2673
2674                phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
2675        }
2676
2677        pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
2678        return OPAL_SUCCESS;
2679}
2680
2681static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
2682{
2683        struct iommu_table *tbl = pe->table_group.tables[0];
2684        int64_t rc;
2685
2686        if (!pe->dma_setup_done)
2687                return;
2688
2689        rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
2690        if (rc != OPAL_SUCCESS)
2691                return;
2692
2693        pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
2694        if (pe->table_group.group) {
2695                iommu_group_put(pe->table_group.group);
2696                WARN_ON(pe->table_group.group);
2697        }
2698
2699        free_pages(tbl->it_base, get_order(tbl->it_size << 3));
2700        iommu_tce_table_put(tbl);
2701}
2702
2703void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
2704{
2705        struct iommu_table *tbl = pe->table_group.tables[0];
2706        int64_t rc;
2707
2708        if (!pe->dma_setup_done)
2709                return;
2710
2711        rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
2712        if (rc)
2713                pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
2714
2715        pnv_pci_ioda2_set_bypass(pe, false);
2716        if (pe->table_group.group) {
2717                iommu_group_put(pe->table_group.group);
2718                WARN_ON(pe->table_group.group);
2719        }
2720
2721        iommu_tce_table_put(tbl);
2722}
2723
2724static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
2725                                 unsigned short win,
2726                                 unsigned int *map)
2727{
2728        struct pnv_phb *phb = pe->phb;
2729        int idx;
2730        int64_t rc;
2731
2732        for (idx = 0; idx < phb->ioda.total_pe_num; idx++) {
2733                if (map[idx] != pe->pe_number)
2734                        continue;
2735
2736                rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2737                                phb->ioda.reserved_pe_idx, win, 0, idx);
2738
2739                if (rc != OPAL_SUCCESS)
2740                        pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
2741                                rc, win, idx);
2742
2743                map[idx] = IODA_INVALID_PE;
2744        }
2745}
2746
2747static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
2748{
2749        struct pnv_phb *phb = pe->phb;
2750
2751        if (phb->type == PNV_PHB_IODA1) {
2752                pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
2753                                     phb->ioda.io_segmap);
2754                pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
2755                                     phb->ioda.m32_segmap);
2756                /* M64 is pre-configured by pnv_ioda1_init_m64() */
2757        } else if (phb->type == PNV_PHB_IODA2) {
2758                pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
2759                                     phb->ioda.m32_segmap);
2760        }
2761}
2762
2763static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
2764{
2765        struct pnv_phb *phb = pe->phb;
2766        struct pnv_ioda_pe *slave, *tmp;
2767
2768        pe_info(pe, "Releasing PE\n");
2769
2770        mutex_lock(&phb->ioda.pe_list_mutex);
2771        list_del(&pe->list);
2772        mutex_unlock(&phb->ioda.pe_list_mutex);
2773
2774        switch (phb->type) {
2775        case PNV_PHB_IODA1:
2776                pnv_pci_ioda1_release_pe_dma(pe);
2777                break;
2778        case PNV_PHB_IODA2:
2779                pnv_pci_ioda2_release_pe_dma(pe);
2780                break;
2781        case PNV_PHB_NPU_OCAPI:
2782                break;
2783        default:
2784                WARN_ON(1);
2785        }
2786
2787        pnv_ioda_release_pe_seg(pe);
2788        pnv_ioda_deconfigure_pe(pe->phb, pe);
2789
2790        /* Release slave PEs in the compound PE */
2791        if (pe->flags & PNV_IODA_PE_MASTER) {
2792                list_for_each_entry_safe(slave, tmp, &pe->slaves, list) {
2793                        list_del(&slave->list);
2794                        pnv_ioda_free_pe(slave);
2795                }
2796        }
2797
2798        /*
2799         * The PE for root bus can be removed because of hotplug in EEH
2800         * recovery for fenced PHB error. We need to mark the PE dead so
2801         * that it can be populated again in PCI hot add path. The PE
2802         * shouldn't be destroyed as it's the global reserved resource.
2803         */
2804        if (phb->ioda.root_pe_idx == pe->pe_number)
2805                return;
2806
2807        pnv_ioda_free_pe(pe);
2808}
2809
2810static void pnv_pci_release_device(struct pci_dev *pdev)
2811{
2812        struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
2813        struct pci_dn *pdn = pci_get_pdn(pdev);
2814        struct pnv_ioda_pe *pe;
2815
2816        /* The VF PE state is torn down when sriov_disable() is called */
2817        if (pdev->is_virtfn)
2818                return;
2819
2820        if (!pdn || pdn->pe_number == IODA_INVALID_PE)
2821                return;
2822
2823#ifdef CONFIG_PCI_IOV
2824        /*
2825         * FIXME: Try move this to sriov_disable(). It's here since we allocate
2826         * the iov state at probe time since we need to fiddle with the IOV
2827         * resources.
2828         */
2829        if (pdev->is_physfn)
2830                kfree(pdev->dev.archdata.iov_data);
2831#endif
2832
2833        /*
2834         * PCI hotplug can happen as part of EEH error recovery. The @pdn
2835         * isn't removed and added afterwards in this scenario. We should
2836         * set the PE number in @pdn to an invalid one. Otherwise, the PE's
2837         * device count is decreased on removing devices while failing to
2838         * be increased on adding devices. It leads to unbalanced PE's device
2839         * count and eventually make normal PCI hotplug path broken.
2840         */
2841        pe = &phb->ioda.pe_array[pdn->pe_number];
2842        pdn->pe_number = IODA_INVALID_PE;
2843
2844        WARN_ON(--pe->device_count < 0);
2845        if (pe->device_count == 0)
2846                pnv_ioda_release_pe(pe);
2847}
2848
2849static void pnv_npu_disable_device(struct pci_dev *pdev)
2850{
2851        struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
2852        struct eeh_pe *eehpe = edev ? edev->pe : NULL;
2853
2854        if (eehpe && eeh_ops && eeh_ops->reset)
2855                eeh_ops->reset(eehpe, EEH_RESET_HOT);
2856}
2857
2858static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
2859{
2860        struct pnv_phb *phb = hose->private_data;
2861
2862        opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
2863                       OPAL_ASSERT_RESET);
2864}
2865
2866static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus)
2867{
2868        struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
2869        struct pnv_ioda_pe *pe;
2870
2871        list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2872                if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
2873                        continue;
2874
2875                if (!pe->pbus)
2876                        continue;
2877
2878                if (bus->number == ((pe->rid >> 8) & 0xFF)) {
2879                        pe->pbus = bus;
2880                        break;
2881                }
2882        }
2883}
2884
2885static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
2886        .dma_dev_setup          = pnv_pci_ioda_dma_dev_setup,
2887        .dma_bus_setup          = pnv_pci_ioda_dma_bus_setup,
2888        .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
2889        .setup_msi_irqs         = pnv_setup_msi_irqs,
2890        .teardown_msi_irqs      = pnv_teardown_msi_irqs,
2891        .enable_device_hook     = pnv_pci_enable_device_hook,
2892        .release_device         = pnv_pci_release_device,
2893        .window_alignment       = pnv_pci_window_alignment,
2894        .setup_bridge           = pnv_pci_fixup_bridge_resources,
2895        .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
2896        .shutdown               = pnv_pci_ioda_shutdown,
2897};
2898
2899static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
2900        .setup_msi_irqs         = pnv_setup_msi_irqs,
2901        .teardown_msi_irqs      = pnv_teardown_msi_irqs,
2902        .enable_device_hook     = pnv_pci_enable_device_hook,
2903        .window_alignment       = pnv_pci_window_alignment,
2904        .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
2905        .shutdown               = pnv_pci_ioda_shutdown,
2906        .disable_device         = pnv_npu_disable_device,
2907};
2908
2909static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
2910        .enable_device_hook     = pnv_ocapi_enable_device_hook,
2911        .release_device         = pnv_pci_release_device,
2912        .window_alignment       = pnv_pci_window_alignment,
2913        .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
2914        .shutdown               = pnv_pci_ioda_shutdown,
2915};
2916
2917static void __init pnv_pci_init_ioda_phb(struct device_node *np,
2918                                         u64 hub_id, int ioda_type)
2919{
2920        struct pci_controller *hose;
2921        struct pnv_phb *phb;
2922        unsigned long size, m64map_off, m32map_off, pemap_off;
2923        unsigned long iomap_off = 0, dma32map_off = 0;
2924        struct pnv_ioda_pe *root_pe;
2925        struct resource r;
2926        const __be64 *prop64;
2927        const __be32 *prop32;
2928        int len;
2929        unsigned int segno;
2930        u64 phb_id;
2931        void *aux;
2932        long rc;
2933
2934        if (!of_device_is_available(np))
2935                return;
2936
2937        pr_info("Initializing %s PHB (%pOF)\n", pnv_phb_names[ioda_type], np);
2938
2939        prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
2940        if (!prop64) {
2941                pr_err("  Missing \"ibm,opal-phbid\" property !\n");
2942                return;
2943        }
2944        phb_id = be64_to_cpup(prop64);
2945        pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
2946
2947        phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES);
2948        if (!phb)
2949                panic("%s: Failed to allocate %zu bytes\n", __func__,
2950                      sizeof(*phb));
2951
2952        /* Allocate PCI controller */
2953        phb->hose = hose = pcibios_alloc_controller(np);
2954        if (!phb->hose) {
2955                pr_err("  Can't allocate PCI controller for %pOF\n",
2956                       np);
2957                memblock_free(__pa(phb), sizeof(struct pnv_phb));
2958                return;
2959        }
2960
2961        spin_lock_init(&phb->lock);
2962        prop32 = of_get_property(np, "bus-range", &len);
2963        if (prop32 && len == 8) {
2964                hose->first_busno = be32_to_cpu(prop32[0]);
2965                hose->last_busno = be32_to_cpu(prop32[1]);
2966        } else {
2967                pr_warn("  Broken <bus-range> on %pOF\n", np);
2968                hose->first_busno = 0;
2969                hose->last_busno = 0xff;
2970        }
2971        hose->private_data = phb;
2972        phb->hub_id = hub_id;
2973        phb->opal_id = phb_id;
2974        phb->type = ioda_type;
2975        mutex_init(&phb->ioda.pe_alloc_mutex);
2976
2977        /* Detect specific models for error handling */
2978        if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
2979                phb->model = PNV_PHB_MODEL_P7IOC;
2980        else if (of_device_is_compatible(np, "ibm,power8-pciex"))
2981                phb->model = PNV_PHB_MODEL_PHB3;
2982        else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
2983                phb->model = PNV_PHB_MODEL_NPU;
2984        else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
2985                phb->model = PNV_PHB_MODEL_NPU2;
2986        else
2987                phb->model = PNV_PHB_MODEL_UNKNOWN;
2988
2989        /* Initialize diagnostic data buffer */
2990        prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL);
2991        if (prop32)
2992                phb->diag_data_size = be32_to_cpup(prop32);
2993        else
2994                phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
2995
2996        phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES);
2997        if (!phb->diag_data)
2998                panic("%s: Failed to allocate %u bytes\n", __func__,
2999                      phb->diag_data_size);
3000
3001        /* Parse 32-bit and IO ranges (if any) */
3002        pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
3003
3004        /* Get registers */
3005        if (!of_address_to_resource(np, 0, &r)) {
3006                phb->regs_phys = r.start;
3007                phb->regs = ioremap(r.start, resource_size(&r));
3008                if (phb->regs == NULL)
3009                        pr_err("  Failed to map registers !\n");
3010        }
3011
3012        /* Initialize more IODA stuff */
3013        phb->ioda.total_pe_num = 1;
3014        prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
3015        if (prop32)
3016                phb->ioda.total_pe_num = be32_to_cpup(prop32);
3017        prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
3018        if (prop32)
3019                phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
3020
3021        /* Invalidate RID to PE# mapping */
3022        for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++)
3023                phb->ioda.pe_rmap[segno] = IODA_INVALID_PE;
3024
3025        /* Parse 64-bit MMIO range */
3026        pnv_ioda_parse_m64_window(phb);
3027
3028        phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
3029        /* FW Has already off top 64k of M32 space (MSI space) */
3030        phb->ioda.m32_size += 0x10000;
3031
3032        phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
3033        phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
3034        phb->ioda.io_size = hose->pci_io_size;
3035        phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
3036        phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
3037
3038        /* Calculate how many 32-bit TCE segments we have */
3039        phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3040                                PNV_IODA1_DMA32_SEGSIZE;
3041
3042        /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
3043        size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
3044                        sizeof(unsigned long));
3045        m64map_off = size;
3046        size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
3047        m32map_off = size;
3048        size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
3049        if (phb->type == PNV_PHB_IODA1) {
3050                iomap_off = size;
3051                size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
3052                dma32map_off = size;
3053                size += phb->ioda.dma32_count *
3054                        sizeof(phb->ioda.dma32_segmap[0]);
3055        }
3056        pemap_off = size;
3057        size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
3058        aux = memblock_alloc(size, SMP_CACHE_BYTES);
3059        if (!aux)
3060                panic("%s: Failed to allocate %lu bytes\n", __func__, size);
3061        phb->ioda.pe_alloc = aux;
3062        phb->ioda.m64_segmap = aux + m64map_off;
3063        phb->ioda.m32_segmap = aux + m32map_off;
3064        for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
3065                phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
3066                phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
3067        }
3068        if (phb->type == PNV_PHB_IODA1) {
3069                phb->ioda.io_segmap = aux + iomap_off;
3070                for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
3071                        phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
3072
3073                phb->ioda.dma32_segmap = aux + dma32map_off;
3074                for (segno = 0; segno < phb->ioda.dma32_count; segno++)
3075                        phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
3076        }
3077        phb->ioda.pe_array = aux + pemap_off;
3078
3079        /*
3080         * Choose PE number for root bus, which shouldn't have
3081         * M64 resources consumed by its child devices. To pick
3082         * the PE number adjacent to the reserved one if possible.
3083         */
3084        pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx);
3085        if (phb->ioda.reserved_pe_idx == 0) {
3086                phb->ioda.root_pe_idx = 1;
3087                pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3088        } else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) {
3089                phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
3090                pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3091        } else {
3092                /* otherwise just allocate one */
3093                root_pe = pnv_ioda_alloc_pe(phb, 1);
3094                phb->ioda.root_pe_idx = root_pe->pe_number;
3095        }
3096
3097        INIT_LIST_HEAD(&phb->ioda.pe_list);
3098        mutex_init(&phb->ioda.pe_list_mutex);
3099
3100        /* Calculate how many 32-bit TCE segments we have */
3101        phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3102                                PNV_IODA1_DMA32_SEGSIZE;
3103
3104#if 0 /* We should really do that ... */
3105        rc = opal_pci_set_phb_mem_window(opal->phb_id,
3106                                         window_type,
3107                                         window_num,
3108                                         starting_real_address,
3109                                         starting_pci_address,
3110                                         segment_size);
3111#endif
3112
3113        pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
3114                phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
3115                phb->ioda.m32_size, phb->ioda.m32_segsize);
3116        if (phb->ioda.m64_size)
3117                pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
3118                        phb->ioda.m64_size, phb->ioda.m64_segsize);
3119        if (phb->ioda.io_size)
3120                pr_info("                  IO: 0x%x [segment=0x%x]\n",
3121                        phb->ioda.io_size, phb->ioda.io_segsize);
3122
3123
3124        phb->hose->ops = &pnv_pci_ops;
3125        phb->get_pe_state = pnv_ioda_get_pe_state;
3126        phb->freeze_pe = pnv_ioda_freeze_pe;
3127        phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
3128
3129        /* Setup MSI support */
3130        pnv_pci_init_ioda_msis(phb);
3131
3132        /*
3133         * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
3134         * to let the PCI core do resource assignment. It's supposed
3135         * that the PCI core will do correct I/O and MMIO alignment
3136         * for the P2P bridge bars so that each PCI bus (excluding
3137         * the child P2P bridges) can form individual PE.
3138         */
3139        ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
3140
3141        switch (phb->type) {
3142        case PNV_PHB_NPU_NVLINK:
3143                hose->controller_ops = pnv_npu_ioda_controller_ops;
3144                break;
3145        case PNV_PHB_NPU_OCAPI:
3146                hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
3147                break;
3148        default:
3149                hose->controller_ops = pnv_pci_ioda_controller_ops;
3150        }
3151
3152        ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
3153
3154#ifdef CONFIG_PCI_IOV
3155        ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov;
3156        ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
3157        ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
3158        ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
3159#endif
3160
3161        pci_add_flags(PCI_REASSIGN_ALL_RSRC);
3162
3163        /* Reset IODA tables to a clean state */
3164        rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
3165        if (rc)
3166                pr_warn("  OPAL Error %ld performing IODA table reset !\n", rc);
3167
3168        /*
3169         * If we're running in kdump kernel, the previous kernel never
3170         * shutdown PCI devices correctly. We already got IODA table
3171         * cleaned out. So we have to issue PHB reset to stop all PCI
3172         * transactions from previous kernel. The ppc_pci_reset_phbs
3173         * kernel parameter will force this reset too. Additionally,
3174         * if the IODA reset above failed then use a bigger hammer.
3175         * This can happen if we get a PHB fatal error in very early
3176         * boot.
3177         */
3178        if (is_kdump_kernel() || pci_reset_phbs || rc) {
3179                pr_info("  Issue PHB reset ...\n");
3180                pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
3181                pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
3182        }
3183
3184        /* Remove M64 resource if we can't configure it successfully */
3185        if (!phb->init_m64 || phb->init_m64(phb))
3186                hose->mem_resources[1].flags = 0;
3187}
3188
3189void __init pnv_pci_init_ioda2_phb(struct device_node *np)
3190{
3191        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
3192}
3193
3194void __init pnv_pci_init_npu_phb(struct device_node *np)
3195{
3196        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
3197}
3198
3199void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
3200{
3201        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
3202}
3203
3204static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
3205{
3206        struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
3207
3208        if (!machine_is(powernv))
3209                return;
3210
3211        if (phb->type == PNV_PHB_NPU_OCAPI)
3212                dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
3213}
3214DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
3215
3216void __init pnv_pci_init_ioda_hub(struct device_node *np)
3217{
3218        struct device_node *phbn;
3219        const __be64 *prop64;
3220        u64 hub_id;
3221
3222        pr_info("Probing IODA IO-Hub %pOF\n", np);
3223
3224        prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
3225        if (!prop64) {
3226                pr_err(" Missing \"ibm,opal-hubid\" property !\n");
3227                return;
3228        }
3229        hub_id = be64_to_cpup(prop64);
3230        pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
3231
3232        /* Count child PHBs */
3233        for_each_child_of_node(np, phbn) {
3234                /* Look for IODA1 PHBs */
3235                if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
3236                        pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
3237        }
3238}
3239