linux/arch/powerpc/platforms/powernv/pci-ioda.c
<<
>>
Prefs
   1/*
   2 * Support PCI/PCIe on PowerNV platforms
   3 *
   4 * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License
   8 * as published by the Free Software Foundation; either version
   9 * 2 of the License, or (at your option) any later version.
  10 */
  11
  12#undef DEBUG
  13
  14#include <linux/kernel.h>
  15#include <linux/pci.h>
  16#include <linux/crash_dump.h>
  17#include <linux/delay.h>
  18#include <linux/string.h>
  19#include <linux/init.h>
  20#include <linux/memblock.h>
  21#include <linux/irq.h>
  22#include <linux/io.h>
  23#include <linux/msi.h>
  24#include <linux/iommu.h>
  25#include <linux/rculist.h>
  26#include <linux/sizes.h>
  27
  28#include <asm/sections.h>
  29#include <asm/io.h>
  30#include <asm/prom.h>
  31#include <asm/pci-bridge.h>
  32#include <asm/machdep.h>
  33#include <asm/msi_bitmap.h>
  34#include <asm/ppc-pci.h>
  35#include <asm/opal.h>
  36#include <asm/iommu.h>
  37#include <asm/tce.h>
  38#include <asm/xics.h>
  39#include <asm/debugfs.h>
  40#include <asm/firmware.h>
  41#include <asm/pnv-pci.h>
  42#include <asm/mmzone.h>
  43
  44#include <misc/cxl-base.h>
  45
  46#include "powernv.h"
  47#include "pci.h"
  48#include "../../../../drivers/pci/pci.h"
  49
  50#define PNV_IODA1_M64_NUM       16      /* Number of M64 BARs   */
  51#define PNV_IODA1_M64_SEGS      8       /* Segments per M64 BAR */
  52#define PNV_IODA1_DMA32_SEGSIZE 0x10000000
  53
  54static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
  55                                              "NPU_OCAPI" };
  56
  57void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
  58                            const char *fmt, ...)
  59{
  60        struct va_format vaf;
  61        va_list args;
  62        char pfix[32];
  63
  64        va_start(args, fmt);
  65
  66        vaf.fmt = fmt;
  67        vaf.va = &args;
  68
  69        if (pe->flags & PNV_IODA_PE_DEV)
  70                strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
  71        else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
  72                sprintf(pfix, "%04x:%02x     ",
  73                        pci_domain_nr(pe->pbus), pe->pbus->number);
  74#ifdef CONFIG_PCI_IOV
  75        else if (pe->flags & PNV_IODA_PE_VF)
  76                sprintf(pfix, "%04x:%02x:%2x.%d",
  77                        pci_domain_nr(pe->parent_dev->bus),
  78                        (pe->rid & 0xff00) >> 8,
  79                        PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
  80#endif /* CONFIG_PCI_IOV*/
  81
  82        printk("%spci %s: [PE# %.2x] %pV",
  83               level, pfix, pe->pe_number, &vaf);
  84
  85        va_end(args);
  86}
  87
  88static bool pnv_iommu_bypass_disabled __read_mostly;
  89static bool pci_reset_phbs __read_mostly;
  90
  91static int __init iommu_setup(char *str)
  92{
  93        if (!str)
  94                return -EINVAL;
  95
  96        while (*str) {
  97                if (!strncmp(str, "nobypass", 8)) {
  98                        pnv_iommu_bypass_disabled = true;
  99                        pr_info("PowerNV: IOMMU bypass window disabled.\n");
 100                        break;
 101                }
 102                str += strcspn(str, ",");
 103                if (*str == ',')
 104                        str++;
 105        }
 106
 107        return 0;
 108}
 109early_param("iommu", iommu_setup);
 110
 111static int __init pci_reset_phbs_setup(char *str)
 112{
 113        pci_reset_phbs = true;
 114        return 0;
 115}
 116
 117early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
 118
 119static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
 120{
 121        /*
 122         * WARNING: We cannot rely on the resource flags. The Linux PCI
 123         * allocation code sometimes decides to put a 64-bit prefetchable
 124         * BAR in the 32-bit window, so we have to compare the addresses.
 125         *
 126         * For simplicity we only test resource start.
 127         */
 128        return (r->start >= phb->ioda.m64_base &&
 129                r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
 130}
 131
 132static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
 133{
 134        unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
 135
 136        return (resource_flags & flags) == flags;
 137}
 138
 139static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
 140{
 141        s64 rc;
 142
 143        phb->ioda.pe_array[pe_no].phb = phb;
 144        phb->ioda.pe_array[pe_no].pe_number = pe_no;
 145
 146        /*
 147         * Clear the PE frozen state as it might be put into frozen state
 148         * in the last PCI remove path. It's not harmful to do so when the
 149         * PE is already in unfrozen state.
 150         */
 151        rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
 152                                       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 153        if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
 154                pr_warn("%s: Error %lld unfreezing PHB#%x-PE#%x\n",
 155                        __func__, rc, phb->hose->global_number, pe_no);
 156
 157        return &phb->ioda.pe_array[pe_no];
 158}
 159
 160static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
 161{
 162        if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
 163                pr_warn("%s: Invalid PE %x on PHB#%x\n",
 164                        __func__, pe_no, phb->hose->global_number);
 165                return;
 166        }
 167
 168        if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
 169                pr_debug("%s: PE %x was reserved on PHB#%x\n",
 170                         __func__, pe_no, phb->hose->global_number);
 171
 172        pnv_ioda_init_pe(phb, pe_no);
 173}
 174
 175static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
 176{
 177        long pe;
 178
 179        for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
 180                if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
 181                        return pnv_ioda_init_pe(phb, pe);
 182        }
 183
 184        return NULL;
 185}
 186
 187static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
 188{
 189        struct pnv_phb *phb = pe->phb;
 190        unsigned int pe_num = pe->pe_number;
 191
 192        WARN_ON(pe->pdev);
 193        WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
 194        kfree(pe->npucomp);
 195        memset(pe, 0, sizeof(struct pnv_ioda_pe));
 196        clear_bit(pe_num, phb->ioda.pe_alloc);
 197}
 198
 199/* The default M64 BAR is shared by all PEs */
 200static int pnv_ioda2_init_m64(struct pnv_phb *phb)
 201{
 202        const char *desc;
 203        struct resource *r;
 204        s64 rc;
 205
 206        /* Configure the default M64 BAR */
 207        rc = opal_pci_set_phb_mem_window(phb->opal_id,
 208                                         OPAL_M64_WINDOW_TYPE,
 209                                         phb->ioda.m64_bar_idx,
 210                                         phb->ioda.m64_base,
 211                                         0, /* unused */
 212                                         phb->ioda.m64_size);
 213        if (rc != OPAL_SUCCESS) {
 214                desc = "configuring";
 215                goto fail;
 216        }
 217
 218        /* Enable the default M64 BAR */
 219        rc = opal_pci_phb_mmio_enable(phb->opal_id,
 220                                      OPAL_M64_WINDOW_TYPE,
 221                                      phb->ioda.m64_bar_idx,
 222                                      OPAL_ENABLE_M64_SPLIT);
 223        if (rc != OPAL_SUCCESS) {
 224                desc = "enabling";
 225                goto fail;
 226        }
 227
 228        /*
 229         * Exclude the segments for reserved and root bus PE, which
 230         * are first or last two PEs.
 231         */
 232        r = &phb->hose->mem_resources[1];
 233        if (phb->ioda.reserved_pe_idx == 0)
 234                r->start += (2 * phb->ioda.m64_segsize);
 235        else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
 236                r->end -= (2 * phb->ioda.m64_segsize);
 237        else
 238                pr_warn("  Cannot strip M64 segment for reserved PE#%x\n",
 239                        phb->ioda.reserved_pe_idx);
 240
 241        return 0;
 242
 243fail:
 244        pr_warn("  Failure %lld %s M64 BAR#%d\n",
 245                rc, desc, phb->ioda.m64_bar_idx);
 246        opal_pci_phb_mmio_enable(phb->opal_id,
 247                                 OPAL_M64_WINDOW_TYPE,
 248                                 phb->ioda.m64_bar_idx,
 249                                 OPAL_DISABLE_M64);
 250        return -EIO;
 251}
 252
 253static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
 254                                         unsigned long *pe_bitmap)
 255{
 256        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 257        struct pnv_phb *phb = hose->private_data;
 258        struct resource *r;
 259        resource_size_t base, sgsz, start, end;
 260        int segno, i;
 261
 262        base = phb->ioda.m64_base;
 263        sgsz = phb->ioda.m64_segsize;
 264        for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
 265                r = &pdev->resource[i];
 266                if (!r->parent || !pnv_pci_is_m64(phb, r))
 267                        continue;
 268
 269                start = ALIGN_DOWN(r->start - base, sgsz);
 270                end = ALIGN(r->end - base, sgsz);
 271                for (segno = start / sgsz; segno < end / sgsz; segno++) {
 272                        if (pe_bitmap)
 273                                set_bit(segno, pe_bitmap);
 274                        else
 275                                pnv_ioda_reserve_pe(phb, segno);
 276                }
 277        }
 278}
 279
 280static int pnv_ioda1_init_m64(struct pnv_phb *phb)
 281{
 282        struct resource *r;
 283        int index;
 284
 285        /*
 286         * There are 16 M64 BARs, each of which has 8 segments. So
 287         * there are as many M64 segments as the maximum number of
 288         * PEs, which is 128.
 289         */
 290        for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
 291                unsigned long base, segsz = phb->ioda.m64_segsize;
 292                int64_t rc;
 293
 294                base = phb->ioda.m64_base +
 295                       index * PNV_IODA1_M64_SEGS * segsz;
 296                rc = opal_pci_set_phb_mem_window(phb->opal_id,
 297                                OPAL_M64_WINDOW_TYPE, index, base, 0,
 298                                PNV_IODA1_M64_SEGS * segsz);
 299                if (rc != OPAL_SUCCESS) {
 300                        pr_warn("  Error %lld setting M64 PHB#%x-BAR#%d\n",
 301                                rc, phb->hose->global_number, index);
 302                        goto fail;
 303                }
 304
 305                rc = opal_pci_phb_mmio_enable(phb->opal_id,
 306                                OPAL_M64_WINDOW_TYPE, index,
 307                                OPAL_ENABLE_M64_SPLIT);
 308                if (rc != OPAL_SUCCESS) {
 309                        pr_warn("  Error %lld enabling M64 PHB#%x-BAR#%d\n",
 310                                rc, phb->hose->global_number, index);
 311                        goto fail;
 312                }
 313        }
 314
 315        /*
 316         * Exclude the segments for reserved and root bus PE, which
 317         * are first or last two PEs.
 318         */
 319        r = &phb->hose->mem_resources[1];
 320        if (phb->ioda.reserved_pe_idx == 0)
 321                r->start += (2 * phb->ioda.m64_segsize);
 322        else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
 323                r->end -= (2 * phb->ioda.m64_segsize);
 324        else
 325                WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
 326                     phb->ioda.reserved_pe_idx, phb->hose->global_number);
 327
 328        return 0;
 329
 330fail:
 331        for ( ; index >= 0; index--)
 332                opal_pci_phb_mmio_enable(phb->opal_id,
 333                        OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
 334
 335        return -EIO;
 336}
 337
 338static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
 339                                    unsigned long *pe_bitmap,
 340                                    bool all)
 341{
 342        struct pci_dev *pdev;
 343
 344        list_for_each_entry(pdev, &bus->devices, bus_list) {
 345                pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
 346
 347                if (all && pdev->subordinate)
 348                        pnv_ioda_reserve_m64_pe(pdev->subordinate,
 349                                                pe_bitmap, all);
 350        }
 351}
 352
 353static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
 354{
 355        struct pci_controller *hose = pci_bus_to_host(bus);
 356        struct pnv_phb *phb = hose->private_data;
 357        struct pnv_ioda_pe *master_pe, *pe;
 358        unsigned long size, *pe_alloc;
 359        int i;
 360
 361        /* Root bus shouldn't use M64 */
 362        if (pci_is_root_bus(bus))
 363                return NULL;
 364
 365        /* Allocate bitmap */
 366        size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
 367        pe_alloc = kzalloc(size, GFP_KERNEL);
 368        if (!pe_alloc) {
 369                pr_warn("%s: Out of memory !\n",
 370                        __func__);
 371                return NULL;
 372        }
 373
 374        /* Figure out reserved PE numbers by the PE */
 375        pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
 376
 377        /*
 378         * the current bus might not own M64 window and that's all
 379         * contributed by its child buses. For the case, we needn't
 380         * pick M64 dependent PE#.
 381         */
 382        if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
 383                kfree(pe_alloc);
 384                return NULL;
 385        }
 386
 387        /*
 388         * Figure out the master PE and put all slave PEs to master
 389         * PE's list to form compound PE.
 390         */
 391        master_pe = NULL;
 392        i = -1;
 393        while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
 394                phb->ioda.total_pe_num) {
 395                pe = &phb->ioda.pe_array[i];
 396
 397                phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
 398                if (!master_pe) {
 399                        pe->flags |= PNV_IODA_PE_MASTER;
 400                        INIT_LIST_HEAD(&pe->slaves);
 401                        master_pe = pe;
 402                } else {
 403                        pe->flags |= PNV_IODA_PE_SLAVE;
 404                        pe->master = master_pe;
 405                        list_add_tail(&pe->list, &master_pe->slaves);
 406                }
 407
 408                /*
 409                 * P7IOC supports M64DT, which helps mapping M64 segment
 410                 * to one particular PE#. However, PHB3 has fixed mapping
 411                 * between M64 segment and PE#. In order to have same logic
 412                 * for P7IOC and PHB3, we enforce fixed mapping between M64
 413                 * segment and PE# on P7IOC.
 414                 */
 415                if (phb->type == PNV_PHB_IODA1) {
 416                        int64_t rc;
 417
 418                        rc = opal_pci_map_pe_mmio_window(phb->opal_id,
 419                                        pe->pe_number, OPAL_M64_WINDOW_TYPE,
 420                                        pe->pe_number / PNV_IODA1_M64_SEGS,
 421                                        pe->pe_number % PNV_IODA1_M64_SEGS);
 422                        if (rc != OPAL_SUCCESS)
 423                                pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
 424                                        __func__, rc, phb->hose->global_number,
 425                                        pe->pe_number);
 426                }
 427        }
 428
 429        kfree(pe_alloc);
 430        return master_pe;
 431}
 432
 433static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
 434{
 435        struct pci_controller *hose = phb->hose;
 436        struct device_node *dn = hose->dn;
 437        struct resource *res;
 438        u32 m64_range[2], i;
 439        const __be32 *r;
 440        u64 pci_addr;
 441
 442        if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
 443                pr_info("  Not support M64 window\n");
 444                return;
 445        }
 446
 447        if (!firmware_has_feature(FW_FEATURE_OPAL)) {
 448                pr_info("  Firmware too old to support M64 window\n");
 449                return;
 450        }
 451
 452        r = of_get_property(dn, "ibm,opal-m64-window", NULL);
 453        if (!r) {
 454                pr_info("  No <ibm,opal-m64-window> on %pOF\n",
 455                        dn);
 456                return;
 457        }
 458
 459        /*
 460         * Find the available M64 BAR range and pickup the last one for
 461         * covering the whole 64-bits space. We support only one range.
 462         */
 463        if (of_property_read_u32_array(dn, "ibm,opal-available-m64-ranges",
 464                                       m64_range, 2)) {
 465                /* In absence of the property, assume 0..15 */
 466                m64_range[0] = 0;
 467                m64_range[1] = 16;
 468        }
 469        /* We only support 64 bits in our allocator */
 470        if (m64_range[1] > 63) {
 471                pr_warn("%s: Limiting M64 range to 63 (from %d) on PHB#%x\n",
 472                        __func__, m64_range[1], phb->hose->global_number);
 473                m64_range[1] = 63;
 474        }
 475        /* Empty range, no m64 */
 476        if (m64_range[1] <= m64_range[0]) {
 477                pr_warn("%s: M64 empty, disabling M64 usage on PHB#%x\n",
 478                        __func__, phb->hose->global_number);
 479                return;
 480        }
 481
 482        /* Configure M64 informations */
 483        res = &hose->mem_resources[1];
 484        res->name = dn->full_name;
 485        res->start = of_translate_address(dn, r + 2);
 486        res->end = res->start + of_read_number(r + 4, 2) - 1;
 487        res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
 488        pci_addr = of_read_number(r, 2);
 489        hose->mem_offset[1] = res->start - pci_addr;
 490
 491        phb->ioda.m64_size = resource_size(res);
 492        phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
 493        phb->ioda.m64_base = pci_addr;
 494
 495        /* This lines up nicely with the display from processing OF ranges */
 496        pr_info(" MEM 0x%016llx..0x%016llx -> 0x%016llx (M64 #%d..%d)\n",
 497                res->start, res->end, pci_addr, m64_range[0],
 498                m64_range[0] + m64_range[1] - 1);
 499
 500        /* Mark all M64 used up by default */
 501        phb->ioda.m64_bar_alloc = (unsigned long)-1;
 502
 503        /* Use last M64 BAR to cover M64 window */
 504        m64_range[1]--;
 505        phb->ioda.m64_bar_idx = m64_range[0] + m64_range[1];
 506
 507        pr_info(" Using M64 #%d as default window\n", phb->ioda.m64_bar_idx);
 508
 509        /* Mark remaining ones free */
 510        for (i = m64_range[0]; i < m64_range[1]; i++)
 511                clear_bit(i, &phb->ioda.m64_bar_alloc);
 512
 513        /*
 514         * Setup init functions for M64 based on IODA version, IODA3 uses
 515         * the IODA2 code.
 516         */
 517        if (phb->type == PNV_PHB_IODA1)
 518                phb->init_m64 = pnv_ioda1_init_m64;
 519        else
 520                phb->init_m64 = pnv_ioda2_init_m64;
 521        phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
 522        phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
 523}
 524
 525static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
 526{
 527        struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
 528        struct pnv_ioda_pe *slave;
 529        s64 rc;
 530
 531        /* Fetch master PE */
 532        if (pe->flags & PNV_IODA_PE_SLAVE) {
 533                pe = pe->master;
 534                if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
 535                        return;
 536
 537                pe_no = pe->pe_number;
 538        }
 539
 540        /* Freeze master PE */
 541        rc = opal_pci_eeh_freeze_set(phb->opal_id,
 542                                     pe_no,
 543                                     OPAL_EEH_ACTION_SET_FREEZE_ALL);
 544        if (rc != OPAL_SUCCESS) {
 545                pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
 546                        __func__, rc, phb->hose->global_number, pe_no);
 547                return;
 548        }
 549
 550        /* Freeze slave PEs */
 551        if (!(pe->flags & PNV_IODA_PE_MASTER))
 552                return;
 553
 554        list_for_each_entry(slave, &pe->slaves, list) {
 555                rc = opal_pci_eeh_freeze_set(phb->opal_id,
 556                                             slave->pe_number,
 557                                             OPAL_EEH_ACTION_SET_FREEZE_ALL);
 558                if (rc != OPAL_SUCCESS)
 559                        pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
 560                                __func__, rc, phb->hose->global_number,
 561                                slave->pe_number);
 562        }
 563}
 564
 565static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
 566{
 567        struct pnv_ioda_pe *pe, *slave;
 568        s64 rc;
 569
 570        /* Find master PE */
 571        pe = &phb->ioda.pe_array[pe_no];
 572        if (pe->flags & PNV_IODA_PE_SLAVE) {
 573                pe = pe->master;
 574                WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
 575                pe_no = pe->pe_number;
 576        }
 577
 578        /* Clear frozen state for master PE */
 579        rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
 580        if (rc != OPAL_SUCCESS) {
 581                pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
 582                        __func__, rc, opt, phb->hose->global_number, pe_no);
 583                return -EIO;
 584        }
 585
 586        if (!(pe->flags & PNV_IODA_PE_MASTER))
 587                return 0;
 588
 589        /* Clear frozen state for slave PEs */
 590        list_for_each_entry(slave, &pe->slaves, list) {
 591                rc = opal_pci_eeh_freeze_clear(phb->opal_id,
 592                                             slave->pe_number,
 593                                             opt);
 594                if (rc != OPAL_SUCCESS) {
 595                        pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
 596                                __func__, rc, opt, phb->hose->global_number,
 597                                slave->pe_number);
 598                        return -EIO;
 599                }
 600        }
 601
 602        return 0;
 603}
 604
 605static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
 606{
 607        struct pnv_ioda_pe *slave, *pe;
 608        u8 fstate = 0, state;
 609        __be16 pcierr = 0;
 610        s64 rc;
 611
 612        /* Sanity check on PE number */
 613        if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
 614                return OPAL_EEH_STOPPED_PERM_UNAVAIL;
 615
 616        /*
 617         * Fetch the master PE and the PE instance might be
 618         * not initialized yet.
 619         */
 620        pe = &phb->ioda.pe_array[pe_no];
 621        if (pe->flags & PNV_IODA_PE_SLAVE) {
 622                pe = pe->master;
 623                WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
 624                pe_no = pe->pe_number;
 625        }
 626
 627        /* Check the master PE */
 628        rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
 629                                        &state, &pcierr, NULL);
 630        if (rc != OPAL_SUCCESS) {
 631                pr_warn("%s: Failure %lld getting "
 632                        "PHB#%x-PE#%x state\n",
 633                        __func__, rc,
 634                        phb->hose->global_number, pe_no);
 635                return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
 636        }
 637
 638        /* Check the slave PE */
 639        if (!(pe->flags & PNV_IODA_PE_MASTER))
 640                return state;
 641
 642        list_for_each_entry(slave, &pe->slaves, list) {
 643                rc = opal_pci_eeh_freeze_status(phb->opal_id,
 644                                                slave->pe_number,
 645                                                &fstate,
 646                                                &pcierr,
 647                                                NULL);
 648                if (rc != OPAL_SUCCESS) {
 649                        pr_warn("%s: Failure %lld getting "
 650                                "PHB#%x-PE#%x state\n",
 651                                __func__, rc,
 652                                phb->hose->global_number, slave->pe_number);
 653                        return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
 654                }
 655
 656                /*
 657                 * Override the result based on the ascending
 658                 * priority.
 659                 */
 660                if (fstate > state)
 661                        state = fstate;
 662        }
 663
 664        return state;
 665}
 666
 667struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 668{
 669        struct pci_controller *hose = pci_bus_to_host(dev->bus);
 670        struct pnv_phb *phb = hose->private_data;
 671        struct pci_dn *pdn = pci_get_pdn(dev);
 672
 673        if (!pdn)
 674                return NULL;
 675        if (pdn->pe_number == IODA_INVALID_PE)
 676                return NULL;
 677        return &phb->ioda.pe_array[pdn->pe_number];
 678}
 679
 680static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
 681                                  struct pnv_ioda_pe *parent,
 682                                  struct pnv_ioda_pe *child,
 683                                  bool is_add)
 684{
 685        const char *desc = is_add ? "adding" : "removing";
 686        uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
 687                              OPAL_REMOVE_PE_FROM_DOMAIN;
 688        struct pnv_ioda_pe *slave;
 689        long rc;
 690
 691        /* Parent PE affects child PE */
 692        rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
 693                                child->pe_number, op);
 694        if (rc != OPAL_SUCCESS) {
 695                pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
 696                        rc, desc);
 697                return -ENXIO;
 698        }
 699
 700        if (!(child->flags & PNV_IODA_PE_MASTER))
 701                return 0;
 702
 703        /* Compound case: parent PE affects slave PEs */
 704        list_for_each_entry(slave, &child->slaves, list) {
 705                rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
 706                                        slave->pe_number, op);
 707                if (rc != OPAL_SUCCESS) {
 708                        pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
 709                                rc, desc);
 710                        return -ENXIO;
 711                }
 712        }
 713
 714        return 0;
 715}
 716
 717static int pnv_ioda_set_peltv(struct pnv_phb *phb,
 718                              struct pnv_ioda_pe *pe,
 719                              bool is_add)
 720{
 721        struct pnv_ioda_pe *slave;
 722        struct pci_dev *pdev = NULL;
 723        int ret;
 724
 725        /*
 726         * Clear PE frozen state. If it's master PE, we need
 727         * clear slave PE frozen state as well.
 728         */
 729        if (is_add) {
 730                opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
 731                                          OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 732                if (pe->flags & PNV_IODA_PE_MASTER) {
 733                        list_for_each_entry(slave, &pe->slaves, list)
 734                                opal_pci_eeh_freeze_clear(phb->opal_id,
 735                                                          slave->pe_number,
 736                                                          OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 737                }
 738        }
 739
 740        /*
 741         * Associate PE in PELT. We need add the PE into the
 742         * corresponding PELT-V as well. Otherwise, the error
 743         * originated from the PE might contribute to other
 744         * PEs.
 745         */
 746        ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
 747        if (ret)
 748                return ret;
 749
 750        /* For compound PEs, any one affects all of them */
 751        if (pe->flags & PNV_IODA_PE_MASTER) {
 752                list_for_each_entry(slave, &pe->slaves, list) {
 753                        ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
 754                        if (ret)
 755                                return ret;
 756                }
 757        }
 758
 759        if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
 760                pdev = pe->pbus->self;
 761        else if (pe->flags & PNV_IODA_PE_DEV)
 762                pdev = pe->pdev->bus->self;
 763#ifdef CONFIG_PCI_IOV
 764        else if (pe->flags & PNV_IODA_PE_VF)
 765                pdev = pe->parent_dev;
 766#endif /* CONFIG_PCI_IOV */
 767        while (pdev) {
 768                struct pci_dn *pdn = pci_get_pdn(pdev);
 769                struct pnv_ioda_pe *parent;
 770
 771                if (pdn && pdn->pe_number != IODA_INVALID_PE) {
 772                        parent = &phb->ioda.pe_array[pdn->pe_number];
 773                        ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
 774                        if (ret)
 775                                return ret;
 776                }
 777
 778                pdev = pdev->bus->self;
 779        }
 780
 781        return 0;
 782}
 783
 784static void pnv_ioda_unset_peltv(struct pnv_phb *phb,
 785                                 struct pnv_ioda_pe *pe,
 786                                 struct pci_dev *parent)
 787{
 788        int64_t rc;
 789
 790        while (parent) {
 791                struct pci_dn *pdn = pci_get_pdn(parent);
 792
 793                if (pdn && pdn->pe_number != IODA_INVALID_PE) {
 794                        rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
 795                                                pe->pe_number,
 796                                                OPAL_REMOVE_PE_FROM_DOMAIN);
 797                        /* XXX What to do in case of error ? */
 798                }
 799                parent = parent->bus->self;
 800        }
 801
 802        opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
 803                                  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 804
 805        /* Disassociate PE in PELT */
 806        rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
 807                                pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
 808        if (rc)
 809                pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
 810}
 811
 812static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 813{
 814        struct pci_dev *parent;
 815        uint8_t bcomp, dcomp, fcomp;
 816        int64_t rc;
 817        long rid_end, rid;
 818
 819        /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
 820        if (pe->pbus) {
 821                int count;
 822
 823                dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
 824                fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
 825                parent = pe->pbus->self;
 826                if (pe->flags & PNV_IODA_PE_BUS_ALL)
 827                        count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
 828                else
 829                        count = 1;
 830
 831                switch(count) {
 832                case  1: bcomp = OpalPciBusAll;         break;
 833                case  2: bcomp = OpalPciBus7Bits;       break;
 834                case  4: bcomp = OpalPciBus6Bits;       break;
 835                case  8: bcomp = OpalPciBus5Bits;       break;
 836                case 16: bcomp = OpalPciBus4Bits;       break;
 837                case 32: bcomp = OpalPciBus3Bits;       break;
 838                default:
 839                        dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
 840                                count);
 841                        /* Do an exact match only */
 842                        bcomp = OpalPciBusAll;
 843                }
 844                rid_end = pe->rid + (count << 8);
 845        } else {
 846#ifdef CONFIG_PCI_IOV
 847                if (pe->flags & PNV_IODA_PE_VF)
 848                        parent = pe->parent_dev;
 849                else
 850#endif
 851                        parent = pe->pdev->bus->self;
 852                bcomp = OpalPciBusAll;
 853                dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
 854                fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
 855                rid_end = pe->rid + 1;
 856        }
 857
 858        /* Clear the reverse map */
 859        for (rid = pe->rid; rid < rid_end; rid++)
 860                phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
 861
 862        /*
 863         * Release from all parents PELT-V. NPUs don't have a PELTV
 864         * table
 865         */
 866        if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
 867                pnv_ioda_unset_peltv(phb, pe, parent);
 868
 869        rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
 870                             bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
 871        if (rc)
 872                pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
 873
 874        pe->pbus = NULL;
 875        pe->pdev = NULL;
 876#ifdef CONFIG_PCI_IOV
 877        pe->parent_dev = NULL;
 878#endif
 879
 880        return 0;
 881}
 882
 883static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 884{
 885        struct pci_dev *parent;
 886        uint8_t bcomp, dcomp, fcomp;
 887        long rc, rid_end, rid;
 888
 889        /* Bus validation ? */
 890        if (pe->pbus) {
 891                int count;
 892
 893                dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
 894                fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
 895                parent = pe->pbus->self;
 896                if (pe->flags & PNV_IODA_PE_BUS_ALL)
 897                        count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
 898                else
 899                        count = 1;
 900
 901                switch(count) {
 902                case  1: bcomp = OpalPciBusAll;         break;
 903                case  2: bcomp = OpalPciBus7Bits;       break;
 904                case  4: bcomp = OpalPciBus6Bits;       break;
 905                case  8: bcomp = OpalPciBus5Bits;       break;
 906                case 16: bcomp = OpalPciBus4Bits;       break;
 907                case 32: bcomp = OpalPciBus3Bits;       break;
 908                default:
 909                        dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
 910                                count);
 911                        /* Do an exact match only */
 912                        bcomp = OpalPciBusAll;
 913                }
 914                rid_end = pe->rid + (count << 8);
 915        } else {
 916#ifdef CONFIG_PCI_IOV
 917                if (pe->flags & PNV_IODA_PE_VF)
 918                        parent = pe->parent_dev;
 919                else
 920#endif /* CONFIG_PCI_IOV */
 921                        parent = pe->pdev->bus->self;
 922                bcomp = OpalPciBusAll;
 923                dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
 924                fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
 925                rid_end = pe->rid + 1;
 926        }
 927
 928        /*
 929         * Associate PE in PELT. We need add the PE into the
 930         * corresponding PELT-V as well. Otherwise, the error
 931         * originated from the PE might contribute to other
 932         * PEs.
 933         */
 934        rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
 935                             bcomp, dcomp, fcomp, OPAL_MAP_PE);
 936        if (rc) {
 937                pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
 938                return -ENXIO;
 939        }
 940
 941        /*
 942         * Configure PELTV. NPUs don't have a PELTV table so skip
 943         * configuration on them.
 944         */
 945        if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
 946                pnv_ioda_set_peltv(phb, pe, true);
 947
 948        /* Setup reverse map */
 949        for (rid = pe->rid; rid < rid_end; rid++)
 950                phb->ioda.pe_rmap[rid] = pe->pe_number;
 951
 952        /* Setup one MVTs on IODA1 */
 953        if (phb->type != PNV_PHB_IODA1) {
 954                pe->mve_number = 0;
 955                goto out;
 956        }
 957
 958        pe->mve_number = pe->pe_number;
 959        rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
 960        if (rc != OPAL_SUCCESS) {
 961                pe_err(pe, "OPAL error %ld setting up MVE %x\n",
 962                       rc, pe->mve_number);
 963                pe->mve_number = -1;
 964        } else {
 965                rc = opal_pci_set_mve_enable(phb->opal_id,
 966                                             pe->mve_number, OPAL_ENABLE_MVE);
 967                if (rc) {
 968                        pe_err(pe, "OPAL error %ld enabling MVE %x\n",
 969                               rc, pe->mve_number);
 970                        pe->mve_number = -1;
 971                }
 972        }
 973
 974out:
 975        return 0;
 976}
 977
 978#ifdef CONFIG_PCI_IOV
 979static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 980{
 981        struct pci_dn *pdn = pci_get_pdn(dev);
 982        int i;
 983        struct resource *res, res2;
 984        resource_size_t size;
 985        u16 num_vfs;
 986
 987        if (!dev->is_physfn)
 988                return -EINVAL;
 989
 990        /*
 991         * "offset" is in VFs.  The M64 windows are sized so that when they
 992         * are segmented, each segment is the same size as the IOV BAR.
 993         * Each segment is in a separate PE, and the high order bits of the
 994         * address are the PE number.  Therefore, each VF's BAR is in a
 995         * separate PE, and changing the IOV BAR start address changes the
 996         * range of PEs the VFs are in.
 997         */
 998        num_vfs = pdn->num_vfs;
 999        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1000                res = &dev->resource[i + PCI_IOV_RESOURCES];
1001                if (!res->flags || !res->parent)
1002                        continue;
1003
1004                /*
1005                 * The actual IOV BAR range is determined by the start address
1006                 * and the actual size for num_vfs VFs BAR.  This check is to
1007                 * make sure that after shifting, the range will not overlap
1008                 * with another device.
1009                 */
1010                size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
1011                res2.flags = res->flags;
1012                res2.start = res->start + (size * offset);
1013                res2.end = res2.start + (size * num_vfs) - 1;
1014
1015                if (res2.end > res->end) {
1016                        dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
1017                                i, &res2, res, num_vfs, offset);
1018                        return -EBUSY;
1019                }
1020        }
1021
1022        /*
1023         * Since M64 BAR shares segments among all possible 256 PEs,
1024         * we have to shift the beginning of PF IOV BAR to make it start from
1025         * the segment which belongs to the PE number assigned to the first VF.
1026         * This creates a "hole" in the /proc/iomem which could be used for
1027         * allocating other resources so we reserve this area below and
1028         * release when IOV is released.
1029         */
1030        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1031                res = &dev->resource[i + PCI_IOV_RESOURCES];
1032                if (!res->flags || !res->parent)
1033                        continue;
1034
1035                size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
1036                res2 = *res;
1037                res->start += size * offset;
1038
1039                dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
1040                         i, &res2, res, (offset > 0) ? "En" : "Dis",
1041                         num_vfs, offset);
1042
1043                if (offset < 0) {
1044                        devm_release_resource(&dev->dev, &pdn->holes[i]);
1045                        memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
1046                }
1047
1048                pci_update_resource(dev, i + PCI_IOV_RESOURCES);
1049
1050                if (offset > 0) {
1051                        pdn->holes[i].start = res2.start;
1052                        pdn->holes[i].end = res2.start + size * offset - 1;
1053                        pdn->holes[i].flags = IORESOURCE_BUS;
1054                        pdn->holes[i].name = "pnv_iov_reserved";
1055                        devm_request_resource(&dev->dev, res->parent,
1056                                        &pdn->holes[i]);
1057                }
1058        }
1059        return 0;
1060}
1061#endif /* CONFIG_PCI_IOV */
1062
1063static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
1064{
1065        struct pci_controller *hose = pci_bus_to_host(dev->bus);
1066        struct pnv_phb *phb = hose->private_data;
1067        struct pci_dn *pdn = pci_get_pdn(dev);
1068        struct pnv_ioda_pe *pe;
1069
1070        if (!pdn) {
1071                pr_err("%s: Device tree node not associated properly\n",
1072                           pci_name(dev));
1073                return NULL;
1074        }
1075        if (pdn->pe_number != IODA_INVALID_PE)
1076                return NULL;
1077
1078        pe = pnv_ioda_alloc_pe(phb);
1079        if (!pe) {
1080                pr_warn("%s: Not enough PE# available, disabling device\n",
1081                        pci_name(dev));
1082                return NULL;
1083        }
1084
1085        /* NOTE: We don't get a reference for the pointer in the PE
1086         * data structure, both the device and PE structures should be
1087         * destroyed at the same time. However, removing nvlink
1088         * devices will need some work.
1089         *
1090         * At some point we want to remove the PDN completely anyways
1091         */
1092        pdn->pe_number = pe->pe_number;
1093        pe->flags = PNV_IODA_PE_DEV;
1094        pe->pdev = dev;
1095        pe->pbus = NULL;
1096        pe->mve_number = -1;
1097        pe->rid = dev->bus->number << 8 | pdn->devfn;
1098        pe->device_count++;
1099
1100        pe_info(pe, "Associated device to PE\n");
1101
1102        if (pnv_ioda_configure_pe(phb, pe)) {
1103                /* XXX What do we do here ? */
1104                pnv_ioda_free_pe(pe);
1105                pdn->pe_number = IODA_INVALID_PE;
1106                pe->pdev = NULL;
1107                return NULL;
1108        }
1109
1110        /* Put PE to the list */
1111        mutex_lock(&phb->ioda.pe_list_mutex);
1112        list_add_tail(&pe->list, &phb->ioda.pe_list);
1113        mutex_unlock(&phb->ioda.pe_list_mutex);
1114        return pe;
1115}
1116
1117static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
1118{
1119        struct pci_dev *dev;
1120
1121        list_for_each_entry(dev, &bus->devices, bus_list) {
1122                struct pci_dn *pdn = pci_get_pdn(dev);
1123
1124                if (pdn == NULL) {
1125                        pr_warn("%s: No device node associated with device !\n",
1126                                pci_name(dev));
1127                        continue;
1128                }
1129
1130                /*
1131                 * In partial hotplug case, the PCI device might be still
1132                 * associated with the PE and needn't attach it to the PE
1133                 * again.
1134                 */
1135                if (pdn->pe_number != IODA_INVALID_PE)
1136                        continue;
1137
1138                pe->device_count++;
1139                pdn->pe_number = pe->pe_number;
1140                if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1141                        pnv_ioda_setup_same_PE(dev->subordinate, pe);
1142        }
1143}
1144
1145/*
1146 * There're 2 types of PCI bus sensitive PEs: One that is compromised of
1147 * single PCI bus. Another one that contains the primary PCI bus and its
1148 * subordinate PCI devices and buses. The second type of PE is normally
1149 * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
1150 */
1151static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
1152{
1153        struct pci_controller *hose = pci_bus_to_host(bus);
1154        struct pnv_phb *phb = hose->private_data;
1155        struct pnv_ioda_pe *pe = NULL;
1156        unsigned int pe_num;
1157
1158        /*
1159         * In partial hotplug case, the PE instance might be still alive.
1160         * We should reuse it instead of allocating a new one.
1161         */
1162        pe_num = phb->ioda.pe_rmap[bus->number << 8];
1163        if (pe_num != IODA_INVALID_PE) {
1164                pe = &phb->ioda.pe_array[pe_num];
1165                pnv_ioda_setup_same_PE(bus, pe);
1166                return NULL;
1167        }
1168
1169        /* PE number for root bus should have been reserved */
1170        if (pci_is_root_bus(bus) &&
1171            phb->ioda.root_pe_idx != IODA_INVALID_PE)
1172                pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
1173
1174        /* Check if PE is determined by M64 */
1175        if (!pe && phb->pick_m64_pe)
1176                pe = phb->pick_m64_pe(bus, all);
1177
1178        /* The PE number isn't pinned by M64 */
1179        if (!pe)
1180                pe = pnv_ioda_alloc_pe(phb);
1181
1182        if (!pe) {
1183                pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
1184                        __func__, pci_domain_nr(bus), bus->number);
1185                return NULL;
1186        }
1187
1188        pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
1189        pe->pbus = bus;
1190        pe->pdev = NULL;
1191        pe->mve_number = -1;
1192        pe->rid = bus->busn_res.start << 8;
1193
1194        if (all)
1195                pe_info(pe, "Secondary bus %d..%d associated with PE#%x\n",
1196                        bus->busn_res.start, bus->busn_res.end, pe->pe_number);
1197        else
1198                pe_info(pe, "Secondary bus %d associated with PE#%x\n",
1199                        bus->busn_res.start, pe->pe_number);
1200
1201        if (pnv_ioda_configure_pe(phb, pe)) {
1202                /* XXX What do we do here ? */
1203                pnv_ioda_free_pe(pe);
1204                pe->pbus = NULL;
1205                return NULL;
1206        }
1207
1208        /* Associate it with all child devices */
1209        pnv_ioda_setup_same_PE(bus, pe);
1210
1211        /* Put PE to the list */
1212        list_add_tail(&pe->list, &phb->ioda.pe_list);
1213
1214        return pe;
1215}
1216
1217static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
1218{
1219        int pe_num, found_pe = false, rc;
1220        long rid;
1221        struct pnv_ioda_pe *pe;
1222        struct pci_dev *gpu_pdev;
1223        struct pci_dn *npu_pdn;
1224        struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
1225        struct pnv_phb *phb = hose->private_data;
1226
1227        /*
1228         * Intentionally leak a reference on the npu device (for
1229         * nvlink only; this is not an opencapi path) to make sure it
1230         * never goes away, as it's been the case all along and some
1231         * work is needed otherwise.
1232         */
1233        pci_dev_get(npu_pdev);
1234
1235        /*
1236         * Due to a hardware errata PE#0 on the NPU is reserved for
1237         * error handling. This means we only have three PEs remaining
1238         * which need to be assigned to four links, implying some
1239         * links must share PEs.
1240         *
1241         * To achieve this we assign PEs such that NPUs linking the
1242         * same GPU get assigned the same PE.
1243         */
1244        gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
1245        for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
1246                pe = &phb->ioda.pe_array[pe_num];
1247                if (!pe->pdev)
1248                        continue;
1249
1250                if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
1251                        /*
1252                         * This device has the same peer GPU so should
1253                         * be assigned the same PE as the existing
1254                         * peer NPU.
1255                         */
1256                        dev_info(&npu_pdev->dev,
1257                                "Associating to existing PE %x\n", pe_num);
1258                        npu_pdn = pci_get_pdn(npu_pdev);
1259                        rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
1260                        npu_pdn->pe_number = pe_num;
1261                        phb->ioda.pe_rmap[rid] = pe->pe_number;
1262                        pe->device_count++;
1263
1264                        /* Map the PE to this link */
1265                        rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
1266                                        OpalPciBusAll,
1267                                        OPAL_COMPARE_RID_DEVICE_NUMBER,
1268                                        OPAL_COMPARE_RID_FUNCTION_NUMBER,
1269                                        OPAL_MAP_PE);
1270                        WARN_ON(rc != OPAL_SUCCESS);
1271                        found_pe = true;
1272                        break;
1273                }
1274        }
1275
1276        if (!found_pe)
1277                /*
1278                 * Could not find an existing PE so allocate a new
1279                 * one.
1280                 */
1281                return pnv_ioda_setup_dev_PE(npu_pdev);
1282        else
1283                return pe;
1284}
1285
1286static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
1287{
1288        struct pci_dev *pdev;
1289
1290        list_for_each_entry(pdev, &bus->devices, bus_list)
1291                pnv_ioda_setup_npu_PE(pdev);
1292}
1293
1294static void pnv_pci_ioda_setup_PEs(void)
1295{
1296        struct pci_controller *hose;
1297        struct pnv_phb *phb;
1298        struct pnv_ioda_pe *pe;
1299
1300        list_for_each_entry(hose, &hose_list, list_node) {
1301                phb = hose->private_data;
1302                if (phb->type == PNV_PHB_NPU_NVLINK) {
1303                        /* PE#0 is needed for error reporting */
1304                        pnv_ioda_reserve_pe(phb, 0);
1305                        pnv_ioda_setup_npu_PEs(hose->bus);
1306                        if (phb->model == PNV_PHB_MODEL_NPU2)
1307                                WARN_ON_ONCE(pnv_npu2_init(hose));
1308                }
1309        }
1310        list_for_each_entry(hose, &hose_list, list_node) {
1311                phb = hose->private_data;
1312                if (phb->type != PNV_PHB_IODA2)
1313                        continue;
1314
1315                list_for_each_entry(pe, &phb->ioda.pe_list, list)
1316                        pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
1317        }
1318}
1319
1320#ifdef CONFIG_PCI_IOV
1321static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
1322{
1323        struct pci_bus        *bus;
1324        struct pci_controller *hose;
1325        struct pnv_phb        *phb;
1326        struct pci_dn         *pdn;
1327        int                    i, j;
1328        int                    m64_bars;
1329
1330        bus = pdev->bus;
1331        hose = pci_bus_to_host(bus);
1332        phb = hose->private_data;
1333        pdn = pci_get_pdn(pdev);
1334
1335        if (pdn->m64_single_mode)
1336                m64_bars = num_vfs;
1337        else
1338                m64_bars = 1;
1339
1340        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
1341                for (j = 0; j < m64_bars; j++) {
1342                        if (pdn->m64_map[j][i] == IODA_INVALID_M64)
1343                                continue;
1344                        opal_pci_phb_mmio_enable(phb->opal_id,
1345                                OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
1346                        clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
1347                        pdn->m64_map[j][i] = IODA_INVALID_M64;
1348                }
1349
1350        kfree(pdn->m64_map);
1351        return 0;
1352}
1353
1354static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
1355{
1356        struct pci_bus        *bus;
1357        struct pci_controller *hose;
1358        struct pnv_phb        *phb;
1359        struct pci_dn         *pdn;
1360        unsigned int           win;
1361        struct resource       *res;
1362        int                    i, j;
1363        int64_t                rc;
1364        int                    total_vfs;
1365        resource_size_t        size, start;
1366        int                    pe_num;
1367        int                    m64_bars;
1368
1369        bus = pdev->bus;
1370        hose = pci_bus_to_host(bus);
1371        phb = hose->private_data;
1372        pdn = pci_get_pdn(pdev);
1373        total_vfs = pci_sriov_get_totalvfs(pdev);
1374
1375        if (pdn->m64_single_mode)
1376                m64_bars = num_vfs;
1377        else
1378                m64_bars = 1;
1379
1380        pdn->m64_map = kmalloc_array(m64_bars,
1381                                     sizeof(*pdn->m64_map),
1382                                     GFP_KERNEL);
1383        if (!pdn->m64_map)
1384                return -ENOMEM;
1385        /* Initialize the m64_map to IODA_INVALID_M64 */
1386        for (i = 0; i < m64_bars ; i++)
1387                for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
1388                        pdn->m64_map[i][j] = IODA_INVALID_M64;
1389
1390
1391        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1392                res = &pdev->resource[i + PCI_IOV_RESOURCES];
1393                if (!res->flags || !res->parent)
1394                        continue;
1395
1396                for (j = 0; j < m64_bars; j++) {
1397                        do {
1398                                win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
1399                                                phb->ioda.m64_bar_idx + 1, 0);
1400
1401                                if (win >= phb->ioda.m64_bar_idx + 1)
1402                                        goto m64_failed;
1403                        } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
1404
1405                        pdn->m64_map[j][i] = win;
1406
1407                        if (pdn->m64_single_mode) {
1408                                size = pci_iov_resource_size(pdev,
1409                                                        PCI_IOV_RESOURCES + i);
1410                                start = res->start + size * j;
1411                        } else {
1412                                size = resource_size(res);
1413                                start = res->start;
1414                        }
1415
1416                        /* Map the M64 here */
1417                        if (pdn->m64_single_mode) {
1418                                pe_num = pdn->pe_num_map[j];
1419                                rc = opal_pci_map_pe_mmio_window(phb->opal_id,
1420                                                pe_num, OPAL_M64_WINDOW_TYPE,
1421                                                pdn->m64_map[j][i], 0);
1422                        }
1423
1424                        rc = opal_pci_set_phb_mem_window(phb->opal_id,
1425                                                 OPAL_M64_WINDOW_TYPE,
1426                                                 pdn->m64_map[j][i],
1427                                                 start,
1428                                                 0, /* unused */
1429                                                 size);
1430
1431
1432                        if (rc != OPAL_SUCCESS) {
1433                                dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
1434                                        win, rc);
1435                                goto m64_failed;
1436                        }
1437
1438                        if (pdn->m64_single_mode)
1439                                rc = opal_pci_phb_mmio_enable(phb->opal_id,
1440                                     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
1441                        else
1442                                rc = opal_pci_phb_mmio_enable(phb->opal_id,
1443                                     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
1444
1445                        if (rc != OPAL_SUCCESS) {
1446                                dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
1447                                        win, rc);
1448                                goto m64_failed;
1449                        }
1450                }
1451        }
1452        return 0;
1453
1454m64_failed:
1455        pnv_pci_vf_release_m64(pdev, num_vfs);
1456        return -EBUSY;
1457}
1458
1459static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
1460                int num);
1461
1462static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
1463{
1464        struct iommu_table    *tbl;
1465        int64_t               rc;
1466
1467        tbl = pe->table_group.tables[0];
1468        rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
1469        if (rc)
1470                pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
1471
1472        pnv_pci_ioda2_set_bypass(pe, false);
1473        if (pe->table_group.group) {
1474                iommu_group_put(pe->table_group.group);
1475                BUG_ON(pe->table_group.group);
1476        }
1477        iommu_tce_table_put(tbl);
1478}
1479
1480static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
1481{
1482        struct pci_bus        *bus;
1483        struct pci_controller *hose;
1484        struct pnv_phb        *phb;
1485        struct pnv_ioda_pe    *pe, *pe_n;
1486        struct pci_dn         *pdn;
1487
1488        bus = pdev->bus;
1489        hose = pci_bus_to_host(bus);
1490        phb = hose->private_data;
1491        pdn = pci_get_pdn(pdev);
1492
1493        if (!pdev->is_physfn)
1494                return;
1495
1496        list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
1497                if (pe->parent_dev != pdev)
1498                        continue;
1499
1500                pnv_pci_ioda2_release_dma_pe(pdev, pe);
1501
1502                /* Remove from list */
1503                mutex_lock(&phb->ioda.pe_list_mutex);
1504                list_del(&pe->list);
1505                mutex_unlock(&phb->ioda.pe_list_mutex);
1506
1507                pnv_ioda_deconfigure_pe(phb, pe);
1508
1509                pnv_ioda_free_pe(pe);
1510        }
1511}
1512
1513static void pnv_pci_sriov_disable(struct pci_dev *pdev)
1514{
1515        struct pci_bus        *bus;
1516        struct pci_controller *hose;
1517        struct pnv_phb        *phb;
1518        struct pnv_ioda_pe    *pe;
1519        struct pci_dn         *pdn;
1520        u16                    num_vfs, i;
1521
1522        bus = pdev->bus;
1523        hose = pci_bus_to_host(bus);
1524        phb = hose->private_data;
1525        pdn = pci_get_pdn(pdev);
1526        num_vfs = pdn->num_vfs;
1527
1528        /* Release VF PEs */
1529        pnv_ioda_release_vf_PE(pdev);
1530
1531        if (phb->type == PNV_PHB_IODA2) {
1532                if (!pdn->m64_single_mode)
1533                        pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
1534
1535                /* Release M64 windows */
1536                pnv_pci_vf_release_m64(pdev, num_vfs);
1537
1538                /* Release PE numbers */
1539                if (pdn->m64_single_mode) {
1540                        for (i = 0; i < num_vfs; i++) {
1541                                if (pdn->pe_num_map[i] == IODA_INVALID_PE)
1542                                        continue;
1543
1544                                pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
1545                                pnv_ioda_free_pe(pe);
1546                        }
1547                } else
1548                        bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1549                /* Releasing pe_num_map */
1550                kfree(pdn->pe_num_map);
1551        }
1552}
1553
1554static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
1555                                       struct pnv_ioda_pe *pe);
1556#ifdef CONFIG_IOMMU_API
1557static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
1558                struct iommu_table_group *table_group, struct pci_bus *bus);
1559
1560#endif
1561static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
1562{
1563        struct pci_bus        *bus;
1564        struct pci_controller *hose;
1565        struct pnv_phb        *phb;
1566        struct pnv_ioda_pe    *pe;
1567        int                    pe_num;
1568        u16                    vf_index;
1569        struct pci_dn         *pdn;
1570
1571        bus = pdev->bus;
1572        hose = pci_bus_to_host(bus);
1573        phb = hose->private_data;
1574        pdn = pci_get_pdn(pdev);
1575
1576        if (!pdev->is_physfn)
1577                return;
1578
1579        /* Reserve PE for each VF */
1580        for (vf_index = 0; vf_index < num_vfs; vf_index++) {
1581                int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
1582                int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
1583                struct pci_dn *vf_pdn;
1584
1585                if (pdn->m64_single_mode)
1586                        pe_num = pdn->pe_num_map[vf_index];
1587                else
1588                        pe_num = *pdn->pe_num_map + vf_index;
1589
1590                pe = &phb->ioda.pe_array[pe_num];
1591                pe->pe_number = pe_num;
1592                pe->phb = phb;
1593                pe->flags = PNV_IODA_PE_VF;
1594                pe->pbus = NULL;
1595                pe->parent_dev = pdev;
1596                pe->mve_number = -1;
1597                pe->rid = (vf_bus << 8) | vf_devfn;
1598
1599                pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
1600                        hose->global_number, pdev->bus->number,
1601                        PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
1602
1603                if (pnv_ioda_configure_pe(phb, pe)) {
1604                        /* XXX What do we do here ? */
1605                        pnv_ioda_free_pe(pe);
1606                        pe->pdev = NULL;
1607                        continue;
1608                }
1609
1610                /* Put PE to the list */
1611                mutex_lock(&phb->ioda.pe_list_mutex);
1612                list_add_tail(&pe->list, &phb->ioda.pe_list);
1613                mutex_unlock(&phb->ioda.pe_list_mutex);
1614
1615                /* associate this pe to it's pdn */
1616                list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
1617                        if (vf_pdn->busno == vf_bus &&
1618                            vf_pdn->devfn == vf_devfn) {
1619                                vf_pdn->pe_number = pe_num;
1620                                break;
1621                        }
1622                }
1623
1624                pnv_pci_ioda2_setup_dma_pe(phb, pe);
1625#ifdef CONFIG_IOMMU_API
1626                iommu_register_group(&pe->table_group,
1627                                pe->phb->hose->global_number, pe->pe_number);
1628                pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL);
1629#endif
1630        }
1631}
1632
1633static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1634{
1635        struct pci_bus        *bus;
1636        struct pci_controller *hose;
1637        struct pnv_phb        *phb;
1638        struct pnv_ioda_pe    *pe;
1639        struct pci_dn         *pdn;
1640        int                    ret;
1641        u16                    i;
1642
1643        bus = pdev->bus;
1644        hose = pci_bus_to_host(bus);
1645        phb = hose->private_data;
1646        pdn = pci_get_pdn(pdev);
1647
1648        if (phb->type == PNV_PHB_IODA2) {
1649                if (!pdn->vfs_expanded) {
1650                        dev_info(&pdev->dev, "don't support this SRIOV device"
1651                                " with non 64bit-prefetchable IOV BAR\n");
1652                        return -ENOSPC;
1653                }
1654
1655                /*
1656                 * When M64 BARs functions in Single PE mode, the number of VFs
1657                 * could be enabled must be less than the number of M64 BARs.
1658                 */
1659                if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
1660                        dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
1661                        return -EBUSY;
1662                }
1663
1664                /* Allocating pe_num_map */
1665                if (pdn->m64_single_mode)
1666                        pdn->pe_num_map = kmalloc_array(num_vfs,
1667                                                        sizeof(*pdn->pe_num_map),
1668                                                        GFP_KERNEL);
1669                else
1670                        pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
1671
1672                if (!pdn->pe_num_map)
1673                        return -ENOMEM;
1674
1675                if (pdn->m64_single_mode)
1676                        for (i = 0; i < num_vfs; i++)
1677                                pdn->pe_num_map[i] = IODA_INVALID_PE;
1678
1679                /* Calculate available PE for required VFs */
1680                if (pdn->m64_single_mode) {
1681                        for (i = 0; i < num_vfs; i++) {
1682                                pe = pnv_ioda_alloc_pe(phb);
1683                                if (!pe) {
1684                                        ret = -EBUSY;
1685                                        goto m64_failed;
1686                                }
1687
1688                                pdn->pe_num_map[i] = pe->pe_number;
1689                        }
1690                } else {
1691                        mutex_lock(&phb->ioda.pe_alloc_mutex);
1692                        *pdn->pe_num_map = bitmap_find_next_zero_area(
1693                                phb->ioda.pe_alloc, phb->ioda.total_pe_num,
1694                                0, num_vfs, 0);
1695                        if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
1696                                mutex_unlock(&phb->ioda.pe_alloc_mutex);
1697                                dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
1698                                kfree(pdn->pe_num_map);
1699                                return -EBUSY;
1700                        }
1701                        bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1702                        mutex_unlock(&phb->ioda.pe_alloc_mutex);
1703                }
1704                pdn->num_vfs = num_vfs;
1705
1706                /* Assign M64 window accordingly */
1707                ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
1708                if (ret) {
1709                        dev_info(&pdev->dev, "Not enough M64 window resources\n");
1710                        goto m64_failed;
1711                }
1712
1713                /*
1714                 * When using one M64 BAR to map one IOV BAR, we need to shift
1715                 * the IOV BAR according to the PE# allocated to the VFs.
1716                 * Otherwise, the PE# for the VF will conflict with others.
1717                 */
1718                if (!pdn->m64_single_mode) {
1719                        ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
1720                        if (ret)
1721                                goto m64_failed;
1722                }
1723        }
1724
1725        /* Setup VF PEs */
1726        pnv_ioda_setup_vf_PE(pdev, num_vfs);
1727
1728        return 0;
1729
1730m64_failed:
1731        if (pdn->m64_single_mode) {
1732                for (i = 0; i < num_vfs; i++) {
1733                        if (pdn->pe_num_map[i] == IODA_INVALID_PE)
1734                                continue;
1735
1736                        pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
1737                        pnv_ioda_free_pe(pe);
1738                }
1739        } else
1740                bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1741
1742        /* Releasing pe_num_map */
1743        kfree(pdn->pe_num_map);
1744
1745        return ret;
1746}
1747
1748static int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
1749{
1750        pnv_pci_sriov_disable(pdev);
1751
1752        /* Release PCI data */
1753        remove_sriov_vf_pdns(pdev);
1754        return 0;
1755}
1756
1757static int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1758{
1759        /* Allocate PCI data */
1760        add_sriov_vf_pdns(pdev);
1761
1762        return pnv_pci_sriov_enable(pdev, num_vfs);
1763}
1764#endif /* CONFIG_PCI_IOV */
1765
1766static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
1767{
1768        struct pci_dn *pdn = pci_get_pdn(pdev);
1769        struct pnv_ioda_pe *pe;
1770
1771        /*
1772         * The function can be called while the PE#
1773         * hasn't been assigned. Do nothing for the
1774         * case.
1775         */
1776        if (!pdn || pdn->pe_number == IODA_INVALID_PE)
1777                return;
1778
1779        pe = &phb->ioda.pe_array[pdn->pe_number];
1780        WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1781        pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
1782        set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
1783        /*
1784         * Note: iommu_add_device() will fail here as
1785         * for physical PE: the device is already added by now;
1786         * for virtual PE: sysfs entries are not ready yet and
1787         * tce_iommu_bus_notifier will add the device to a group later.
1788         */
1789}
1790
1791/*
1792 * Reconfigure TVE#0 to be usable as 64-bit DMA space.
1793 *
1794 * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
1795 * Devices can only access more than that if bit 59 of the PCI address is set
1796 * by hardware, which indicates TVE#1 should be used instead of TVE#0.
1797 * Many PCI devices are not capable of addressing that many bits, and as a
1798 * result are limited to the 4GB of virtual memory made available to 32-bit
1799 * devices in TVE#0.
1800 *
1801 * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
1802 * devices by configuring the virtual memory past the first 4GB inaccessible
1803 * by 64-bit DMAs.  This should only be used by devices that want more than
1804 * 4GB, and only on PEs that have no 32-bit devices.
1805 *
1806 * Currently this will only work on PHB3 (POWER8).
1807 */
1808static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
1809{
1810        u64 window_size, table_size, tce_count, addr;
1811        struct page *table_pages;
1812        u64 tce_order = 28; /* 256MB TCEs */
1813        __be64 *tces;
1814        s64 rc;
1815
1816        /*
1817         * Window size needs to be a power of two, but needs to account for
1818         * shifting memory by the 4GB offset required to skip 32bit space.
1819         */
1820        window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
1821        tce_count = window_size >> tce_order;
1822        table_size = tce_count << 3;
1823
1824        if (table_size < PAGE_SIZE)
1825                table_size = PAGE_SIZE;
1826
1827        table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
1828                                       get_order(table_size));
1829        if (!table_pages)
1830                goto err;
1831
1832        tces = page_address(table_pages);
1833        if (!tces)
1834                goto err;
1835
1836        memset(tces, 0, table_size);
1837
1838        for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
1839                tces[(addr + (1ULL << 32)) >> tce_order] =
1840                        cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
1841        }
1842
1843        rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
1844                                        pe->pe_number,
1845                                        /* reconfigure window 0 */
1846                                        (pe->pe_number << 1) + 0,
1847                                        1,
1848                                        __pa(tces),
1849                                        table_size,
1850                                        1 << tce_order);
1851        if (rc == OPAL_SUCCESS) {
1852                pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
1853                return 0;
1854        }
1855err:
1856        pe_err(pe, "Error configuring 64-bit DMA bypass\n");
1857        return -EIO;
1858}
1859
1860static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
1861                u64 dma_mask)
1862{
1863        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1864        struct pnv_phb *phb = hose->private_data;
1865        struct pci_dn *pdn = pci_get_pdn(pdev);
1866        struct pnv_ioda_pe *pe;
1867
1868        if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1869                return false;
1870
1871        pe = &phb->ioda.pe_array[pdn->pe_number];
1872        if (pe->tce_bypass_enabled) {
1873                u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1874                if (dma_mask >= top)
1875                        return true;
1876        }
1877
1878        /*
1879         * If the device can't set the TCE bypass bit but still wants
1880         * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
1881         * bypass the 32-bit region and be usable for 64-bit DMAs.
1882         * The device needs to be able to address all of this space.
1883         */
1884        if (dma_mask >> 32 &&
1885            dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
1886            /* pe->pdev should be set if it's a single device, pe->pbus if not */
1887            (pe->device_count == 1 || !pe->pbus) &&
1888            phb->model == PNV_PHB_MODEL_PHB3) {
1889                /* Configure the bypass mode */
1890                s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
1891                if (rc)
1892                        return false;
1893                /* 4GB offset bypasses 32-bit space */
1894                pdev->dev.archdata.dma_offset = (1ULL << 32);
1895                return true;
1896        }
1897
1898        return false;
1899}
1900
1901static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
1902{
1903        struct pci_dev *dev;
1904
1905        list_for_each_entry(dev, &bus->devices, bus_list) {
1906                set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
1907                dev->dev.archdata.dma_offset = pe->tce_bypass_base;
1908
1909                if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1910                        pnv_ioda_setup_bus_dma(pe, dev->subordinate);
1911        }
1912}
1913
1914static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
1915                                                     bool real_mode)
1916{
1917        return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
1918                (phb->regs + 0x210);
1919}
1920
1921static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
1922                unsigned long index, unsigned long npages, bool rm)
1923{
1924        struct iommu_table_group_link *tgl = list_first_entry_or_null(
1925                        &tbl->it_group_list, struct iommu_table_group_link,
1926                        next);
1927        struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1928                        struct pnv_ioda_pe, table_group);
1929        __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
1930        unsigned long start, end, inc;
1931
1932        start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
1933        end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
1934                        npages - 1);
1935
1936        /* p7ioc-style invalidation, 2 TCEs per write */
1937        start |= (1ull << 63);
1938        end |= (1ull << 63);
1939        inc = 16;
1940        end |= inc - 1; /* round up end to be different than start */
1941
1942        mb(); /* Ensure above stores are visible */
1943        while (start <= end) {
1944                if (rm)
1945                        __raw_rm_writeq_be(start, invalidate);
1946                else
1947                        __raw_writeq_be(start, invalidate);
1948
1949                start += inc;
1950        }
1951
1952        /*
1953         * The iommu layer will do another mb() for us on build()
1954         * and we don't care on free()
1955         */
1956}
1957
1958static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
1959                long npages, unsigned long uaddr,
1960                enum dma_data_direction direction,
1961                unsigned long attrs)
1962{
1963        int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1964                        attrs);
1965
1966        if (!ret)
1967                pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
1968
1969        return ret;
1970}
1971
1972#ifdef CONFIG_IOMMU_API
1973/* Common for IODA1 and IODA2 */
1974static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
1975                unsigned long *hpa, enum dma_data_direction *direction,
1976                bool realmode)
1977{
1978        return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
1979}
1980
1981static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
1982                unsigned long *hpa, enum dma_data_direction *direction)
1983{
1984        long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
1985
1986        if (!ret)
1987                pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
1988
1989        return ret;
1990}
1991
1992static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
1993                unsigned long *hpa, enum dma_data_direction *direction)
1994{
1995        long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
1996
1997        if (!ret)
1998                pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
1999
2000        return ret;
2001}
2002#endif
2003
2004static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
2005                long npages)
2006{
2007        pnv_tce_free(tbl, index, npages);
2008
2009        pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
2010}
2011
2012static struct iommu_table_ops pnv_ioda1_iommu_ops = {
2013        .set = pnv_ioda1_tce_build,
2014#ifdef CONFIG_IOMMU_API
2015        .exchange = pnv_ioda1_tce_xchg,
2016        .exchange_rm = pnv_ioda1_tce_xchg_rm,
2017        .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
2018        .tce_kill = pnv_pci_p7ioc_tce_invalidate,
2019        .useraddrptr = pnv_tce_useraddrptr,
2020#endif
2021        .clear = pnv_ioda1_tce_free,
2022        .get = pnv_tce_get,
2023};
2024
2025#define PHB3_TCE_KILL_INVAL_ALL         PPC_BIT(0)
2026#define PHB3_TCE_KILL_INVAL_PE          PPC_BIT(1)
2027#define PHB3_TCE_KILL_INVAL_ONE         PPC_BIT(2)
2028
2029static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
2030{
2031        __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
2032        const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
2033
2034        mb(); /* Ensure previous TCE table stores are visible */
2035        if (rm)
2036                __raw_rm_writeq_be(val, invalidate);
2037        else
2038                __raw_writeq_be(val, invalidate);
2039}
2040
2041static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
2042{
2043        /* 01xb - invalidate TCEs that match the specified PE# */
2044        __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
2045        unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
2046
2047        mb(); /* Ensure above stores are visible */
2048        __raw_writeq_be(val, invalidate);
2049}
2050
2051static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
2052                                        unsigned shift, unsigned long index,
2053                                        unsigned long npages)
2054{
2055        __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
2056        unsigned long start, end, inc;
2057
2058        /* We'll invalidate DMA address in PE scope */
2059        start = PHB3_TCE_KILL_INVAL_ONE;
2060        start |= (pe->pe_number & 0xFF);
2061        end = start;
2062
2063        /* Figure out the start, end and step */
2064        start |= (index << shift);
2065        end |= ((index + npages - 1) << shift);
2066        inc = (0x1ull << shift);
2067        mb();
2068
2069        while (start <= end) {
2070                if (rm)
2071                        __raw_rm_writeq_be(start, invalidate);
2072                else
2073                        __raw_writeq_be(start, invalidate);
2074                start += inc;
2075        }
2076}
2077
2078static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
2079{
2080        struct pnv_phb *phb = pe->phb;
2081
2082        if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
2083                pnv_pci_phb3_tce_invalidate_pe(pe);
2084        else
2085                opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PE,
2086                                  pe->pe_number, 0, 0, 0);
2087}
2088
2089static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
2090                unsigned long index, unsigned long npages, bool rm)
2091{
2092        struct iommu_table_group_link *tgl;
2093
2094        list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
2095                struct pnv_ioda_pe *pe = container_of(tgl->table_group,
2096                                struct pnv_ioda_pe, table_group);
2097                struct pnv_phb *phb = pe->phb;
2098                unsigned int shift = tbl->it_page_shift;
2099
2100                /*
2101                 * NVLink1 can use the TCE kill register directly as
2102                 * it's the same as PHB3. NVLink2 is different and
2103                 * should go via the OPAL call.
2104                 */
2105                if (phb->model == PNV_PHB_MODEL_NPU) {
2106                        /*
2107                         * The NVLink hardware does not support TCE kill
2108                         * per TCE entry so we have to invalidate
2109                         * the entire cache for it.
2110                         */
2111                        pnv_pci_phb3_tce_invalidate_entire(phb, rm);
2112                        continue;
2113                }
2114                if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
2115                        pnv_pci_phb3_tce_invalidate(pe, rm, shift,
2116                                                    index, npages);
2117                else
2118                        opal_pci_tce_kill(phb->opal_id,
2119                                          OPAL_PCI_TCE_KILL_PAGES,
2120                                          pe->pe_number, 1u << shift,
2121                                          index << shift, npages);
2122        }
2123}
2124
2125void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
2126{
2127        if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
2128                pnv_pci_phb3_tce_invalidate_entire(phb, rm);
2129        else
2130                opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
2131}
2132
2133static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
2134                long npages, unsigned long uaddr,
2135                enum dma_data_direction direction,
2136                unsigned long attrs)
2137{
2138        int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
2139                        attrs);
2140
2141        if (!ret)
2142                pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
2143
2144        return ret;
2145}
2146
2147#ifdef CONFIG_IOMMU_API
2148static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
2149                unsigned long *hpa, enum dma_data_direction *direction)
2150{
2151        long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
2152
2153        if (!ret)
2154                pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
2155
2156        return ret;
2157}
2158
2159static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
2160                unsigned long *hpa, enum dma_data_direction *direction)
2161{
2162        long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
2163
2164        if (!ret)
2165                pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
2166
2167        return ret;
2168}
2169#endif
2170
2171static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
2172                long npages)
2173{
2174        pnv_tce_free(tbl, index, npages);
2175
2176        pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
2177}
2178
2179static void pnv_ioda2_table_free(struct iommu_table *tbl)
2180{
2181        pnv_pci_ioda2_table_free_pages(tbl);
2182}
2183
2184static struct iommu_table_ops pnv_ioda2_iommu_ops = {
2185        .set = pnv_ioda2_tce_build,
2186#ifdef CONFIG_IOMMU_API
2187        .exchange = pnv_ioda2_tce_xchg,
2188        .exchange_rm = pnv_ioda2_tce_xchg_rm,
2189        .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
2190        .tce_kill = pnv_pci_ioda2_tce_invalidate,
2191        .useraddrptr = pnv_tce_useraddrptr,
2192#endif
2193        .clear = pnv_ioda2_tce_free,
2194        .get = pnv_tce_get,
2195        .free = pnv_ioda2_table_free,
2196};
2197
2198static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
2199{
2200        unsigned int *weight = (unsigned int *)data;
2201
2202        /* This is quite simplistic. The "base" weight of a device
2203         * is 10. 0 means no DMA is to be accounted for it.
2204         */
2205        if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
2206                return 0;
2207
2208        if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
2209            dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
2210            dev->class == PCI_CLASS_SERIAL_USB_EHCI)
2211                *weight += 3;
2212        else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
2213                *weight += 15;
2214        else
2215                *weight += 10;
2216
2217        return 0;
2218}
2219
2220static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
2221{
2222        unsigned int weight = 0;
2223
2224        /* SRIOV VF has same DMA32 weight as its PF */
2225#ifdef CONFIG_PCI_IOV
2226        if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
2227                pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
2228                return weight;
2229        }
2230#endif
2231
2232        if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
2233                pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
2234        } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
2235                struct pci_dev *pdev;
2236
2237                list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
2238                        pnv_pci_ioda_dev_dma_weight(pdev, &weight);
2239        } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
2240                pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
2241        }
2242
2243        return weight;
2244}
2245
2246static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
2247                                       struct pnv_ioda_pe *pe)
2248{
2249
2250        struct page *tce_mem = NULL;
2251        struct iommu_table *tbl;
2252        unsigned int weight, total_weight = 0;
2253        unsigned int tce32_segsz, base, segs, avail, i;
2254        int64_t rc;
2255        void *addr;
2256
2257        /* XXX FIXME: Handle 64-bit only DMA devices */
2258        /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
2259        /* XXX FIXME: Allocate multi-level tables on PHB3 */
2260        weight = pnv_pci_ioda_pe_dma_weight(pe);
2261        if (!weight)
2262                return;
2263
2264        pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
2265                     &total_weight);
2266        segs = (weight * phb->ioda.dma32_count) / total_weight;
2267        if (!segs)
2268                segs = 1;
2269
2270        /*
2271         * Allocate contiguous DMA32 segments. We begin with the expected
2272         * number of segments. With one more attempt, the number of DMA32
2273         * segments to be allocated is decreased by one until one segment
2274         * is allocated successfully.
2275         */
2276        do {
2277                for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
2278                        for (avail = 0, i = base; i < base + segs; i++) {
2279                                if (phb->ioda.dma32_segmap[i] ==
2280                                    IODA_INVALID_PE)
2281                                        avail++;
2282                        }
2283
2284                        if (avail == segs)
2285                                goto found;
2286                }
2287        } while (--segs);
2288
2289        if (!segs) {
2290                pe_warn(pe, "No available DMA32 segments\n");
2291                return;
2292        }
2293
2294found:
2295        tbl = pnv_pci_table_alloc(phb->hose->node);
2296        if (WARN_ON(!tbl))
2297                return;
2298
2299        iommu_register_group(&pe->table_group, phb->hose->global_number,
2300                        pe->pe_number);
2301        pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
2302
2303        /* Grab a 32-bit TCE table */
2304        pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
2305                weight, total_weight, base, segs);
2306        pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
2307                base * PNV_IODA1_DMA32_SEGSIZE,
2308                (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
2309
2310        /* XXX Currently, we allocate one big contiguous table for the
2311         * TCEs. We only really need one chunk per 256M of TCE space
2312         * (ie per segment) but that's an optimization for later, it
2313         * requires some added smarts with our get/put_tce implementation
2314         *
2315         * Each TCE page is 4KB in size and each TCE entry occupies 8
2316         * bytes
2317         */
2318        tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
2319        tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
2320                                   get_order(tce32_segsz * segs));
2321        if (!tce_mem) {
2322                pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
2323                goto fail;
2324        }
2325        addr = page_address(tce_mem);
2326        memset(addr, 0, tce32_segsz * segs);
2327
2328        /* Configure HW */
2329        for (i = 0; i < segs; i++) {
2330                rc = opal_pci_map_pe_dma_window(phb->opal_id,
2331                                              pe->pe_number,
2332                                              base + i, 1,
2333                                              __pa(addr) + tce32_segsz * i,
2334                                              tce32_segsz, IOMMU_PAGE_SIZE_4K);
2335                if (rc) {
2336                        pe_err(pe, " Failed to configure 32-bit TCE table,"
2337                               " err %ld\n", rc);
2338                        goto fail;
2339                }
2340        }
2341
2342        /* Setup DMA32 segment mapping */
2343        for (i = base; i < base + segs; i++)
2344                phb->ioda.dma32_segmap[i] = pe->pe_number;
2345
2346        /* Setup linux iommu table */
2347        pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
2348                                  base * PNV_IODA1_DMA32_SEGSIZE,
2349                                  IOMMU_PAGE_SHIFT_4K);
2350
2351        tbl->it_ops = &pnv_ioda1_iommu_ops;
2352        pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
2353        pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
2354        iommu_init_table(tbl, phb->hose->node);
2355
2356        if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2357                pnv_ioda_setup_bus_dma(pe, pe->pbus);
2358
2359        return;
2360 fail:
2361        /* XXX Failure: Try to fallback to 64-bit only ? */
2362        if (tce_mem)
2363                __free_pages(tce_mem, get_order(tce32_segsz * segs));
2364        if (tbl) {
2365                pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
2366                iommu_tce_table_put(tbl);
2367        }
2368}
2369
2370static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
2371                int num, struct iommu_table *tbl)
2372{
2373        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2374                        table_group);
2375        struct pnv_phb *phb = pe->phb;
2376        int64_t rc;
2377        const unsigned long size = tbl->it_indirect_levels ?
2378                        tbl->it_level_size : tbl->it_size;
2379        const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
2380        const __u64 win_size = tbl->it_size << tbl->it_page_shift;
2381
2382        pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
2383                        start_addr, start_addr + win_size - 1,
2384                        IOMMU_PAGE_SIZE(tbl));
2385
2386        /*
2387         * Map TCE table through TVT. The TVE index is the PE number
2388         * shifted by 1 bit for 32-bits DMA space.
2389         */
2390        rc = opal_pci_map_pe_dma_window(phb->opal_id,
2391                        pe->pe_number,
2392                        (pe->pe_number << 1) + num,
2393                        tbl->it_indirect_levels + 1,
2394                        __pa(tbl->it_base),
2395                        size << 3,
2396                        IOMMU_PAGE_SIZE(tbl));
2397        if (rc) {
2398                pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
2399                return rc;
2400        }
2401
2402        pnv_pci_link_table_and_group(phb->hose->node, num,
2403                        tbl, &pe->table_group);
2404        pnv_pci_ioda2_tce_invalidate_pe(pe);
2405
2406        return 0;
2407}
2408
2409void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
2410{
2411        uint16_t window_id = (pe->pe_number << 1 ) + 1;
2412        int64_t rc;
2413
2414        pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
2415        if (enable) {
2416                phys_addr_t top = memblock_end_of_DRAM();
2417
2418                top = roundup_pow_of_two(top);
2419                rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
2420                                                     pe->pe_number,
2421                                                     window_id,
2422                                                     pe->tce_bypass_base,
2423                                                     top);
2424        } else {
2425                rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
2426                                                     pe->pe_number,
2427                                                     window_id,
2428                                                     pe->tce_bypass_base,
2429                                                     0);
2430        }
2431        if (rc)
2432                pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
2433        else
2434                pe->tce_bypass_enabled = enable;
2435}
2436
2437static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
2438                int num, __u32 page_shift, __u64 window_size, __u32 levels,
2439                bool alloc_userspace_copy, struct iommu_table **ptbl)
2440{
2441        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2442                        table_group);
2443        int nid = pe->phb->hose->node;
2444        __u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
2445        long ret;
2446        struct iommu_table *tbl;
2447
2448        tbl = pnv_pci_table_alloc(nid);
2449        if (!tbl)
2450                return -ENOMEM;
2451
2452        tbl->it_ops = &pnv_ioda2_iommu_ops;
2453
2454        ret = pnv_pci_ioda2_table_alloc_pages(nid,
2455                        bus_offset, page_shift, window_size,
2456                        levels, alloc_userspace_copy, tbl);
2457        if (ret) {
2458                iommu_tce_table_put(tbl);
2459                return ret;
2460        }
2461
2462        *ptbl = tbl;
2463
2464        return 0;
2465}
2466
2467static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
2468{
2469        struct iommu_table *tbl = NULL;
2470        long rc;
2471
2472        /*
2473         * crashkernel= specifies the kdump kernel's maximum memory at
2474         * some offset and there is no guaranteed the result is a power
2475         * of 2, which will cause errors later.
2476         */
2477        const u64 max_memory = __rounddown_pow_of_two(memory_hotplug_max());
2478
2479        /*
2480         * In memory constrained environments, e.g. kdump kernel, the
2481         * DMA window can be larger than available memory, which will
2482         * cause errors later.
2483         */
2484        const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
2485
2486        rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
2487                        IOMMU_PAGE_SHIFT_4K,
2488                        window_size,
2489                        POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
2490        if (rc) {
2491                pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
2492                                rc);
2493                return rc;
2494        }
2495
2496        iommu_init_table(tbl, pe->phb->hose->node);
2497
2498        rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
2499        if (rc) {
2500                pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
2501                                rc);
2502                iommu_tce_table_put(tbl);
2503                return rc;
2504        }
2505
2506        if (!pnv_iommu_bypass_disabled)
2507                pnv_pci_ioda2_set_bypass(pe, true);
2508
2509        return 0;
2510}
2511
2512#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
2513static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
2514                int num)
2515{
2516        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2517                        table_group);
2518        struct pnv_phb *phb = pe->phb;
2519        long ret;
2520
2521        pe_info(pe, "Removing DMA window #%d\n", num);
2522
2523        ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
2524                        (pe->pe_number << 1) + num,
2525                        0/* levels */, 0/* table address */,
2526                        0/* table size */, 0/* page size */);
2527        if (ret)
2528                pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
2529        else
2530                pnv_pci_ioda2_tce_invalidate_pe(pe);
2531
2532        pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
2533
2534        return ret;
2535}
2536#endif
2537
2538#ifdef CONFIG_IOMMU_API
2539unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
2540                __u64 window_size, __u32 levels)
2541{
2542        unsigned long bytes = 0;
2543        const unsigned window_shift = ilog2(window_size);
2544        unsigned entries_shift = window_shift - page_shift;
2545        unsigned table_shift = entries_shift + 3;
2546        unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
2547        unsigned long direct_table_size;
2548
2549        if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
2550                        !is_power_of_2(window_size))
2551                return 0;
2552
2553        /* Calculate a direct table size from window_size and levels */
2554        entries_shift = (entries_shift + levels - 1) / levels;
2555        table_shift = entries_shift + 3;
2556        table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
2557        direct_table_size =  1UL << table_shift;
2558
2559        for ( ; levels; --levels) {
2560                bytes += ALIGN(tce_table_size, direct_table_size);
2561
2562                tce_table_size /= direct_table_size;
2563                tce_table_size <<= 3;
2564                tce_table_size = max_t(unsigned long,
2565                                tce_table_size, direct_table_size);
2566        }
2567
2568        return bytes + bytes; /* one for HW table, one for userspace copy */
2569}
2570
2571static long pnv_pci_ioda2_create_table_userspace(
2572                struct iommu_table_group *table_group,
2573                int num, __u32 page_shift, __u64 window_size, __u32 levels,
2574                struct iommu_table **ptbl)
2575{
2576        long ret = pnv_pci_ioda2_create_table(table_group,
2577                        num, page_shift, window_size, levels, true, ptbl);
2578
2579        if (!ret)
2580                (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
2581                                page_shift, window_size, levels);
2582        return ret;
2583}
2584
2585static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
2586{
2587        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2588                                                table_group);
2589        /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
2590        struct iommu_table *tbl = pe->table_group.tables[0];
2591
2592        pnv_pci_ioda2_set_bypass(pe, false);
2593        pnv_pci_ioda2_unset_window(&pe->table_group, 0);
2594        if (pe->pbus)
2595                pnv_ioda_setup_bus_dma(pe, pe->pbus);
2596        iommu_tce_table_put(tbl);
2597}
2598
2599static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
2600{
2601        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2602                                                table_group);
2603
2604        pnv_pci_ioda2_setup_default_config(pe);
2605        if (pe->pbus)
2606                pnv_ioda_setup_bus_dma(pe, pe->pbus);
2607}
2608
2609static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
2610        .get_table_size = pnv_pci_ioda2_get_table_size,
2611        .create_table = pnv_pci_ioda2_create_table_userspace,
2612        .set_window = pnv_pci_ioda2_set_window,
2613        .unset_window = pnv_pci_ioda2_unset_window,
2614        .take_ownership = pnv_ioda2_take_ownership,
2615        .release_ownership = pnv_ioda2_release_ownership,
2616};
2617
2618static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
2619                struct iommu_table_group *table_group,
2620                struct pci_bus *bus)
2621{
2622        struct pci_dev *dev;
2623
2624        list_for_each_entry(dev, &bus->devices, bus_list) {
2625                iommu_add_device(table_group, &dev->dev);
2626
2627                if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
2628                        pnv_ioda_setup_bus_iommu_group_add_devices(pe,
2629                                        table_group, dev->subordinate);
2630        }
2631}
2632
2633static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
2634                struct iommu_table_group *table_group, struct pci_bus *bus)
2635{
2636
2637        if (pe->flags & PNV_IODA_PE_DEV)
2638                iommu_add_device(table_group, &pe->pdev->dev);
2639
2640        if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
2641                pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
2642                                bus);
2643}
2644
2645static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
2646
2647static void pnv_pci_ioda_setup_iommu_api(void)
2648{
2649        struct pci_controller *hose;
2650        struct pnv_phb *phb;
2651        struct pnv_ioda_pe *pe;
2652
2653        /*
2654         * There are 4 types of PEs:
2655         * - PNV_IODA_PE_BUS: a downstream port with an adapter,
2656         *   created from pnv_pci_setup_bridge();
2657         * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
2658         *   created from pnv_pci_setup_bridge();
2659         * - PNV_IODA_PE_VF: a SRIOV virtual function,
2660         *   created from pnv_pcibios_sriov_enable();
2661         * - PNV_IODA_PE_DEV: an NPU or OCAPI device,
2662         *   created from pnv_pci_ioda_fixup().
2663         *
2664         * Normally a PE is represented by an IOMMU group, however for
2665         * devices with side channels the groups need to be more strict.
2666         */
2667        list_for_each_entry(hose, &hose_list, list_node) {
2668                phb = hose->private_data;
2669
2670                if (phb->type == PNV_PHB_NPU_NVLINK ||
2671                    phb->type == PNV_PHB_NPU_OCAPI)
2672                        continue;
2673
2674                list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2675                        struct iommu_table_group *table_group;
2676
2677                        table_group = pnv_try_setup_npu_table_group(pe);
2678                        if (!table_group) {
2679                                if (!pnv_pci_ioda_pe_dma_weight(pe))
2680                                        continue;
2681
2682                                table_group = &pe->table_group;
2683                                iommu_register_group(&pe->table_group,
2684                                                pe->phb->hose->global_number,
2685                                                pe->pe_number);
2686                        }
2687                        pnv_ioda_setup_bus_iommu_group(pe, table_group,
2688                                        pe->pbus);
2689                }
2690        }
2691
2692        /*
2693         * Now we have all PHBs discovered, time to add NPU devices to
2694         * the corresponding IOMMU groups.
2695         */
2696        list_for_each_entry(hose, &hose_list, list_node) {
2697                unsigned long  pgsizes;
2698
2699                phb = hose->private_data;
2700
2701                if (phb->type != PNV_PHB_NPU_NVLINK)
2702                        continue;
2703
2704                pgsizes = pnv_ioda_parse_tce_sizes(phb);
2705                list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2706                        /*
2707                         * IODA2 bridges get this set up from
2708                         * pci_controller_ops::setup_bridge but NPU bridges
2709                         * do not have this hook defined so we do it here.
2710                         */
2711                        pe->table_group.pgsizes = pgsizes;
2712                        pnv_npu_compound_attach(pe);
2713                }
2714        }
2715}
2716#else /* !CONFIG_IOMMU_API */
2717static void pnv_pci_ioda_setup_iommu_api(void) { };
2718#endif
2719
2720static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
2721{
2722        struct pci_controller *hose = phb->hose;
2723        struct device_node *dn = hose->dn;
2724        unsigned long mask = 0;
2725        int i, rc, count;
2726        u32 val;
2727
2728        count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
2729        if (count <= 0) {
2730                mask = SZ_4K | SZ_64K;
2731                /* Add 16M for POWER8 by default */
2732                if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
2733                                !cpu_has_feature(CPU_FTR_ARCH_300))
2734                        mask |= SZ_16M;
2735                return mask;
2736        }
2737
2738        for (i = 0; i < count; i++) {
2739                rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
2740                                                i, &val);
2741                if (rc == 0)
2742                        mask |= 1ULL << val;
2743        }
2744
2745        return mask;
2746}
2747
2748static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
2749                                       struct pnv_ioda_pe *pe)
2750{
2751        int64_t rc;
2752
2753        if (!pnv_pci_ioda_pe_dma_weight(pe))
2754                return;
2755
2756        /* TVE #1 is selected by PCI address bit 59 */
2757        pe->tce_bypass_base = 1ull << 59;
2758
2759        /* The PE will reserve all possible 32-bits space */
2760        pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
2761                phb->ioda.m32_pci_base);
2762
2763        /* Setup linux iommu table */
2764        pe->table_group.tce32_start = 0;
2765        pe->table_group.tce32_size = phb->ioda.m32_pci_base;
2766        pe->table_group.max_dynamic_windows_supported =
2767                        IOMMU_TABLE_GROUP_MAX_TABLES;
2768        pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
2769        pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
2770#ifdef CONFIG_IOMMU_API
2771        pe->table_group.ops = &pnv_pci_ioda2_ops;
2772#endif
2773
2774        rc = pnv_pci_ioda2_setup_default_config(pe);
2775        if (rc)
2776                return;
2777
2778        if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2779                pnv_ioda_setup_bus_dma(pe, pe->pbus);
2780}
2781
2782int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
2783{
2784        struct pnv_phb *phb = container_of(chip, struct pnv_phb,
2785                                           ioda.irq_chip);
2786
2787        return opal_pci_msi_eoi(phb->opal_id, hw_irq);
2788}
2789
2790static void pnv_ioda2_msi_eoi(struct irq_data *d)
2791{
2792        int64_t rc;
2793        unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
2794        struct irq_chip *chip = irq_data_get_irq_chip(d);
2795
2796        rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
2797        WARN_ON_ONCE(rc);
2798
2799        icp_native_eoi(d);
2800}
2801
2802
2803void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
2804{
2805        struct irq_data *idata;
2806        struct irq_chip *ichip;
2807
2808        /* The MSI EOI OPAL call is only needed on PHB3 */
2809        if (phb->model != PNV_PHB_MODEL_PHB3)
2810                return;
2811
2812        if (!phb->ioda.irq_chip_init) {
2813                /*
2814                 * First time we setup an MSI IRQ, we need to setup the
2815                 * corresponding IRQ chip to route correctly.
2816                 */
2817                idata = irq_get_irq_data(virq);
2818                ichip = irq_data_get_irq_chip(idata);
2819                phb->ioda.irq_chip_init = 1;
2820                phb->ioda.irq_chip = *ichip;
2821                phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
2822        }
2823        irq_set_chip(virq, &phb->ioda.irq_chip);
2824}
2825
2826/*
2827 * Returns true iff chip is something that we could call
2828 * pnv_opal_pci_msi_eoi for.
2829 */
2830bool is_pnv_opal_msi(struct irq_chip *chip)
2831{
2832        return chip->irq_eoi == pnv_ioda2_msi_eoi;
2833}
2834EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
2835
2836static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
2837                                  unsigned int hwirq, unsigned int virq,
2838                                  unsigned int is_64, struct msi_msg *msg)
2839{
2840        struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
2841        unsigned int xive_num = hwirq - phb->msi_base;
2842        __be32 data;
2843        int rc;
2844
2845        /* No PE assigned ? bail out ... no MSI for you ! */
2846        if (pe == NULL)
2847                return -ENXIO;
2848
2849        /* Check if we have an MVE */
2850        if (pe->mve_number < 0)
2851                return -ENXIO;
2852
2853        /* Force 32-bit MSI on some broken devices */
2854        if (dev->no_64bit_msi)
2855                is_64 = 0;
2856
2857        /* Assign XIVE to PE */
2858        rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2859        if (rc) {
2860                pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
2861                        pci_name(dev), rc, xive_num);
2862                return -EIO;
2863        }
2864
2865        if (is_64) {
2866                __be64 addr64;
2867
2868                rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
2869                                     &addr64, &data);
2870                if (rc) {
2871                        pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
2872                                pci_name(dev), rc);
2873                        return -EIO;
2874                }
2875                msg->address_hi = be64_to_cpu(addr64) >> 32;
2876                msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
2877        } else {
2878                __be32 addr32;
2879
2880                rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
2881                                     &addr32, &data);
2882                if (rc) {
2883                        pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
2884                                pci_name(dev), rc);
2885                        return -EIO;
2886                }
2887                msg->address_hi = 0;
2888                msg->address_lo = be32_to_cpu(addr32);
2889        }
2890        msg->data = be32_to_cpu(data);
2891
2892        pnv_set_msi_irq_chip(phb, virq);
2893
2894        pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
2895                 " address=%x_%08x data=%x PE# %x\n",
2896                 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
2897                 msg->address_hi, msg->address_lo, data, pe->pe_number);
2898
2899        return 0;
2900}
2901
2902static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
2903{
2904        unsigned int count;
2905        const __be32 *prop = of_get_property(phb->hose->dn,
2906                                             "ibm,opal-msi-ranges", NULL);
2907        if (!prop) {
2908                /* BML Fallback */
2909                prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
2910        }
2911        if (!prop)
2912                return;
2913
2914        phb->msi_base = be32_to_cpup(prop);
2915        count = be32_to_cpup(prop + 1);
2916        if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
2917                pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
2918                       phb->hose->global_number);
2919                return;
2920        }
2921
2922        phb->msi_setup = pnv_pci_ioda_msi_setup;
2923        phb->msi32_support = 1;
2924        pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
2925                count, phb->msi_base);
2926}
2927
2928#ifdef CONFIG_PCI_IOV
2929static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
2930{
2931        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
2932        struct pnv_phb *phb = hose->private_data;
2933        const resource_size_t gate = phb->ioda.m64_segsize >> 2;
2934        struct resource *res;
2935        int i;
2936        resource_size_t size, total_vf_bar_sz;
2937        struct pci_dn *pdn;
2938        int mul, total_vfs;
2939
2940        if (!pdev->is_physfn || pci_dev_is_added(pdev))
2941                return;
2942
2943        pdn = pci_get_pdn(pdev);
2944        pdn->vfs_expanded = 0;
2945        pdn->m64_single_mode = false;
2946
2947        total_vfs = pci_sriov_get_totalvfs(pdev);
2948        mul = phb->ioda.total_pe_num;
2949        total_vf_bar_sz = 0;
2950
2951        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2952                res = &pdev->resource[i + PCI_IOV_RESOURCES];
2953                if (!res->flags || res->parent)
2954                        continue;
2955                if (!pnv_pci_is_m64_flags(res->flags)) {
2956                        dev_warn(&pdev->dev, "Don't support SR-IOV with"
2957                                        " non M64 VF BAR%d: %pR. \n",
2958                                 i, res);
2959                        goto truncate_iov;
2960                }
2961
2962                total_vf_bar_sz += pci_iov_resource_size(pdev,
2963                                i + PCI_IOV_RESOURCES);
2964
2965                /*
2966                 * If bigger than quarter of M64 segment size, just round up
2967                 * power of two.
2968                 *
2969                 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
2970                 * with other devices, IOV BAR size is expanded to be
2971                 * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
2972                 * segment size , the expanded size would equal to half of the
2973                 * whole M64 space size, which will exhaust the M64 Space and
2974                 * limit the system flexibility.  This is a design decision to
2975                 * set the boundary to quarter of the M64 segment size.
2976                 */
2977                if (total_vf_bar_sz > gate) {
2978                        mul = roundup_pow_of_two(total_vfs);
2979                        dev_info(&pdev->dev,
2980                                "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
2981                                total_vf_bar_sz, gate, mul);
2982                        pdn->m64_single_mode = true;
2983                        break;
2984                }
2985        }
2986
2987        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2988                res = &pdev->resource[i + PCI_IOV_RESOURCES];
2989                if (!res->flags || res->parent)
2990                        continue;
2991
2992                size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
2993                /*
2994                 * On PHB3, the minimum size alignment of M64 BAR in single
2995                 * mode is 32MB.
2996                 */
2997                if (pdn->m64_single_mode && (size < SZ_32M))
2998                        goto truncate_iov;
2999                dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
3000                res->end = res->start + size * mul - 1;
3001                dev_dbg(&pdev->dev, "                       %pR\n", res);
3002                dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
3003                         i, res, mul);
3004        }
3005        pdn->vfs_expanded = mul;
3006
3007        return;
3008
3009truncate_iov:
3010        /* To save MMIO space, IOV BAR is truncated. */
3011        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
3012                res = &pdev->resource[i + PCI_IOV_RESOURCES];
3013                res->flags = 0;
3014                res->end = res->start - 1;
3015        }
3016}
3017#endif /* CONFIG_PCI_IOV */
3018
3019static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
3020                                  struct resource *res)
3021{
3022        struct pnv_phb *phb = pe->phb;
3023        struct pci_bus_region region;
3024        int index;
3025        int64_t rc;
3026
3027        if (!res || !res->flags || res->start > res->end)
3028                return;
3029
3030        if (res->flags & IORESOURCE_IO) {
3031                region.start = res->start - phb->ioda.io_pci_base;
3032                region.end   = res->end - phb->ioda.io_pci_base;
3033                index = region.start / phb->ioda.io_segsize;
3034
3035                while (index < phb->ioda.total_pe_num &&
3036                       region.start <= region.end) {
3037                        phb->ioda.io_segmap[index] = pe->pe_number;
3038                        rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3039                                pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
3040                        if (rc != OPAL_SUCCESS) {
3041                                pr_err("%s: Error %lld mapping IO segment#%d to PE#%x\n",
3042                                       __func__, rc, index, pe->pe_number);
3043                                break;
3044                        }
3045
3046                        region.start += phb->ioda.io_segsize;
3047                        index++;
3048                }
3049        } else if ((res->flags & IORESOURCE_MEM) &&
3050                   !pnv_pci_is_m64(phb, res)) {
3051                region.start = res->start -
3052                               phb->hose->mem_offset[0] -
3053                               phb->ioda.m32_pci_base;
3054                region.end   = res->end -
3055                               phb->hose->mem_offset[0] -
3056                               phb->ioda.m32_pci_base;
3057                index = region.start / phb->ioda.m32_segsize;
3058
3059                while (index < phb->ioda.total_pe_num &&
3060                       region.start <= region.end) {
3061                        phb->ioda.m32_segmap[index] = pe->pe_number;
3062                        rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3063                                pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
3064                        if (rc != OPAL_SUCCESS) {
3065                                pr_err("%s: Error %lld mapping M32 segment#%d to PE#%x",
3066                                       __func__, rc, index, pe->pe_number);
3067                                break;
3068                        }
3069
3070                        region.start += phb->ioda.m32_segsize;
3071                        index++;
3072                }
3073        }
3074}
3075
3076/*
3077 * This function is supposed to be called on basis of PE from top
3078 * to bottom style. So the the I/O or MMIO segment assigned to
3079 * parent PE could be overridden by its child PEs if necessary.
3080 */
3081static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
3082{
3083        struct pci_dev *pdev;
3084        int i;
3085
3086        /*
3087         * NOTE: We only care PCI bus based PE for now. For PCI
3088         * device based PE, for example SRIOV sensitive VF should
3089         * be figured out later.
3090         */
3091        BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
3092
3093        list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
3094                for (i = 0; i <= PCI_ROM_RESOURCE; i++)
3095                        pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
3096
3097                /*
3098                 * If the PE contains all subordinate PCI buses, the
3099                 * windows of the child bridges should be mapped to
3100                 * the PE as well.
3101                 */
3102                if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
3103                        continue;
3104                for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
3105                        pnv_ioda_setup_pe_res(pe,
3106                                &pdev->resource[PCI_BRIDGE_RESOURCES + i]);
3107        }
3108}
3109
3110#ifdef CONFIG_DEBUG_FS
3111static int pnv_pci_diag_data_set(void *data, u64 val)
3112{
3113        struct pnv_phb *phb = data;
3114        s64 ret;
3115
3116        /* Retrieve the diag data from firmware */
3117        ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
3118                                          phb->diag_data_size);
3119        if (ret != OPAL_SUCCESS)
3120                return -EIO;
3121
3122        /* Print the diag data to the kernel log */
3123        pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
3124        return 0;
3125}
3126
3127DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set,
3128                         "%llu\n");
3129
3130static int pnv_pci_ioda_pe_dump(void *data, u64 val)
3131{
3132        struct pnv_phb *phb = data;
3133        int pe_num;
3134
3135        for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
3136                struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num];
3137
3138                if (!test_bit(pe_num, phb->ioda.pe_alloc))
3139                        continue;
3140
3141                pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n",
3142                        pe->rid, pe->device_count,
3143                        (pe->flags & PNV_IODA_PE_DEV) ? "dev " : "",
3144                        (pe->flags & PNV_IODA_PE_BUS) ? "bus " : "",
3145                        (pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "",
3146                        (pe->flags & PNV_IODA_PE_MASTER) ? "master " : "",
3147                        (pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "",
3148                        (pe->flags & PNV_IODA_PE_VF) ? "vf " : "");
3149        }
3150
3151        return 0;
3152}
3153
3154DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL,
3155                         pnv_pci_ioda_pe_dump, "%llu\n");
3156
3157#endif /* CONFIG_DEBUG_FS */
3158
3159static void pnv_pci_ioda_create_dbgfs(void)
3160{
3161#ifdef CONFIG_DEBUG_FS
3162        struct pci_controller *hose, *tmp;
3163        struct pnv_phb *phb;
3164        char name[16];
3165
3166        list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
3167                phb = hose->private_data;
3168
3169                /* Notify initialization of PHB done */
3170                phb->initialized = 1;
3171
3172                sprintf(name, "PCI%04x", hose->global_number);
3173                phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
3174                if (!phb->dbgfs) {
3175                        pr_warn("%s: Error on creating debugfs on PHB#%x\n",
3176                                __func__, hose->global_number);
3177                        continue;
3178                }
3179
3180                debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs,
3181                                           phb, &pnv_pci_diag_data_fops);
3182                debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs,
3183                                           phb, &pnv_pci_ioda_pe_dump_fops);
3184        }
3185#endif /* CONFIG_DEBUG_FS */
3186}
3187
3188static void pnv_pci_enable_bridge(struct pci_bus *bus)
3189{
3190        struct pci_dev *dev = bus->self;
3191        struct pci_bus *child;
3192
3193        /* Empty bus ? bail */
3194        if (list_empty(&bus->devices))
3195                return;
3196
3197        /*
3198         * If there's a bridge associated with that bus enable it. This works
3199         * around races in the generic code if the enabling is done during
3200         * parallel probing. This can be removed once those races have been
3201         * fixed.
3202         */
3203        if (dev) {
3204                int rc = pci_enable_device(dev);
3205                if (rc)
3206                        pci_err(dev, "Error enabling bridge (%d)\n", rc);
3207                pci_set_master(dev);
3208        }
3209
3210        /* Perform the same to child busses */
3211        list_for_each_entry(child, &bus->children, node)
3212                pnv_pci_enable_bridge(child);
3213}
3214
3215static void pnv_pci_enable_bridges(void)
3216{
3217        struct pci_controller *hose;
3218
3219        list_for_each_entry(hose, &hose_list, list_node)
3220                pnv_pci_enable_bridge(hose->bus);
3221}
3222
3223static void pnv_pci_ioda_fixup(void)
3224{
3225        pnv_pci_ioda_setup_PEs();
3226        pnv_pci_ioda_setup_iommu_api();
3227        pnv_pci_ioda_create_dbgfs();
3228
3229        pnv_pci_enable_bridges();
3230
3231#ifdef CONFIG_EEH
3232        pnv_eeh_post_init();
3233#endif
3234}
3235
3236/*
3237 * Returns the alignment for I/O or memory windows for P2P
3238 * bridges. That actually depends on how PEs are segmented.
3239 * For now, we return I/O or M32 segment size for PE sensitive
3240 * P2P bridges. Otherwise, the default values (4KiB for I/O,
3241 * 1MiB for memory) will be returned.
3242 *
3243 * The current PCI bus might be put into one PE, which was
3244 * create against the parent PCI bridge. For that case, we
3245 * needn't enlarge the alignment so that we can save some
3246 * resources.
3247 */
3248static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
3249                                                unsigned long type)
3250{
3251        struct pci_dev *bridge;
3252        struct pci_controller *hose = pci_bus_to_host(bus);
3253        struct pnv_phb *phb = hose->private_data;
3254        int num_pci_bridges = 0;
3255
3256        bridge = bus->self;
3257        while (bridge) {
3258                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
3259                        num_pci_bridges++;
3260                        if (num_pci_bridges >= 2)
3261                                return 1;
3262                }
3263
3264                bridge = bridge->bus->self;
3265        }
3266
3267        /*
3268         * We fall back to M32 if M64 isn't supported. We enforce the M64
3269         * alignment for any 64-bit resource, PCIe doesn't care and
3270         * bridges only do 64-bit prefetchable anyway.
3271         */
3272        if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
3273                return phb->ioda.m64_segsize;
3274        if (type & IORESOURCE_MEM)
3275                return phb->ioda.m32_segsize;
3276
3277        return phb->ioda.io_segsize;
3278}
3279
3280/*
3281 * We are updating root port or the upstream port of the
3282 * bridge behind the root port with PHB's windows in order
3283 * to accommodate the changes on required resources during
3284 * PCI (slot) hotplug, which is connected to either root
3285 * port or the downstream ports of PCIe switch behind the
3286 * root port.
3287 */
3288static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
3289                                           unsigned long type)
3290{
3291        struct pci_controller *hose = pci_bus_to_host(bus);
3292        struct pnv_phb *phb = hose->private_data;
3293        struct pci_dev *bridge = bus->self;
3294        struct resource *r, *w;
3295        bool msi_region = false;
3296        int i;
3297
3298        /* Check if we need apply fixup to the bridge's windows */
3299        if (!pci_is_root_bus(bridge->bus) &&
3300            !pci_is_root_bus(bridge->bus->self->bus))
3301                return;
3302
3303        /* Fixup the resources */
3304        for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
3305                r = &bridge->resource[PCI_BRIDGE_RESOURCES + i];
3306                if (!r->flags || !r->parent)
3307                        continue;
3308
3309                w = NULL;
3310                if (r->flags & type & IORESOURCE_IO)
3311                        w = &hose->io_resource;
3312                else if (pnv_pci_is_m64(phb, r) &&
3313                         (type & IORESOURCE_PREFETCH) &&
3314                         phb->ioda.m64_segsize)
3315                        w = &hose->mem_resources[1];
3316                else if (r->flags & type & IORESOURCE_MEM) {
3317                        w = &hose->mem_resources[0];
3318                        msi_region = true;
3319                }
3320
3321                r->start = w->start;
3322                r->end = w->end;
3323
3324                /* The 64KB 32-bits MSI region shouldn't be included in
3325                 * the 32-bits bridge window. Otherwise, we can see strange
3326                 * issues. One of them is EEH error observed on Garrison.
3327                 *
3328                 * Exclude top 1MB region which is the minimal alignment of
3329                 * 32-bits bridge window.
3330                 */
3331                if (msi_region) {
3332                        r->end += 0x10000;
3333                        r->end -= 0x100000;
3334                }
3335        }
3336}
3337
3338static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
3339{
3340        struct pci_controller *hose = pci_bus_to_host(bus);
3341        struct pnv_phb *phb = hose->private_data;
3342        struct pci_dev *bridge = bus->self;
3343        struct pnv_ioda_pe *pe;
3344        bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
3345
3346        /* Extend bridge's windows if necessary */
3347        pnv_pci_fixup_bridge_resources(bus, type);
3348
3349        /* The PE for root bus should be realized before any one else */
3350        if (!phb->ioda.root_pe_populated) {
3351                pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
3352                if (pe) {
3353                        phb->ioda.root_pe_idx = pe->pe_number;
3354                        phb->ioda.root_pe_populated = true;
3355                }
3356        }
3357
3358        /* Don't assign PE to PCI bus, which doesn't have subordinate devices */
3359        if (list_empty(&bus->devices))
3360                return;
3361
3362        /* Reserve PEs according to used M64 resources */
3363        if (phb->reserve_m64_pe)
3364                phb->reserve_m64_pe(bus, NULL, all);
3365
3366        /*
3367         * Assign PE. We might run here because of partial hotplug.
3368         * For the case, we just pick up the existing PE and should
3369         * not allocate resources again.
3370         */
3371        pe = pnv_ioda_setup_bus_PE(bus, all);
3372        if (!pe)
3373                return;
3374
3375        pnv_ioda_setup_pe_seg(pe);
3376        switch (phb->type) {
3377        case PNV_PHB_IODA1:
3378                pnv_pci_ioda1_setup_dma_pe(phb, pe);
3379                break;
3380        case PNV_PHB_IODA2:
3381                pnv_pci_ioda2_setup_dma_pe(phb, pe);
3382                break;
3383        default:
3384                pr_warn("%s: No DMA for PHB#%x (type %d)\n",
3385                        __func__, phb->hose->global_number, phb->type);
3386        }
3387}
3388
3389static resource_size_t pnv_pci_default_alignment(void)
3390{
3391        return PAGE_SIZE;
3392}
3393
3394#ifdef CONFIG_PCI_IOV
3395static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
3396                                                      int resno)
3397{
3398        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3399        struct pnv_phb *phb = hose->private_data;
3400        struct pci_dn *pdn = pci_get_pdn(pdev);
3401        resource_size_t align;
3402
3403        /*
3404         * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
3405         * SR-IOV. While from hardware perspective, the range mapped by M64
3406         * BAR should be size aligned.
3407         *
3408         * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
3409         * powernv-specific hardware restriction is gone. But if just use the
3410         * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
3411         * in one segment of M64 #15, which introduces the PE conflict between
3412         * PF and VF. Based on this, the minimum alignment of an IOV BAR is
3413         * m64_segsize.
3414         *
3415         * This function returns the total IOV BAR size if M64 BAR is in
3416         * Shared PE mode or just VF BAR size if not.
3417         * If the M64 BAR is in Single PE mode, return the VF BAR size or
3418         * M64 segment size if IOV BAR size is less.
3419         */
3420        align = pci_iov_resource_size(pdev, resno);
3421        if (!pdn->vfs_expanded)
3422                return align;
3423        if (pdn->m64_single_mode)
3424                return max(align, (resource_size_t)phb->ioda.m64_segsize);
3425
3426        return pdn->vfs_expanded * align;
3427}
3428#endif /* CONFIG_PCI_IOV */
3429
3430/* Prevent enabling devices for which we couldn't properly
3431 * assign a PE
3432 */
3433bool pnv_pci_enable_device_hook(struct pci_dev *dev)
3434{
3435        struct pci_controller *hose = pci_bus_to_host(dev->bus);
3436        struct pnv_phb *phb = hose->private_data;
3437        struct pci_dn *pdn;
3438
3439        /* The function is probably called while the PEs have
3440         * not be created yet. For example, resource reassignment
3441         * during PCI probe period. We just skip the check if
3442         * PEs isn't ready.
3443         */
3444        if (!phb->initialized)
3445                return true;
3446
3447        pdn = pci_get_pdn(dev);
3448        if (!pdn || pdn->pe_number == IODA_INVALID_PE) {
3449                pci_err(dev, "pci_enable_device() blocked, no PE assigned.\n");
3450                return false;
3451        }
3452
3453        return true;
3454}
3455
3456static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev)
3457{
3458        struct pci_controller *hose = pci_bus_to_host(dev->bus);
3459        struct pnv_phb *phb = hose->private_data;
3460        struct pci_dn *pdn;
3461        struct pnv_ioda_pe *pe;
3462
3463        if (!phb->initialized)
3464                return true;
3465
3466        pdn = pci_get_pdn(dev);
3467        if (!pdn)
3468                return false;
3469
3470        if (pdn->pe_number == IODA_INVALID_PE) {
3471                pe = pnv_ioda_setup_dev_PE(dev);
3472                if (!pe)
3473                        return false;
3474        }
3475        return true;
3476}
3477
3478static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
3479                                       int num)
3480{
3481        struct pnv_ioda_pe *pe = container_of(table_group,
3482                                              struct pnv_ioda_pe, table_group);
3483        struct pnv_phb *phb = pe->phb;
3484        unsigned int idx;
3485        long rc;
3486
3487        pe_info(pe, "Removing DMA window #%d\n", num);
3488        for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
3489                if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
3490                        continue;
3491
3492                rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
3493                                                idx, 0, 0ul, 0ul, 0ul);
3494                if (rc != OPAL_SUCCESS) {
3495                        pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
3496                                rc, idx);
3497                        return rc;
3498                }
3499
3500                phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
3501        }
3502
3503        pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
3504        return OPAL_SUCCESS;
3505}
3506
3507static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
3508{
3509        unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
3510        struct iommu_table *tbl = pe->table_group.tables[0];
3511        int64_t rc;
3512
3513        if (!weight)
3514                return;
3515
3516        rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
3517        if (rc != OPAL_SUCCESS)
3518                return;
3519
3520        pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
3521        if (pe->table_group.group) {
3522                iommu_group_put(pe->table_group.group);
3523                WARN_ON(pe->table_group.group);
3524        }
3525
3526        free_pages(tbl->it_base, get_order(tbl->it_size << 3));
3527        iommu_tce_table_put(tbl);
3528}
3529
3530static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
3531{
3532        struct iommu_table *tbl = pe->table_group.tables[0];
3533        unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
3534#ifdef CONFIG_IOMMU_API
3535        int64_t rc;
3536#endif
3537
3538        if (!weight)
3539                return;
3540
3541#ifdef CONFIG_IOMMU_API
3542        rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
3543        if (rc)
3544                pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
3545#endif
3546
3547        pnv_pci_ioda2_set_bypass(pe, false);
3548        if (pe->table_group.group) {
3549                iommu_group_put(pe->table_group.group);
3550                WARN_ON(pe->table_group.group);
3551        }
3552
3553        iommu_tce_table_put(tbl);
3554}
3555
3556static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
3557                                 unsigned short win,
3558                                 unsigned int *map)
3559{
3560        struct pnv_phb *phb = pe->phb;
3561        int idx;
3562        int64_t rc;
3563
3564        for (idx = 0; idx < phb->ioda.total_pe_num; idx++) {
3565                if (map[idx] != pe->pe_number)
3566                        continue;
3567
3568                if (win == OPAL_M64_WINDOW_TYPE)
3569                        rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3570                                        phb->ioda.reserved_pe_idx, win,
3571                                        idx / PNV_IODA1_M64_SEGS,
3572                                        idx % PNV_IODA1_M64_SEGS);
3573                else
3574                        rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3575                                        phb->ioda.reserved_pe_idx, win, 0, idx);
3576
3577                if (rc != OPAL_SUCCESS)
3578                        pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
3579                                rc, win, idx);
3580
3581                map[idx] = IODA_INVALID_PE;
3582        }
3583}
3584
3585static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
3586{
3587        struct pnv_phb *phb = pe->phb;
3588
3589        if (phb->type == PNV_PHB_IODA1) {
3590                pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
3591                                     phb->ioda.io_segmap);
3592                pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
3593                                     phb->ioda.m32_segmap);
3594                pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
3595                                     phb->ioda.m64_segmap);
3596        } else if (phb->type == PNV_PHB_IODA2) {
3597                pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
3598                                     phb->ioda.m32_segmap);
3599        }
3600}
3601
3602static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
3603{
3604        struct pnv_phb *phb = pe->phb;
3605        struct pnv_ioda_pe *slave, *tmp;
3606
3607        mutex_lock(&phb->ioda.pe_list_mutex);
3608        list_del(&pe->list);
3609        mutex_unlock(&phb->ioda.pe_list_mutex);
3610
3611        switch (phb->type) {
3612        case PNV_PHB_IODA1:
3613                pnv_pci_ioda1_release_pe_dma(pe);
3614                break;
3615        case PNV_PHB_IODA2:
3616                pnv_pci_ioda2_release_pe_dma(pe);
3617                break;
3618        case PNV_PHB_NPU_OCAPI:
3619                break;
3620        default:
3621                WARN_ON(1);
3622        }
3623
3624        pnv_ioda_release_pe_seg(pe);
3625        pnv_ioda_deconfigure_pe(pe->phb, pe);
3626
3627        /* Release slave PEs in the compound PE */
3628        if (pe->flags & PNV_IODA_PE_MASTER) {
3629                list_for_each_entry_safe(slave, tmp, &pe->slaves, list) {
3630                        list_del(&slave->list);
3631                        pnv_ioda_free_pe(slave);
3632                }
3633        }
3634
3635        /*
3636         * The PE for root bus can be removed because of hotplug in EEH
3637         * recovery for fenced PHB error. We need to mark the PE dead so
3638         * that it can be populated again in PCI hot add path. The PE
3639         * shouldn't be destroyed as it's the global reserved resource.
3640         */
3641        if (phb->ioda.root_pe_populated &&
3642            phb->ioda.root_pe_idx == pe->pe_number)
3643                phb->ioda.root_pe_populated = false;
3644        else
3645                pnv_ioda_free_pe(pe);
3646}
3647
3648static void pnv_pci_release_device(struct pci_dev *pdev)
3649{
3650        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3651        struct pnv_phb *phb = hose->private_data;
3652        struct pci_dn *pdn = pci_get_pdn(pdev);
3653        struct pnv_ioda_pe *pe;
3654
3655        if (pdev->is_virtfn)
3656                return;
3657
3658        if (!pdn || pdn->pe_number == IODA_INVALID_PE)
3659                return;
3660
3661        /*
3662         * PCI hotplug can happen as part of EEH error recovery. The @pdn
3663         * isn't removed and added afterwards in this scenario. We should
3664         * set the PE number in @pdn to an invalid one. Otherwise, the PE's
3665         * device count is decreased on removing devices while failing to
3666         * be increased on adding devices. It leads to unbalanced PE's device
3667         * count and eventually make normal PCI hotplug path broken.
3668         */
3669        pe = &phb->ioda.pe_array[pdn->pe_number];
3670        pdn->pe_number = IODA_INVALID_PE;
3671
3672        WARN_ON(--pe->device_count < 0);
3673        if (pe->device_count == 0)
3674                pnv_ioda_release_pe(pe);
3675}
3676
3677static void pnv_npu_disable_device(struct pci_dev *pdev)
3678{
3679        struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
3680        struct eeh_pe *eehpe = edev ? edev->pe : NULL;
3681
3682        if (eehpe && eeh_ops && eeh_ops->reset)
3683                eeh_ops->reset(eehpe, EEH_RESET_HOT);
3684}
3685
3686static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
3687{
3688        struct pnv_phb *phb = hose->private_data;
3689
3690        opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
3691                       OPAL_ASSERT_RESET);
3692}
3693
3694static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
3695        .dma_dev_setup          = pnv_pci_dma_dev_setup,
3696        .dma_bus_setup          = pnv_pci_dma_bus_setup,
3697        .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
3698        .setup_msi_irqs         = pnv_setup_msi_irqs,
3699        .teardown_msi_irqs      = pnv_teardown_msi_irqs,
3700        .enable_device_hook     = pnv_pci_enable_device_hook,
3701        .release_device         = pnv_pci_release_device,
3702        .window_alignment       = pnv_pci_window_alignment,
3703        .setup_bridge           = pnv_pci_setup_bridge,
3704        .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
3705        .shutdown               = pnv_pci_ioda_shutdown,
3706};
3707
3708static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
3709        .dma_dev_setup          = pnv_pci_dma_dev_setup,
3710        .setup_msi_irqs         = pnv_setup_msi_irqs,
3711        .teardown_msi_irqs      = pnv_teardown_msi_irqs,
3712        .enable_device_hook     = pnv_pci_enable_device_hook,
3713        .window_alignment       = pnv_pci_window_alignment,
3714        .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
3715        .shutdown               = pnv_pci_ioda_shutdown,
3716        .disable_device         = pnv_npu_disable_device,
3717};
3718
3719static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
3720        .enable_device_hook     = pnv_ocapi_enable_device_hook,
3721        .release_device         = pnv_pci_release_device,
3722        .window_alignment       = pnv_pci_window_alignment,
3723        .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
3724        .shutdown               = pnv_pci_ioda_shutdown,
3725};
3726
3727#ifdef CONFIG_CXL_BASE
3728const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops = {
3729        .dma_dev_setup          = pnv_pci_dma_dev_setup,
3730        .dma_bus_setup          = pnv_pci_dma_bus_setup,
3731        .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
3732#ifdef CONFIG_PCI_MSI
3733        .setup_msi_irqs         = pnv_cxl_cx4_setup_msi_irqs,
3734        .teardown_msi_irqs      = pnv_cxl_cx4_teardown_msi_irqs,
3735#endif
3736        .enable_device_hook     = pnv_cxl_enable_device_hook,
3737        .disable_device         = pnv_cxl_disable_device,
3738        .release_device         = pnv_pci_release_device,
3739        .window_alignment       = pnv_pci_window_alignment,
3740        .setup_bridge           = pnv_pci_setup_bridge,
3741        .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
3742        .shutdown               = pnv_pci_ioda_shutdown,
3743};
3744#endif
3745
3746static void __init pnv_pci_init_ioda_phb(struct device_node *np,
3747                                         u64 hub_id, int ioda_type)
3748{
3749        struct pci_controller *hose;
3750        struct pnv_phb *phb;
3751        unsigned long size, m64map_off, m32map_off, pemap_off;
3752        unsigned long iomap_off = 0, dma32map_off = 0;
3753        struct resource r;
3754        const __be64 *prop64;
3755        const __be32 *prop32;
3756        int len;
3757        unsigned int segno;
3758        u64 phb_id;
3759        void *aux;
3760        long rc;
3761
3762        if (!of_device_is_available(np))
3763                return;
3764
3765        pr_info("Initializing %s PHB (%pOF)\n", pnv_phb_names[ioda_type], np);
3766
3767        prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
3768        if (!prop64) {
3769                pr_err("  Missing \"ibm,opal-phbid\" property !\n");
3770                return;
3771        }
3772        phb_id = be64_to_cpup(prop64);
3773        pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
3774
3775        phb = kzalloc(sizeof(*phb), GFP_KERNEL);
3776        if (!phb)
3777                panic("%s: Failed to allocate %zu bytes\n", __func__,
3778                      sizeof(*phb));
3779
3780        /* Allocate PCI controller */
3781        phb->hose = hose = pcibios_alloc_controller(np);
3782        if (!phb->hose) {
3783                pr_err("  Can't allocate PCI controller for %pOF\n",
3784                       np);
3785                memblock_free(__pa(phb), sizeof(struct pnv_phb));
3786                return;
3787        }
3788
3789        spin_lock_init(&phb->lock);
3790        prop32 = of_get_property(np, "bus-range", &len);
3791        if (prop32 && len == 8) {
3792                hose->first_busno = be32_to_cpu(prop32[0]);
3793                hose->last_busno = be32_to_cpu(prop32[1]);
3794        } else {
3795                pr_warn("  Broken <bus-range> on %pOF\n", np);
3796                hose->first_busno = 0;
3797                hose->last_busno = 0xff;
3798        }
3799        hose->private_data = phb;
3800        phb->hub_id = hub_id;
3801        phb->opal_id = phb_id;
3802        phb->type = ioda_type;
3803        mutex_init(&phb->ioda.pe_alloc_mutex);
3804
3805        /* Detect specific models for error handling */
3806        if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
3807                phb->model = PNV_PHB_MODEL_P7IOC;
3808        else if (of_device_is_compatible(np, "ibm,power8-pciex"))
3809                phb->model = PNV_PHB_MODEL_PHB3;
3810        else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
3811                phb->model = PNV_PHB_MODEL_NPU;
3812        else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
3813                phb->model = PNV_PHB_MODEL_NPU2;
3814        else
3815                phb->model = PNV_PHB_MODEL_UNKNOWN;
3816
3817        /* Initialize diagnostic data buffer */
3818        prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL);
3819        if (prop32)
3820                phb->diag_data_size = be32_to_cpup(prop32);
3821        else
3822                phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
3823
3824        phb->diag_data = kzalloc(phb->diag_data_size, GFP_KERNEL);
3825        if (!phb->diag_data)
3826                panic("%s: Failed to allocate %u bytes\n", __func__,
3827                      phb->diag_data_size);
3828
3829        /* Parse 32-bit and IO ranges (if any) */
3830        pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
3831
3832        /* Get registers */
3833        if (!of_address_to_resource(np, 0, &r)) {
3834                phb->regs_phys = r.start;
3835                phb->regs = ioremap(r.start, resource_size(&r));
3836                if (phb->regs == NULL)
3837                        pr_err("  Failed to map registers !\n");
3838        }
3839
3840        /* Initialize more IODA stuff */
3841        phb->ioda.total_pe_num = 1;
3842        prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
3843        if (prop32)
3844                phb->ioda.total_pe_num = be32_to_cpup(prop32);
3845        prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
3846        if (prop32)
3847                phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
3848
3849        /* Invalidate RID to PE# mapping */
3850        for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++)
3851                phb->ioda.pe_rmap[segno] = IODA_INVALID_PE;
3852
3853        /* Parse 64-bit MMIO range */
3854        pnv_ioda_parse_m64_window(phb);
3855
3856        phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
3857        /* FW Has already off top 64k of M32 space (MSI space) */
3858        phb->ioda.m32_size += 0x10000;
3859
3860        phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
3861        phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
3862        phb->ioda.io_size = hose->pci_io_size;
3863        phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
3864        phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
3865
3866        /* Calculate how many 32-bit TCE segments we have */
3867        phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3868                                PNV_IODA1_DMA32_SEGSIZE;
3869
3870        /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
3871        size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
3872                        sizeof(unsigned long));
3873        m64map_off = size;
3874        size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
3875        m32map_off = size;
3876        size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
3877        if (phb->type == PNV_PHB_IODA1) {
3878                iomap_off = size;
3879                size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
3880                dma32map_off = size;
3881                size += phb->ioda.dma32_count *
3882                        sizeof(phb->ioda.dma32_segmap[0]);
3883        }
3884        pemap_off = size;
3885        size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
3886        aux = kzalloc(size, GFP_KERNEL);
3887        if (!aux)
3888                panic("%s: Failed to allocate %lu bytes\n", __func__, size);
3889
3890        phb->ioda.pe_alloc = aux;
3891        phb->ioda.m64_segmap = aux + m64map_off;
3892        phb->ioda.m32_segmap = aux + m32map_off;
3893        for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
3894                phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
3895                phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
3896        }
3897        if (phb->type == PNV_PHB_IODA1) {
3898                phb->ioda.io_segmap = aux + iomap_off;
3899                for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
3900                        phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
3901
3902                phb->ioda.dma32_segmap = aux + dma32map_off;
3903                for (segno = 0; segno < phb->ioda.dma32_count; segno++)
3904                        phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
3905        }
3906        phb->ioda.pe_array = aux + pemap_off;
3907
3908        /*
3909         * Choose PE number for root bus, which shouldn't have
3910         * M64 resources consumed by its child devices. To pick
3911         * the PE number adjacent to the reserved one if possible.
3912         */
3913        pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx);
3914        if (phb->ioda.reserved_pe_idx == 0) {
3915                phb->ioda.root_pe_idx = 1;
3916                pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3917        } else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) {
3918                phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
3919                pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3920        } else {
3921                phb->ioda.root_pe_idx = IODA_INVALID_PE;
3922        }
3923
3924        INIT_LIST_HEAD(&phb->ioda.pe_list);
3925        mutex_init(&phb->ioda.pe_list_mutex);
3926
3927        /* Calculate how many 32-bit TCE segments we have */
3928        phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3929                                PNV_IODA1_DMA32_SEGSIZE;
3930
3931#if 0 /* We should really do that ... */
3932        rc = opal_pci_set_phb_mem_window(opal->phb_id,
3933                                         window_type,
3934                                         window_num,
3935                                         starting_real_address,
3936                                         starting_pci_address,
3937                                         segment_size);
3938#endif
3939
3940        pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
3941                phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
3942                phb->ioda.m32_size, phb->ioda.m32_segsize);
3943        if (phb->ioda.m64_size)
3944                pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
3945                        phb->ioda.m64_size, phb->ioda.m64_segsize);
3946        if (phb->ioda.io_size)
3947                pr_info("                  IO: 0x%x [segment=0x%x]\n",
3948                        phb->ioda.io_size, phb->ioda.io_segsize);
3949
3950
3951        phb->hose->ops = &pnv_pci_ops;
3952        phb->get_pe_state = pnv_ioda_get_pe_state;
3953        phb->freeze_pe = pnv_ioda_freeze_pe;
3954        phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
3955
3956        /* Setup MSI support */
3957        pnv_pci_init_ioda_msis(phb);
3958
3959        /*
3960         * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
3961         * to let the PCI core do resource assignment. It's supposed
3962         * that the PCI core will do correct I/O and MMIO alignment
3963         * for the P2P bridge bars so that each PCI bus (excluding
3964         * the child P2P bridges) can form individual PE.
3965         */
3966        ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
3967
3968        switch (phb->type) {
3969        case PNV_PHB_NPU_NVLINK:
3970                hose->controller_ops = pnv_npu_ioda_controller_ops;
3971                break;
3972        case PNV_PHB_NPU_OCAPI:
3973                hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
3974                break;
3975        default:
3976                phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
3977                hose->controller_ops = pnv_pci_ioda_controller_ops;
3978        }
3979
3980        ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
3981
3982#ifdef CONFIG_PCI_IOV
3983        ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
3984        ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
3985        ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
3986        ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
3987#endif
3988
3989        pci_add_flags(PCI_REASSIGN_ALL_RSRC);
3990
3991        /* Reset IODA tables to a clean state */
3992        rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
3993        if (rc)
3994                pr_warn("  OPAL Error %ld performing IODA table reset !\n", rc);
3995
3996        /*
3997         * If we're running in kdump kernel, the previous kernel never
3998         * shutdown PCI devices correctly. We already got IODA table
3999         * cleaned out. So we have to issue PHB reset to stop all PCI
4000         * transactions from previous kernel. The ppc_pci_reset_phbs
4001         * kernel parameter will force this reset too. Additionally,
4002         * if the IODA reset above failed then use a bigger hammer.
4003         * This can happen if we get a PHB fatal error in very early
4004         * boot.
4005         */
4006        if (is_kdump_kernel() || pci_reset_phbs || rc) {
4007                pr_info("  Issue PHB reset ...\n");
4008                pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
4009                pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
4010        }
4011
4012        /* Remove M64 resource if we can't configure it successfully */
4013        if (!phb->init_m64 || phb->init_m64(phb))
4014                hose->mem_resources[1].flags = 0;
4015
4016        /* create pci_dn's for DT nodes under this PHB */
4017        pci_devs_phb_init_dynamic(hose);
4018}
4019
4020void __init pnv_pci_init_ioda2_phb(struct device_node *np)
4021{
4022        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
4023}
4024
4025void __init pnv_pci_init_npu_phb(struct device_node *np)
4026{
4027        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
4028}
4029
4030void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
4031{
4032        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
4033}
4034
4035static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
4036{
4037        struct pci_controller *hose = pci_bus_to_host(dev->bus);
4038        struct pnv_phb *phb = hose->private_data;
4039
4040        if (!machine_is(powernv))
4041                return;
4042
4043        if (phb->type == PNV_PHB_NPU_OCAPI)
4044                dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
4045}
4046DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
4047
4048void __init pnv_pci_init_ioda_hub(struct device_node *np)
4049{
4050        struct device_node *phbn;
4051        const __be64 *prop64;
4052        u64 hub_id;
4053
4054        pr_info("Probing IODA IO-Hub %pOF\n", np);
4055
4056        prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
4057        if (!prop64) {
4058                pr_err(" Missing \"ibm,opal-hubid\" property !\n");
4059                return;
4060        }
4061        hub_id = be64_to_cpup(prop64);
4062        pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
4063
4064        /* Count child PHBs */
4065        for_each_child_of_node(np, phbn) {
4066                /* Look for IODA1 PHBs */
4067                if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
4068                        pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
4069        }
4070}
4071