linux/arch/powerpc/platforms/powernv/pci-ioda.c
<<
>>
Prefs
   1/*
   2 * Support PCI/PCIe on PowerNV platforms
   3 *
   4 * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License
   8 * as published by the Free Software Foundation; either version
   9 * 2 of the License, or (at your option) any later version.
  10 */
  11
  12#undef DEBUG
  13
  14#include <linux/kernel.h>
  15#include <linux/pci.h>
  16#include <linux/crash_dump.h>
  17#include <linux/debugfs.h>
  18#include <linux/delay.h>
  19#include <linux/string.h>
  20#include <linux/init.h>
  21#include <linux/bootmem.h>
  22#include <linux/irq.h>
  23#include <linux/io.h>
  24#include <linux/msi.h>
  25#include <linux/memblock.h>
  26#include <linux/iommu.h>
  27#include <linux/rculist.h>
  28#include <linux/sizes.h>
  29
  30#include <asm/sections.h>
  31#include <asm/io.h>
  32#include <asm/prom.h>
  33#include <asm/pci-bridge.h>
  34#include <asm/machdep.h>
  35#include <asm/msi_bitmap.h>
  36#include <asm/ppc-pci.h>
  37#include <asm/opal.h>
  38#include <asm/iommu.h>
  39#include <asm/tce.h>
  40#include <asm/xics.h>
  41#include <asm/debug.h>
  42#include <asm/firmware.h>
  43#include <asm/pnv-pci.h>
  44#include <asm/mmzone.h>
  45
  46#include <misc/cxl-base.h>
  47
  48#include "powernv.h"
  49#include "pci.h"
  50
  51/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
  52#define TCE32_TABLE_SIZE        ((0x10000000 / 0x1000) * 8)
  53
  54#define POWERNV_IOMMU_DEFAULT_LEVELS    1
  55#define POWERNV_IOMMU_MAX_LEVELS        5
  56
  57static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
  58
  59static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
  60                            const char *fmt, ...)
  61{
  62        struct va_format vaf;
  63        va_list args;
  64        char pfix[32];
  65
  66        va_start(args, fmt);
  67
  68        vaf.fmt = fmt;
  69        vaf.va = &args;
  70
  71        if (pe->flags & PNV_IODA_PE_DEV)
  72                strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
  73        else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
  74                sprintf(pfix, "%04x:%02x     ",
  75                        pci_domain_nr(pe->pbus), pe->pbus->number);
  76#ifdef CONFIG_PCI_IOV
  77        else if (pe->flags & PNV_IODA_PE_VF)
  78                sprintf(pfix, "%04x:%02x:%2x.%d",
  79                        pci_domain_nr(pe->parent_dev->bus),
  80                        (pe->rid & 0xff00) >> 8,
  81                        PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
  82#endif /* CONFIG_PCI_IOV*/
  83
  84        printk("%spci %s: [PE# %.3d] %pV",
  85               level, pfix, pe->pe_number, &vaf);
  86
  87        va_end(args);
  88}
  89
  90#define pe_err(pe, fmt, ...)                                    \
  91        pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
  92#define pe_warn(pe, fmt, ...)                                   \
  93        pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
  94#define pe_info(pe, fmt, ...)                                   \
  95        pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
  96
  97static bool pnv_iommu_bypass_disabled __read_mostly;
  98
  99static int __init iommu_setup(char *str)
 100{
 101        if (!str)
 102                return -EINVAL;
 103
 104        while (*str) {
 105                if (!strncmp(str, "nobypass", 8)) {
 106                        pnv_iommu_bypass_disabled = true;
 107                        pr_info("PowerNV: IOMMU bypass window disabled.\n");
 108                        break;
 109                }
 110                str += strcspn(str, ",");
 111                if (*str == ',')
 112                        str++;
 113        }
 114
 115        return 0;
 116}
 117early_param("iommu", iommu_setup);
 118
 119/*
 120 * stdcix is only supposed to be used in hypervisor real mode as per
 121 * the architecture spec
 122 */
 123static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
 124{
 125        __asm__ __volatile__("stdcix %0,0,%1"
 126                : : "r" (val), "r" (paddr) : "memory");
 127}
 128
 129static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
 130{
 131        return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) ==
 132                (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
 133}
 134
 135static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
 136{
 137        if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
 138                pr_warn("%s: Invalid PE %d on PHB#%x\n",
 139                        __func__, pe_no, phb->hose->global_number);
 140                return;
 141        }
 142
 143        if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
 144                pr_debug("%s: PE %d was reserved on PHB#%x\n",
 145                         __func__, pe_no, phb->hose->global_number);
 146
 147        phb->ioda.pe_array[pe_no].phb = phb;
 148        phb->ioda.pe_array[pe_no].pe_number = pe_no;
 149}
 150
 151static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
 152{
 153        unsigned long pe;
 154
 155        do {
 156                pe = find_next_zero_bit(phb->ioda.pe_alloc,
 157                                        phb->ioda.total_pe, 0);
 158                if (pe >= phb->ioda.total_pe)
 159                        return IODA_INVALID_PE;
 160        } while(test_and_set_bit(pe, phb->ioda.pe_alloc));
 161
 162        phb->ioda.pe_array[pe].phb = phb;
 163        phb->ioda.pe_array[pe].pe_number = pe;
 164        return pe;
 165}
 166
 167static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
 168{
 169        WARN_ON(phb->ioda.pe_array[pe].pdev);
 170
 171        memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
 172        clear_bit(pe, phb->ioda.pe_alloc);
 173}
 174
 175/* The default M64 BAR is shared by all PEs */
 176static int pnv_ioda2_init_m64(struct pnv_phb *phb)
 177{
 178        const char *desc;
 179        struct resource *r;
 180        s64 rc;
 181
 182        /* Configure the default M64 BAR */
 183        rc = opal_pci_set_phb_mem_window(phb->opal_id,
 184                                         OPAL_M64_WINDOW_TYPE,
 185                                         phb->ioda.m64_bar_idx,
 186                                         phb->ioda.m64_base,
 187                                         0, /* unused */
 188                                         phb->ioda.m64_size);
 189        if (rc != OPAL_SUCCESS) {
 190                desc = "configuring";
 191                goto fail;
 192        }
 193
 194        /* Enable the default M64 BAR */
 195        rc = opal_pci_phb_mmio_enable(phb->opal_id,
 196                                      OPAL_M64_WINDOW_TYPE,
 197                                      phb->ioda.m64_bar_idx,
 198                                      OPAL_ENABLE_M64_SPLIT);
 199        if (rc != OPAL_SUCCESS) {
 200                desc = "enabling";
 201                goto fail;
 202        }
 203
 204        /* Mark the M64 BAR assigned */
 205        set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc);
 206
 207        /*
 208         * Strip off the segment used by the reserved PE, which is
 209         * expected to be 0 or last one of PE capabicity.
 210         */
 211        r = &phb->hose->mem_resources[1];
 212        if (phb->ioda.reserved_pe == 0)
 213                r->start += phb->ioda.m64_segsize;
 214        else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1))
 215                r->end -= phb->ioda.m64_segsize;
 216        else
 217                pr_warn("  Cannot strip M64 segment for reserved PE#%d\n",
 218                        phb->ioda.reserved_pe);
 219
 220        return 0;
 221
 222fail:
 223        pr_warn("  Failure %lld %s M64 BAR#%d\n",
 224                rc, desc, phb->ioda.m64_bar_idx);
 225        opal_pci_phb_mmio_enable(phb->opal_id,
 226                                 OPAL_M64_WINDOW_TYPE,
 227                                 phb->ioda.m64_bar_idx,
 228                                 OPAL_DISABLE_M64);
 229        return -EIO;
 230}
 231
 232static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev,
 233                                         unsigned long *pe_bitmap)
 234{
 235        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 236        struct pnv_phb *phb = hose->private_data;
 237        struct resource *r;
 238        resource_size_t base, sgsz, start, end;
 239        int segno, i;
 240
 241        base = phb->ioda.m64_base;
 242        sgsz = phb->ioda.m64_segsize;
 243        for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
 244                r = &pdev->resource[i];
 245                if (!r->parent || !pnv_pci_is_mem_pref_64(r->flags))
 246                        continue;
 247
 248                start = _ALIGN_DOWN(r->start - base, sgsz);
 249                end = _ALIGN_UP(r->end - base, sgsz);
 250                for (segno = start / sgsz; segno < end / sgsz; segno++) {
 251                        if (pe_bitmap)
 252                                set_bit(segno, pe_bitmap);
 253                        else
 254                                pnv_ioda_reserve_pe(phb, segno);
 255                }
 256        }
 257}
 258
 259static void pnv_ioda2_reserve_m64_pe(struct pci_bus *bus,
 260                                     unsigned long *pe_bitmap,
 261                                     bool all)
 262{
 263        struct pci_dev *pdev;
 264
 265        list_for_each_entry(pdev, &bus->devices, bus_list) {
 266                pnv_ioda2_reserve_dev_m64_pe(pdev, pe_bitmap);
 267
 268                if (all && pdev->subordinate)
 269                        pnv_ioda2_reserve_m64_pe(pdev->subordinate,
 270                                                 pe_bitmap, all);
 271        }
 272}
 273
 274static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
 275{
 276        struct pci_controller *hose = pci_bus_to_host(bus);
 277        struct pnv_phb *phb = hose->private_data;
 278        struct pnv_ioda_pe *master_pe, *pe;
 279        unsigned long size, *pe_alloc;
 280        int i;
 281
 282        /* Root bus shouldn't use M64 */
 283        if (pci_is_root_bus(bus))
 284                return IODA_INVALID_PE;
 285
 286        /* Allocate bitmap */
 287        size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
 288        pe_alloc = kzalloc(size, GFP_KERNEL);
 289        if (!pe_alloc) {
 290                pr_warn("%s: Out of memory !\n",
 291                        __func__);
 292                return IODA_INVALID_PE;
 293        }
 294
 295        /* Figure out reserved PE numbers by the PE */
 296        pnv_ioda2_reserve_m64_pe(bus, pe_alloc, all);
 297
 298        /*
 299         * the current bus might not own M64 window and that's all
 300         * contributed by its child buses. For the case, we needn't
 301         * pick M64 dependent PE#.
 302         */
 303        if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) {
 304                kfree(pe_alloc);
 305                return IODA_INVALID_PE;
 306        }
 307
 308        /*
 309         * Figure out the master PE and put all slave PEs to master
 310         * PE's list to form compound PE.
 311         */
 312        master_pe = NULL;
 313        i = -1;
 314        while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) <
 315                phb->ioda.total_pe) {
 316                pe = &phb->ioda.pe_array[i];
 317
 318                if (!master_pe) {
 319                        pe->flags |= PNV_IODA_PE_MASTER;
 320                        INIT_LIST_HEAD(&pe->slaves);
 321                        master_pe = pe;
 322                } else {
 323                        pe->flags |= PNV_IODA_PE_SLAVE;
 324                        pe->master = master_pe;
 325                        list_add_tail(&pe->list, &master_pe->slaves);
 326                }
 327        }
 328
 329        kfree(pe_alloc);
 330        return master_pe->pe_number;
 331}
 332
 333static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
 334{
 335        struct pci_controller *hose = phb->hose;
 336        struct device_node *dn = hose->dn;
 337        struct resource *res;
 338        const u32 *r;
 339        u64 pci_addr;
 340
 341        /* FIXME: Support M64 for P7IOC */
 342        if (phb->type != PNV_PHB_IODA2) {
 343                pr_info("  Not support M64 window\n");
 344                return;
 345        }
 346
 347        if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
 348                pr_info("  Firmware too old to support M64 window\n");
 349                return;
 350        }
 351
 352        r = of_get_property(dn, "ibm,opal-m64-window", NULL);
 353        if (!r) {
 354                pr_info("  No <ibm,opal-m64-window> on %s\n",
 355                        dn->full_name);
 356                return;
 357        }
 358
 359        res = &hose->mem_resources[1];
 360        res->start = of_translate_address(dn, r + 2);
 361        res->end = res->start + of_read_number(r + 4, 2) - 1;
 362        res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
 363        pci_addr = of_read_number(r, 2);
 364        hose->mem_offset[1] = res->start - pci_addr;
 365
 366        phb->ioda.m64_size = resource_size(res);
 367        phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe;
 368        phb->ioda.m64_base = pci_addr;
 369
 370        pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n",
 371                        res->start, res->end, pci_addr);
 372
 373        /* Use last M64 BAR to cover M64 window */
 374        phb->ioda.m64_bar_idx = 15;
 375        phb->init_m64 = pnv_ioda2_init_m64;
 376        phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe;
 377        phb->pick_m64_pe = pnv_ioda2_pick_m64_pe;
 378}
 379
 380static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
 381{
 382        struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
 383        struct pnv_ioda_pe *slave;
 384        s64 rc;
 385
 386        /* Fetch master PE */
 387        if (pe->flags & PNV_IODA_PE_SLAVE) {
 388                pe = pe->master;
 389                if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
 390                        return;
 391
 392                pe_no = pe->pe_number;
 393        }
 394
 395        /* Freeze master PE */
 396        rc = opal_pci_eeh_freeze_set(phb->opal_id,
 397                                     pe_no,
 398                                     OPAL_EEH_ACTION_SET_FREEZE_ALL);
 399        if (rc != OPAL_SUCCESS) {
 400                pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
 401                        __func__, rc, phb->hose->global_number, pe_no);
 402                return;
 403        }
 404
 405        /* Freeze slave PEs */
 406        if (!(pe->flags & PNV_IODA_PE_MASTER))
 407                return;
 408
 409        list_for_each_entry(slave, &pe->slaves, list) {
 410                rc = opal_pci_eeh_freeze_set(phb->opal_id,
 411                                             slave->pe_number,
 412                                             OPAL_EEH_ACTION_SET_FREEZE_ALL);
 413                if (rc != OPAL_SUCCESS)
 414                        pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
 415                                __func__, rc, phb->hose->global_number,
 416                                slave->pe_number);
 417        }
 418}
 419
 420static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
 421{
 422        struct pnv_ioda_pe *pe, *slave;
 423        s64 rc;
 424
 425        /* Find master PE */
 426        pe = &phb->ioda.pe_array[pe_no];
 427        if (pe->flags & PNV_IODA_PE_SLAVE) {
 428                pe = pe->master;
 429                WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
 430                pe_no = pe->pe_number;
 431        }
 432
 433        /* Clear frozen state for master PE */
 434        rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
 435        if (rc != OPAL_SUCCESS) {
 436                pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
 437                        __func__, rc, opt, phb->hose->global_number, pe_no);
 438                return -EIO;
 439        }
 440
 441        if (!(pe->flags & PNV_IODA_PE_MASTER))
 442                return 0;
 443
 444        /* Clear frozen state for slave PEs */
 445        list_for_each_entry(slave, &pe->slaves, list) {
 446                rc = opal_pci_eeh_freeze_clear(phb->opal_id,
 447                                             slave->pe_number,
 448                                             opt);
 449                if (rc != OPAL_SUCCESS) {
 450                        pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
 451                                __func__, rc, opt, phb->hose->global_number,
 452                                slave->pe_number);
 453                        return -EIO;
 454                }
 455        }
 456
 457        return 0;
 458}
 459
 460static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
 461{
 462        struct pnv_ioda_pe *slave, *pe;
 463        u8 fstate, state;
 464        __be16 pcierr;
 465        s64 rc;
 466
 467        /* Sanity check on PE number */
 468        if (pe_no < 0 || pe_no >= phb->ioda.total_pe)
 469                return OPAL_EEH_STOPPED_PERM_UNAVAIL;
 470
 471        /*
 472         * Fetch the master PE and the PE instance might be
 473         * not initialized yet.
 474         */
 475        pe = &phb->ioda.pe_array[pe_no];
 476        if (pe->flags & PNV_IODA_PE_SLAVE) {
 477                pe = pe->master;
 478                WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
 479                pe_no = pe->pe_number;
 480        }
 481
 482        /* Check the master PE */
 483        rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
 484                                        &state, &pcierr, NULL);
 485        if (rc != OPAL_SUCCESS) {
 486                pr_warn("%s: Failure %lld getting "
 487                        "PHB#%x-PE#%x state\n",
 488                        __func__, rc,
 489                        phb->hose->global_number, pe_no);
 490                return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
 491        }
 492
 493        /* Check the slave PE */
 494        if (!(pe->flags & PNV_IODA_PE_MASTER))
 495                return state;
 496
 497        list_for_each_entry(slave, &pe->slaves, list) {
 498                rc = opal_pci_eeh_freeze_status(phb->opal_id,
 499                                                slave->pe_number,
 500                                                &fstate,
 501                                                &pcierr,
 502                                                NULL);
 503                if (rc != OPAL_SUCCESS) {
 504                        pr_warn("%s: Failure %lld getting "
 505                                "PHB#%x-PE#%x state\n",
 506                                __func__, rc,
 507                                phb->hose->global_number, slave->pe_number);
 508                        return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
 509                }
 510
 511                /*
 512                 * Override the result based on the ascending
 513                 * priority.
 514                 */
 515                if (fstate > state)
 516                        state = fstate;
 517        }
 518
 519        return state;
 520}
 521
 522/* Currently those 2 are only used when MSIs are enabled, this will change
 523 * but in the meantime, we need to protect them to avoid warnings
 524 */
 525#ifdef CONFIG_PCI_MSI
 526static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 527{
 528        struct pci_controller *hose = pci_bus_to_host(dev->bus);
 529        struct pnv_phb *phb = hose->private_data;
 530        struct pci_dn *pdn = pci_get_pdn(dev);
 531
 532        if (!pdn)
 533                return NULL;
 534        if (pdn->pe_number == IODA_INVALID_PE)
 535                return NULL;
 536        return &phb->ioda.pe_array[pdn->pe_number];
 537}
 538#endif /* CONFIG_PCI_MSI */
 539
 540static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
 541                                  struct pnv_ioda_pe *parent,
 542                                  struct pnv_ioda_pe *child,
 543                                  bool is_add)
 544{
 545        const char *desc = is_add ? "adding" : "removing";
 546        uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
 547                              OPAL_REMOVE_PE_FROM_DOMAIN;
 548        struct pnv_ioda_pe *slave;
 549        long rc;
 550
 551        /* Parent PE affects child PE */
 552        rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
 553                                child->pe_number, op);
 554        if (rc != OPAL_SUCCESS) {
 555                pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
 556                        rc, desc);
 557                return -ENXIO;
 558        }
 559
 560        if (!(child->flags & PNV_IODA_PE_MASTER))
 561                return 0;
 562
 563        /* Compound case: parent PE affects slave PEs */
 564        list_for_each_entry(slave, &child->slaves, list) {
 565                rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
 566                                        slave->pe_number, op);
 567                if (rc != OPAL_SUCCESS) {
 568                        pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
 569                                rc, desc);
 570                        return -ENXIO;
 571                }
 572        }
 573
 574        return 0;
 575}
 576
 577static int pnv_ioda_set_peltv(struct pnv_phb *phb,
 578                              struct pnv_ioda_pe *pe,
 579                              bool is_add)
 580{
 581        struct pnv_ioda_pe *slave;
 582        struct pci_dev *pdev = NULL;
 583        int ret;
 584
 585        /*
 586         * Clear PE frozen state. If it's master PE, we need
 587         * clear slave PE frozen state as well.
 588         */
 589        if (is_add) {
 590                opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
 591                                          OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 592                if (pe->flags & PNV_IODA_PE_MASTER) {
 593                        list_for_each_entry(slave, &pe->slaves, list)
 594                                opal_pci_eeh_freeze_clear(phb->opal_id,
 595                                                          slave->pe_number,
 596                                                          OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 597                }
 598        }
 599
 600        /*
 601         * Associate PE in PELT. We need add the PE into the
 602         * corresponding PELT-V as well. Otherwise, the error
 603         * originated from the PE might contribute to other
 604         * PEs.
 605         */
 606        ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
 607        if (ret)
 608                return ret;
 609
 610        /* For compound PEs, any one affects all of them */
 611        if (pe->flags & PNV_IODA_PE_MASTER) {
 612                list_for_each_entry(slave, &pe->slaves, list) {
 613                        ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
 614                        if (ret)
 615                                return ret;
 616                }
 617        }
 618
 619        if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
 620                pdev = pe->pbus->self;
 621        else if (pe->flags & PNV_IODA_PE_DEV)
 622                pdev = pe->pdev->bus->self;
 623#ifdef CONFIG_PCI_IOV
 624        else if (pe->flags & PNV_IODA_PE_VF)
 625                pdev = pe->parent_dev;
 626#endif /* CONFIG_PCI_IOV */
 627        while (pdev) {
 628                struct pci_dn *pdn = pci_get_pdn(pdev);
 629                struct pnv_ioda_pe *parent;
 630
 631                if (pdn && pdn->pe_number != IODA_INVALID_PE) {
 632                        parent = &phb->ioda.pe_array[pdn->pe_number];
 633                        ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
 634                        if (ret)
 635                                return ret;
 636                }
 637
 638                pdev = pdev->bus->self;
 639        }
 640
 641        return 0;
 642}
 643
 644#ifdef CONFIG_PCI_IOV
 645static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 646{
 647        struct pci_dev *parent;
 648        uint8_t bcomp, dcomp, fcomp;
 649        int64_t rc;
 650        long rid_end, rid;
 651
 652        /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
 653        if (pe->pbus) {
 654                int count;
 655
 656                dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
 657                fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
 658                parent = pe->pbus->self;
 659                if (pe->flags & PNV_IODA_PE_BUS_ALL)
 660                        count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
 661                else
 662                        count = 1;
 663
 664                switch(count) {
 665                case  1: bcomp = OpalPciBusAll;         break;
 666                case  2: bcomp = OpalPciBus7Bits;       break;
 667                case  4: bcomp = OpalPciBus6Bits;       break;
 668                case  8: bcomp = OpalPciBus5Bits;       break;
 669                case 16: bcomp = OpalPciBus4Bits;       break;
 670                case 32: bcomp = OpalPciBus3Bits;       break;
 671                default:
 672                        dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
 673                                count);
 674                        /* Do an exact match only */
 675                        bcomp = OpalPciBusAll;
 676                }
 677                rid_end = pe->rid + (count << 8);
 678        } else {
 679                if (pe->flags & PNV_IODA_PE_VF)
 680                        parent = pe->parent_dev;
 681                else
 682                        parent = pe->pdev->bus->self;
 683                bcomp = OpalPciBusAll;
 684                dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
 685                fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
 686                rid_end = pe->rid + 1;
 687        }
 688
 689        /* Clear the reverse map */
 690        for (rid = pe->rid; rid < rid_end; rid++)
 691                phb->ioda.pe_rmap[rid] = 0;
 692
 693        /* Release from all parents PELT-V */
 694        while (parent) {
 695                struct pci_dn *pdn = pci_get_pdn(parent);
 696                if (pdn && pdn->pe_number != IODA_INVALID_PE) {
 697                        rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
 698                                                pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
 699                        /* XXX What to do in case of error ? */
 700                }
 701                parent = parent->bus->self;
 702        }
 703
 704        opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
 705                                  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
 706
 707        /* Disassociate PE in PELT */
 708        rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
 709                                pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
 710        if (rc)
 711                pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
 712        rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
 713                             bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
 714        if (rc)
 715                pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
 716
 717        pe->pbus = NULL;
 718        pe->pdev = NULL;
 719        pe->parent_dev = NULL;
 720
 721        return 0;
 722}
 723#endif /* CONFIG_PCI_IOV */
 724
 725static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 726{
 727        struct pci_dev *parent;
 728        uint8_t bcomp, dcomp, fcomp;
 729        long rc, rid_end, rid;
 730
 731        /* Bus validation ? */
 732        if (pe->pbus) {
 733                int count;
 734
 735                dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
 736                fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
 737                parent = pe->pbus->self;
 738                if (pe->flags & PNV_IODA_PE_BUS_ALL)
 739                        count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
 740                else
 741                        count = 1;
 742
 743                switch(count) {
 744                case  1: bcomp = OpalPciBusAll;         break;
 745                case  2: bcomp = OpalPciBus7Bits;       break;
 746                case  4: bcomp = OpalPciBus6Bits;       break;
 747                case  8: bcomp = OpalPciBus5Bits;       break;
 748                case 16: bcomp = OpalPciBus4Bits;       break;
 749                case 32: bcomp = OpalPciBus3Bits;       break;
 750                default:
 751                        dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
 752                                count);
 753                        /* Do an exact match only */
 754                        bcomp = OpalPciBusAll;
 755                }
 756                rid_end = pe->rid + (count << 8);
 757        } else {
 758#ifdef CONFIG_PCI_IOV
 759                if (pe->flags & PNV_IODA_PE_VF)
 760                        parent = pe->parent_dev;
 761                else
 762#endif /* CONFIG_PCI_IOV */
 763                        parent = pe->pdev->bus->self;
 764                bcomp = OpalPciBusAll;
 765                dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
 766                fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
 767                rid_end = pe->rid + 1;
 768        }
 769
 770        /*
 771         * Associate PE in PELT. We need add the PE into the
 772         * corresponding PELT-V as well. Otherwise, the error
 773         * originated from the PE might contribute to other
 774         * PEs.
 775         */
 776        rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
 777                             bcomp, dcomp, fcomp, OPAL_MAP_PE);
 778        if (rc) {
 779                pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
 780                return -ENXIO;
 781        }
 782
 783        /* Configure PELTV */
 784        pnv_ioda_set_peltv(phb, pe, true);
 785
 786        /* Setup reverse map */
 787        for (rid = pe->rid; rid < rid_end; rid++)
 788                phb->ioda.pe_rmap[rid] = pe->pe_number;
 789
 790        /* Setup one MVTs on IODA1 */
 791        if (phb->type != PNV_PHB_IODA1) {
 792                pe->mve_number = 0;
 793                goto out;
 794        }
 795
 796        pe->mve_number = pe->pe_number;
 797        rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
 798        if (rc != OPAL_SUCCESS) {
 799                pe_err(pe, "OPAL error %ld setting up MVE %d\n",
 800                       rc, pe->mve_number);
 801                pe->mve_number = -1;
 802        } else {
 803                rc = opal_pci_set_mve_enable(phb->opal_id,
 804                                             pe->mve_number, OPAL_ENABLE_MVE);
 805                if (rc) {
 806                        pe_err(pe, "OPAL error %ld enabling MVE %d\n",
 807                               rc, pe->mve_number);
 808                        pe->mve_number = -1;
 809                }
 810        }
 811
 812out:
 813        return 0;
 814}
 815
 816static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
 817                                       struct pnv_ioda_pe *pe)
 818{
 819        struct pnv_ioda_pe *lpe;
 820
 821        list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) {
 822                if (lpe->dma_weight < pe->dma_weight) {
 823                        list_add_tail(&pe->dma_link, &lpe->dma_link);
 824                        return;
 825                }
 826        }
 827        list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list);
 828}
 829
 830static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
 831{
 832        /* This is quite simplistic. The "base" weight of a device
 833         * is 10. 0 means no DMA is to be accounted for it.
 834         */
 835
 836        /* If it's a bridge, no DMA */
 837        if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 838                return 0;
 839
 840        /* Reduce the weight of slow USB controllers */
 841        if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
 842            dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
 843            dev->class == PCI_CLASS_SERIAL_USB_EHCI)
 844                return 3;
 845
 846        /* Increase the weight of RAID (includes Obsidian) */
 847        if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
 848                return 15;
 849
 850        /* Default */
 851        return 10;
 852}
 853
 854#ifdef CONFIG_PCI_IOV
 855static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 856{
 857        struct pci_dn *pdn = pci_get_pdn(dev);
 858        int i;
 859        struct resource *res, res2;
 860        resource_size_t size;
 861        u16 num_vfs;
 862
 863        if (!dev->is_physfn)
 864                return -EINVAL;
 865
 866        /*
 867         * "offset" is in VFs.  The M64 windows are sized so that when they
 868         * are segmented, each segment is the same size as the IOV BAR.
 869         * Each segment is in a separate PE, and the high order bits of the
 870         * address are the PE number.  Therefore, each VF's BAR is in a
 871         * separate PE, and changing the IOV BAR start address changes the
 872         * range of PEs the VFs are in.
 873         */
 874        num_vfs = pdn->num_vfs;
 875        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 876                res = &dev->resource[i + PCI_IOV_RESOURCES];
 877                if (!res->flags || !res->parent)
 878                        continue;
 879
 880                if (!pnv_pci_is_mem_pref_64(res->flags))
 881                        continue;
 882
 883                /*
 884                 * The actual IOV BAR range is determined by the start address
 885                 * and the actual size for num_vfs VFs BAR.  This check is to
 886                 * make sure that after shifting, the range will not overlap
 887                 * with another device.
 888                 */
 889                size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
 890                res2.flags = res->flags;
 891                res2.start = res->start + (size * offset);
 892                res2.end = res2.start + (size * num_vfs) - 1;
 893
 894                if (res2.end > res->end) {
 895                        dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
 896                                i, &res2, res, num_vfs, offset);
 897                        return -EBUSY;
 898                }
 899        }
 900
 901        /*
 902         * After doing so, there would be a "hole" in the /proc/iomem when
 903         * offset is a positive value. It looks like the device return some
 904         * mmio back to the system, which actually no one could use it.
 905         */
 906        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 907                res = &dev->resource[i + PCI_IOV_RESOURCES];
 908                if (!res->flags || !res->parent)
 909                        continue;
 910
 911                if (!pnv_pci_is_mem_pref_64(res->flags))
 912                        continue;
 913
 914                size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
 915                res2 = *res;
 916                res->start += size * offset;
 917
 918                dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
 919                         i, &res2, res, (offset > 0) ? "En" : "Dis",
 920                         num_vfs, offset);
 921                pci_update_resource(dev, i + PCI_IOV_RESOURCES);
 922        }
 923        return 0;
 924}
 925#endif /* CONFIG_PCI_IOV */
 926
 927#if 0
 928static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 929{
 930        struct pci_controller *hose = pci_bus_to_host(dev->bus);
 931        struct pnv_phb *phb = hose->private_data;
 932        struct pci_dn *pdn = pci_get_pdn(dev);
 933        struct pnv_ioda_pe *pe;
 934        int pe_num;
 935
 936        if (!pdn) {
 937                pr_err("%s: Device tree node not associated properly\n",
 938                           pci_name(dev));
 939                return NULL;
 940        }
 941        if (pdn->pe_number != IODA_INVALID_PE)
 942                return NULL;
 943
 944        /* PE#0 has been pre-set */
 945        if (dev->bus->number == 0)
 946                pe_num = 0;
 947        else
 948                pe_num = pnv_ioda_alloc_pe(phb);
 949        if (pe_num == IODA_INVALID_PE) {
 950                pr_warning("%s: Not enough PE# available, disabling device\n",
 951                           pci_name(dev));
 952                return NULL;
 953        }
 954
 955        /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
 956         * pointer in the PE data structure, both should be destroyed at the
 957         * same time. However, this needs to be looked at more closely again
 958         * once we actually start removing things (Hotplug, SR-IOV, ...)
 959         *
 960         * At some point we want to remove the PDN completely anyways
 961         */
 962        pe = &phb->ioda.pe_array[pe_num];
 963        pci_dev_get(dev);
 964        pdn->pcidev = dev;
 965        pdn->pe_number = pe_num;
 966        pe->pdev = dev;
 967        pe->pbus = NULL;
 968        pe->tce32_seg = -1;
 969        pe->mve_number = -1;
 970        pe->rid = dev->bus->number << 8 | pdn->devfn;
 971
 972        pe_info(pe, "Associated device to PE\n");
 973
 974        if (pnv_ioda_configure_pe(phb, pe)) {
 975                /* XXX What do we do here ? */
 976                if (pe_num)
 977                        pnv_ioda_free_pe(phb, pe_num);
 978                pdn->pe_number = IODA_INVALID_PE;
 979                pe->pdev = NULL;
 980                pci_dev_put(dev);
 981                return NULL;
 982        }
 983
 984        /* Assign a DMA weight to the device */
 985        pe->dma_weight = pnv_ioda_dma_weight(dev);
 986        if (pe->dma_weight != 0) {
 987                phb->ioda.dma_weight += pe->dma_weight;
 988                phb->ioda.dma_pe_count++;
 989        }
 990
 991        /* Link the PE */
 992        pnv_ioda_link_pe_by_weight(phb, pe);
 993
 994        return pe;
 995}
 996#endif /* Useful for SRIOV case */
 997
 998static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
 999{
1000        struct pci_dev *dev;
1001
1002        list_for_each_entry(dev, &bus->devices, bus_list) {
1003                struct pci_dn *pdn = pci_get_pdn(dev);
1004
1005                if (pdn == NULL) {
1006                        pr_warn("%s: No device node associated with device !\n",
1007                                pci_name(dev));
1008                        continue;
1009                }
1010                pdn->pe_number = pe->pe_number;
1011                pe->dma_weight += pnv_ioda_dma_weight(dev);
1012                if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1013                        pnv_ioda_setup_same_PE(dev->subordinate, pe);
1014        }
1015}
1016
1017/*
1018 * There're 2 types of PCI bus sensitive PEs: One that is compromised of
1019 * single PCI bus. Another one that contains the primary PCI bus and its
1020 * subordinate PCI devices and buses. The second type of PE is normally
1021 * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
1022 */
1023static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
1024{
1025        struct pci_controller *hose = pci_bus_to_host(bus);
1026        struct pnv_phb *phb = hose->private_data;
1027        struct pnv_ioda_pe *pe;
1028        int pe_num = IODA_INVALID_PE;
1029
1030        /* Check if PE is determined by M64 */
1031        if (phb->pick_m64_pe)
1032                pe_num = phb->pick_m64_pe(bus, all);
1033
1034        /* The PE number isn't pinned by M64 */
1035        if (pe_num == IODA_INVALID_PE)
1036                pe_num = pnv_ioda_alloc_pe(phb);
1037
1038        if (pe_num == IODA_INVALID_PE) {
1039                pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
1040                        __func__, pci_domain_nr(bus), bus->number);
1041                return;
1042        }
1043
1044        pe = &phb->ioda.pe_array[pe_num];
1045        pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
1046        pe->pbus = bus;
1047        pe->pdev = NULL;
1048        pe->tce32_seg = -1;
1049        pe->mve_number = -1;
1050        pe->rid = bus->busn_res.start << 8;
1051        pe->dma_weight = 0;
1052
1053        if (all)
1054                pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
1055                        bus->busn_res.start, bus->busn_res.end, pe_num);
1056        else
1057                pe_info(pe, "Secondary bus %d associated with PE#%d\n",
1058                        bus->busn_res.start, pe_num);
1059
1060        if (pnv_ioda_configure_pe(phb, pe)) {
1061                /* XXX What do we do here ? */
1062                if (pe_num)
1063                        pnv_ioda_free_pe(phb, pe_num);
1064                pe->pbus = NULL;
1065                return;
1066        }
1067
1068        /* Associate it with all child devices */
1069        pnv_ioda_setup_same_PE(bus, pe);
1070
1071        /* Put PE to the list */
1072        list_add_tail(&pe->list, &phb->ioda.pe_list);
1073
1074        /* Account for one DMA PE if at least one DMA capable device exist
1075         * below the bridge
1076         */
1077        if (pe->dma_weight != 0) {
1078                phb->ioda.dma_weight += pe->dma_weight;
1079                phb->ioda.dma_pe_count++;
1080        }
1081
1082        /* Link the PE */
1083        pnv_ioda_link_pe_by_weight(phb, pe);
1084}
1085
1086static void pnv_ioda_setup_PEs(struct pci_bus *bus)
1087{
1088        struct pci_dev *dev;
1089
1090        pnv_ioda_setup_bus_PE(bus, false);
1091
1092        list_for_each_entry(dev, &bus->devices, bus_list) {
1093                if (dev->subordinate) {
1094                        if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE)
1095                                pnv_ioda_setup_bus_PE(dev->subordinate, true);
1096                        else
1097                                pnv_ioda_setup_PEs(dev->subordinate);
1098                }
1099        }
1100}
1101
1102/*
1103 * Configure PEs so that the downstream PCI buses and devices
1104 * could have their associated PE#. Unfortunately, we didn't
1105 * figure out the way to identify the PLX bridge yet. So we
1106 * simply put the PCI bus and the subordinate behind the root
1107 * port to PE# here. The game rule here is expected to be changed
1108 * as soon as we can detected PLX bridge correctly.
1109 */
1110static void pnv_pci_ioda_setup_PEs(void)
1111{
1112        struct pci_controller *hose, *tmp;
1113        struct pnv_phb *phb;
1114
1115        list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
1116                phb = hose->private_data;
1117
1118                /* M64 layout might affect PE allocation */
1119                if (phb->reserve_m64_pe)
1120                        phb->reserve_m64_pe(hose->bus, NULL, true);
1121
1122                pnv_ioda_setup_PEs(hose->bus);
1123        }
1124}
1125
1126#ifdef CONFIG_PCI_IOV
1127static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
1128{
1129        struct pci_bus        *bus;
1130        struct pci_controller *hose;
1131        struct pnv_phb        *phb;
1132        struct pci_dn         *pdn;
1133        int                    i, j;
1134
1135        bus = pdev->bus;
1136        hose = pci_bus_to_host(bus);
1137        phb = hose->private_data;
1138        pdn = pci_get_pdn(pdev);
1139
1140        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
1141                for (j = 0; j < M64_PER_IOV; j++) {
1142                        if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
1143                                continue;
1144                        opal_pci_phb_mmio_enable(phb->opal_id,
1145                                OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
1146                        clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
1147                        pdn->m64_wins[i][j] = IODA_INVALID_M64;
1148                }
1149
1150        return 0;
1151}
1152
1153static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
1154{
1155        struct pci_bus        *bus;
1156        struct pci_controller *hose;
1157        struct pnv_phb        *phb;
1158        struct pci_dn         *pdn;
1159        unsigned int           win;
1160        struct resource       *res;
1161        int                    i, j;
1162        int64_t                rc;
1163        int                    total_vfs;
1164        resource_size_t        size, start;
1165        int                    pe_num;
1166        int                    vf_groups;
1167        int                    vf_per_group;
1168
1169        bus = pdev->bus;
1170        hose = pci_bus_to_host(bus);
1171        phb = hose->private_data;
1172        pdn = pci_get_pdn(pdev);
1173        total_vfs = pci_sriov_get_totalvfs(pdev);
1174
1175        /* Initialize the m64_wins to IODA_INVALID_M64 */
1176        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
1177                for (j = 0; j < M64_PER_IOV; j++)
1178                        pdn->m64_wins[i][j] = IODA_INVALID_M64;
1179
1180        if (pdn->m64_per_iov == M64_PER_IOV) {
1181                vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
1182                vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
1183                        roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
1184        } else {
1185                vf_groups = 1;
1186                vf_per_group = 1;
1187        }
1188
1189        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1190                res = &pdev->resource[i + PCI_IOV_RESOURCES];
1191                if (!res->flags || !res->parent)
1192                        continue;
1193
1194                if (!pnv_pci_is_mem_pref_64(res->flags))
1195                        continue;
1196
1197                for (j = 0; j < vf_groups; j++) {
1198                        do {
1199                                win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
1200                                                phb->ioda.m64_bar_idx + 1, 0);
1201
1202                                if (win >= phb->ioda.m64_bar_idx + 1)
1203                                        goto m64_failed;
1204                        } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
1205
1206                        pdn->m64_wins[i][j] = win;
1207
1208                        if (pdn->m64_per_iov == M64_PER_IOV) {
1209                                size = pci_iov_resource_size(pdev,
1210                                                        PCI_IOV_RESOURCES + i);
1211                                size = size * vf_per_group;
1212                                start = res->start + size * j;
1213                        } else {
1214                                size = resource_size(res);
1215                                start = res->start;
1216                        }
1217
1218                        /* Map the M64 here */
1219                        if (pdn->m64_per_iov == M64_PER_IOV) {
1220                                pe_num = pdn->offset + j;
1221                                rc = opal_pci_map_pe_mmio_window(phb->opal_id,
1222                                                pe_num, OPAL_M64_WINDOW_TYPE,
1223                                                pdn->m64_wins[i][j], 0);
1224                        }
1225
1226                        rc = opal_pci_set_phb_mem_window(phb->opal_id,
1227                                                 OPAL_M64_WINDOW_TYPE,
1228                                                 pdn->m64_wins[i][j],
1229                                                 start,
1230                                                 0, /* unused */
1231                                                 size);
1232
1233
1234                        if (rc != OPAL_SUCCESS) {
1235                                dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
1236                                        win, rc);
1237                                goto m64_failed;
1238                        }
1239
1240                        if (pdn->m64_per_iov == M64_PER_IOV)
1241                                rc = opal_pci_phb_mmio_enable(phb->opal_id,
1242                                     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
1243                        else
1244                                rc = opal_pci_phb_mmio_enable(phb->opal_id,
1245                                     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
1246
1247                        if (rc != OPAL_SUCCESS) {
1248                                dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
1249                                        win, rc);
1250                                goto m64_failed;
1251                        }
1252                }
1253        }
1254        return 0;
1255
1256m64_failed:
1257        pnv_pci_vf_release_m64(pdev);
1258        return -EBUSY;
1259}
1260
1261static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
1262                int num);
1263static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
1264
1265static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
1266{
1267        struct iommu_table    *tbl;
1268        int64_t               rc;
1269
1270        tbl = pe->table_group.tables[0];
1271        rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
1272        if (rc)
1273                pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
1274
1275        pnv_pci_ioda2_set_bypass(pe, false);
1276        if (pe->table_group.group) {
1277                iommu_group_put(pe->table_group.group);
1278                BUG_ON(pe->table_group.group);
1279        }
1280        pnv_pci_ioda2_table_free_pages(tbl);
1281        iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
1282}
1283
1284static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
1285{
1286        struct pci_bus        *bus;
1287        struct pci_controller *hose;
1288        struct pnv_phb        *phb;
1289        struct pnv_ioda_pe    *pe, *pe_n;
1290        struct pci_dn         *pdn;
1291        u16                    vf_index;
1292        int64_t                rc;
1293
1294        bus = pdev->bus;
1295        hose = pci_bus_to_host(bus);
1296        phb = hose->private_data;
1297        pdn = pci_get_pdn(pdev);
1298
1299        if (!pdev->is_physfn)
1300                return;
1301
1302        if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
1303                int   vf_group;
1304                int   vf_per_group;
1305                int   vf_index1;
1306
1307                vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
1308
1309                for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
1310                        for (vf_index = vf_group * vf_per_group;
1311                                vf_index < (vf_group + 1) * vf_per_group &&
1312                                vf_index < num_vfs;
1313                                vf_index++)
1314                                for (vf_index1 = vf_group * vf_per_group;
1315                                        vf_index1 < (vf_group + 1) * vf_per_group &&
1316                                        vf_index1 < num_vfs;
1317                                        vf_index1++){
1318
1319                                        rc = opal_pci_set_peltv(phb->opal_id,
1320                                                pdn->offset + vf_index,
1321                                                pdn->offset + vf_index1,
1322                                                OPAL_REMOVE_PE_FROM_DOMAIN);
1323
1324                                        if (rc)
1325                                            dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
1326                                                __func__,
1327                                                pdn->offset + vf_index1, rc);
1328                                }
1329        }
1330
1331        list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
1332                if (pe->parent_dev != pdev)
1333                        continue;
1334
1335                pnv_pci_ioda2_release_dma_pe(pdev, pe);
1336
1337                /* Remove from list */
1338                mutex_lock(&phb->ioda.pe_list_mutex);
1339                list_del(&pe->list);
1340                mutex_unlock(&phb->ioda.pe_list_mutex);
1341
1342                pnv_ioda_deconfigure_pe(phb, pe);
1343
1344                pnv_ioda_free_pe(phb, pe->pe_number);
1345        }
1346}
1347
1348void pnv_pci_sriov_disable(struct pci_dev *pdev)
1349{
1350        struct pci_bus        *bus;
1351        struct pci_controller *hose;
1352        struct pnv_phb        *phb;
1353        struct pci_dn         *pdn;
1354        struct pci_sriov      *iov;
1355        u16 num_vfs;
1356
1357        bus = pdev->bus;
1358        hose = pci_bus_to_host(bus);
1359        phb = hose->private_data;
1360        pdn = pci_get_pdn(pdev);
1361        iov = pdev->sriov;
1362        num_vfs = pdn->num_vfs;
1363
1364        /* Release VF PEs */
1365        pnv_ioda_release_vf_PE(pdev, num_vfs);
1366
1367        if (phb->type == PNV_PHB_IODA2) {
1368                if (pdn->m64_per_iov == 1)
1369                        pnv_pci_vf_resource_shift(pdev, -pdn->offset);
1370
1371                /* Release M64 windows */
1372                pnv_pci_vf_release_m64(pdev);
1373
1374                /* Release PE numbers */
1375                bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
1376                pdn->offset = 0;
1377        }
1378}
1379
1380static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
1381                                       struct pnv_ioda_pe *pe);
1382static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
1383{
1384        struct pci_bus        *bus;
1385        struct pci_controller *hose;
1386        struct pnv_phb        *phb;
1387        struct pnv_ioda_pe    *pe;
1388        int                    pe_num;
1389        u16                    vf_index;
1390        struct pci_dn         *pdn;
1391        int64_t                rc;
1392
1393        bus = pdev->bus;
1394        hose = pci_bus_to_host(bus);
1395        phb = hose->private_data;
1396        pdn = pci_get_pdn(pdev);
1397
1398        if (!pdev->is_physfn)
1399                return;
1400
1401        /* Reserve PE for each VF */
1402        for (vf_index = 0; vf_index < num_vfs; vf_index++) {
1403                pe_num = pdn->offset + vf_index;
1404
1405                pe = &phb->ioda.pe_array[pe_num];
1406                pe->pe_number = pe_num;
1407                pe->phb = phb;
1408                pe->flags = PNV_IODA_PE_VF;
1409                pe->pbus = NULL;
1410                pe->parent_dev = pdev;
1411                pe->tce32_seg = -1;
1412                pe->mve_number = -1;
1413                pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
1414                           pci_iov_virtfn_devfn(pdev, vf_index);
1415
1416                pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
1417                        hose->global_number, pdev->bus->number,
1418                        PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
1419                        PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
1420
1421                if (pnv_ioda_configure_pe(phb, pe)) {
1422                        /* XXX What do we do here ? */
1423                        if (pe_num)
1424                                pnv_ioda_free_pe(phb, pe_num);
1425                        pe->pdev = NULL;
1426                        continue;
1427                }
1428
1429                /* Put PE to the list */
1430                mutex_lock(&phb->ioda.pe_list_mutex);
1431                list_add_tail(&pe->list, &phb->ioda.pe_list);
1432                mutex_unlock(&phb->ioda.pe_list_mutex);
1433
1434                pnv_pci_ioda2_setup_dma_pe(phb, pe);
1435        }
1436
1437        if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
1438                int   vf_group;
1439                int   vf_per_group;
1440                int   vf_index1;
1441
1442                vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
1443
1444                for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
1445                        for (vf_index = vf_group * vf_per_group;
1446                             vf_index < (vf_group + 1) * vf_per_group &&
1447                             vf_index < num_vfs;
1448                             vf_index++) {
1449                                for (vf_index1 = vf_group * vf_per_group;
1450                                     vf_index1 < (vf_group + 1) * vf_per_group &&
1451                                     vf_index1 < num_vfs;
1452                                     vf_index1++) {
1453
1454                                        rc = opal_pci_set_peltv(phb->opal_id,
1455                                                pdn->offset + vf_index,
1456                                                pdn->offset + vf_index1,
1457                                                OPAL_ADD_PE_TO_DOMAIN);
1458
1459                                        if (rc)
1460                                            dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
1461                                                __func__,
1462                                                pdn->offset + vf_index1, rc);
1463                                }
1464                        }
1465                }
1466        }
1467}
1468
1469int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1470{
1471        struct pci_bus        *bus;
1472        struct pci_controller *hose;
1473        struct pnv_phb        *phb;
1474        struct pci_dn         *pdn;
1475        int                    ret;
1476
1477        bus = pdev->bus;
1478        hose = pci_bus_to_host(bus);
1479        phb = hose->private_data;
1480        pdn = pci_get_pdn(pdev);
1481
1482        if (phb->type == PNV_PHB_IODA2) {
1483                /* Calculate available PE for required VFs */
1484                mutex_lock(&phb->ioda.pe_alloc_mutex);
1485                pdn->offset = bitmap_find_next_zero_area(
1486                        phb->ioda.pe_alloc, phb->ioda.total_pe,
1487                        0, num_vfs, 0);
1488                if (pdn->offset >= phb->ioda.total_pe) {
1489                        mutex_unlock(&phb->ioda.pe_alloc_mutex);
1490                        dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
1491                        pdn->offset = 0;
1492                        return -EBUSY;
1493                }
1494                bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
1495                pdn->num_vfs = num_vfs;
1496                mutex_unlock(&phb->ioda.pe_alloc_mutex);
1497
1498                /* Assign M64 window accordingly */
1499                ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
1500                if (ret) {
1501                        dev_info(&pdev->dev, "Not enough M64 window resources\n");
1502                        goto m64_failed;
1503                }
1504
1505                /*
1506                 * When using one M64 BAR to map one IOV BAR, we need to shift
1507                 * the IOV BAR according to the PE# allocated to the VFs.
1508                 * Otherwise, the PE# for the VF will conflict with others.
1509                 */
1510                if (pdn->m64_per_iov == 1) {
1511                        ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
1512                        if (ret)
1513                                goto m64_failed;
1514                }
1515        }
1516
1517        /* Setup VF PEs */
1518        pnv_ioda_setup_vf_PE(pdev, num_vfs);
1519
1520        return 0;
1521
1522m64_failed:
1523        bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
1524        pdn->offset = 0;
1525
1526        return ret;
1527}
1528
1529int pcibios_sriov_disable(struct pci_dev *pdev)
1530{
1531        pnv_pci_sriov_disable(pdev);
1532
1533        /* Release PCI data */
1534        remove_dev_pci_data(pdev);
1535        return 0;
1536}
1537
1538int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1539{
1540        /* Allocate PCI data */
1541        add_dev_pci_data(pdev);
1542
1543        pnv_pci_sriov_enable(pdev, num_vfs);
1544        return 0;
1545}
1546#endif /* CONFIG_PCI_IOV */
1547
1548static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
1549{
1550        struct pci_dn *pdn = pci_get_pdn(pdev);
1551        struct pnv_ioda_pe *pe;
1552
1553        /*
1554         * The function can be called while the PE#
1555         * hasn't been assigned. Do nothing for the
1556         * case.
1557         */
1558        if (!pdn || pdn->pe_number == IODA_INVALID_PE)
1559                return;
1560
1561        pe = &phb->ioda.pe_array[pdn->pe_number];
1562        WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1563        set_dma_offset(&pdev->dev, pe->tce_bypass_base);
1564        set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
1565        /*
1566         * Note: iommu_add_device() will fail here as
1567         * for physical PE: the device is already added by now;
1568         * for virtual PE: sysfs entries are not ready yet and
1569         * tce_iommu_bus_notifier will add the device to a group later.
1570         */
1571}
1572
1573static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
1574{
1575        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1576        struct pnv_phb *phb = hose->private_data;
1577        struct pci_dn *pdn = pci_get_pdn(pdev);
1578        struct pnv_ioda_pe *pe;
1579        uint64_t top;
1580        bool bypass = false;
1581
1582        if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1583                return -ENODEV;;
1584
1585        pe = &phb->ioda.pe_array[pdn->pe_number];
1586        if (pe->tce_bypass_enabled) {
1587                top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1588                bypass = (dma_mask >= top);
1589        }
1590
1591        if (bypass) {
1592                dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
1593                set_dma_ops(&pdev->dev, &dma_direct_ops);
1594        } else {
1595                dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
1596                set_dma_ops(&pdev->dev, &dma_iommu_ops);
1597        }
1598        *pdev->dev.dma_mask = dma_mask;
1599        return 0;
1600}
1601
1602static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev)
1603{
1604        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1605        struct pnv_phb *phb = hose->private_data;
1606        struct pci_dn *pdn = pci_get_pdn(pdev);
1607        struct pnv_ioda_pe *pe;
1608        u64 end, mask;
1609
1610        if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1611                return 0;
1612
1613        pe = &phb->ioda.pe_array[pdn->pe_number];
1614        if (!pe->tce_bypass_enabled)
1615                return __dma_get_required_mask(&pdev->dev);
1616
1617
1618        end = pe->tce_bypass_base + memblock_end_of_DRAM();
1619        mask = 1ULL << (fls64(end) - 1);
1620        mask += mask - 1;
1621
1622        return mask;
1623}
1624
1625static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
1626                                   struct pci_bus *bus)
1627{
1628        struct pci_dev *dev;
1629
1630        list_for_each_entry(dev, &bus->devices, bus_list) {
1631                set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
1632                set_dma_offset(&dev->dev, pe->tce_bypass_base);
1633                iommu_add_device(&dev->dev);
1634
1635                if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1636                        pnv_ioda_setup_bus_dma(pe, dev->subordinate);
1637        }
1638}
1639
1640static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
1641                unsigned long index, unsigned long npages, bool rm)
1642{
1643        struct iommu_table_group_link *tgl = list_first_entry_or_null(
1644                        &tbl->it_group_list, struct iommu_table_group_link,
1645                        next);
1646        struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1647                        struct pnv_ioda_pe, table_group);
1648        __be64 __iomem *invalidate = rm ?
1649                (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
1650                pe->phb->ioda.tce_inval_reg;
1651        unsigned long start, end, inc;
1652        const unsigned shift = tbl->it_page_shift;
1653
1654        start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
1655        end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
1656                        npages - 1);
1657
1658        /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
1659        if (tbl->it_busno) {
1660                start <<= shift;
1661                end <<= shift;
1662                inc = 128ull << shift;
1663                start |= tbl->it_busno;
1664                end |= tbl->it_busno;
1665        } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) {
1666                /* p7ioc-style invalidation, 2 TCEs per write */
1667                start |= (1ull << 63);
1668                end |= (1ull << 63);
1669                inc = 16;
1670        } else {
1671                /* Default (older HW) */
1672                inc = 128;
1673        }
1674
1675        end |= inc - 1; /* round up end to be different than start */
1676
1677        mb(); /* Ensure above stores are visible */
1678        while (start <= end) {
1679                if (rm)
1680                        __raw_rm_writeq(cpu_to_be64(start), invalidate);
1681                else
1682                        __raw_writeq(cpu_to_be64(start), invalidate);
1683                start += inc;
1684        }
1685
1686        /*
1687         * The iommu layer will do another mb() for us on build()
1688         * and we don't care on free()
1689         */
1690}
1691
1692static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
1693                long npages, unsigned long uaddr,
1694                enum dma_data_direction direction,
1695                struct dma_attrs *attrs)
1696{
1697        int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1698                        attrs);
1699
1700        if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
1701                pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
1702
1703        return ret;
1704}
1705
1706#ifdef CONFIG_IOMMU_API
1707static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
1708                unsigned long *hpa, enum dma_data_direction *direction)
1709{
1710        long ret = pnv_tce_xchg(tbl, index, hpa, direction);
1711
1712        if (!ret && (tbl->it_type &
1713                        (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
1714                pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false);
1715
1716        return ret;
1717}
1718#endif
1719
1720static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
1721                long npages)
1722{
1723        pnv_tce_free(tbl, index, npages);
1724
1725        if (tbl->it_type & TCE_PCI_SWINV_FREE)
1726                pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
1727}
1728
1729static struct iommu_table_ops pnv_ioda1_iommu_ops = {
1730        .set = pnv_ioda1_tce_build,
1731#ifdef CONFIG_IOMMU_API
1732        .exchange = pnv_ioda1_tce_xchg,
1733#endif
1734        .clear = pnv_ioda1_tce_free,
1735        .get = pnv_tce_get,
1736};
1737
1738static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
1739{
1740        /* 01xb - invalidate TCEs that match the specified PE# */
1741        unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF);
1742        struct pnv_phb *phb = pe->phb;
1743
1744        if (!phb->ioda.tce_inval_reg)
1745                return;
1746
1747        mb(); /* Ensure above stores are visible */
1748        __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
1749}
1750
1751static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
1752                __be64 __iomem *invalidate, unsigned shift,
1753                unsigned long index, unsigned long npages)
1754{
1755        unsigned long start, end, inc;
1756
1757        /* We'll invalidate DMA address in PE scope */
1758        start = 0x2ull << 60;
1759        start |= (pe_number & 0xFF);
1760        end = start;
1761
1762        /* Figure out the start, end and step */
1763        start |= (index << shift);
1764        end |= ((index + npages - 1) << shift);
1765        inc = (0x1ull << shift);
1766        mb();
1767
1768        while (start <= end) {
1769                if (rm)
1770                        __raw_rm_writeq(cpu_to_be64(start), invalidate);
1771                else
1772                        __raw_writeq(cpu_to_be64(start), invalidate);
1773                start += inc;
1774        }
1775}
1776
1777static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
1778                unsigned long index, unsigned long npages, bool rm)
1779{
1780        struct iommu_table_group_link *tgl;
1781
1782        list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
1783                struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1784                                struct pnv_ioda_pe, table_group);
1785                __be64 __iomem *invalidate = rm ?
1786                        (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
1787                        pe->phb->ioda.tce_inval_reg;
1788
1789                pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
1790                        invalidate, tbl->it_page_shift,
1791                        index, npages);
1792        }
1793}
1794
1795static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
1796                long npages, unsigned long uaddr,
1797                enum dma_data_direction direction,
1798                struct dma_attrs *attrs)
1799{
1800        int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1801                        attrs);
1802
1803        if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
1804                pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
1805
1806        return ret;
1807}
1808
1809#ifdef CONFIG_IOMMU_API
1810static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
1811                unsigned long *hpa, enum dma_data_direction *direction)
1812{
1813        long ret = pnv_tce_xchg(tbl, index, hpa, direction);
1814
1815        if (!ret && (tbl->it_type &
1816                        (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
1817                pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
1818
1819        return ret;
1820}
1821#endif
1822
1823static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
1824                long npages)
1825{
1826        pnv_tce_free(tbl, index, npages);
1827
1828        if (tbl->it_type & TCE_PCI_SWINV_FREE)
1829                pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
1830}
1831
1832static void pnv_ioda2_table_free(struct iommu_table *tbl)
1833{
1834        pnv_pci_ioda2_table_free_pages(tbl);
1835        iommu_free_table(tbl, "pnv");
1836}
1837
1838static struct iommu_table_ops pnv_ioda2_iommu_ops = {
1839        .set = pnv_ioda2_tce_build,
1840#ifdef CONFIG_IOMMU_API
1841        .exchange = pnv_ioda2_tce_xchg,
1842#endif
1843        .clear = pnv_ioda2_tce_free,
1844        .get = pnv_tce_get,
1845        .free = pnv_ioda2_table_free,
1846};
1847
1848static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
1849                                      struct pnv_ioda_pe *pe, unsigned int base,
1850                                      unsigned int segs)
1851{
1852
1853        struct page *tce_mem = NULL;
1854        struct iommu_table *tbl;
1855        unsigned int i;
1856        int64_t rc;
1857        void *addr;
1858
1859        /* XXX FIXME: Handle 64-bit only DMA devices */
1860        /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
1861        /* XXX FIXME: Allocate multi-level tables on PHB3 */
1862
1863        /* We shouldn't already have a 32-bit DMA associated */
1864        if (WARN_ON(pe->tce32_seg >= 0))
1865                return;
1866
1867        tbl = pnv_pci_table_alloc(phb->hose->node);
1868        iommu_register_group(&pe->table_group, phb->hose->global_number,
1869                        pe->pe_number);
1870        pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
1871
1872        /* Grab a 32-bit TCE table */
1873        pe->tce32_seg = base;
1874        pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
1875                (base << 28), ((base + segs) << 28) - 1);
1876
1877        /* XXX Currently, we allocate one big contiguous table for the
1878         * TCEs. We only really need one chunk per 256M of TCE space
1879         * (ie per segment) but that's an optimization for later, it
1880         * requires some added smarts with our get/put_tce implementation
1881         */
1882        tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
1883                                   get_order(TCE32_TABLE_SIZE * segs));
1884        if (!tce_mem) {
1885                pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
1886                goto fail;
1887        }
1888        addr = page_address(tce_mem);
1889        memset(addr, 0, TCE32_TABLE_SIZE * segs);
1890
1891        /* Configure HW */
1892        for (i = 0; i < segs; i++) {
1893                rc = opal_pci_map_pe_dma_window(phb->opal_id,
1894                                              pe->pe_number,
1895                                              base + i, 1,
1896                                              __pa(addr) + TCE32_TABLE_SIZE * i,
1897                                              TCE32_TABLE_SIZE, 0x1000);
1898                if (rc) {
1899                        pe_err(pe, " Failed to configure 32-bit TCE table,"
1900                               " err %ld\n", rc);
1901                        goto fail;
1902                }
1903        }
1904
1905        /* Setup linux iommu table */
1906        pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
1907                                  base << 28, IOMMU_PAGE_SHIFT_4K);
1908
1909        /* OPAL variant of P7IOC SW invalidated TCEs */
1910        if (phb->ioda.tce_inval_reg)
1911                tbl->it_type |= (TCE_PCI_SWINV_CREATE |
1912                                 TCE_PCI_SWINV_FREE   |
1913                                 TCE_PCI_SWINV_PAIR);
1914
1915        tbl->it_ops = &pnv_ioda1_iommu_ops;
1916        pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
1917        pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
1918        iommu_init_table(tbl, phb->hose->node);
1919
1920        if (pe->flags & PNV_IODA_PE_DEV) {
1921                /*
1922                 * Setting table base here only for carrying iommu_group
1923                 * further down to let iommu_add_device() do the job.
1924                 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
1925                 */
1926                set_iommu_table_base(&pe->pdev->dev, tbl);
1927                iommu_add_device(&pe->pdev->dev);
1928        } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
1929                pnv_ioda_setup_bus_dma(pe, pe->pbus);
1930
1931        return;
1932 fail:
1933        /* XXX Failure: Try to fallback to 64-bit only ? */
1934        if (pe->tce32_seg >= 0)
1935                pe->tce32_seg = -1;
1936        if (tce_mem)
1937                __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
1938        if (tbl) {
1939                pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
1940                iommu_free_table(tbl, "pnv");
1941        }
1942}
1943
1944static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
1945                int num, struct iommu_table *tbl)
1946{
1947        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
1948                        table_group);
1949        struct pnv_phb *phb = pe->phb;
1950        int64_t rc;
1951        const unsigned long size = tbl->it_indirect_levels ?
1952                        tbl->it_level_size : tbl->it_size;
1953        const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
1954        const __u64 win_size = tbl->it_size << tbl->it_page_shift;
1955
1956        pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
1957                        start_addr, start_addr + win_size - 1,
1958                        IOMMU_PAGE_SIZE(tbl));
1959
1960        /*
1961         * Map TCE table through TVT. The TVE index is the PE number
1962         * shifted by 1 bit for 32-bits DMA space.
1963         */
1964        rc = opal_pci_map_pe_dma_window(phb->opal_id,
1965                        pe->pe_number,
1966                        (pe->pe_number << 1) + num,
1967                        tbl->it_indirect_levels + 1,
1968                        __pa(tbl->it_base),
1969                        size << 3,
1970                        IOMMU_PAGE_SIZE(tbl));
1971        if (rc) {
1972                pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
1973                return rc;
1974        }
1975
1976        pnv_pci_link_table_and_group(phb->hose->node, num,
1977                        tbl, &pe->table_group);
1978        pnv_pci_ioda2_tce_invalidate_entire(pe);
1979
1980        return 0;
1981}
1982
1983static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
1984{
1985        uint16_t window_id = (pe->pe_number << 1 ) + 1;
1986        int64_t rc;
1987
1988        pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
1989        if (enable) {
1990                phys_addr_t top = memblock_end_of_DRAM();
1991
1992                top = roundup_pow_of_two(top);
1993                rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
1994                                                     pe->pe_number,
1995                                                     window_id,
1996                                                     pe->tce_bypass_base,
1997                                                     top);
1998        } else {
1999                rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
2000                                                     pe->pe_number,
2001                                                     window_id,
2002                                                     pe->tce_bypass_base,
2003                                                     0);
2004        }
2005        if (rc)
2006                pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
2007        else
2008                pe->tce_bypass_enabled = enable;
2009}
2010
2011static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
2012                __u32 page_shift, __u64 window_size, __u32 levels,
2013                struct iommu_table *tbl);
2014
2015static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
2016                int num, __u32 page_shift, __u64 window_size, __u32 levels,
2017                struct iommu_table **ptbl)
2018{
2019        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2020                        table_group);
2021        int nid = pe->phb->hose->node;
2022        __u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
2023        long ret;
2024        struct iommu_table *tbl;
2025
2026        tbl = pnv_pci_table_alloc(nid);
2027        if (!tbl)
2028                return -ENOMEM;
2029
2030        ret = pnv_pci_ioda2_table_alloc_pages(nid,
2031                        bus_offset, page_shift, window_size,
2032                        levels, tbl);
2033        if (ret) {
2034                iommu_free_table(tbl, "pnv");
2035                return ret;
2036        }
2037
2038        tbl->it_ops = &pnv_ioda2_iommu_ops;
2039        if (pe->phb->ioda.tce_inval_reg)
2040                tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
2041
2042        *ptbl = tbl;
2043
2044        return 0;
2045}
2046
2047static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
2048{
2049        struct iommu_table *tbl = NULL;
2050        long rc;
2051
2052        /*
2053         * crashkernel= specifies the kdump kernel's maximum memory at
2054         * some offset and there is no guaranteed the result is a power
2055         * of 2, which will cause errors later.
2056         */
2057        const u64 max_memory = __rounddown_pow_of_two(memory_hotplug_max());
2058
2059        /*
2060         * In memory constrained environments, e.g. kdump kernel, the
2061         * DMA window can be larger than available memory, which will
2062         * cause errors later.
2063         */
2064        const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
2065
2066        rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
2067                        IOMMU_PAGE_SHIFT_4K,
2068                        window_size,
2069                        POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
2070        if (rc) {
2071                pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
2072                                rc);
2073                return rc;
2074        }
2075
2076        iommu_init_table(tbl, pe->phb->hose->node);
2077
2078        rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
2079        if (rc) {
2080                pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
2081                                rc);
2082                pnv_ioda2_table_free(tbl);
2083                return rc;
2084        }
2085
2086        if (!pnv_iommu_bypass_disabled)
2087                pnv_pci_ioda2_set_bypass(pe, true);
2088
2089        /* OPAL variant of PHB3 invalidated TCEs */
2090        if (pe->phb->ioda.tce_inval_reg)
2091                tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
2092
2093        /*
2094         * Setting table base here only for carrying iommu_group
2095         * further down to let iommu_add_device() do the job.
2096         * pnv_pci_ioda_dma_dev_setup will override it later anyway.
2097         */
2098        if (pe->flags & PNV_IODA_PE_DEV)
2099                set_iommu_table_base(&pe->pdev->dev, tbl);
2100
2101        return 0;
2102}
2103
2104#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
2105static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
2106                int num)
2107{
2108        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2109                        table_group);
2110        struct pnv_phb *phb = pe->phb;
2111        long ret;
2112
2113        pe_info(pe, "Removing DMA window #%d\n", num);
2114
2115        ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
2116                        (pe->pe_number << 1) + num,
2117                        0/* levels */, 0/* table address */,
2118                        0/* table size */, 0/* page size */);
2119        if (ret)
2120                pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
2121        else
2122                pnv_pci_ioda2_tce_invalidate_entire(pe);
2123
2124        pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
2125
2126        return ret;
2127}
2128#endif
2129
2130#ifdef CONFIG_IOMMU_API
2131static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
2132                __u64 window_size, __u32 levels)
2133{
2134        unsigned long bytes = 0;
2135        const unsigned window_shift = ilog2(window_size);
2136        unsigned entries_shift = window_shift - page_shift;
2137        unsigned table_shift = entries_shift + 3;
2138        unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
2139        unsigned long direct_table_size;
2140
2141        if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
2142                        (window_size > memory_hotplug_max()) ||
2143                        !is_power_of_2(window_size))
2144                return 0;
2145
2146        /* Calculate a direct table size from window_size and levels */
2147        entries_shift = (entries_shift + levels - 1) / levels;
2148        table_shift = entries_shift + 3;
2149        table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
2150        direct_table_size =  1UL << table_shift;
2151
2152        for ( ; levels; --levels) {
2153                bytes += _ALIGN_UP(tce_table_size, direct_table_size);
2154
2155                tce_table_size /= direct_table_size;
2156                tce_table_size <<= 3;
2157                tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
2158        }
2159
2160        return bytes;
2161}
2162
2163static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
2164{
2165        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2166                                                table_group);
2167        /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
2168        struct iommu_table *tbl = pe->table_group.tables[0];
2169
2170        pnv_pci_ioda2_set_bypass(pe, false);
2171        pnv_pci_ioda2_unset_window(&pe->table_group, 0);
2172        pnv_ioda2_table_free(tbl);
2173}
2174
2175static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
2176{
2177        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2178                                                table_group);
2179
2180        pnv_pci_ioda2_setup_default_config(pe);
2181}
2182
2183static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
2184        .get_table_size = pnv_pci_ioda2_get_table_size,
2185        .create_table = pnv_pci_ioda2_create_table,
2186        .set_window = pnv_pci_ioda2_set_window,
2187        .unset_window = pnv_pci_ioda2_unset_window,
2188        .take_ownership = pnv_ioda2_take_ownership,
2189        .release_ownership = pnv_ioda2_release_ownership,
2190};
2191#endif
2192
2193static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
2194{
2195        const __be64 *swinvp;
2196
2197        /* OPAL variant of PHB3 invalidated TCEs */
2198        swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
2199        if (!swinvp)
2200                return;
2201
2202        phb->ioda.tce_inval_reg_phys = be64_to_cpup(swinvp);
2203        phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
2204}
2205
2206static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
2207                unsigned levels, unsigned long limit,
2208                unsigned long *current_offset, unsigned long *total_allocated)
2209{
2210        struct page *tce_mem = NULL;
2211        __be64 *addr, *tmp;
2212        unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
2213        unsigned long allocated = 1UL << (order + PAGE_SHIFT);
2214        unsigned entries = 1UL << (shift - 3);
2215        long i;
2216
2217        tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
2218        if (!tce_mem) {
2219                pr_err("Failed to allocate a TCE memory, order=%d\n", order);
2220                return NULL;
2221        }
2222        addr = page_address(tce_mem);
2223        memset(addr, 0, allocated);
2224        *total_allocated += allocated;
2225
2226        --levels;
2227        if (!levels) {
2228                *current_offset += allocated;
2229                return addr;
2230        }
2231
2232        for (i = 0; i < entries; ++i) {
2233                tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
2234                                levels, limit, current_offset, total_allocated);
2235                if (!tmp)
2236                        break;
2237
2238                addr[i] = cpu_to_be64(__pa(tmp) |
2239                                TCE_PCI_READ | TCE_PCI_WRITE);
2240
2241                if (*current_offset >= limit)
2242                        break;
2243        }
2244
2245        return addr;
2246}
2247
2248static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
2249                unsigned long size, unsigned level);
2250
2251static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
2252                __u32 page_shift, __u64 window_size, __u32 levels,
2253                struct iommu_table *tbl)
2254{
2255        void *addr;
2256        unsigned long offset = 0, level_shift, total_allocated = 0;
2257        const unsigned window_shift = ilog2(window_size);
2258        unsigned entries_shift = window_shift - page_shift;
2259        unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
2260        const unsigned long tce_table_size = 1UL << table_shift;
2261
2262        if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
2263                return -EINVAL;
2264
2265        if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
2266                return -EINVAL;
2267
2268        /* Adjust direct table size from window_size and levels */
2269        entries_shift = (entries_shift + levels - 1) / levels;
2270        level_shift = entries_shift + 3;
2271        level_shift = max_t(unsigned, level_shift, PAGE_SHIFT);
2272
2273        /* Allocate TCE table */
2274        addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
2275                        levels, tce_table_size, &offset, &total_allocated);
2276
2277        /* addr==NULL means that the first level allocation failed */
2278        if (!addr)
2279                return -ENOMEM;
2280
2281        /*
2282         * First level was allocated but some lower level failed as
2283         * we did not allocate as much as we wanted,
2284         * release partially allocated table.
2285         */
2286        if (offset < tce_table_size) {
2287                pnv_pci_ioda2_table_do_free_pages(addr,
2288                                1ULL << (level_shift - 3), levels - 1);
2289                return -ENOMEM;
2290        }
2291
2292        /* Setup linux iommu table */
2293        pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
2294                        page_shift);
2295        tbl->it_level_size = 1ULL << (level_shift - 3);
2296        tbl->it_indirect_levels = levels - 1;
2297        tbl->it_allocated_size = total_allocated;
2298
2299        pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
2300                        window_size, tce_table_size, bus_offset);
2301
2302        return 0;
2303}
2304
2305static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
2306                unsigned long size, unsigned level)
2307{
2308        const unsigned long addr_ul = (unsigned long) addr &
2309                        ~(TCE_PCI_READ | TCE_PCI_WRITE);
2310
2311        if (level) {
2312                long i;
2313                u64 *tmp = (u64 *) addr_ul;
2314
2315                for (i = 0; i < size; ++i) {
2316                        unsigned long hpa = be64_to_cpu(tmp[i]);
2317
2318                        if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
2319                                continue;
2320
2321                        pnv_pci_ioda2_table_do_free_pages(__va(hpa), size,
2322                                        level - 1);
2323                }
2324        }
2325
2326        free_pages(addr_ul, get_order(size << 3));
2327}
2328
2329static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
2330{
2331        const unsigned long size = tbl->it_indirect_levels ?
2332                        tbl->it_level_size : tbl->it_size;
2333
2334        if (!tbl->it_size)
2335                return;
2336
2337        pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size,
2338                        tbl->it_indirect_levels);
2339}
2340
2341static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
2342                                       struct pnv_ioda_pe *pe)
2343{
2344        int64_t rc;
2345
2346        /* We shouldn't already have a 32-bit DMA associated */
2347        if (WARN_ON(pe->tce32_seg >= 0))
2348                return;
2349
2350        /* TVE #1 is selected by PCI address bit 59 */
2351        pe->tce_bypass_base = 1ull << 59;
2352
2353        iommu_register_group(&pe->table_group, phb->hose->global_number,
2354                        pe->pe_number);
2355
2356        /* The PE will reserve all possible 32-bits space */
2357        pe->tce32_seg = 0;
2358        pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
2359                phb->ioda.m32_pci_base);
2360
2361        /* Setup linux iommu table */
2362        pe->table_group.tce32_start = 0;
2363        pe->table_group.tce32_size = phb->ioda.m32_pci_base;
2364        pe->table_group.max_dynamic_windows_supported =
2365                        IOMMU_TABLE_GROUP_MAX_TABLES;
2366        pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
2367        pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M;
2368#ifdef CONFIG_IOMMU_API
2369        pe->table_group.ops = &pnv_pci_ioda2_ops;
2370#endif
2371
2372        rc = pnv_pci_ioda2_setup_default_config(pe);
2373        if (rc) {
2374                if (pe->tce32_seg >= 0)
2375                        pe->tce32_seg = -1;
2376                return;
2377        }
2378
2379        if (pe->flags & PNV_IODA_PE_DEV)
2380                iommu_add_device(&pe->pdev->dev);
2381        else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2382                pnv_ioda_setup_bus_dma(pe, pe->pbus);
2383}
2384
2385static void pnv_ioda_setup_dma(struct pnv_phb *phb)
2386{
2387        struct pci_controller *hose = phb->hose;
2388        unsigned int residual, remaining, segs, tw, base;
2389        struct pnv_ioda_pe *pe;
2390
2391        /* If we have more PE# than segments available, hand out one
2392         * per PE until we run out and let the rest fail. If not,
2393         * then we assign at least one segment per PE, plus more based
2394         * on the amount of devices under that PE
2395         */
2396        if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
2397                residual = 0;
2398        else
2399                residual = phb->ioda.tce32_count -
2400                        phb->ioda.dma_pe_count;
2401
2402        pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
2403                hose->global_number, phb->ioda.tce32_count);
2404        pr_info("PCI: %d PE# for a total weight of %d\n",
2405                phb->ioda.dma_pe_count, phb->ioda.dma_weight);
2406
2407        pnv_pci_ioda_setup_opal_tce_kill(phb);
2408
2409        /* Walk our PE list and configure their DMA segments, hand them
2410         * out one base segment plus any residual segments based on
2411         * weight
2412         */
2413        remaining = phb->ioda.tce32_count;
2414        tw = phb->ioda.dma_weight;
2415        base = 0;
2416        list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
2417                if (!pe->dma_weight)
2418                        continue;
2419                if (!remaining) {
2420                        pe_warn(pe, "No DMA32 resources available\n");
2421                        continue;
2422                }
2423                segs = 1;
2424                if (residual) {
2425                        segs += ((pe->dma_weight * residual)  + (tw / 2)) / tw;
2426                        if (segs > remaining)
2427                                segs = remaining;
2428                }
2429
2430                /*
2431                 * For IODA2 compliant PHB3, we needn't care about the weight.
2432                 * The all available 32-bits DMA space will be assigned to
2433                 * the specific PE.
2434                 */
2435                if (phb->type == PNV_PHB_IODA1) {
2436                        pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
2437                                pe->dma_weight, segs);
2438                        pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
2439                } else {
2440                        pe_info(pe, "Assign DMA32 space\n");
2441                        segs = 0;
2442                        pnv_pci_ioda2_setup_dma_pe(phb, pe);
2443                }
2444
2445                remaining -= segs;
2446                base += segs;
2447        }
2448}
2449
2450#ifdef CONFIG_PCI_MSI
2451static void pnv_ioda2_msi_eoi(struct irq_data *d)
2452{
2453        unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
2454        struct irq_chip *chip = irq_data_get_irq_chip(d);
2455        struct pnv_phb *phb = container_of(chip, struct pnv_phb,
2456                                           ioda.irq_chip);
2457        int64_t rc;
2458
2459        rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
2460        WARN_ON_ONCE(rc);
2461
2462        icp_native_eoi(d);
2463}
2464
2465
2466static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
2467{
2468        struct irq_data *idata;
2469        struct irq_chip *ichip;
2470
2471        if (phb->type != PNV_PHB_IODA2)
2472                return;
2473
2474        if (!phb->ioda.irq_chip_init) {
2475                /*
2476                 * First time we setup an MSI IRQ, we need to setup the
2477                 * corresponding IRQ chip to route correctly.
2478                 */
2479                idata = irq_get_irq_data(virq);
2480                ichip = irq_data_get_irq_chip(idata);
2481                phb->ioda.irq_chip_init = 1;
2482                phb->ioda.irq_chip = *ichip;
2483                phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
2484        }
2485        irq_set_chip(virq, &phb->ioda.irq_chip);
2486}
2487
2488#ifdef CONFIG_CXL_BASE
2489
2490struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
2491{
2492        struct pci_controller *hose = pci_bus_to_host(dev->bus);
2493
2494        return of_node_get(hose->dn);
2495}
2496EXPORT_SYMBOL(pnv_pci_get_phb_node);
2497
2498int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
2499{
2500        struct pci_controller *hose = pci_bus_to_host(dev->bus);
2501        struct pnv_phb *phb = hose->private_data;
2502        struct pnv_ioda_pe *pe;
2503        int rc;
2504
2505        pe = pnv_ioda_get_pe(dev);
2506        if (!pe)
2507                return -ENODEV;
2508
2509        pe_info(pe, "Switching PHB to CXL\n");
2510
2511        rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number);
2512        if (rc)
2513                dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc);
2514
2515        return rc;
2516}
2517EXPORT_SYMBOL(pnv_phb_to_cxl_mode);
2518
2519/* Find PHB for cxl dev and allocate MSI hwirqs?
2520 * Returns the absolute hardware IRQ number
2521 */
2522int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num)
2523{
2524        struct pci_controller *hose = pci_bus_to_host(dev->bus);
2525        struct pnv_phb *phb = hose->private_data;
2526        int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num);
2527
2528        if (hwirq < 0) {
2529                dev_warn(&dev->dev, "Failed to find a free MSI\n");
2530                return -ENOSPC;
2531        }
2532
2533        return phb->msi_base + hwirq;
2534}
2535EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs);
2536
2537void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num)
2538{
2539        struct pci_controller *hose = pci_bus_to_host(dev->bus);
2540        struct pnv_phb *phb = hose->private_data;
2541
2542        msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num);
2543}
2544EXPORT_SYMBOL(pnv_cxl_release_hwirqs);
2545
2546void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs,
2547                                  struct pci_dev *dev)
2548{
2549        struct pci_controller *hose = pci_bus_to_host(dev->bus);
2550        struct pnv_phb *phb = hose->private_data;
2551        int i, hwirq;
2552
2553        for (i = 1; i < CXL_IRQ_RANGES; i++) {
2554                if (!irqs->range[i])
2555                        continue;
2556                pr_devel("cxl release irq range 0x%x: offset: 0x%lx  limit: %ld\n",
2557                         i, irqs->offset[i],
2558                         irqs->range[i]);
2559                hwirq = irqs->offset[i] - phb->msi_base;
2560                msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq,
2561                                       irqs->range[i]);
2562        }
2563}
2564EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges);
2565
2566int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
2567                               struct pci_dev *dev, int num)
2568{
2569        struct pci_controller *hose = pci_bus_to_host(dev->bus);
2570        struct pnv_phb *phb = hose->private_data;
2571        int i, hwirq, try;
2572
2573        memset(irqs, 0, sizeof(struct cxl_irq_ranges));
2574
2575        /* 0 is reserved for the multiplexed PSL DSI interrupt */
2576        for (i = 1; i < CXL_IRQ_RANGES && num; i++) {
2577                try = num;
2578                while (try) {
2579                        hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try);
2580                        if (hwirq >= 0)
2581                                break;
2582                        try /= 2;
2583                }
2584                if (!try)
2585                        goto fail;
2586
2587                irqs->offset[i] = phb->msi_base + hwirq;
2588                irqs->range[i] = try;
2589                pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx  limit: %li\n",
2590                         i, irqs->offset[i], irqs->range[i]);
2591                num -= try;
2592        }
2593        if (num)
2594                goto fail;
2595
2596        return 0;
2597fail:
2598        pnv_cxl_release_hwirq_ranges(irqs, dev);
2599        return -ENOSPC;
2600}
2601EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges);
2602
2603int pnv_cxl_get_irq_count(struct pci_dev *dev)
2604{
2605        struct pci_controller *hose = pci_bus_to_host(dev->bus);
2606        struct pnv_phb *phb = hose->private_data;
2607
2608        return phb->msi_bmp.irq_count;
2609}
2610EXPORT_SYMBOL(pnv_cxl_get_irq_count);
2611
2612int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
2613                           unsigned int virq)
2614{
2615        struct pci_controller *hose = pci_bus_to_host(dev->bus);
2616        struct pnv_phb *phb = hose->private_data;
2617        unsigned int xive_num = hwirq - phb->msi_base;
2618        struct pnv_ioda_pe *pe;
2619        int rc;
2620
2621        if (!(pe = pnv_ioda_get_pe(dev)))
2622                return -ENODEV;
2623
2624        /* Assign XIVE to PE */
2625        rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2626        if (rc) {
2627                pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x "
2628                        "hwirq 0x%x XIVE 0x%x PE\n",
2629                        pci_name(dev), rc, phb->msi_base, hwirq, xive_num);
2630                return -EIO;
2631        }
2632        set_msi_irq_chip(phb, virq);
2633
2634        return 0;
2635}
2636EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup);
2637#endif
2638
2639static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
2640                                  unsigned int hwirq, unsigned int virq,
2641                                  unsigned int is_64, struct msi_msg *msg)
2642{
2643        struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
2644        unsigned int xive_num = hwirq - phb->msi_base;
2645        __be32 data;
2646        int rc;
2647
2648        /* No PE assigned ? bail out ... no MSI for you ! */
2649        if (pe == NULL)
2650                return -ENXIO;
2651
2652        /* Check if we have an MVE */
2653        if (pe->mve_number < 0)
2654                return -ENXIO;
2655
2656        /* Force 32-bit MSI on some broken devices */
2657        if (dev->no_64bit_msi)
2658                is_64 = 0;
2659
2660        /* Assign XIVE to PE */
2661        rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2662        if (rc) {
2663                pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
2664                        pci_name(dev), rc, xive_num);
2665                return -EIO;
2666        }
2667
2668        if (is_64) {
2669                __be64 addr64;
2670
2671                rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
2672                                     &addr64, &data);
2673                if (rc) {
2674                        pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
2675                                pci_name(dev), rc);
2676                        return -EIO;
2677                }
2678                msg->address_hi = be64_to_cpu(addr64) >> 32;
2679                msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
2680        } else {
2681                __be32 addr32;
2682
2683                rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
2684                                     &addr32, &data);
2685                if (rc) {
2686                        pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
2687                                pci_name(dev), rc);
2688                        return -EIO;
2689                }
2690                msg->address_hi = 0;
2691                msg->address_lo = be32_to_cpu(addr32);
2692        }
2693        msg->data = be32_to_cpu(data);
2694
2695        set_msi_irq_chip(phb, virq);
2696
2697        pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
2698                 " address=%x_%08x data=%x PE# %d\n",
2699                 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
2700                 msg->address_hi, msg->address_lo, data, pe->pe_number);
2701
2702        return 0;
2703}
2704
2705static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
2706{
2707        unsigned int count;
2708        const __be32 *prop = of_get_property(phb->hose->dn,
2709                                             "ibm,opal-msi-ranges", NULL);
2710        if (!prop) {
2711                /* BML Fallback */
2712                prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
2713        }
2714        if (!prop)
2715                return;
2716
2717        phb->msi_base = be32_to_cpup(prop);
2718        count = be32_to_cpup(prop + 1);
2719        if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
2720                pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
2721                       phb->hose->global_number);
2722                return;
2723        }
2724
2725        phb->msi_setup = pnv_pci_ioda_msi_setup;
2726        phb->msi32_support = 1;
2727        pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
2728                count, phb->msi_base);
2729}
2730#else
2731static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
2732#endif /* CONFIG_PCI_MSI */
2733
2734#ifdef CONFIG_PCI_IOV
2735static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
2736{
2737        struct pci_controller *hose;
2738        struct pnv_phb *phb;
2739        struct resource *res;
2740        int i;
2741        resource_size_t size;
2742        struct pci_dn *pdn;
2743        int mul, total_vfs;
2744
2745        if (!pdev->is_physfn || pdev->is_added)
2746                return;
2747
2748        hose = pci_bus_to_host(pdev->bus);
2749        phb = hose->private_data;
2750
2751        pdn = pci_get_pdn(pdev);
2752        pdn->vfs_expanded = 0;
2753
2754        total_vfs = pci_sriov_get_totalvfs(pdev);
2755        pdn->m64_per_iov = 1;
2756        mul = phb->ioda.total_pe;
2757
2758        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2759                res = &pdev->resource[i + PCI_IOV_RESOURCES];
2760                if (!res->flags || res->parent)
2761                        continue;
2762                if (!pnv_pci_is_mem_pref_64(res->flags)) {
2763                        dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
2764                                 i, res);
2765                        continue;
2766                }
2767
2768                size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
2769
2770                /* bigger than 64M */
2771                if (size > (1 << 26)) {
2772                        dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
2773                                 i, res);
2774                        pdn->m64_per_iov = M64_PER_IOV;
2775                        mul = roundup_pow_of_two(total_vfs);
2776                        break;
2777                }
2778        }
2779
2780        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2781                res = &pdev->resource[i + PCI_IOV_RESOURCES];
2782                if (!res->flags || res->parent)
2783                        continue;
2784                if (!pnv_pci_is_mem_pref_64(res->flags)) {
2785                        dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
2786                                 i, res);
2787                        continue;
2788                }
2789
2790                dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
2791                size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
2792                res->end = res->start + size * mul - 1;
2793                dev_dbg(&pdev->dev, "                       %pR\n", res);
2794                dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
2795                         i, res, mul);
2796        }
2797        pdn->vfs_expanded = mul;
2798}
2799#endif /* CONFIG_PCI_IOV */
2800
2801/*
2802 * This function is supposed to be called on basis of PE from top
2803 * to bottom style. So the the I/O or MMIO segment assigned to
2804 * parent PE could be overrided by its child PEs if necessary.
2805 */
2806static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
2807                                  struct pnv_ioda_pe *pe)
2808{
2809        struct pnv_phb *phb = hose->private_data;
2810        struct pci_bus_region region;
2811        struct resource *res;
2812        int i, index;
2813        int rc;
2814
2815        /*
2816         * NOTE: We only care PCI bus based PE for now. For PCI
2817         * device based PE, for example SRIOV sensitive VF should
2818         * be figured out later.
2819         */
2820        BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
2821
2822        pci_bus_for_each_resource(pe->pbus, res, i) {
2823                if (!res || !res->flags ||
2824                    res->start > res->end)
2825                        continue;
2826
2827                if (res->flags & IORESOURCE_IO) {
2828                        region.start = res->start - phb->ioda.io_pci_base;
2829                        region.end   = res->end - phb->ioda.io_pci_base;
2830                        index = region.start / phb->ioda.io_segsize;
2831
2832                        while (index < phb->ioda.total_pe &&
2833                               region.start <= region.end) {
2834                                phb->ioda.io_segmap[index] = pe->pe_number;
2835                                rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2836                                        pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
2837                                if (rc != OPAL_SUCCESS) {
2838                                        pr_err("%s: OPAL error %d when mapping IO "
2839                                               "segment #%d to PE#%d\n",
2840                                               __func__, rc, index, pe->pe_number);
2841                                        break;
2842                                }
2843
2844                                region.start += phb->ioda.io_segsize;
2845                                index++;
2846                        }
2847                } else if ((res->flags & IORESOURCE_MEM) &&
2848                           !pnv_pci_is_mem_pref_64(res->flags)) {
2849                        region.start = res->start -
2850                                       hose->mem_offset[0] -
2851                                       phb->ioda.m32_pci_base;
2852                        region.end   = res->end -
2853                                       hose->mem_offset[0] -
2854                                       phb->ioda.m32_pci_base;
2855                        index = region.start / phb->ioda.m32_segsize;
2856
2857                        while (index < phb->ioda.total_pe &&
2858                               region.start <= region.end) {
2859                                phb->ioda.m32_segmap[index] = pe->pe_number;
2860                                rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2861                                        pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
2862                                if (rc != OPAL_SUCCESS) {
2863                                        pr_err("%s: OPAL error %d when mapping M32 "
2864                                               "segment#%d to PE#%d",
2865                                               __func__, rc, index, pe->pe_number);
2866                                        break;
2867                                }
2868
2869                                region.start += phb->ioda.m32_segsize;
2870                                index++;
2871                        }
2872                }
2873        }
2874}
2875
2876static void pnv_pci_ioda_setup_seg(void)
2877{
2878        struct pci_controller *tmp, *hose;
2879        struct pnv_phb *phb;
2880        struct pnv_ioda_pe *pe;
2881
2882        list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
2883                phb = hose->private_data;
2884                list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2885                        pnv_ioda_setup_pe_seg(hose, pe);
2886                }
2887        }
2888}
2889
2890static void pnv_pci_ioda_setup_DMA(void)
2891{
2892        struct pci_controller *hose, *tmp;
2893        struct pnv_phb *phb;
2894
2895        list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
2896                pnv_ioda_setup_dma(hose->private_data);
2897
2898                /* Mark the PHB initialization done */
2899                phb = hose->private_data;
2900                phb->initialized = 1;
2901        }
2902}
2903
2904static void pnv_pci_ioda_create_dbgfs(void)
2905{
2906#ifdef CONFIG_DEBUG_FS
2907        struct pci_controller *hose, *tmp;
2908        struct pnv_phb *phb;
2909        char name[16];
2910
2911        list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
2912                phb = hose->private_data;
2913
2914                sprintf(name, "PCI%04x", hose->global_number);
2915                phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
2916                if (!phb->dbgfs)
2917                        pr_warning("%s: Error on creating debugfs on PHB#%x\n",
2918                                __func__, hose->global_number);
2919        }
2920#endif /* CONFIG_DEBUG_FS */
2921}
2922
2923static void pnv_pci_ioda_fixup(void)
2924{
2925        pnv_pci_ioda_setup_PEs();
2926        pnv_pci_ioda_setup_seg();
2927        pnv_pci_ioda_setup_DMA();
2928
2929        pnv_pci_ioda_create_dbgfs();
2930
2931#ifdef CONFIG_EEH
2932        eeh_init();
2933        eeh_addr_cache_build();
2934#endif
2935}
2936
2937/*
2938 * Returns the alignment for I/O or memory windows for P2P
2939 * bridges. That actually depends on how PEs are segmented.
2940 * For now, we return I/O or M32 segment size for PE sensitive
2941 * P2P bridges. Otherwise, the default values (4KiB for I/O,
2942 * 1MiB for memory) will be returned.
2943 *
2944 * The current PCI bus might be put into one PE, which was
2945 * create against the parent PCI bridge. For that case, we
2946 * needn't enlarge the alignment so that we can save some
2947 * resources.
2948 */
2949static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
2950                                                unsigned long type)
2951{
2952        struct pci_dev *bridge;
2953        struct pci_controller *hose = pci_bus_to_host(bus);
2954        struct pnv_phb *phb = hose->private_data;
2955        int num_pci_bridges = 0;
2956
2957        bridge = bus->self;
2958        while (bridge) {
2959                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
2960                        num_pci_bridges++;
2961                        if (num_pci_bridges >= 2)
2962                                return 1;
2963                }
2964
2965                bridge = bridge->bus->self;
2966        }
2967
2968        /* We fail back to M32 if M64 isn't supported */
2969        if (phb->ioda.m64_segsize &&
2970            pnv_pci_is_mem_pref_64(type))
2971                return phb->ioda.m64_segsize;
2972        if (type & IORESOURCE_MEM)
2973                return phb->ioda.m32_segsize;
2974
2975        return phb->ioda.io_segsize;
2976}
2977
2978#ifdef CONFIG_PCI_IOV
2979static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
2980                                                      int resno)
2981{
2982        struct pci_dn *pdn = pci_get_pdn(pdev);
2983        resource_size_t align, iov_align;
2984
2985        iov_align = resource_size(&pdev->resource[resno]);
2986        if (iov_align)
2987                return iov_align;
2988
2989        align = pci_iov_resource_size(pdev, resno);
2990        if (pdn->vfs_expanded)
2991                return pdn->vfs_expanded * align;
2992
2993        return align;
2994}
2995#endif /* CONFIG_PCI_IOV */
2996
2997/* Prevent enabling devices for which we couldn't properly
2998 * assign a PE
2999 */
3000static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
3001{
3002        struct pci_controller *hose = pci_bus_to_host(dev->bus);
3003        struct pnv_phb *phb = hose->private_data;
3004        struct pci_dn *pdn;
3005
3006        /* The function is probably called while the PEs have
3007         * not be created yet. For example, resource reassignment
3008         * during PCI probe period. We just skip the check if
3009         * PEs isn't ready.
3010         */
3011        if (!phb->initialized)
3012                return true;
3013
3014        pdn = pci_get_pdn(dev);
3015        if (!pdn || pdn->pe_number == IODA_INVALID_PE)
3016                return false;
3017
3018        return true;
3019}
3020
3021static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
3022                               u32 devfn)
3023{
3024        return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
3025}
3026
3027static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
3028{
3029        struct pnv_phb *phb = hose->private_data;
3030
3031        opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
3032                       OPAL_ASSERT_RESET);
3033}
3034
3035static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
3036       .dma_dev_setup = pnv_pci_dma_dev_setup,
3037#ifdef CONFIG_PCI_MSI
3038       .setup_msi_irqs = pnv_setup_msi_irqs,
3039       .teardown_msi_irqs = pnv_teardown_msi_irqs,
3040#endif
3041       .enable_device_hook = pnv_pci_enable_device_hook,
3042       .window_alignment = pnv_pci_window_alignment,
3043       .reset_secondary_bus = pnv_pci_reset_secondary_bus,
3044       .dma_set_mask = pnv_pci_ioda_dma_set_mask,
3045       .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask,
3046       .shutdown = pnv_pci_ioda_shutdown,
3047};
3048
3049static void __init pnv_pci_init_ioda_phb(struct device_node *np,
3050                                         u64 hub_id, int ioda_type)
3051{
3052        struct pci_controller *hose;
3053        struct pnv_phb *phb;
3054        unsigned long size, m32map_off, pemap_off, iomap_off = 0;
3055        const __be64 *prop64;
3056        const __be32 *prop32;
3057        int len;
3058        u64 phb_id;
3059        void *aux;
3060        long rc;
3061
3062        pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name);
3063
3064        prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
3065        if (!prop64) {
3066                pr_err("  Missing \"ibm,opal-phbid\" property !\n");
3067                return;
3068        }
3069        phb_id = be64_to_cpup(prop64);
3070        pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
3071
3072        phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0);
3073
3074        /* Allocate PCI controller */
3075        phb->hose = hose = pcibios_alloc_controller(np);
3076        if (!phb->hose) {
3077                pr_err("  Can't allocate PCI controller for %s\n",
3078                       np->full_name);
3079                memblock_free(__pa(phb), sizeof(struct pnv_phb));
3080                return;
3081        }
3082
3083        spin_lock_init(&phb->lock);
3084        prop32 = of_get_property(np, "bus-range", &len);
3085        if (prop32 && len == 8) {
3086                hose->first_busno = be32_to_cpu(prop32[0]);
3087                hose->last_busno = be32_to_cpu(prop32[1]);
3088        } else {
3089                pr_warn("  Broken <bus-range> on %s\n", np->full_name);
3090                hose->first_busno = 0;
3091                hose->last_busno = 0xff;
3092        }
3093        hose->private_data = phb;
3094        phb->hub_id = hub_id;
3095        phb->opal_id = phb_id;
3096        phb->type = ioda_type;
3097        mutex_init(&phb->ioda.pe_alloc_mutex);
3098
3099        /* Detect specific models for error handling */
3100        if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
3101                phb->model = PNV_PHB_MODEL_P7IOC;
3102        else if (of_device_is_compatible(np, "ibm,power8-pciex"))
3103                phb->model = PNV_PHB_MODEL_PHB3;
3104        else
3105                phb->model = PNV_PHB_MODEL_UNKNOWN;
3106
3107        /* Parse 32-bit and IO ranges (if any) */
3108        pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
3109
3110        /* Get registers */
3111        phb->regs = of_iomap(np, 0);
3112        if (phb->regs == NULL)
3113                pr_err("  Failed to map registers !\n");
3114
3115        /* Initialize more IODA stuff */
3116        phb->ioda.total_pe = 1;
3117        prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
3118        if (prop32)
3119                phb->ioda.total_pe = be32_to_cpup(prop32);
3120        prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
3121        if (prop32)
3122                phb->ioda.reserved_pe = be32_to_cpup(prop32);
3123
3124        /* Parse 64-bit MMIO range */
3125        pnv_ioda_parse_m64_window(phb);
3126
3127        phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
3128        /* FW Has already off top 64k of M32 space (MSI space) */
3129        phb->ioda.m32_size += 0x10000;
3130
3131        phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
3132        phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
3133        phb->ioda.io_size = hose->pci_io_size;
3134        phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
3135        phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
3136
3137        /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
3138        size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
3139        m32map_off = size;
3140        size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]);
3141        if (phb->type == PNV_PHB_IODA1) {
3142                iomap_off = size;
3143                size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]);
3144        }
3145        pemap_off = size;
3146        size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe);
3147        aux = memblock_virt_alloc(size, 0);
3148        phb->ioda.pe_alloc = aux;
3149        phb->ioda.m32_segmap = aux + m32map_off;
3150        if (phb->type == PNV_PHB_IODA1)
3151                phb->ioda.io_segmap = aux + iomap_off;
3152        phb->ioda.pe_array = aux + pemap_off;
3153        set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc);
3154
3155        INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
3156        INIT_LIST_HEAD(&phb->ioda.pe_list);
3157        mutex_init(&phb->ioda.pe_list_mutex);
3158
3159        /* Calculate how many 32-bit TCE segments we have */
3160        phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
3161
3162#if 0 /* We should really do that ... */
3163        rc = opal_pci_set_phb_mem_window(opal->phb_id,
3164                                         window_type,
3165                                         window_num,
3166                                         starting_real_address,
3167                                         starting_pci_address,
3168                                         segment_size);
3169#endif
3170
3171        pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
3172                phb->ioda.total_pe, phb->ioda.reserved_pe,
3173                phb->ioda.m32_size, phb->ioda.m32_segsize);
3174        if (phb->ioda.m64_size)
3175                pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
3176                        phb->ioda.m64_size, phb->ioda.m64_segsize);
3177        if (phb->ioda.io_size)
3178                pr_info("                  IO: 0x%x [segment=0x%x]\n",
3179                        phb->ioda.io_size, phb->ioda.io_segsize);
3180
3181
3182        phb->hose->ops = &pnv_pci_ops;
3183        phb->get_pe_state = pnv_ioda_get_pe_state;
3184        phb->freeze_pe = pnv_ioda_freeze_pe;
3185        phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
3186
3187        /* Setup RID -> PE mapping function */
3188        phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
3189
3190        /* Setup TCEs */
3191        phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
3192
3193        /* Setup MSI support */
3194        pnv_pci_init_ioda_msis(phb);
3195
3196        /*
3197         * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
3198         * to let the PCI core do resource assignment. It's supposed
3199         * that the PCI core will do correct I/O and MMIO alignment
3200         * for the P2P bridge bars so that each PCI bus (excluding
3201         * the child P2P bridges) can form individual PE.
3202         */
3203        ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
3204        hose->controller_ops = pnv_pci_ioda_controller_ops;
3205
3206#ifdef CONFIG_PCI_IOV
3207        ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
3208        ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
3209#endif
3210
3211        pci_add_flags(PCI_REASSIGN_ALL_RSRC);
3212
3213        /* Reset IODA tables to a clean state */
3214        rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
3215        if (rc)
3216                pr_warning("  OPAL Error %ld performing IODA table reset !\n", rc);
3217
3218        /* If we're running in kdump kerenl, the previous kerenl never
3219         * shutdown PCI devices correctly. We already got IODA table
3220         * cleaned out. So we have to issue PHB reset to stop all PCI
3221         * transactions from previous kerenl.
3222         */
3223        if (is_kdump_kernel()) {
3224                pr_info("  Issue PHB reset ...\n");
3225                pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
3226                pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
3227        }
3228
3229        /* Remove M64 resource if we can't configure it successfully */
3230        if (!phb->init_m64 || phb->init_m64(phb))
3231                hose->mem_resources[1].flags = 0;
3232}
3233
3234void __init pnv_pci_init_ioda2_phb(struct device_node *np)
3235{
3236        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
3237}
3238
3239void __init pnv_pci_init_ioda_hub(struct device_node *np)
3240{
3241        struct device_node *phbn;
3242        const __be64 *prop64;
3243        u64 hub_id;
3244
3245        pr_info("Probing IODA IO-Hub %s\n", np->full_name);
3246
3247        prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
3248        if (!prop64) {
3249                pr_err(" Missing \"ibm,opal-hubid\" property !\n");
3250                return;
3251        }
3252        hub_id = be64_to_cpup(prop64);
3253        pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
3254
3255        /* Count child PHBs */
3256        for_each_child_of_node(np, phbn) {
3257                /* Look for IODA1 PHBs */
3258                if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
3259                        pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
3260        }
3261}
3262