linux/arch/powerpc/platforms/pseries/iommu.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
   4 *
   5 * Rewrite, cleanup:
   6 *
   7 * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
   8 * Copyright (C) 2006 Olof Johansson <olof@lixom.net>
   9 *
  10 * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
  11 */
  12
  13#include <linux/init.h>
  14#include <linux/types.h>
  15#include <linux/slab.h>
  16#include <linux/mm.h>
  17#include <linux/memblock.h>
  18#include <linux/spinlock.h>
  19#include <linux/string.h>
  20#include <linux/pci.h>
  21#include <linux/dma-mapping.h>
  22#include <linux/crash_dump.h>
  23#include <linux/memory.h>
  24#include <linux/of.h>
  25#include <linux/iommu.h>
  26#include <linux/rculist.h>
  27#include <asm/io.h>
  28#include <asm/prom.h>
  29#include <asm/rtas.h>
  30#include <asm/iommu.h>
  31#include <asm/pci-bridge.h>
  32#include <asm/machdep.h>
  33#include <asm/firmware.h>
  34#include <asm/tce.h>
  35#include <asm/ppc-pci.h>
  36#include <asm/udbg.h>
  37#include <asm/mmzone.h>
  38#include <asm/plpar_wrappers.h>
  39
  40#include "pseries.h"
  41
  42enum {
  43        DDW_QUERY_PE_DMA_WIN  = 0,
  44        DDW_CREATE_PE_DMA_WIN = 1,
  45        DDW_REMOVE_PE_DMA_WIN = 2,
  46
  47        DDW_APPLICABLE_SIZE
  48};
  49
  50enum {
  51        DDW_EXT_SIZE = 0,
  52        DDW_EXT_RESET_DMA_WIN = 1,
  53        DDW_EXT_QUERY_OUT_SIZE = 2
  54};
  55
  56static struct iommu_table *iommu_pseries_alloc_table(int node)
  57{
  58        struct iommu_table *tbl;
  59
  60        tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
  61        if (!tbl)
  62                return NULL;
  63
  64        INIT_LIST_HEAD_RCU(&tbl->it_group_list);
  65        kref_init(&tbl->it_kref);
  66        return tbl;
  67}
  68
  69static struct iommu_table_group *iommu_pseries_alloc_group(int node)
  70{
  71        struct iommu_table_group *table_group;
  72
  73        table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
  74        if (!table_group)
  75                return NULL;
  76
  77        table_group->tables[0] = iommu_pseries_alloc_table(node);
  78        if (table_group->tables[0])
  79                return table_group;
  80
  81        kfree(table_group);
  82        return NULL;
  83}
  84
  85static void iommu_pseries_free_group(struct iommu_table_group *table_group,
  86                const char *node_name)
  87{
  88        struct iommu_table *tbl;
  89
  90        if (!table_group)
  91                return;
  92
  93        tbl = table_group->tables[0];
  94#ifdef CONFIG_IOMMU_API
  95        if (table_group->group) {
  96                iommu_group_put(table_group->group);
  97                BUG_ON(table_group->group);
  98        }
  99#endif
 100        iommu_tce_table_put(tbl);
 101
 102        kfree(table_group);
 103}
 104
 105static int tce_build_pSeries(struct iommu_table *tbl, long index,
 106                              long npages, unsigned long uaddr,
 107                              enum dma_data_direction direction,
 108                              unsigned long attrs)
 109{
 110        u64 proto_tce;
 111        __be64 *tcep;
 112        u64 rpn;
 113        const unsigned long tceshift = tbl->it_page_shift;
 114        const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
 115
 116        proto_tce = TCE_PCI_READ; // Read allowed
 117
 118        if (direction != DMA_TO_DEVICE)
 119                proto_tce |= TCE_PCI_WRITE;
 120
 121        tcep = ((__be64 *)tbl->it_base) + index;
 122
 123        while (npages--) {
 124                /* can't move this out since we might cross MEMBLOCK boundary */
 125                rpn = __pa(uaddr) >> tceshift;
 126                *tcep = cpu_to_be64(proto_tce | rpn << tceshift);
 127
 128                uaddr += pagesize;
 129                tcep++;
 130        }
 131        return 0;
 132}
 133
 134
 135static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
 136{
 137        __be64 *tcep;
 138
 139        tcep = ((__be64 *)tbl->it_base) + index;
 140
 141        while (npages--)
 142                *(tcep++) = 0;
 143}
 144
 145static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
 146{
 147        __be64 *tcep;
 148
 149        tcep = ((__be64 *)tbl->it_base) + index;
 150
 151        return be64_to_cpu(*tcep);
 152}
 153
 154static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
 155static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
 156
 157static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
 158                                long npages, unsigned long uaddr,
 159                                enum dma_data_direction direction,
 160                                unsigned long attrs)
 161{
 162        u64 rc = 0;
 163        u64 proto_tce, tce;
 164        u64 rpn;
 165        int ret = 0;
 166        long tcenum_start = tcenum, npages_start = npages;
 167
 168        rpn = __pa(uaddr) >> tceshift;
 169        proto_tce = TCE_PCI_READ;
 170        if (direction != DMA_TO_DEVICE)
 171                proto_tce |= TCE_PCI_WRITE;
 172
 173        while (npages--) {
 174                tce = proto_tce | rpn << tceshift;
 175                rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
 176
 177                if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
 178                        ret = (int)rc;
 179                        tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
 180                                           (npages_start - (npages + 1)));
 181                        break;
 182                }
 183
 184                if (rc && printk_ratelimit()) {
 185                        printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
 186                        printk("\tindex   = 0x%llx\n", (u64)liobn);
 187                        printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
 188                        printk("\ttce val = 0x%llx\n", tce );
 189                        dump_stack();
 190                }
 191
 192                tcenum++;
 193                rpn++;
 194        }
 195        return ret;
 196}
 197
 198static DEFINE_PER_CPU(__be64 *, tce_page);
 199
 200static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 201                                     long npages, unsigned long uaddr,
 202                                     enum dma_data_direction direction,
 203                                     unsigned long attrs)
 204{
 205        u64 rc = 0;
 206        u64 proto_tce;
 207        __be64 *tcep;
 208        u64 rpn;
 209        long l, limit;
 210        long tcenum_start = tcenum, npages_start = npages;
 211        int ret = 0;
 212        unsigned long flags;
 213        const unsigned long tceshift = tbl->it_page_shift;
 214
 215        if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
 216                return tce_build_pSeriesLP(tbl->it_index, tcenum,
 217                                           tceshift, npages, uaddr,
 218                                           direction, attrs);
 219        }
 220
 221        local_irq_save(flags);  /* to protect tcep and the page behind it */
 222
 223        tcep = __this_cpu_read(tce_page);
 224
 225        /* This is safe to do since interrupts are off when we're called
 226         * from iommu_alloc{,_sg}()
 227         */
 228        if (!tcep) {
 229                tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
 230                /* If allocation fails, fall back to the loop implementation */
 231                if (!tcep) {
 232                        local_irq_restore(flags);
 233                        return tce_build_pSeriesLP(tbl->it_index, tcenum,
 234                                        tceshift,
 235                                        npages, uaddr, direction, attrs);
 236                }
 237                __this_cpu_write(tce_page, tcep);
 238        }
 239
 240        rpn = __pa(uaddr) >> tceshift;
 241        proto_tce = TCE_PCI_READ;
 242        if (direction != DMA_TO_DEVICE)
 243                proto_tce |= TCE_PCI_WRITE;
 244
 245        /* We can map max one pageful of TCEs at a time */
 246        do {
 247                /*
 248                 * Set up the page with TCE data, looping through and setting
 249                 * the values.
 250                 */
 251                limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
 252
 253                for (l = 0; l < limit; l++) {
 254                        tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift);
 255                        rpn++;
 256                }
 257
 258                rc = plpar_tce_put_indirect((u64)tbl->it_index,
 259                                            (u64)tcenum << tceshift,
 260                                            (u64)__pa(tcep),
 261                                            limit);
 262
 263                npages -= limit;
 264                tcenum += limit;
 265        } while (npages > 0 && !rc);
 266
 267        local_irq_restore(flags);
 268
 269        if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
 270                ret = (int)rc;
 271                tce_freemulti_pSeriesLP(tbl, tcenum_start,
 272                                        (npages_start - (npages + limit)));
 273                return ret;
 274        }
 275
 276        if (rc && printk_ratelimit()) {
 277                printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
 278                printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
 279                printk("\tnpages  = 0x%llx\n", (u64)npages);
 280                printk("\ttce[0] val = 0x%llx\n", tcep[0]);
 281                dump_stack();
 282        }
 283        return ret;
 284}
 285
 286static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
 287                               long npages)
 288{
 289        u64 rc;
 290
 291        while (npages--) {
 292                rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0);
 293
 294                if (rc && printk_ratelimit()) {
 295                        printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
 296                        printk("\tindex   = 0x%llx\n", (u64)liobn);
 297                        printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
 298                        dump_stack();
 299                }
 300
 301                tcenum++;
 302        }
 303}
 304
 305
 306static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
 307{
 308        u64 rc;
 309
 310        if (!firmware_has_feature(FW_FEATURE_STUFF_TCE))
 311                return tce_free_pSeriesLP(tbl->it_index, tcenum,
 312                                          tbl->it_page_shift, npages);
 313
 314        rc = plpar_tce_stuff((u64)tbl->it_index,
 315                             (u64)tcenum << tbl->it_page_shift, 0, npages);
 316
 317        if (rc && printk_ratelimit()) {
 318                printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
 319                printk("\trc      = %lld\n", rc);
 320                printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
 321                printk("\tnpages  = 0x%llx\n", (u64)npages);
 322                dump_stack();
 323        }
 324}
 325
 326static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
 327{
 328        u64 rc;
 329        unsigned long tce_ret;
 330
 331        rc = plpar_tce_get((u64)tbl->it_index,
 332                           (u64)tcenum << tbl->it_page_shift, &tce_ret);
 333
 334        if (rc && printk_ratelimit()) {
 335                printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
 336                printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
 337                printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
 338                dump_stack();
 339        }
 340
 341        return tce_ret;
 342}
 343
 344/* this is compatible with cells for the device tree property */
 345struct dynamic_dma_window_prop {
 346        __be32  liobn;          /* tce table number */
 347        __be64  dma_base;       /* address hi,lo */
 348        __be32  tce_shift;      /* ilog2(tce_page_size) */
 349        __be32  window_shift;   /* ilog2(tce_window_size) */
 350};
 351
 352struct dma_win {
 353        struct device_node *device;
 354        const struct dynamic_dma_window_prop *prop;
 355        struct list_head list;
 356};
 357
 358/* Dynamic DMA Window support */
 359struct ddw_query_response {
 360        u32 windows_available;
 361        u64 largest_available_block;
 362        u32 page_size;
 363        u32 migration_capable;
 364};
 365
 366struct ddw_create_response {
 367        u32 liobn;
 368        u32 addr_hi;
 369        u32 addr_lo;
 370};
 371
 372static LIST_HEAD(dma_win_list);
 373/* prevents races between memory on/offline and window creation */
 374static DEFINE_SPINLOCK(dma_win_list_lock);
 375/* protects initializing window twice for same device */
 376static DEFINE_MUTEX(dma_win_init_mutex);
 377#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
 378#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 379
 380static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
 381                                        unsigned long num_pfn, const void *arg)
 382{
 383        const struct dynamic_dma_window_prop *maprange = arg;
 384        int rc;
 385        u64 tce_size, num_tce, dma_offset, next;
 386        u32 tce_shift;
 387        long limit;
 388
 389        tce_shift = be32_to_cpu(maprange->tce_shift);
 390        tce_size = 1ULL << tce_shift;
 391        next = start_pfn << PAGE_SHIFT;
 392        num_tce = num_pfn << PAGE_SHIFT;
 393
 394        /* round back to the beginning of the tce page size */
 395        num_tce += next & (tce_size - 1);
 396        next &= ~(tce_size - 1);
 397
 398        /* covert to number of tces */
 399        num_tce |= tce_size - 1;
 400        num_tce >>= tce_shift;
 401
 402        do {
 403                /*
 404                 * Set up the page with TCE data, looping through and setting
 405                 * the values.
 406                 */
 407                limit = min_t(long, num_tce, 512);
 408                dma_offset = next + be64_to_cpu(maprange->dma_base);
 409
 410                rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
 411                                             dma_offset,
 412                                             0, limit);
 413                next += limit * tce_size;
 414                num_tce -= limit;
 415        } while (num_tce > 0 && !rc);
 416
 417        return rc;
 418}
 419
 420static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
 421                                        unsigned long num_pfn, const void *arg)
 422{
 423        const struct dynamic_dma_window_prop *maprange = arg;
 424        u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;
 425        __be64 *tcep;
 426        u32 tce_shift;
 427        u64 rc = 0;
 428        long l, limit;
 429
 430        if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
 431                unsigned long tceshift = be32_to_cpu(maprange->tce_shift);
 432                unsigned long dmastart = (start_pfn << PAGE_SHIFT) +
 433                                be64_to_cpu(maprange->dma_base);
 434                unsigned long tcenum = dmastart >> tceshift;
 435                unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift;
 436                void *uaddr = __va(start_pfn << PAGE_SHIFT);
 437
 438                return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn),
 439                                tcenum, tceshift, npages, (unsigned long) uaddr,
 440                                DMA_BIDIRECTIONAL, 0);
 441        }
 442
 443        local_irq_disable();    /* to protect tcep and the page behind it */
 444        tcep = __this_cpu_read(tce_page);
 445
 446        if (!tcep) {
 447                tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
 448                if (!tcep) {
 449                        local_irq_enable();
 450                        return -ENOMEM;
 451                }
 452                __this_cpu_write(tce_page, tcep);
 453        }
 454
 455        proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
 456
 457        liobn = (u64)be32_to_cpu(maprange->liobn);
 458        tce_shift = be32_to_cpu(maprange->tce_shift);
 459        tce_size = 1ULL << tce_shift;
 460        next = start_pfn << PAGE_SHIFT;
 461        num_tce = num_pfn << PAGE_SHIFT;
 462
 463        /* round back to the beginning of the tce page size */
 464        num_tce += next & (tce_size - 1);
 465        next &= ~(tce_size - 1);
 466
 467        /* covert to number of tces */
 468        num_tce |= tce_size - 1;
 469        num_tce >>= tce_shift;
 470
 471        /* We can map max one pageful of TCEs at a time */
 472        do {
 473                /*
 474                 * Set up the page with TCE data, looping through and setting
 475                 * the values.
 476                 */
 477                limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
 478                dma_offset = next + be64_to_cpu(maprange->dma_base);
 479
 480                for (l = 0; l < limit; l++) {
 481                        tcep[l] = cpu_to_be64(proto_tce | next);
 482                        next += tce_size;
 483                }
 484
 485                rc = plpar_tce_put_indirect(liobn,
 486                                            dma_offset,
 487                                            (u64)__pa(tcep),
 488                                            limit);
 489
 490                num_tce -= limit;
 491        } while (num_tce > 0 && !rc);
 492
 493        /* error cleanup: caller will clear whole range */
 494
 495        local_irq_enable();
 496        return rc;
 497}
 498
 499static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
 500                unsigned long num_pfn, void *arg)
 501{
 502        return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
 503}
 504
 505static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno,
 506                                        unsigned long liobn, unsigned long win_addr,
 507                                        unsigned long window_size, unsigned long page_shift,
 508                                        void *base, struct iommu_table_ops *table_ops)
 509{
 510        tbl->it_busno = busno;
 511        tbl->it_index = liobn;
 512        tbl->it_offset = win_addr >> page_shift;
 513        tbl->it_size = window_size >> page_shift;
 514        tbl->it_page_shift = page_shift;
 515        tbl->it_base = (unsigned long)base;
 516        tbl->it_blocksize = 16;
 517        tbl->it_type = TCE_PCI;
 518        tbl->it_ops = table_ops;
 519}
 520
 521struct iommu_table_ops iommu_table_pseries_ops;
 522
 523static void iommu_table_setparms(struct pci_controller *phb,
 524                                 struct device_node *dn,
 525                                 struct iommu_table *tbl)
 526{
 527        struct device_node *node;
 528        const unsigned long *basep;
 529        const u32 *sizep;
 530
 531        /* Test if we are going over 2GB of DMA space */
 532        if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
 533                udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
 534                panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
 535        }
 536
 537        node = phb->dn;
 538        basep = of_get_property(node, "linux,tce-base", NULL);
 539        sizep = of_get_property(node, "linux,tce-size", NULL);
 540        if (basep == NULL || sizep == NULL) {
 541                printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has "
 542                                "missing tce entries !\n", dn);
 543                return;
 544        }
 545
 546        iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur,
 547                                    phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
 548                                    __va(*basep), &iommu_table_pseries_ops);
 549
 550        if (!is_kdump_kernel())
 551                memset((void *)tbl->it_base, 0, *sizep);
 552
 553        phb->dma_window_base_cur += phb->dma_window_size;
 554}
 555
 556struct iommu_table_ops iommu_table_lpar_multi_ops;
 557
 558/*
 559 * iommu_table_setparms_lpar
 560 *
 561 * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
 562 */
 563static void iommu_table_setparms_lpar(struct pci_controller *phb,
 564                                      struct device_node *dn,
 565                                      struct iommu_table *tbl,
 566                                      struct iommu_table_group *table_group,
 567                                      const __be32 *dma_window)
 568{
 569        unsigned long offset, size, liobn;
 570
 571        of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
 572
 573        iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
 574                                    &iommu_table_lpar_multi_ops);
 575
 576
 577        table_group->tce32_start = offset;
 578        table_group->tce32_size = size;
 579}
 580
 581struct iommu_table_ops iommu_table_pseries_ops = {
 582        .set = tce_build_pSeries,
 583        .clear = tce_free_pSeries,
 584        .get = tce_get_pseries
 585};
 586
 587static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 588{
 589        struct device_node *dn;
 590        struct iommu_table *tbl;
 591        struct device_node *isa_dn, *isa_dn_orig;
 592        struct device_node *tmp;
 593        struct pci_dn *pci;
 594        int children;
 595
 596        dn = pci_bus_to_OF_node(bus);
 597
 598        pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn);
 599
 600        if (bus->self) {
 601                /* This is not a root bus, any setup will be done for the
 602                 * device-side of the bridge in iommu_dev_setup_pSeries().
 603                 */
 604                return;
 605        }
 606        pci = PCI_DN(dn);
 607
 608        /* Check if the ISA bus on the system is under
 609         * this PHB.
 610         */
 611        isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
 612
 613        while (isa_dn && isa_dn != dn)
 614                isa_dn = isa_dn->parent;
 615
 616        of_node_put(isa_dn_orig);
 617
 618        /* Count number of direct PCI children of the PHB. */
 619        for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
 620                children++;
 621
 622        pr_debug("Children: %d\n", children);
 623
 624        /* Calculate amount of DMA window per slot. Each window must be
 625         * a power of two (due to pci_alloc_consistent requirements).
 626         *
 627         * Keep 256MB aside for PHBs with ISA.
 628         */
 629
 630        if (!isa_dn) {
 631                /* No ISA/IDE - just set window size and return */
 632                pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
 633
 634                while (pci->phb->dma_window_size * children > 0x80000000ul)
 635                        pci->phb->dma_window_size >>= 1;
 636                pr_debug("No ISA/IDE, window size is 0x%llx\n",
 637                         pci->phb->dma_window_size);
 638                pci->phb->dma_window_base_cur = 0;
 639
 640                return;
 641        }
 642
 643        /* If we have ISA, then we probably have an IDE
 644         * controller too. Allocate a 128MB table but
 645         * skip the first 128MB to avoid stepping on ISA
 646         * space.
 647         */
 648        pci->phb->dma_window_size = 0x8000000ul;
 649        pci->phb->dma_window_base_cur = 0x8000000ul;
 650
 651        pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
 652        tbl = pci->table_group->tables[0];
 653
 654        iommu_table_setparms(pci->phb, dn, tbl);
 655
 656        if (!iommu_init_table(tbl, pci->phb->node, 0, 0))
 657                panic("Failed to initialize iommu table");
 658
 659        /* Divide the rest (1.75GB) among the children */
 660        pci->phb->dma_window_size = 0x80000000ul;
 661        while (pci->phb->dma_window_size * children > 0x70000000ul)
 662                pci->phb->dma_window_size >>= 1;
 663
 664        pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
 665}
 666
 667#ifdef CONFIG_IOMMU_API
 668static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
 669                                long *tce, enum dma_data_direction *direction,
 670                                bool realmode)
 671{
 672        long rc;
 673        unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
 674        unsigned long flags, oldtce = 0;
 675        u64 proto_tce = iommu_direction_to_tce_perm(*direction);
 676        unsigned long newtce = *tce | proto_tce;
 677
 678        spin_lock_irqsave(&tbl->large_pool.lock, flags);
 679
 680        rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
 681        if (!rc)
 682                rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
 683
 684        if (!rc) {
 685                *direction = iommu_tce_direction(oldtce);
 686                *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
 687        }
 688
 689        spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
 690
 691        return rc;
 692}
 693#endif
 694
 695struct iommu_table_ops iommu_table_lpar_multi_ops = {
 696        .set = tce_buildmulti_pSeriesLP,
 697#ifdef CONFIG_IOMMU_API
 698        .xchg_no_kill = tce_exchange_pseries,
 699#endif
 700        .clear = tce_freemulti_pSeriesLP,
 701        .get = tce_get_pSeriesLP
 702};
 703
 704static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 705{
 706        struct iommu_table *tbl;
 707        struct device_node *dn, *pdn;
 708        struct pci_dn *ppci;
 709        const __be32 *dma_window = NULL;
 710
 711        dn = pci_bus_to_OF_node(bus);
 712
 713        pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 714                 dn);
 715
 716        /*
 717         * Find nearest ibm,dma-window (default DMA window), walking up the
 718         * device tree
 719         */
 720        for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
 721                dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
 722                if (dma_window != NULL)
 723                        break;
 724        }
 725
 726        if (dma_window == NULL) {
 727                pr_debug("  no ibm,dma-window property !\n");
 728                return;
 729        }
 730
 731        ppci = PCI_DN(pdn);
 732
 733        pr_debug("  parent is %pOF, iommu_table: 0x%p\n",
 734                 pdn, ppci->table_group);
 735
 736        if (!ppci->table_group) {
 737                ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
 738                tbl = ppci->table_group->tables[0];
 739                iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
 740                                ppci->table_group, dma_window);
 741
 742                if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
 743                        panic("Failed to initialize iommu table");
 744                iommu_register_group(ppci->table_group,
 745                                pci_domain_nr(bus), 0);
 746                pr_debug("  created table: %p\n", ppci->table_group);
 747        }
 748}
 749
 750
 751static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 752{
 753        struct device_node *dn;
 754        struct iommu_table *tbl;
 755
 756        pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
 757
 758        dn = dev->dev.of_node;
 759
 760        /* If we're the direct child of a root bus, then we need to allocate
 761         * an iommu table ourselves. The bus setup code should have setup
 762         * the window sizes already.
 763         */
 764        if (!dev->bus->self) {
 765                struct pci_controller *phb = PCI_DN(dn)->phb;
 766
 767                pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
 768                PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
 769                tbl = PCI_DN(dn)->table_group->tables[0];
 770                iommu_table_setparms(phb, dn, tbl);
 771
 772                if (!iommu_init_table(tbl, phb->node, 0, 0))
 773                        panic("Failed to initialize iommu table");
 774
 775                set_iommu_table_base(&dev->dev, tbl);
 776                return;
 777        }
 778
 779        /* If this device is further down the bus tree, search upwards until
 780         * an already allocated iommu table is found and use that.
 781         */
 782
 783        while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
 784                dn = dn->parent;
 785
 786        if (dn && PCI_DN(dn))
 787                set_iommu_table_base(&dev->dev,
 788                                PCI_DN(dn)->table_group->tables[0]);
 789        else
 790                printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
 791                       pci_name(dev));
 792}
 793
 794static int __read_mostly disable_ddw;
 795
 796static int __init disable_ddw_setup(char *str)
 797{
 798        disable_ddw = 1;
 799        printk(KERN_INFO "ppc iommu: disabling ddw.\n");
 800
 801        return 0;
 802}
 803
 804early_param("disable_ddw", disable_ddw_setup);
 805
 806static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp)
 807{
 808        int ret;
 809
 810        ret = tce_clearrange_multi_pSeriesLP(0,
 811                1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
 812        if (ret)
 813                pr_warn("%pOF failed to clear tces in window.\n",
 814                        np);
 815        else
 816                pr_debug("%pOF successfully cleared tces in window.\n",
 817                         np);
 818}
 819
 820/*
 821 * Call only if DMA window is clean.
 822 */
 823static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn)
 824{
 825        int ret;
 826
 827        ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
 828        if (ret)
 829                pr_warn("%pOF: failed to remove DMA window: rtas returned "
 830                        "%d to ibm,remove-pe-dma-window(%x) %llx\n",
 831                        np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 832        else
 833                pr_debug("%pOF: successfully removed DMA window: rtas returned "
 834                        "%d to ibm,remove-pe-dma-window(%x) %llx\n",
 835                        np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 836}
 837
 838static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
 839                              struct property *win)
 840{
 841        struct dynamic_dma_window_prop *dwp;
 842        u64 liobn;
 843
 844        dwp = win->value;
 845        liobn = (u64)be32_to_cpu(dwp->liobn);
 846
 847        clean_dma_window(np, dwp);
 848        __remove_dma_window(np, ddw_avail, liobn);
 849}
 850
 851static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name)
 852{
 853        struct property *win;
 854        u32 ddw_avail[DDW_APPLICABLE_SIZE];
 855        int ret = 0;
 856
 857        win = of_find_property(np, win_name, NULL);
 858        if (!win)
 859                return -EINVAL;
 860
 861        ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
 862                                         &ddw_avail[0], DDW_APPLICABLE_SIZE);
 863        if (ret)
 864                return 0;
 865
 866
 867        if (win->length >= sizeof(struct dynamic_dma_window_prop))
 868                remove_dma_window(np, ddw_avail, win);
 869
 870        if (!remove_prop)
 871                return 0;
 872
 873        ret = of_remove_property(np, win);
 874        if (ret)
 875                pr_warn("%pOF: failed to remove DMA window property: %d\n",
 876                        np, ret);
 877        return 0;
 878}
 879
 880static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift)
 881{
 882        struct dma_win *window;
 883        const struct dynamic_dma_window_prop *dma64;
 884        bool found = false;
 885
 886        spin_lock(&dma_win_list_lock);
 887        /* check if we already created a window and dupe that config if so */
 888        list_for_each_entry(window, &dma_win_list, list) {
 889                if (window->device == pdn) {
 890                        dma64 = window->prop;
 891                        *dma_addr = be64_to_cpu(dma64->dma_base);
 892                        *window_shift = be32_to_cpu(dma64->window_shift);
 893                        found = true;
 894                        break;
 895                }
 896        }
 897        spin_unlock(&dma_win_list_lock);
 898
 899        return found;
 900}
 901
 902static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
 903                                          const struct dynamic_dma_window_prop *dma64)
 904{
 905        struct dma_win *window;
 906
 907        window = kzalloc(sizeof(*window), GFP_KERNEL);
 908        if (!window)
 909                return NULL;
 910
 911        window->device = pdn;
 912        window->prop = dma64;
 913
 914        return window;
 915}
 916
 917static void find_existing_ddw_windows_named(const char *name)
 918{
 919        int len;
 920        struct device_node *pdn;
 921        struct dma_win *window;
 922        const struct dynamic_dma_window_prop *dma64;
 923
 924        for_each_node_with_property(pdn, name) {
 925                dma64 = of_get_property(pdn, name, &len);
 926                if (!dma64 || len < sizeof(*dma64)) {
 927                        remove_ddw(pdn, true, name);
 928                        continue;
 929                }
 930
 931                window = ddw_list_new_entry(pdn, dma64);
 932                if (!window)
 933                        break;
 934
 935                spin_lock(&dma_win_list_lock);
 936                list_add(&window->list, &dma_win_list);
 937                spin_unlock(&dma_win_list_lock);
 938        }
 939}
 940
 941static int find_existing_ddw_windows(void)
 942{
 943        if (!firmware_has_feature(FW_FEATURE_LPAR))
 944                return 0;
 945
 946        find_existing_ddw_windows_named(DIRECT64_PROPNAME);
 947        find_existing_ddw_windows_named(DMA64_PROPNAME);
 948
 949        return 0;
 950}
 951machine_arch_initcall(pseries, find_existing_ddw_windows);
 952
 953/**
 954 * ddw_read_ext - Get the value of an DDW extension
 955 * @np:         device node from which the extension value is to be read.
 956 * @extnum:     index number of the extension.
 957 * @value:      pointer to return value, modified when extension is available.
 958 *
 959 * Checks if "ibm,ddw-extensions" exists for this node, and get the value
 960 * on index 'extnum'.
 961 * It can be used only to check if a property exists, passing value == NULL.
 962 *
 963 * Returns:
 964 *      0 if extension successfully read
 965 *      -EINVAL if the "ibm,ddw-extensions" does not exist,
 966 *      -ENODATA if "ibm,ddw-extensions" does not have a value, and
 967 *      -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
 968 */
 969static inline int ddw_read_ext(const struct device_node *np, int extnum,
 970                               u32 *value)
 971{
 972        static const char propname[] = "ibm,ddw-extensions";
 973        u32 count;
 974        int ret;
 975
 976        ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count);
 977        if (ret)
 978                return ret;
 979
 980        if (count < extnum)
 981                return -EOVERFLOW;
 982
 983        if (!value)
 984                value = &count;
 985
 986        return of_property_read_u32_index(np, propname, extnum, value);
 987}
 988
 989static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
 990                     struct ddw_query_response *query,
 991                     struct device_node *parent)
 992{
 993        struct device_node *dn;
 994        struct pci_dn *pdn;
 995        u32 cfg_addr, ext_query, query_out[5];
 996        u64 buid;
 997        int ret, out_sz;
 998
 999        /*
1000         * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
1001         * output parameters ibm,query-pe-dma-windows will have, ranging from
1002         * 5 to 6.
1003         */
1004        ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query);
1005        if (!ret && ext_query == 1)
1006                out_sz = 6;
1007        else
1008                out_sz = 5;
1009
1010        /*
1011         * Get the config address and phb buid of the PE window.
1012         * Rely on eeh to retrieve this for us.
1013         * Retrieve them from the pci device, not the node with the
1014         * dma-window property
1015         */
1016        dn = pci_device_to_OF_node(dev);
1017        pdn = PCI_DN(dn);
1018        buid = pdn->phb->buid;
1019        cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
1020
1021        ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
1022                        cfg_addr, BUID_HI(buid), BUID_LO(buid));
1023        dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d\n",
1024                 ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
1025                 BUID_LO(buid), ret);
1026
1027        switch (out_sz) {
1028        case 5:
1029                query->windows_available = query_out[0];
1030                query->largest_available_block = query_out[1];
1031                query->page_size = query_out[2];
1032                query->migration_capable = query_out[3];
1033                break;
1034        case 6:
1035                query->windows_available = query_out[0];
1036                query->largest_available_block = ((u64)query_out[1] << 32) |
1037                                                 query_out[2];
1038                query->page_size = query_out[3];
1039                query->migration_capable = query_out[4];
1040                break;
1041        }
1042
1043        return ret;
1044}
1045
1046static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
1047                        struct ddw_create_response *create, int page_shift,
1048                        int window_shift)
1049{
1050        struct device_node *dn;
1051        struct pci_dn *pdn;
1052        u32 cfg_addr;
1053        u64 buid;
1054        int ret;
1055
1056        /*
1057         * Get the config address and phb buid of the PE window.
1058         * Rely on eeh to retrieve this for us.
1059         * Retrieve them from the pci device, not the node with the
1060         * dma-window property
1061         */
1062        dn = pci_device_to_OF_node(dev);
1063        pdn = PCI_DN(dn);
1064        buid = pdn->phb->buid;
1065        cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
1066
1067        do {
1068                /* extra outputs are LIOBN and dma-addr (hi, lo) */
1069                ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,
1070                                (u32 *)create, cfg_addr, BUID_HI(buid),
1071                                BUID_LO(buid), page_shift, window_shift);
1072        } while (rtas_busy_delay(ret));
1073        dev_info(&dev->dev,
1074                "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
1075                "(liobn = 0x%x starting addr = %x %x)\n",
1076                 ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
1077                 BUID_LO(buid), page_shift, window_shift, ret, create->liobn,
1078                 create->addr_hi, create->addr_lo);
1079
1080        return ret;
1081}
1082
1083struct failed_ddw_pdn {
1084        struct device_node *pdn;
1085        struct list_head list;
1086};
1087
1088static LIST_HEAD(failed_ddw_pdn_list);
1089
1090static phys_addr_t ddw_memory_hotplug_max(void)
1091{
1092        phys_addr_t max_addr = memory_hotplug_max();
1093        struct device_node *memory;
1094
1095        /*
1096         * The "ibm,pmemory" can appear anywhere in the address space.
1097         * Assuming it is still backed by page structs, set the upper limit
1098         * for the huge DMA window as MAX_PHYSMEM_BITS.
1099         */
1100        if (of_find_node_by_type(NULL, "ibm,pmemory"))
1101                return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
1102                        (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS);
1103
1104        for_each_node_by_type(memory, "memory") {
1105                unsigned long start, size;
1106                int n_mem_addr_cells, n_mem_size_cells, len;
1107                const __be32 *memcell_buf;
1108
1109                memcell_buf = of_get_property(memory, "reg", &len);
1110                if (!memcell_buf || len <= 0)
1111                        continue;
1112
1113                n_mem_addr_cells = of_n_addr_cells(memory);
1114                n_mem_size_cells = of_n_size_cells(memory);
1115
1116                start = of_read_number(memcell_buf, n_mem_addr_cells);
1117                memcell_buf += n_mem_addr_cells;
1118                size = of_read_number(memcell_buf, n_mem_size_cells);
1119                memcell_buf += n_mem_size_cells;
1120
1121                max_addr = max_t(phys_addr_t, max_addr, start + size);
1122        }
1123
1124        return max_addr;
1125}
1126
1127/*
1128 * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
1129 * ibm,ddw-extensions, which carries the rtas token for
1130 * ibm,reset-pe-dma-windows.
1131 * That rtas-call can be used to restore the default DMA window for the device.
1132 */
1133static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
1134{
1135        int ret;
1136        u32 cfg_addr, reset_dma_win;
1137        u64 buid;
1138        struct device_node *dn;
1139        struct pci_dn *pdn;
1140
1141        ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);
1142        if (ret)
1143                return;
1144
1145        dn = pci_device_to_OF_node(dev);
1146        pdn = PCI_DN(dn);
1147        buid = pdn->phb->buid;
1148        cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8);
1149
1150        ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
1151                        BUID_LO(buid));
1152        if (ret)
1153                dev_info(&dev->dev,
1154                         "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
1155                         reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
1156                         ret);
1157}
1158
1159/* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */
1160static int iommu_get_page_shift(u32 query_page_size)
1161{
1162        /* Supported IO page-sizes according to LoPAR */
1163        const int shift[] = {
1164                __builtin_ctzll(SZ_4K),   __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M),
1165                __builtin_ctzll(SZ_32M),  __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M),
1166                __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G)
1167        };
1168
1169        int i = ARRAY_SIZE(shift) - 1;
1170
1171        /*
1172         * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field:
1173         * - bit 31 means 4k pages are supported,
1174         * - bit 30 means 64k pages are supported, and so on.
1175         * Larger pagesizes map more memory with the same amount of TCEs, so start probing them.
1176         */
1177        for (; i >= 0 ; i--) {
1178                if (query_page_size & (1 << i))
1179                        return shift[i];
1180        }
1181
1182        /* No valid page size found. */
1183        return 0;
1184}
1185
1186static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,
1187                                            u32 page_shift, u32 window_shift)
1188{
1189        struct dynamic_dma_window_prop *ddwprop;
1190        struct property *win64;
1191
1192        win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
1193        if (!win64)
1194                return NULL;
1195
1196        win64->name = kstrdup(propname, GFP_KERNEL);
1197        ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
1198        win64->value = ddwprop;
1199        win64->length = sizeof(*ddwprop);
1200        if (!win64->name || !win64->value) {
1201                kfree(win64->name);
1202                kfree(win64->value);
1203                kfree(win64);
1204                return NULL;
1205        }
1206
1207        ddwprop->liobn = cpu_to_be32(liobn);
1208        ddwprop->dma_base = cpu_to_be64(dma_addr);
1209        ddwprop->tce_shift = cpu_to_be32(page_shift);
1210        ddwprop->window_shift = cpu_to_be32(window_shift);
1211
1212        return win64;
1213}
1214
1215/*
1216 * If the PE supports dynamic dma windows, and there is space for a table
1217 * that can map all pages in a linear offset, then setup such a table,
1218 * and record the dma-offset in the struct device.
1219 *
1220 * dev: the pci device we are checking
1221 * pdn: the parent pe node with the ibm,dma_window property
1222 * Future: also check if we can remap the base window for our base page size
1223 *
1224 * returns true if can map all pages (direct mapping), false otherwise..
1225 */
1226static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
1227{
1228        int len = 0, ret;
1229        int max_ram_len = order_base_2(ddw_memory_hotplug_max());
1230        struct ddw_query_response query;
1231        struct ddw_create_response create;
1232        int page_shift;
1233        u64 win_addr;
1234        const char *win_name;
1235        struct device_node *dn;
1236        u32 ddw_avail[DDW_APPLICABLE_SIZE];
1237        struct dma_win *window;
1238        struct property *win64;
1239        bool ddw_enabled = false;
1240        struct failed_ddw_pdn *fpdn;
1241        bool default_win_removed = false, direct_mapping = false;
1242        bool pmem_present;
1243        struct pci_dn *pci = PCI_DN(pdn);
1244        struct iommu_table *tbl = pci->table_group->tables[0];
1245
1246        dn = of_find_node_by_type(NULL, "ibm,pmemory");
1247        pmem_present = dn != NULL;
1248        of_node_put(dn);
1249
1250        mutex_lock(&dma_win_init_mutex);
1251
1252        if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) {
1253                direct_mapping = (len >= max_ram_len);
1254                ddw_enabled = true;
1255                goto out_unlock;
1256        }
1257
1258        /*
1259         * If we already went through this for a previous function of
1260         * the same device and failed, we don't want to muck with the
1261         * DMA window again, as it will race with in-flight operations
1262         * and can lead to EEHs. The above mutex protects access to the
1263         * list.
1264         */
1265        list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {
1266                if (fpdn->pdn == pdn)
1267                        goto out_unlock;
1268        }
1269
1270        /*
1271         * the ibm,ddw-applicable property holds the tokens for:
1272         * ibm,query-pe-dma-window
1273         * ibm,create-pe-dma-window
1274         * ibm,remove-pe-dma-window
1275         * for the given node in that order.
1276         * the property is actually in the parent, not the PE
1277         */
1278        ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
1279                                         &ddw_avail[0], DDW_APPLICABLE_SIZE);
1280        if (ret)
1281                goto out_failed;
1282
1283       /*
1284         * Query if there is a second window of size to map the
1285         * whole partition.  Query returns number of windows, largest
1286         * block assigned to PE (partition endpoint), and two bitmasks
1287         * of page sizes: supported and supported for migrate-dma.
1288         */
1289        dn = pci_device_to_OF_node(dev);
1290        ret = query_ddw(dev, ddw_avail, &query, pdn);
1291        if (ret != 0)
1292                goto out_failed;
1293
1294        /*
1295         * If there is no window available, remove the default DMA window,
1296         * if it's present. This will make all the resources available to the
1297         * new DDW window.
1298         * If anything fails after this, we need to restore it, so also check
1299         * for extensions presence.
1300         */
1301        if (query.windows_available == 0) {
1302                struct property *default_win;
1303                int reset_win_ext;
1304
1305                /* DDW + IOMMU on single window may fail if there is any allocation */
1306                if (iommu_table_in_use(tbl)) {
1307                        dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
1308                        goto out_failed;
1309                }
1310
1311                default_win = of_find_property(pdn, "ibm,dma-window", NULL);
1312                if (!default_win)
1313                        goto out_failed;
1314
1315                reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
1316                if (reset_win_ext)
1317                        goto out_failed;
1318
1319                remove_dma_window(pdn, ddw_avail, default_win);
1320                default_win_removed = true;
1321
1322                /* Query again, to check if the window is available */
1323                ret = query_ddw(dev, ddw_avail, &query, pdn);
1324                if (ret != 0)
1325                        goto out_failed;
1326
1327                if (query.windows_available == 0) {
1328                        /* no windows are available for this device. */
1329                        dev_dbg(&dev->dev, "no free dynamic windows");
1330                        goto out_failed;
1331                }
1332        }
1333
1334        page_shift = iommu_get_page_shift(query.page_size);
1335        if (!page_shift) {
1336                dev_dbg(&dev->dev, "no supported page size in mask %x",
1337                        query.page_size);
1338                goto out_failed;
1339        }
1340
1341
1342        /*
1343         * The "ibm,pmemory" can appear anywhere in the address space.
1344         * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
1345         * for the upper limit and fallback to max RAM otherwise but this
1346         * disables device::dma_ops_bypass.
1347         */
1348        len = max_ram_len;
1349        if (pmem_present) {
1350                if (query.largest_available_block >=
1351                    (1ULL << (MAX_PHYSMEM_BITS - page_shift)))
1352                        len = MAX_PHYSMEM_BITS;
1353                else
1354                        dev_info(&dev->dev, "Skipping ibm,pmemory");
1355        }
1356
1357        /* check if the available block * number of ptes will map everything */
1358        if (query.largest_available_block < (1ULL << (len - page_shift))) {
1359                dev_dbg(&dev->dev,
1360                        "can't map partition max 0x%llx with %llu %llu-sized pages\n",
1361                        1ULL << len,
1362                        query.largest_available_block,
1363                        1ULL << page_shift);
1364
1365                len = order_base_2(query.largest_available_block << page_shift);
1366                win_name = DMA64_PROPNAME;
1367        } else {
1368                direct_mapping = true;
1369                win_name = DIRECT64_PROPNAME;
1370        }
1371
1372        ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
1373        if (ret != 0)
1374                goto out_failed;
1375
1376        dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
1377                  create.liobn, dn);
1378
1379        win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
1380        win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);
1381
1382        if (!win64) {
1383                dev_info(&dev->dev,
1384                         "couldn't allocate property, property name, or value\n");
1385                goto out_remove_win;
1386        }
1387
1388        ret = of_add_property(pdn, win64);
1389        if (ret) {
1390                dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d",
1391                        pdn, ret);
1392                goto out_free_prop;
1393        }
1394
1395        window = ddw_list_new_entry(pdn, win64->value);
1396        if (!window)
1397                goto out_del_prop;
1398
1399        if (direct_mapping) {
1400                /* DDW maps the whole partition, so enable direct DMA mapping */
1401                ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
1402                                            win64->value, tce_setrange_multi_pSeriesLP_walk);
1403                if (ret) {
1404                        dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
1405                                 dn, ret);
1406
1407                /* Make sure to clean DDW if any TCE was set*/
1408                clean_dma_window(pdn, win64->value);
1409                        goto out_del_list;
1410                }
1411        } else {
1412                struct iommu_table *newtbl;
1413                int i;
1414                unsigned long start = 0, end = 0;
1415
1416                for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {
1417                        const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;
1418
1419                        /* Look for MMIO32 */
1420                        if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {
1421                                start = pci->phb->mem_resources[i].start;
1422                                end = pci->phb->mem_resources[i].end;
1423                                break;
1424                        }
1425                }
1426
1427                /* New table for using DDW instead of the default DMA window */
1428                newtbl = iommu_pseries_alloc_table(pci->phb->node);
1429                if (!newtbl) {
1430                        dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");
1431                        goto out_del_list;
1432                }
1433
1434                iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr,
1435                                            1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);
1436                iommu_init_table(newtbl, pci->phb->node, start, end);
1437
1438                pci->table_group->tables[1] = newtbl;
1439
1440                /* Keep default DMA window stuct if removed */
1441                if (default_win_removed) {
1442                        tbl->it_size = 0;
1443                        vfree(tbl->it_map);
1444                        tbl->it_map = NULL;
1445                }
1446
1447                set_iommu_table_base(&dev->dev, newtbl);
1448        }
1449
1450        spin_lock(&dma_win_list_lock);
1451        list_add(&window->list, &dma_win_list);
1452        spin_unlock(&dma_win_list_lock);
1453
1454        dev->dev.archdata.dma_offset = win_addr;
1455        ddw_enabled = true;
1456        goto out_unlock;
1457
1458out_del_list:
1459        kfree(window);
1460
1461out_del_prop:
1462        of_remove_property(pdn, win64);
1463
1464out_free_prop:
1465        kfree(win64->name);
1466        kfree(win64->value);
1467        kfree(win64);
1468
1469out_remove_win:
1470        /* DDW is clean, so it's ok to call this directly. */
1471        __remove_dma_window(pdn, ddw_avail, create.liobn);
1472
1473out_failed:
1474        if (default_win_removed)
1475                reset_dma_window(dev, pdn);
1476
1477        fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
1478        if (!fpdn)
1479                goto out_unlock;
1480        fpdn->pdn = pdn;
1481        list_add(&fpdn->list, &failed_ddw_pdn_list);
1482
1483out_unlock:
1484        mutex_unlock(&dma_win_init_mutex);
1485
1486        /*
1487         * If we have persistent memory and the window size is only as big
1488         * as RAM, then we failed to create a window to cover persistent
1489         * memory and need to set the DMA limit.
1490         */
1491        if (pmem_present && ddw_enabled && direct_mapping && len == max_ram_len)
1492                dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len);
1493
1494    return ddw_enabled && direct_mapping;
1495}
1496
1497static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
1498{
1499        struct device_node *pdn, *dn;
1500        struct iommu_table *tbl;
1501        const __be32 *dma_window = NULL;
1502        struct pci_dn *pci;
1503
1504        pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
1505
1506        /* dev setup for LPAR is a little tricky, since the device tree might
1507         * contain the dma-window properties per-device and not necessarily
1508         * for the bus. So we need to search upwards in the tree until we
1509         * either hit a dma-window property, OR find a parent with a table
1510         * already allocated.
1511         */
1512        dn = pci_device_to_OF_node(dev);
1513        pr_debug("  node is %pOF\n", dn);
1514
1515        for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
1516             pdn = pdn->parent) {
1517                dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
1518                if (dma_window)
1519                        break;
1520        }
1521
1522        if (!pdn || !PCI_DN(pdn)) {
1523                printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
1524                       "no DMA window found for pci dev=%s dn=%pOF\n",
1525                                 pci_name(dev), dn);
1526                return;
1527        }
1528        pr_debug("  parent is %pOF\n", pdn);
1529
1530        pci = PCI_DN(pdn);
1531        if (!pci->table_group) {
1532                pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
1533                tbl = pci->table_group->tables[0];
1534                iommu_table_setparms_lpar(pci->phb, pdn, tbl,
1535                                pci->table_group, dma_window);
1536
1537                iommu_init_table(tbl, pci->phb->node, 0, 0);
1538                iommu_register_group(pci->table_group,
1539                                pci_domain_nr(pci->phb->bus), 0);
1540                pr_debug("  created table: %p\n", pci->table_group);
1541        } else {
1542                pr_debug("  found DMA window, table: %p\n", pci->table_group);
1543        }
1544
1545        set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
1546        iommu_add_device(pci->table_group, &dev->dev);
1547}
1548
1549static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
1550{
1551        struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
1552        const __be32 *dma_window = NULL;
1553
1554        /* only attempt to use a new window if 64-bit DMA is requested */
1555        if (dma_mask < DMA_BIT_MASK(64))
1556                return false;
1557
1558        dev_dbg(&pdev->dev, "node is %pOF\n", dn);
1559
1560        /*
1561         * the device tree might contain the dma-window properties
1562         * per-device and not necessarily for the bus. So we need to
1563         * search upwards in the tree until we either hit a dma-window
1564         * property, OR find a parent with a table already allocated.
1565         */
1566        for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
1567                        pdn = pdn->parent) {
1568                dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
1569                if (dma_window)
1570                        break;
1571        }
1572
1573        if (pdn && PCI_DN(pdn))
1574                return enable_ddw(pdev, pdn);
1575
1576        return false;
1577}
1578
1579static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
1580                void *data)
1581{
1582        struct dma_win *window;
1583        struct memory_notify *arg = data;
1584        int ret = 0;
1585
1586        switch (action) {
1587        case MEM_GOING_ONLINE:
1588                spin_lock(&dma_win_list_lock);
1589                list_for_each_entry(window, &dma_win_list, list) {
1590                        ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
1591                                        arg->nr_pages, window->prop);
1592                        /* XXX log error */
1593                }
1594                spin_unlock(&dma_win_list_lock);
1595                break;
1596        case MEM_CANCEL_ONLINE:
1597        case MEM_OFFLINE:
1598                spin_lock(&dma_win_list_lock);
1599                list_for_each_entry(window, &dma_win_list, list) {
1600                        ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
1601                                        arg->nr_pages, window->prop);
1602                        /* XXX log error */
1603                }
1604                spin_unlock(&dma_win_list_lock);
1605                break;
1606        default:
1607                break;
1608        }
1609        if (ret && action != MEM_CANCEL_ONLINE)
1610                return NOTIFY_BAD;
1611
1612        return NOTIFY_OK;
1613}
1614
1615static struct notifier_block iommu_mem_nb = {
1616        .notifier_call = iommu_mem_notifier,
1617};
1618
1619static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
1620{
1621        int err = NOTIFY_OK;
1622        struct of_reconfig_data *rd = data;
1623        struct device_node *np = rd->dn;
1624        struct pci_dn *pci = PCI_DN(np);
1625        struct dma_win *window;
1626
1627        switch (action) {
1628        case OF_RECONFIG_DETACH_NODE:
1629                /*
1630                 * Removing the property will invoke the reconfig
1631                 * notifier again, which causes dead-lock on the
1632                 * read-write semaphore of the notifier chain. So
1633                 * we have to remove the property when releasing
1634                 * the device node.
1635                 */
1636                if (remove_ddw(np, false, DIRECT64_PROPNAME))
1637                        remove_ddw(np, false, DMA64_PROPNAME);
1638
1639                if (pci && pci->table_group)
1640                        iommu_pseries_free_group(pci->table_group,
1641                                        np->full_name);
1642
1643                spin_lock(&dma_win_list_lock);
1644                list_for_each_entry(window, &dma_win_list, list) {
1645                        if (window->device == np) {
1646                                list_del(&window->list);
1647                                kfree(window);
1648                                break;
1649                        }
1650                }
1651                spin_unlock(&dma_win_list_lock);
1652                break;
1653        default:
1654                err = NOTIFY_DONE;
1655                break;
1656        }
1657        return err;
1658}
1659
1660static struct notifier_block iommu_reconfig_nb = {
1661        .notifier_call = iommu_reconfig_notifier,
1662};
1663
1664/* These are called very early. */
1665void iommu_init_early_pSeries(void)
1666{
1667        if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
1668                return;
1669
1670        if (firmware_has_feature(FW_FEATURE_LPAR)) {
1671                pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
1672                pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
1673                if (!disable_ddw)
1674                        pseries_pci_controller_ops.iommu_bypass_supported =
1675                                iommu_bypass_supported_pSeriesLP;
1676        } else {
1677                pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
1678                pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
1679        }
1680
1681
1682        of_reconfig_notifier_register(&iommu_reconfig_nb);
1683        register_memory_notifier(&iommu_mem_nb);
1684
1685        set_pci_dma_ops(&dma_iommu_ops);
1686}
1687
1688static int __init disable_multitce(char *str)
1689{
1690        if (strcmp(str, "off") == 0 &&
1691            firmware_has_feature(FW_FEATURE_LPAR) &&
1692            (firmware_has_feature(FW_FEATURE_PUT_TCE_IND) ||
1693             firmware_has_feature(FW_FEATURE_STUFF_TCE))) {
1694                printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
1695                powerpc_firmware_features &=
1696                        ~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE);
1697        }
1698        return 1;
1699}
1700
1701__setup("multitce=", disable_multitce);
1702
1703static int tce_iommu_bus_notifier(struct notifier_block *nb,
1704                unsigned long action, void *data)
1705{
1706        struct device *dev = data;
1707
1708        switch (action) {
1709        case BUS_NOTIFY_DEL_DEVICE:
1710                iommu_del_device(dev);
1711                return 0;
1712        default:
1713                return 0;
1714        }
1715}
1716
1717static struct notifier_block tce_iommu_bus_nb = {
1718        .notifier_call = tce_iommu_bus_notifier,
1719};
1720
1721static int __init tce_iommu_bus_notifier_init(void)
1722{
1723        bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
1724        return 0;
1725}
1726machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
1727