linux/arch/ia64/hp/common/sba_iommu.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3**  IA64 System Bus Adapter (SBA) I/O MMU manager
   4**
   5**      (c) Copyright 2002-2005 Alex Williamson
   6**      (c) Copyright 2002-2003 Grant Grundler
   7**      (c) Copyright 2002-2005 Hewlett-Packard Company
   8**
   9**      Portions (c) 2000 Grant Grundler (from parisc I/O MMU code)
  10**      Portions (c) 1999 Dave S. Miller (from sparc64 I/O MMU code)
  11**
  12**
  13**
  14** This module initializes the IOC (I/O Controller) found on HP
  15** McKinley machines and their successors.
  16**
  17*/
  18
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/module.h>
  22#include <linux/spinlock.h>
  23#include <linux/slab.h>
  24#include <linux/init.h>
  25#include <linux/mm.h>
  26#include <linux/string.h>
  27#include <linux/pci.h>
  28#include <linux/proc_fs.h>
  29#include <linux/seq_file.h>
  30#include <linux/acpi.h>
  31#include <linux/efi.h>
  32#include <linux/nodemask.h>
  33#include <linux/bitops.h>         /* hweight64() */
  34#include <linux/crash_dump.h>
  35#include <linux/iommu-helper.h>
  36#include <linux/dma-mapping.h>
  37#include <linux/prefetch.h>
  38
  39#include <asm/delay.h>          /* ia64_get_itc() */
  40#include <asm/io.h>
  41#include <asm/page.h>           /* PAGE_OFFSET */
  42#include <asm/dma.h>
  43
  44#include <asm/acpi-ext.h>
  45
  46extern int swiotlb_late_init_with_default_size (size_t size);
  47
  48#define PFX "IOC: "
  49
  50/*
  51** Enabling timing search of the pdir resource map.  Output in /proc.
  52** Disabled by default to optimize performance.
  53*/
  54#undef PDIR_SEARCH_TIMING
  55
  56/*
  57** This option allows cards capable of 64bit DMA to bypass the IOMMU.  If
  58** not defined, all DMA will be 32bit and go through the TLB.
  59** There's potentially a conflict in the bio merge code with us
  60** advertising an iommu, but then bypassing it.  Since I/O MMU bypassing
  61** appears to give more performance than bio-level virtual merging, we'll
  62** do the former for now.  NOTE: BYPASS_SG also needs to be undef'd to
  63** completely restrict DMA to the IOMMU.
  64*/
  65#define ALLOW_IOV_BYPASS
  66
  67/*
  68** This option specifically allows/disallows bypassing scatterlists with
  69** multiple entries.  Coalescing these entries can allow better DMA streaming
  70** and in some cases shows better performance than entirely bypassing the
  71** IOMMU.  Performance increase on the order of 1-2% sequential output/input
  72** using bonnie++ on a RAID0 MD device (sym2 & mpt).
  73*/
  74#undef ALLOW_IOV_BYPASS_SG
  75
  76/*
  77** If a device prefetches beyond the end of a valid pdir entry, it will cause
  78** a hard failure, ie. MCA.  Version 3.0 and later of the zx1 LBA should
  79** disconnect on 4k boundaries and prevent such issues.  If the device is
  80** particularly aggressive, this option will keep the entire pdir valid such
  81** that prefetching will hit a valid address.  This could severely impact
  82** error containment, and is therefore off by default.  The page that is
  83** used for spill-over is poisoned, so that should help debugging somewhat.
  84*/
  85#undef FULL_VALID_PDIR
  86
  87#define ENABLE_MARK_CLEAN
  88
  89/*
  90** The number of debug flags is a clue - this code is fragile.  NOTE: since
  91** tightening the use of res_lock the resource bitmap and actual pdir are no
  92** longer guaranteed to stay in sync.  The sanity checking code isn't going to
  93** like that.
  94*/
  95#undef DEBUG_SBA_INIT
  96#undef DEBUG_SBA_RUN
  97#undef DEBUG_SBA_RUN_SG
  98#undef DEBUG_SBA_RESOURCE
  99#undef ASSERT_PDIR_SANITY
 100#undef DEBUG_LARGE_SG_ENTRIES
 101#undef DEBUG_BYPASS
 102
 103#if defined(FULL_VALID_PDIR) && defined(ASSERT_PDIR_SANITY)
 104#error FULL_VALID_PDIR and ASSERT_PDIR_SANITY are mutually exclusive
 105#endif
 106
 107#define SBA_INLINE      __inline__
 108/* #define SBA_INLINE */
 109
 110#ifdef DEBUG_SBA_INIT
 111#define DBG_INIT(x...)  printk(x)
 112#else
 113#define DBG_INIT(x...)
 114#endif
 115
 116#ifdef DEBUG_SBA_RUN
 117#define DBG_RUN(x...)   printk(x)
 118#else
 119#define DBG_RUN(x...)
 120#endif
 121
 122#ifdef DEBUG_SBA_RUN_SG
 123#define DBG_RUN_SG(x...)        printk(x)
 124#else
 125#define DBG_RUN_SG(x...)
 126#endif
 127
 128
 129#ifdef DEBUG_SBA_RESOURCE
 130#define DBG_RES(x...)   printk(x)
 131#else
 132#define DBG_RES(x...)
 133#endif
 134
 135#ifdef DEBUG_BYPASS
 136#define DBG_BYPASS(x...)        printk(x)
 137#else
 138#define DBG_BYPASS(x...)
 139#endif
 140
 141#ifdef ASSERT_PDIR_SANITY
 142#define ASSERT(expr) \
 143        if(!(expr)) { \
 144                printk( "\n" __FILE__ ":%d: Assertion " #expr " failed!\n",__LINE__); \
 145                panic(#expr); \
 146        }
 147#else
 148#define ASSERT(expr)
 149#endif
 150
 151/*
 152** The number of pdir entries to "free" before issuing
 153** a read to PCOM register to flush out PCOM writes.
 154** Interacts with allocation granularity (ie 4 or 8 entries
 155** allocated and free'd/purged at a time might make this
 156** less interesting).
 157*/
 158#define DELAYED_RESOURCE_CNT    64
 159
 160#define PCI_DEVICE_ID_HP_SX2000_IOC     0x12ec
 161
 162#define ZX1_IOC_ID      ((PCI_DEVICE_ID_HP_ZX1_IOC << 16) | PCI_VENDOR_ID_HP)
 163#define ZX2_IOC_ID      ((PCI_DEVICE_ID_HP_ZX2_IOC << 16) | PCI_VENDOR_ID_HP)
 164#define REO_IOC_ID      ((PCI_DEVICE_ID_HP_REO_IOC << 16) | PCI_VENDOR_ID_HP)
 165#define SX1000_IOC_ID   ((PCI_DEVICE_ID_HP_SX1000_IOC << 16) | PCI_VENDOR_ID_HP)
 166#define SX2000_IOC_ID   ((PCI_DEVICE_ID_HP_SX2000_IOC << 16) | PCI_VENDOR_ID_HP)
 167
 168#define ZX1_IOC_OFFSET  0x1000  /* ACPI reports SBA, we want IOC */
 169
 170#define IOC_FUNC_ID     0x000
 171#define IOC_FCLASS      0x008   /* function class, bist, header, rev... */
 172#define IOC_IBASE       0x300   /* IO TLB */
 173#define IOC_IMASK       0x308
 174#define IOC_PCOM        0x310
 175#define IOC_TCNFG       0x318
 176#define IOC_PDIR_BASE   0x320
 177
 178#define IOC_ROPE0_CFG   0x500
 179#define   IOC_ROPE_AO     0x10  /* Allow "Relaxed Ordering" */
 180
 181
 182/* AGP GART driver looks for this */
 183#define ZX1_SBA_IOMMU_COOKIE    0x0000badbadc0ffeeUL
 184
 185/*
 186** The zx1 IOC supports 4/8/16/64KB page sizes (see TCNFG register)
 187**
 188** Some IOCs (sx1000) can run at the above pages sizes, but are
 189** really only supported using the IOC at a 4k page size.
 190**
 191** iovp_size could only be greater than PAGE_SIZE if we are
 192** confident the drivers really only touch the next physical
 193** page iff that driver instance owns it.
 194*/
 195static unsigned long iovp_size;
 196static unsigned long iovp_shift;
 197static unsigned long iovp_mask;
 198
 199struct ioc {
 200        void __iomem    *ioc_hpa;       /* I/O MMU base address */
 201        char            *res_map;       /* resource map, bit == pdir entry */
 202        u64             *pdir_base;     /* physical base address */
 203        unsigned long   ibase;          /* pdir IOV Space base */
 204        unsigned long   imask;          /* pdir IOV Space mask */
 205
 206        unsigned long   *res_hint;      /* next avail IOVP - circular search */
 207        unsigned long   dma_mask;
 208        spinlock_t      res_lock;       /* protects the resource bitmap, but must be held when */
 209                                        /* clearing pdir to prevent races with allocations. */
 210        unsigned int    res_bitshift;   /* from the RIGHT! */
 211        unsigned int    res_size;       /* size of resource map in bytes */
 212#ifdef CONFIG_NUMA
 213        unsigned int    node;           /* node where this IOC lives */
 214#endif
 215#if DELAYED_RESOURCE_CNT > 0
 216        spinlock_t      saved_lock;     /* may want to try to get this on a separate cacheline */
 217                                        /* than res_lock for bigger systems. */
 218        int             saved_cnt;
 219        struct sba_dma_pair {
 220                dma_addr_t      iova;
 221                size_t          size;
 222        } saved[DELAYED_RESOURCE_CNT];
 223#endif
 224
 225#ifdef PDIR_SEARCH_TIMING
 226#define SBA_SEARCH_SAMPLE       0x100
 227        unsigned long avg_search[SBA_SEARCH_SAMPLE];
 228        unsigned long avg_idx;  /* current index into avg_search */
 229#endif
 230
 231        /* Stuff we don't need in performance path */
 232        struct ioc      *next;          /* list of IOC's in system */
 233        acpi_handle     handle;         /* for multiple IOC's */
 234        const char      *name;
 235        unsigned int    func_id;
 236        unsigned int    rev;            /* HW revision of chip */
 237        u32             iov_size;
 238        unsigned int    pdir_size;      /* in bytes, determined by IOV Space size */
 239        struct pci_dev  *sac_only_dev;
 240};
 241
 242static struct ioc *ioc_list, *ioc_found;
 243static int reserve_sba_gart = 1;
 244
 245static SBA_INLINE void sba_mark_invalid(struct ioc *, dma_addr_t, size_t);
 246static SBA_INLINE void sba_free_range(struct ioc *, dma_addr_t, size_t);
 247
 248#define sba_sg_address(sg)      sg_virt((sg))
 249
 250#ifdef FULL_VALID_PDIR
 251static u64 prefetch_spill_page;
 252#endif
 253
 254#ifdef CONFIG_PCI
 255# define GET_IOC(dev)   ((dev_is_pci(dev))                                              \
 256                         ? ((struct ioc *) PCI_CONTROLLER(to_pci_dev(dev))->iommu) : NULL)
 257#else
 258# define GET_IOC(dev)   NULL
 259#endif
 260
 261/*
 262** DMA_CHUNK_SIZE is used by the SCSI mid-layer to break up
 263** (or rather not merge) DMAs into manageable chunks.
 264** On parisc, this is more of the software/tuning constraint
 265** rather than the HW. I/O MMU allocation algorithms can be
 266** faster with smaller sizes (to some degree).
 267*/
 268#define DMA_CHUNK_SIZE  (BITS_PER_LONG*iovp_size)
 269
 270#define ROUNDUP(x,y) ((x + ((y)-1)) & ~((y)-1))
 271
 272/************************************
 273** SBA register read and write support
 274**
 275** BE WARNED: register writes are posted.
 276**  (ie follow writes which must reach HW with a read)
 277**
 278*/
 279#define READ_REG(addr)       __raw_readq(addr)
 280#define WRITE_REG(val, addr) __raw_writeq(val, addr)
 281
 282#ifdef DEBUG_SBA_INIT
 283
 284/**
 285 * sba_dump_tlb - debugging only - print IOMMU operating parameters
 286 * @hpa: base address of the IOMMU
 287 *
 288 * Print the size/location of the IO MMU PDIR.
 289 */
 290static void
 291sba_dump_tlb(char *hpa)
 292{
 293        DBG_INIT("IO TLB at 0x%p\n", (void *)hpa);
 294        DBG_INIT("IOC_IBASE    : %016lx\n", READ_REG(hpa+IOC_IBASE));
 295        DBG_INIT("IOC_IMASK    : %016lx\n", READ_REG(hpa+IOC_IMASK));
 296        DBG_INIT("IOC_TCNFG    : %016lx\n", READ_REG(hpa+IOC_TCNFG));
 297        DBG_INIT("IOC_PDIR_BASE: %016lx\n", READ_REG(hpa+IOC_PDIR_BASE));
 298        DBG_INIT("\n");
 299}
 300#endif
 301
 302
 303#ifdef ASSERT_PDIR_SANITY
 304
 305/**
 306 * sba_dump_pdir_entry - debugging only - print one IOMMU PDIR entry
 307 * @ioc: IO MMU structure which owns the pdir we are interested in.
 308 * @msg: text to print ont the output line.
 309 * @pide: pdir index.
 310 *
 311 * Print one entry of the IO MMU PDIR in human readable form.
 312 */
 313static void
 314sba_dump_pdir_entry(struct ioc *ioc, char *msg, uint pide)
 315{
 316        /* start printing from lowest pde in rval */
 317        u64 *ptr = &ioc->pdir_base[pide  & ~(BITS_PER_LONG - 1)];
 318        unsigned long *rptr = (unsigned long *) &ioc->res_map[(pide >>3) & -sizeof(unsigned long)];
 319        uint rcnt;
 320
 321        printk(KERN_DEBUG "SBA: %s rp %p bit %d rval 0x%lx\n",
 322                 msg, rptr, pide & (BITS_PER_LONG - 1), *rptr);
 323
 324        rcnt = 0;
 325        while (rcnt < BITS_PER_LONG) {
 326                printk(KERN_DEBUG "%s %2d %p %016Lx\n",
 327                       (rcnt == (pide & (BITS_PER_LONG - 1)))
 328                       ? "    -->" : "       ",
 329                       rcnt, ptr, (unsigned long long) *ptr );
 330                rcnt++;
 331                ptr++;
 332        }
 333        printk(KERN_DEBUG "%s", msg);
 334}
 335
 336
 337/**
 338 * sba_check_pdir - debugging only - consistency checker
 339 * @ioc: IO MMU structure which owns the pdir we are interested in.
 340 * @msg: text to print ont the output line.
 341 *
 342 * Verify the resource map and pdir state is consistent
 343 */
 344static int
 345sba_check_pdir(struct ioc *ioc, char *msg)
 346{
 347        u64 *rptr_end = (u64 *) &(ioc->res_map[ioc->res_size]);
 348        u64 *rptr = (u64 *) ioc->res_map;       /* resource map ptr */
 349        u64 *pptr = ioc->pdir_base;     /* pdir ptr */
 350        uint pide = 0;
 351
 352        while (rptr < rptr_end) {
 353                u64 rval;
 354                int rcnt; /* number of bits we might check */
 355
 356                rval = *rptr;
 357                rcnt = 64;
 358
 359                while (rcnt) {
 360                        /* Get last byte and highest bit from that */
 361                        u32 pde = ((u32)((*pptr >> (63)) & 0x1));
 362                        if ((rval & 0x1) ^ pde)
 363                        {
 364                                /*
 365                                ** BUMMER!  -- res_map != pdir --
 366                                ** Dump rval and matching pdir entries
 367                                */
 368                                sba_dump_pdir_entry(ioc, msg, pide);
 369                                return(1);
 370                        }
 371                        rcnt--;
 372                        rval >>= 1;     /* try the next bit */
 373                        pptr++;
 374                        pide++;
 375                }
 376                rptr++; /* look at next word of res_map */
 377        }
 378        /* It'd be nice if we always got here :^) */
 379        return 0;
 380}
 381
 382
 383/**
 384 * sba_dump_sg - debugging only - print Scatter-Gather list
 385 * @ioc: IO MMU structure which owns the pdir we are interested in.
 386 * @startsg: head of the SG list
 387 * @nents: number of entries in SG list
 388 *
 389 * print the SG list so we can verify it's correct by hand.
 390 */
 391static void
 392sba_dump_sg( struct ioc *ioc, struct scatterlist *startsg, int nents)
 393{
 394        while (nents-- > 0) {
 395                printk(KERN_DEBUG " %d : DMA %08lx/%05x CPU %p\n", nents,
 396                       startsg->dma_address, startsg->dma_length,
 397                       sba_sg_address(startsg));
 398                startsg = sg_next(startsg);
 399        }
 400}
 401
 402static void
 403sba_check_sg( struct ioc *ioc, struct scatterlist *startsg, int nents)
 404{
 405        struct scatterlist *the_sg = startsg;
 406        int the_nents = nents;
 407
 408        while (the_nents-- > 0) {
 409                if (sba_sg_address(the_sg) == 0x0UL)
 410                        sba_dump_sg(NULL, startsg, nents);
 411                the_sg = sg_next(the_sg);
 412        }
 413}
 414
 415#endif /* ASSERT_PDIR_SANITY */
 416
 417
 418
 419
 420/**************************************************************
 421*
 422*   I/O Pdir Resource Management
 423*
 424*   Bits set in the resource map are in use.
 425*   Each bit can represent a number of pages.
 426*   LSbs represent lower addresses (IOVA's).
 427*
 428***************************************************************/
 429#define PAGES_PER_RANGE 1       /* could increase this to 4 or 8 if needed */
 430
 431/* Convert from IOVP to IOVA and vice versa. */
 432#define SBA_IOVA(ioc,iovp,offset) ((ioc->ibase) | (iovp) | (offset))
 433#define SBA_IOVP(ioc,iova) ((iova) & ~(ioc->ibase))
 434
 435#define PDIR_ENTRY_SIZE sizeof(u64)
 436
 437#define PDIR_INDEX(iovp)   ((iovp)>>iovp_shift)
 438
 439#define RESMAP_MASK(n)    ~(~0UL << (n))
 440#define RESMAP_IDX_MASK   (sizeof(unsigned long) - 1)
 441
 442
 443/**
 444 * For most cases the normal get_order is sufficient, however it limits us
 445 * to PAGE_SIZE being the minimum mapping alignment and TC flush granularity.
 446 * It only incurs about 1 clock cycle to use this one with the static variable
 447 * and makes the code more intuitive.
 448 */
 449static SBA_INLINE int
 450get_iovp_order (unsigned long size)
 451{
 452        long double d = size - 1;
 453        long order;
 454
 455        order = ia64_getf_exp(d);
 456        order = order - iovp_shift - 0xffff + 1;
 457        if (order < 0)
 458                order = 0;
 459        return order;
 460}
 461
 462static unsigned long ptr_to_pide(struct ioc *ioc, unsigned long *res_ptr,
 463                                 unsigned int bitshiftcnt)
 464{
 465        return (((unsigned long)res_ptr - (unsigned long)ioc->res_map) << 3)
 466                + bitshiftcnt;
 467}
 468
 469/**
 470 * sba_search_bitmap - find free space in IO PDIR resource bitmap
 471 * @ioc: IO MMU structure which owns the pdir we are interested in.
 472 * @bits_wanted: number of entries we need.
 473 * @use_hint: use res_hint to indicate where to start looking
 474 *
 475 * Find consecutive free bits in resource bitmap.
 476 * Each bit represents one entry in the IO Pdir.
 477 * Cool perf optimization: search for log2(size) bits at a time.
 478 */
 479static SBA_INLINE unsigned long
 480sba_search_bitmap(struct ioc *ioc, struct device *dev,
 481                  unsigned long bits_wanted, int use_hint)
 482{
 483        unsigned long *res_ptr;
 484        unsigned long *res_end = (unsigned long *) &(ioc->res_map[ioc->res_size]);
 485        unsigned long flags, pide = ~0UL, tpide;
 486        unsigned long boundary_size;
 487        unsigned long shift;
 488        int ret;
 489
 490        ASSERT(((unsigned long) ioc->res_hint & (sizeof(unsigned long) - 1UL)) == 0);
 491        ASSERT(res_ptr < res_end);
 492
 493        boundary_size = (unsigned long long)dma_get_seg_boundary(dev) + 1;
 494        boundary_size = ALIGN(boundary_size, 1ULL << iovp_shift) >> iovp_shift;
 495
 496        BUG_ON(ioc->ibase & ~iovp_mask);
 497        shift = ioc->ibase >> iovp_shift;
 498
 499        spin_lock_irqsave(&ioc->res_lock, flags);
 500
 501        /* Allow caller to force a search through the entire resource space */
 502        if (likely(use_hint)) {
 503                res_ptr = ioc->res_hint;
 504        } else {
 505                res_ptr = (ulong *)ioc->res_map;
 506                ioc->res_bitshift = 0;
 507        }
 508
 509        /*
 510         * N.B.  REO/Grande defect AR2305 can cause TLB fetch timeouts
 511         * if a TLB entry is purged while in use.  sba_mark_invalid()
 512         * purges IOTLB entries in power-of-two sizes, so we also
 513         * allocate IOVA space in power-of-two sizes.
 514         */
 515        bits_wanted = 1UL << get_iovp_order(bits_wanted << iovp_shift);
 516
 517        if (likely(bits_wanted == 1)) {
 518                unsigned int bitshiftcnt;
 519                for(; res_ptr < res_end ; res_ptr++) {
 520                        if (likely(*res_ptr != ~0UL)) {
 521                                bitshiftcnt = ffz(*res_ptr);
 522                                *res_ptr |= (1UL << bitshiftcnt);
 523                                pide = ptr_to_pide(ioc, res_ptr, bitshiftcnt);
 524                                ioc->res_bitshift = bitshiftcnt + bits_wanted;
 525                                goto found_it;
 526                        }
 527                }
 528                goto not_found;
 529
 530        }
 531        
 532        if (likely(bits_wanted <= BITS_PER_LONG/2)) {
 533                /*
 534                ** Search the resource bit map on well-aligned values.
 535                ** "o" is the alignment.
 536                ** We need the alignment to invalidate I/O TLB using
 537                ** SBA HW features in the unmap path.
 538                */
 539                unsigned long o = 1 << get_iovp_order(bits_wanted << iovp_shift);
 540                uint bitshiftcnt = ROUNDUP(ioc->res_bitshift, o);
 541                unsigned long mask, base_mask;
 542
 543                base_mask = RESMAP_MASK(bits_wanted);
 544                mask = base_mask << bitshiftcnt;
 545
 546                DBG_RES("%s() o %ld %p", __func__, o, res_ptr);
 547                for(; res_ptr < res_end ; res_ptr++)
 548                { 
 549                        DBG_RES("    %p %lx %lx\n", res_ptr, mask, *res_ptr);
 550                        ASSERT(0 != mask);
 551                        for (; mask ; mask <<= o, bitshiftcnt += o) {
 552                                tpide = ptr_to_pide(ioc, res_ptr, bitshiftcnt);
 553                                ret = iommu_is_span_boundary(tpide, bits_wanted,
 554                                                             shift,
 555                                                             boundary_size);
 556                                if ((0 == ((*res_ptr) & mask)) && !ret) {
 557                                        *res_ptr |= mask;     /* mark resources busy! */
 558                                        pide = tpide;
 559                                        ioc->res_bitshift = bitshiftcnt + bits_wanted;
 560                                        goto found_it;
 561                                }
 562                        }
 563
 564                        bitshiftcnt = 0;
 565                        mask = base_mask;
 566
 567                }
 568
 569        } else {
 570                int qwords, bits, i;
 571                unsigned long *end;
 572
 573                qwords = bits_wanted >> 6; /* /64 */
 574                bits = bits_wanted - (qwords * BITS_PER_LONG);
 575
 576                end = res_end - qwords;
 577
 578                for (; res_ptr < end; res_ptr++) {
 579                        tpide = ptr_to_pide(ioc, res_ptr, 0);
 580                        ret = iommu_is_span_boundary(tpide, bits_wanted,
 581                                                     shift, boundary_size);
 582                        if (ret)
 583                                goto next_ptr;
 584                        for (i = 0 ; i < qwords ; i++) {
 585                                if (res_ptr[i] != 0)
 586                                        goto next_ptr;
 587                        }
 588                        if (bits && res_ptr[i] && (__ffs(res_ptr[i]) < bits))
 589                                continue;
 590
 591                        /* Found it, mark it */
 592                        for (i = 0 ; i < qwords ; i++)
 593                                res_ptr[i] = ~0UL;
 594                        res_ptr[i] |= RESMAP_MASK(bits);
 595
 596                        pide = tpide;
 597                        res_ptr += qwords;
 598                        ioc->res_bitshift = bits;
 599                        goto found_it;
 600next_ptr:
 601                        ;
 602                }
 603        }
 604
 605not_found:
 606        prefetch(ioc->res_map);
 607        ioc->res_hint = (unsigned long *) ioc->res_map;
 608        ioc->res_bitshift = 0;
 609        spin_unlock_irqrestore(&ioc->res_lock, flags);
 610        return (pide);
 611
 612found_it:
 613        ioc->res_hint = res_ptr;
 614        spin_unlock_irqrestore(&ioc->res_lock, flags);
 615        return (pide);
 616}
 617
 618
 619/**
 620 * sba_alloc_range - find free bits and mark them in IO PDIR resource bitmap
 621 * @ioc: IO MMU structure which owns the pdir we are interested in.
 622 * @size: number of bytes to create a mapping for
 623 *
 624 * Given a size, find consecutive unmarked and then mark those bits in the
 625 * resource bit map.
 626 */
 627static int
 628sba_alloc_range(struct ioc *ioc, struct device *dev, size_t size)
 629{
 630        unsigned int pages_needed = size >> iovp_shift;
 631#ifdef PDIR_SEARCH_TIMING
 632        unsigned long itc_start;
 633#endif
 634        unsigned long pide;
 635
 636        ASSERT(pages_needed);
 637        ASSERT(0 == (size & ~iovp_mask));
 638
 639#ifdef PDIR_SEARCH_TIMING
 640        itc_start = ia64_get_itc();
 641#endif
 642        /*
 643        ** "seek and ye shall find"...praying never hurts either...
 644        */
 645        pide = sba_search_bitmap(ioc, dev, pages_needed, 1);
 646        if (unlikely(pide >= (ioc->res_size << 3))) {
 647                pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
 648                if (unlikely(pide >= (ioc->res_size << 3))) {
 649#if DELAYED_RESOURCE_CNT > 0
 650                        unsigned long flags;
 651
 652                        /*
 653                        ** With delayed resource freeing, we can give this one more shot.  We're
 654                        ** getting close to being in trouble here, so do what we can to make this
 655                        ** one count.
 656                        */
 657                        spin_lock_irqsave(&ioc->saved_lock, flags);
 658                        if (ioc->saved_cnt > 0) {
 659                                struct sba_dma_pair *d;
 660                                int cnt = ioc->saved_cnt;
 661
 662                                d = &(ioc->saved[ioc->saved_cnt - 1]);
 663
 664                                spin_lock(&ioc->res_lock);
 665                                while (cnt--) {
 666                                        sba_mark_invalid(ioc, d->iova, d->size);
 667                                        sba_free_range(ioc, d->iova, d->size);
 668                                        d--;
 669                                }
 670                                ioc->saved_cnt = 0;
 671                                READ_REG(ioc->ioc_hpa+IOC_PCOM);        /* flush purges */
 672                                spin_unlock(&ioc->res_lock);
 673                        }
 674                        spin_unlock_irqrestore(&ioc->saved_lock, flags);
 675
 676                        pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
 677                        if (unlikely(pide >= (ioc->res_size << 3))) {
 678                                printk(KERN_WARNING "%s: I/O MMU @ %p is"
 679                                       "out of mapping resources, %u %u %lx\n",
 680                                       __func__, ioc->ioc_hpa, ioc->res_size,
 681                                       pages_needed, dma_get_seg_boundary(dev));
 682                                return -1;
 683                        }
 684#else
 685                        printk(KERN_WARNING "%s: I/O MMU @ %p is"
 686                               "out of mapping resources, %u %u %lx\n",
 687                               __func__, ioc->ioc_hpa, ioc->res_size,
 688                               pages_needed, dma_get_seg_boundary(dev));
 689                        return -1;
 690#endif
 691                }
 692        }
 693
 694#ifdef PDIR_SEARCH_TIMING
 695        ioc->avg_search[ioc->avg_idx++] = (ia64_get_itc() - itc_start) / pages_needed;
 696        ioc->avg_idx &= SBA_SEARCH_SAMPLE - 1;
 697#endif
 698
 699        prefetchw(&(ioc->pdir_base[pide]));
 700
 701#ifdef ASSERT_PDIR_SANITY
 702        /* verify the first enable bit is clear */
 703        if(0x00 != ((u8 *) ioc->pdir_base)[pide*PDIR_ENTRY_SIZE + 7]) {
 704                sba_dump_pdir_entry(ioc, "sba_search_bitmap() botched it?", pide);
 705        }
 706#endif
 707
 708        DBG_RES("%s(%x) %d -> %lx hint %x/%x\n",
 709                __func__, size, pages_needed, pide,
 710                (uint) ((unsigned long) ioc->res_hint - (unsigned long) ioc->res_map),
 711                ioc->res_bitshift );
 712
 713        return (pide);
 714}
 715
 716
 717/**
 718 * sba_free_range - unmark bits in IO PDIR resource bitmap
 719 * @ioc: IO MMU structure which owns the pdir we are interested in.
 720 * @iova: IO virtual address which was previously allocated.
 721 * @size: number of bytes to create a mapping for
 722 *
 723 * clear bits in the ioc's resource map
 724 */
 725static SBA_INLINE void
 726sba_free_range(struct ioc *ioc, dma_addr_t iova, size_t size)
 727{
 728        unsigned long iovp = SBA_IOVP(ioc, iova);
 729        unsigned int pide = PDIR_INDEX(iovp);
 730        unsigned int ridx = pide >> 3;  /* convert bit to byte address */
 731        unsigned long *res_ptr = (unsigned long *) &((ioc)->res_map[ridx & ~RESMAP_IDX_MASK]);
 732        int bits_not_wanted = size >> iovp_shift;
 733        unsigned long m;
 734
 735        /* Round up to power-of-two size: see AR2305 note above */
 736        bits_not_wanted = 1UL << get_iovp_order(bits_not_wanted << iovp_shift);
 737        for (; bits_not_wanted > 0 ; res_ptr++) {
 738                
 739                if (unlikely(bits_not_wanted > BITS_PER_LONG)) {
 740
 741                        /* these mappings start 64bit aligned */
 742                        *res_ptr = 0UL;
 743                        bits_not_wanted -= BITS_PER_LONG;
 744                        pide += BITS_PER_LONG;
 745
 746                } else {
 747
 748                        /* 3-bits "bit" address plus 2 (or 3) bits for "byte" == bit in word */
 749                        m = RESMAP_MASK(bits_not_wanted) << (pide & (BITS_PER_LONG - 1));
 750                        bits_not_wanted = 0;
 751
 752                        DBG_RES("%s( ,%x,%x) %x/%lx %x %p %lx\n", __func__, (uint) iova, size,
 753                                bits_not_wanted, m, pide, res_ptr, *res_ptr);
 754
 755                        ASSERT(m != 0);
 756                        ASSERT(bits_not_wanted);
 757                        ASSERT((*res_ptr & m) == m); /* verify same bits are set */
 758                        *res_ptr &= ~m;
 759                }
 760        }
 761}
 762
 763
 764/**************************************************************
 765*
 766*   "Dynamic DMA Mapping" support (aka "Coherent I/O")
 767*
 768***************************************************************/
 769
 770/**
 771 * sba_io_pdir_entry - fill in one IO PDIR entry
 772 * @pdir_ptr:  pointer to IO PDIR entry
 773 * @vba: Virtual CPU address of buffer to map
 774 *
 775 * SBA Mapping Routine
 776 *
 777 * Given a virtual address (vba, arg1) sba_io_pdir_entry()
 778 * loads the I/O PDIR entry pointed to by pdir_ptr (arg0).
 779 * Each IO Pdir entry consists of 8 bytes as shown below
 780 * (LSB == bit 0):
 781 *
 782 *  63                    40                                 11    7        0
 783 * +-+---------------------+----------------------------------+----+--------+
 784 * |V|        U            |            PPN[39:12]            | U  |   FF   |
 785 * +-+---------------------+----------------------------------+----+--------+
 786 *
 787 *  V  == Valid Bit
 788 *  U  == Unused
 789 * PPN == Physical Page Number
 790 *
 791 * The physical address fields are filled with the results of virt_to_phys()
 792 * on the vba.
 793 */
 794
 795#if 1
 796#define sba_io_pdir_entry(pdir_ptr, vba) *pdir_ptr = ((vba & ~0xE000000000000FFFULL)    \
 797                                                      | 0x8000000000000000ULL)
 798#else
 799void SBA_INLINE
 800sba_io_pdir_entry(u64 *pdir_ptr, unsigned long vba)
 801{
 802        *pdir_ptr = ((vba & ~0xE000000000000FFFULL) | 0x80000000000000FFULL);
 803}
 804#endif
 805
 806#ifdef ENABLE_MARK_CLEAN
 807/**
 808 * Since DMA is i-cache coherent, any (complete) pages that were written via
 809 * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
 810 * flush them when they get mapped into an executable vm-area.
 811 */
 812static void
 813mark_clean (void *addr, size_t size)
 814{
 815        unsigned long pg_addr, end;
 816
 817        pg_addr = PAGE_ALIGN((unsigned long) addr);
 818        end = (unsigned long) addr + size;
 819        while (pg_addr + PAGE_SIZE <= end) {
 820                struct page *page = virt_to_page((void *)pg_addr);
 821                set_bit(PG_arch_1, &page->flags);
 822                pg_addr += PAGE_SIZE;
 823        }
 824}
 825#endif
 826
 827/**
 828 * sba_mark_invalid - invalidate one or more IO PDIR entries
 829 * @ioc: IO MMU structure which owns the pdir we are interested in.
 830 * @iova:  IO Virtual Address mapped earlier
 831 * @byte_cnt:  number of bytes this mapping covers.
 832 *
 833 * Marking the IO PDIR entry(ies) as Invalid and invalidate
 834 * corresponding IO TLB entry. The PCOM (Purge Command Register)
 835 * is to purge stale entries in the IO TLB when unmapping entries.
 836 *
 837 * The PCOM register supports purging of multiple pages, with a minium
 838 * of 1 page and a maximum of 2GB. Hardware requires the address be
 839 * aligned to the size of the range being purged. The size of the range
 840 * must be a power of 2. The "Cool perf optimization" in the
 841 * allocation routine helps keep that true.
 842 */
 843static SBA_INLINE void
 844sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt)
 845{
 846        u32 iovp = (u32) SBA_IOVP(ioc,iova);
 847
 848        int off = PDIR_INDEX(iovp);
 849
 850        /* Must be non-zero and rounded up */
 851        ASSERT(byte_cnt > 0);
 852        ASSERT(0 == (byte_cnt & ~iovp_mask));
 853
 854#ifdef ASSERT_PDIR_SANITY
 855        /* Assert first pdir entry is set */
 856        if (!(ioc->pdir_base[off] >> 60)) {
 857                sba_dump_pdir_entry(ioc,"sba_mark_invalid()", PDIR_INDEX(iovp));
 858        }
 859#endif
 860
 861        if (byte_cnt <= iovp_size)
 862        {
 863                ASSERT(off < ioc->pdir_size);
 864
 865                iovp |= iovp_shift;     /* set "size" field for PCOM */
 866
 867#ifndef FULL_VALID_PDIR
 868                /*
 869                ** clear I/O PDIR entry "valid" bit
 870                ** Do NOT clear the rest - save it for debugging.
 871                ** We should only clear bits that have previously
 872                ** been enabled.
 873                */
 874                ioc->pdir_base[off] &= ~(0x80000000000000FFULL);
 875#else
 876                /*
 877                ** If we want to maintain the PDIR as valid, put in
 878                ** the spill page so devices prefetching won't
 879                ** cause a hard fail.
 880                */
 881                ioc->pdir_base[off] = (0x80000000000000FFULL | prefetch_spill_page);
 882#endif
 883        } else {
 884                u32 t = get_iovp_order(byte_cnt) + iovp_shift;
 885
 886                iovp |= t;
 887                ASSERT(t <= 31);   /* 2GB! Max value of "size" field */
 888
 889                do {
 890                        /* verify this pdir entry is enabled */
 891                        ASSERT(ioc->pdir_base[off]  >> 63);
 892#ifndef FULL_VALID_PDIR
 893                        /* clear I/O Pdir entry "valid" bit first */
 894                        ioc->pdir_base[off] &= ~(0x80000000000000FFULL);
 895#else
 896                        ioc->pdir_base[off] = (0x80000000000000FFULL | prefetch_spill_page);
 897#endif
 898                        off++;
 899                        byte_cnt -= iovp_size;
 900                } while (byte_cnt > 0);
 901        }
 902
 903        WRITE_REG(iovp | ioc->ibase, ioc->ioc_hpa+IOC_PCOM);
 904}
 905
 906/**
 907 * sba_map_page - map one buffer and return IOVA for DMA
 908 * @dev: instance of PCI owned by the driver that's asking.
 909 * @page: page to map
 910 * @poff: offset into page
 911 * @size: number of bytes to map
 912 * @dir: dma direction
 913 * @attrs: optional dma attributes
 914 *
 915 * See Documentation/DMA-API-HOWTO.txt
 916 */
 917static dma_addr_t sba_map_page(struct device *dev, struct page *page,
 918                               unsigned long poff, size_t size,
 919                               enum dma_data_direction dir,
 920                               unsigned long attrs)
 921{
 922        struct ioc *ioc;
 923        void *addr = page_address(page) + poff;
 924        dma_addr_t iovp;
 925        dma_addr_t offset;
 926        u64 *pdir_start;
 927        int pide;
 928#ifdef ASSERT_PDIR_SANITY
 929        unsigned long flags;
 930#endif
 931#ifdef ALLOW_IOV_BYPASS
 932        unsigned long pci_addr = virt_to_phys(addr);
 933#endif
 934
 935#ifdef ALLOW_IOV_BYPASS
 936        ASSERT(to_pci_dev(dev)->dma_mask);
 937        /*
 938        ** Check if the PCI device can DMA to ptr... if so, just return ptr
 939        */
 940        if (likely((pci_addr & ~to_pci_dev(dev)->dma_mask) == 0)) {
 941                /*
 942                ** Device is bit capable of DMA'ing to the buffer...
 943                ** just return the PCI address of ptr
 944                */
 945                DBG_BYPASS("sba_map_page() bypass mask/addr: "
 946                           "0x%lx/0x%lx\n",
 947                           to_pci_dev(dev)->dma_mask, pci_addr);
 948                return pci_addr;
 949        }
 950#endif
 951        ioc = GET_IOC(dev);
 952        ASSERT(ioc);
 953
 954        prefetch(ioc->res_hint);
 955
 956        ASSERT(size > 0);
 957        ASSERT(size <= DMA_CHUNK_SIZE);
 958
 959        /* save offset bits */
 960        offset = ((dma_addr_t) (long) addr) & ~iovp_mask;
 961
 962        /* round up to nearest iovp_size */
 963        size = (size + offset + ~iovp_mask) & iovp_mask;
 964
 965#ifdef ASSERT_PDIR_SANITY
 966        spin_lock_irqsave(&ioc->res_lock, flags);
 967        if (sba_check_pdir(ioc,"Check before sba_map_page()"))
 968                panic("Sanity check failed");
 969        spin_unlock_irqrestore(&ioc->res_lock, flags);
 970#endif
 971
 972        pide = sba_alloc_range(ioc, dev, size);
 973        if (pide < 0)
 974                return DMA_MAPPING_ERROR;
 975
 976        iovp = (dma_addr_t) pide << iovp_shift;
 977
 978        DBG_RUN("%s() 0x%p -> 0x%lx\n", __func__, addr, (long) iovp | offset);
 979
 980        pdir_start = &(ioc->pdir_base[pide]);
 981
 982        while (size > 0) {
 983                ASSERT(((u8 *)pdir_start)[7] == 0); /* verify availability */
 984                sba_io_pdir_entry(pdir_start, (unsigned long) addr);
 985
 986                DBG_RUN("     pdir 0x%p %lx\n", pdir_start, *pdir_start);
 987
 988                addr += iovp_size;
 989                size -= iovp_size;
 990                pdir_start++;
 991        }
 992        /* force pdir update */
 993        wmb();
 994
 995        /* form complete address */
 996#ifdef ASSERT_PDIR_SANITY
 997        spin_lock_irqsave(&ioc->res_lock, flags);
 998        sba_check_pdir(ioc,"Check after sba_map_page()");
 999        spin_unlock_irqrestore(&ioc->res_lock, flags);
1000#endif
1001        return SBA_IOVA(ioc, iovp, offset);
1002}
1003
1004#ifdef ENABLE_MARK_CLEAN
1005static SBA_INLINE void
1006sba_mark_clean(struct ioc *ioc, dma_addr_t iova, size_t size)
1007{
1008        u32     iovp = (u32) SBA_IOVP(ioc,iova);
1009        int     off = PDIR_INDEX(iovp);
1010        void    *addr;
1011
1012        if (size <= iovp_size) {
1013                addr = phys_to_virt(ioc->pdir_base[off] &
1014                                    ~0xE000000000000FFFULL);
1015                mark_clean(addr, size);
1016        } else {
1017                do {
1018                        addr = phys_to_virt(ioc->pdir_base[off] &
1019                                            ~0xE000000000000FFFULL);
1020                        mark_clean(addr, min(size, iovp_size));
1021                        off++;
1022                        size -= iovp_size;
1023                } while (size > 0);
1024        }
1025}
1026#endif
1027
1028/**
1029 * sba_unmap_page - unmap one IOVA and free resources
1030 * @dev: instance of PCI owned by the driver that's asking.
1031 * @iova:  IOVA of driver buffer previously mapped.
1032 * @size:  number of bytes mapped in driver buffer.
1033 * @dir:  R/W or both.
1034 * @attrs: optional dma attributes
1035 *
1036 * See Documentation/DMA-API-HOWTO.txt
1037 */
1038static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size,
1039                           enum dma_data_direction dir, unsigned long attrs)
1040{
1041        struct ioc *ioc;
1042#if DELAYED_RESOURCE_CNT > 0
1043        struct sba_dma_pair *d;
1044#endif
1045        unsigned long flags;
1046        dma_addr_t offset;
1047
1048        ioc = GET_IOC(dev);
1049        ASSERT(ioc);
1050
1051#ifdef ALLOW_IOV_BYPASS
1052        if (likely((iova & ioc->imask) != ioc->ibase)) {
1053                /*
1054                ** Address does not fall w/in IOVA, must be bypassing
1055                */
1056                DBG_BYPASS("sba_unmap_page() bypass addr: 0x%lx\n",
1057                           iova);
1058
1059#ifdef ENABLE_MARK_CLEAN
1060                if (dir == DMA_FROM_DEVICE) {
1061                        mark_clean(phys_to_virt(iova), size);
1062                }
1063#endif
1064                return;
1065        }
1066#endif
1067        offset = iova & ~iovp_mask;
1068
1069        DBG_RUN("%s() iovp 0x%lx/%x\n", __func__, (long) iova, size);
1070
1071        iova ^= offset;        /* clear offset bits */
1072        size += offset;
1073        size = ROUNDUP(size, iovp_size);
1074
1075#ifdef ENABLE_MARK_CLEAN
1076        if (dir == DMA_FROM_DEVICE)
1077                sba_mark_clean(ioc, iova, size);
1078#endif
1079
1080#if DELAYED_RESOURCE_CNT > 0
1081        spin_lock_irqsave(&ioc->saved_lock, flags);
1082        d = &(ioc->saved[ioc->saved_cnt]);
1083        d->iova = iova;
1084        d->size = size;
1085        if (unlikely(++(ioc->saved_cnt) >= DELAYED_RESOURCE_CNT)) {
1086                int cnt = ioc->saved_cnt;
1087                spin_lock(&ioc->res_lock);
1088                while (cnt--) {
1089                        sba_mark_invalid(ioc, d->iova, d->size);
1090                        sba_free_range(ioc, d->iova, d->size);
1091                        d--;
1092                }
1093                ioc->saved_cnt = 0;
1094                READ_REG(ioc->ioc_hpa+IOC_PCOM);        /* flush purges */
1095                spin_unlock(&ioc->res_lock);
1096        }
1097        spin_unlock_irqrestore(&ioc->saved_lock, flags);
1098#else /* DELAYED_RESOURCE_CNT == 0 */
1099        spin_lock_irqsave(&ioc->res_lock, flags);
1100        sba_mark_invalid(ioc, iova, size);
1101        sba_free_range(ioc, iova, size);
1102        READ_REG(ioc->ioc_hpa+IOC_PCOM);        /* flush purges */
1103        spin_unlock_irqrestore(&ioc->res_lock, flags);
1104#endif /* DELAYED_RESOURCE_CNT == 0 */
1105}
1106
1107/**
1108 * sba_alloc_coherent - allocate/map shared mem for DMA
1109 * @dev: instance of PCI owned by the driver that's asking.
1110 * @size:  number of bytes mapped in driver buffer.
1111 * @dma_handle:  IOVA of new buffer.
1112 *
1113 * See Documentation/DMA-API-HOWTO.txt
1114 */
1115static void *
1116sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
1117                   gfp_t flags, unsigned long attrs)
1118{
1119        struct page *page;
1120        struct ioc *ioc;
1121        int node = -1;
1122        void *addr;
1123
1124        ioc = GET_IOC(dev);
1125        ASSERT(ioc);
1126#ifdef CONFIG_NUMA
1127        node = ioc->node;
1128#endif
1129
1130        page = alloc_pages_node(node, flags, get_order(size));
1131        if (unlikely(!page))
1132                return NULL;
1133
1134        addr = page_address(page);
1135        memset(addr, 0, size);
1136        *dma_handle = page_to_phys(page);
1137
1138#ifdef ALLOW_IOV_BYPASS
1139        ASSERT(dev->coherent_dma_mask);
1140        /*
1141        ** Check if the PCI device can DMA to ptr... if so, just return ptr
1142        */
1143        if (likely((*dma_handle & ~dev->coherent_dma_mask) == 0)) {
1144                DBG_BYPASS("sba_alloc_coherent() bypass mask/addr: 0x%lx/0x%lx\n",
1145                           dev->coherent_dma_mask, *dma_handle);
1146
1147                return addr;
1148        }
1149#endif
1150
1151        /*
1152         * If device can't bypass or bypass is disabled, pass the 32bit fake
1153         * device to map single to get an iova mapping.
1154         */
1155        *dma_handle = sba_map_page(&ioc->sac_only_dev->dev, page, 0, size,
1156                        DMA_BIDIRECTIONAL, 0);
1157        if (dma_mapping_error(dev, *dma_handle))
1158                return NULL;
1159        return addr;
1160}
1161
1162
1163/**
1164 * sba_free_coherent - free/unmap shared mem for DMA
1165 * @dev: instance of PCI owned by the driver that's asking.
1166 * @size:  number of bytes mapped in driver buffer.
1167 * @vaddr:  virtual address IOVA of "consistent" buffer.
1168 * @dma_handler:  IO virtual address of "consistent" buffer.
1169 *
1170 * See Documentation/DMA-API-HOWTO.txt
1171 */
1172static void sba_free_coherent(struct device *dev, size_t size, void *vaddr,
1173                              dma_addr_t dma_handle, unsigned long attrs)
1174{
1175        sba_unmap_page(dev, dma_handle, size, 0, 0);
1176        free_pages((unsigned long) vaddr, get_order(size));
1177}
1178
1179
1180/*
1181** Since 0 is a valid pdir_base index value, can't use that
1182** to determine if a value is valid or not. Use a flag to indicate
1183** the SG list entry contains a valid pdir index.
1184*/
1185#define PIDE_FLAG 0x1UL
1186
1187#ifdef DEBUG_LARGE_SG_ENTRIES
1188int dump_run_sg = 0;
1189#endif
1190
1191
1192/**
1193 * sba_fill_pdir - write allocated SG entries into IO PDIR
1194 * @ioc: IO MMU structure which owns the pdir we are interested in.
1195 * @startsg:  list of IOVA/size pairs
1196 * @nents: number of entries in startsg list
1197 *
1198 * Take preprocessed SG list and write corresponding entries
1199 * in the IO PDIR.
1200 */
1201
1202static SBA_INLINE int
1203sba_fill_pdir(
1204        struct ioc *ioc,
1205        struct scatterlist *startsg,
1206        int nents)
1207{
1208        struct scatterlist *dma_sg = startsg;   /* pointer to current DMA */
1209        int n_mappings = 0;
1210        u64 *pdirp = NULL;
1211        unsigned long dma_offset = 0;
1212
1213        while (nents-- > 0) {
1214                int     cnt = startsg->dma_length;
1215                startsg->dma_length = 0;
1216
1217#ifdef DEBUG_LARGE_SG_ENTRIES
1218                if (dump_run_sg)
1219                        printk(" %2d : %08lx/%05x %p\n",
1220                                nents, startsg->dma_address, cnt,
1221                                sba_sg_address(startsg));
1222#else
1223                DBG_RUN_SG(" %d : %08lx/%05x %p\n",
1224                                nents, startsg->dma_address, cnt,
1225                                sba_sg_address(startsg));
1226#endif
1227                /*
1228                ** Look for the start of a new DMA stream
1229                */
1230                if (startsg->dma_address & PIDE_FLAG) {
1231                        u32 pide = startsg->dma_address & ~PIDE_FLAG;
1232                        dma_offset = (unsigned long) pide & ~iovp_mask;
1233                        startsg->dma_address = 0;
1234                        if (n_mappings)
1235                                dma_sg = sg_next(dma_sg);
1236                        dma_sg->dma_address = pide | ioc->ibase;
1237                        pdirp = &(ioc->pdir_base[pide >> iovp_shift]);
1238                        n_mappings++;
1239                }
1240
1241                /*
1242                ** Look for a VCONTIG chunk
1243                */
1244                if (cnt) {
1245                        unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
1246                        ASSERT(pdirp);
1247
1248                        /* Since multiple Vcontig blocks could make up
1249                        ** one DMA stream, *add* cnt to dma_len.
1250                        */
1251                        dma_sg->dma_length += cnt;
1252                        cnt += dma_offset;
1253                        dma_offset=0;   /* only want offset on first chunk */
1254                        cnt = ROUNDUP(cnt, iovp_size);
1255                        do {
1256                                sba_io_pdir_entry(pdirp, vaddr);
1257                                vaddr += iovp_size;
1258                                cnt -= iovp_size;
1259                                pdirp++;
1260                        } while (cnt > 0);
1261                }
1262                startsg = sg_next(startsg);
1263        }
1264        /* force pdir update */
1265        wmb();
1266
1267#ifdef DEBUG_LARGE_SG_ENTRIES
1268        dump_run_sg = 0;
1269#endif
1270        return(n_mappings);
1271}
1272
1273
1274/*
1275** Two address ranges are DMA contiguous *iff* "end of prev" and
1276** "start of next" are both on an IOV page boundary.
1277**
1278** (shift left is a quick trick to mask off upper bits)
1279*/
1280#define DMA_CONTIG(__X, __Y) \
1281        (((((unsigned long) __X) | ((unsigned long) __Y)) << (BITS_PER_LONG - iovp_shift)) == 0UL)
1282
1283
1284/**
1285 * sba_coalesce_chunks - preprocess the SG list
1286 * @ioc: IO MMU structure which owns the pdir we are interested in.
1287 * @startsg:  list of IOVA/size pairs
1288 * @nents: number of entries in startsg list
1289 *
1290 * First pass is to walk the SG list and determine where the breaks are
1291 * in the DMA stream. Allocates PDIR entries but does not fill them.
1292 * Returns the number of DMA chunks.
1293 *
1294 * Doing the fill separate from the coalescing/allocation keeps the
1295 * code simpler. Future enhancement could make one pass through
1296 * the sglist do both.
1297 */
1298static SBA_INLINE int
1299sba_coalesce_chunks(struct ioc *ioc, struct device *dev,
1300        struct scatterlist *startsg,
1301        int nents)
1302{
1303        struct scatterlist *vcontig_sg;    /* VCONTIG chunk head */
1304        unsigned long vcontig_len;         /* len of VCONTIG chunk */
1305        unsigned long vcontig_end;
1306        struct scatterlist *dma_sg;        /* next DMA stream head */
1307        unsigned long dma_offset, dma_len; /* start/len of DMA stream */
1308        int n_mappings = 0;
1309        unsigned int max_seg_size = dma_get_max_seg_size(dev);
1310        int idx;
1311
1312        while (nents > 0) {
1313                unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
1314
1315                /*
1316                ** Prepare for first/next DMA stream
1317                */
1318                dma_sg = vcontig_sg = startsg;
1319                dma_len = vcontig_len = vcontig_end = startsg->length;
1320                vcontig_end +=  vaddr;
1321                dma_offset = vaddr & ~iovp_mask;
1322
1323                /* PARANOID: clear entries */
1324                startsg->dma_address = startsg->dma_length = 0;
1325
1326                /*
1327                ** This loop terminates one iteration "early" since
1328                ** it's always looking one "ahead".
1329                */
1330                while (--nents > 0) {
1331                        unsigned long vaddr;    /* tmp */
1332
1333                        startsg = sg_next(startsg);
1334
1335                        /* PARANOID */
1336                        startsg->dma_address = startsg->dma_length = 0;
1337
1338                        /* catch brokenness in SCSI layer */
1339                        ASSERT(startsg->length <= DMA_CHUNK_SIZE);
1340
1341                        /*
1342                        ** First make sure current dma stream won't
1343                        ** exceed DMA_CHUNK_SIZE if we coalesce the
1344                        ** next entry.
1345                        */
1346                        if (((dma_len + dma_offset + startsg->length + ~iovp_mask) & iovp_mask)
1347                            > DMA_CHUNK_SIZE)
1348                                break;
1349
1350                        if (dma_len + startsg->length > max_seg_size)
1351                                break;
1352
1353                        /*
1354                        ** Then look for virtually contiguous blocks.
1355                        **
1356                        ** append the next transaction?
1357                        */
1358                        vaddr = (unsigned long) sba_sg_address(startsg);
1359                        if  (vcontig_end == vaddr)
1360                        {
1361                                vcontig_len += startsg->length;
1362                                vcontig_end += startsg->length;
1363                                dma_len     += startsg->length;
1364                                continue;
1365                        }
1366
1367#ifdef DEBUG_LARGE_SG_ENTRIES
1368                        dump_run_sg = (vcontig_len > iovp_size);
1369#endif
1370
1371                        /*
1372                        ** Not virtually contiguous.
1373                        ** Terminate prev chunk.
1374                        ** Start a new chunk.
1375                        **
1376                        ** Once we start a new VCONTIG chunk, dma_offset
1377                        ** can't change. And we need the offset from the first
1378                        ** chunk - not the last one. Ergo Successive chunks
1379                        ** must start on page boundaries and dove tail
1380                        ** with it's predecessor.
1381                        */
1382                        vcontig_sg->dma_length = vcontig_len;
1383
1384                        vcontig_sg = startsg;
1385                        vcontig_len = startsg->length;
1386
1387                        /*
1388                        ** 3) do the entries end/start on page boundaries?
1389                        **    Don't update vcontig_end until we've checked.
1390                        */
1391                        if (DMA_CONTIG(vcontig_end, vaddr))
1392                        {
1393                                vcontig_end = vcontig_len + vaddr;
1394                                dma_len += vcontig_len;
1395                                continue;
1396                        } else {
1397                                break;
1398                        }
1399                }
1400
1401                /*
1402                ** End of DMA Stream
1403                ** Terminate last VCONTIG block.
1404                ** Allocate space for DMA stream.
1405                */
1406                vcontig_sg->dma_length = vcontig_len;
1407                dma_len = (dma_len + dma_offset + ~iovp_mask) & iovp_mask;
1408                ASSERT(dma_len <= DMA_CHUNK_SIZE);
1409                idx = sba_alloc_range(ioc, dev, dma_len);
1410                if (idx < 0) {
1411                        dma_sg->dma_length = 0;
1412                        return -1;
1413                }
1414                dma_sg->dma_address = (dma_addr_t)(PIDE_FLAG | (idx << iovp_shift)
1415                                                   | dma_offset);
1416                n_mappings++;
1417        }
1418
1419        return n_mappings;
1420}
1421
1422static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
1423                               int nents, enum dma_data_direction dir,
1424                               unsigned long attrs);
1425/**
1426 * sba_map_sg - map Scatter/Gather list
1427 * @dev: instance of PCI owned by the driver that's asking.
1428 * @sglist:  array of buffer/length pairs
1429 * @nents:  number of entries in list
1430 * @dir:  R/W or both.
1431 * @attrs: optional dma attributes
1432 *
1433 * See Documentation/DMA-API-HOWTO.txt
1434 */
1435static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist,
1436                            int nents, enum dma_data_direction dir,
1437                            unsigned long attrs)
1438{
1439        struct ioc *ioc;
1440        int coalesced, filled = 0;
1441#ifdef ASSERT_PDIR_SANITY
1442        unsigned long flags;
1443#endif
1444#ifdef ALLOW_IOV_BYPASS_SG
1445        struct scatterlist *sg;
1446#endif
1447
1448        DBG_RUN_SG("%s() START %d entries\n", __func__, nents);
1449        ioc = GET_IOC(dev);
1450        ASSERT(ioc);
1451
1452#ifdef ALLOW_IOV_BYPASS_SG
1453        ASSERT(to_pci_dev(dev)->dma_mask);
1454        if (likely((ioc->dma_mask & ~to_pci_dev(dev)->dma_mask) == 0)) {
1455                for_each_sg(sglist, sg, nents, filled) {
1456                        sg->dma_length = sg->length;
1457                        sg->dma_address = virt_to_phys(sba_sg_address(sg));
1458                }
1459                return filled;
1460        }
1461#endif
1462        /* Fast path single entry scatterlists. */
1463        if (nents == 1) {
1464                sglist->dma_length = sglist->length;
1465                sglist->dma_address = sba_map_page(dev, sg_page(sglist),
1466                                sglist->offset, sglist->length, dir, attrs);
1467                if (dma_mapping_error(dev, sglist->dma_address))
1468                        return 0;
1469                return 1;
1470        }
1471
1472#ifdef ASSERT_PDIR_SANITY
1473        spin_lock_irqsave(&ioc->res_lock, flags);
1474        if (sba_check_pdir(ioc,"Check before sba_map_sg_attrs()"))
1475        {
1476                sba_dump_sg(ioc, sglist, nents);
1477                panic("Check before sba_map_sg_attrs()");
1478        }
1479        spin_unlock_irqrestore(&ioc->res_lock, flags);
1480#endif
1481
1482        prefetch(ioc->res_hint);
1483
1484        /*
1485        ** First coalesce the chunks and allocate I/O pdir space
1486        **
1487        ** If this is one DMA stream, we can properly map using the
1488        ** correct virtual address associated with each DMA page.
1489        ** w/o this association, we wouldn't have coherent DMA!
1490        ** Access to the virtual address is what forces a two pass algorithm.
1491        */
1492        coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
1493        if (coalesced < 0) {
1494                sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs);
1495                return 0;
1496        }
1497
1498        /*
1499        ** Program the I/O Pdir
1500        **
1501        ** map the virtual addresses to the I/O Pdir
1502        ** o dma_address will contain the pdir index
1503        ** o dma_len will contain the number of bytes to map
1504        ** o address contains the virtual address.
1505        */
1506        filled = sba_fill_pdir(ioc, sglist, nents);
1507
1508#ifdef ASSERT_PDIR_SANITY
1509        spin_lock_irqsave(&ioc->res_lock, flags);
1510        if (sba_check_pdir(ioc,"Check after sba_map_sg_attrs()"))
1511        {
1512                sba_dump_sg(ioc, sglist, nents);
1513                panic("Check after sba_map_sg_attrs()\n");
1514        }
1515        spin_unlock_irqrestore(&ioc->res_lock, flags);
1516#endif
1517
1518        ASSERT(coalesced == filled);
1519        DBG_RUN_SG("%s() DONE %d mappings\n", __func__, filled);
1520
1521        return filled;
1522}
1523
1524/**
1525 * sba_unmap_sg_attrs - unmap Scatter/Gather list
1526 * @dev: instance of PCI owned by the driver that's asking.
1527 * @sglist:  array of buffer/length pairs
1528 * @nents:  number of entries in list
1529 * @dir:  R/W or both.
1530 * @attrs: optional dma attributes
1531 *
1532 * See Documentation/DMA-API-HOWTO.txt
1533 */
1534static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
1535                               int nents, enum dma_data_direction dir,
1536                               unsigned long attrs)
1537{
1538#ifdef ASSERT_PDIR_SANITY
1539        struct ioc *ioc;
1540        unsigned long flags;
1541#endif
1542
1543        DBG_RUN_SG("%s() START %d entries,  %p,%x\n",
1544                   __func__, nents, sba_sg_address(sglist), sglist->length);
1545
1546#ifdef ASSERT_PDIR_SANITY
1547        ioc = GET_IOC(dev);
1548        ASSERT(ioc);
1549
1550        spin_lock_irqsave(&ioc->res_lock, flags);
1551        sba_check_pdir(ioc,"Check before sba_unmap_sg_attrs()");
1552        spin_unlock_irqrestore(&ioc->res_lock, flags);
1553#endif
1554
1555        while (nents && sglist->dma_length) {
1556
1557                sba_unmap_page(dev, sglist->dma_address, sglist->dma_length,
1558                               dir, attrs);
1559                sglist = sg_next(sglist);
1560                nents--;
1561        }
1562
1563        DBG_RUN_SG("%s() DONE (nents %d)\n", __func__,  nents);
1564
1565#ifdef ASSERT_PDIR_SANITY
1566        spin_lock_irqsave(&ioc->res_lock, flags);
1567        sba_check_pdir(ioc,"Check after sba_unmap_sg_attrs()");
1568        spin_unlock_irqrestore(&ioc->res_lock, flags);
1569#endif
1570
1571}
1572
1573/**************************************************************
1574*
1575*   Initialization and claim
1576*
1577***************************************************************/
1578
1579static void
1580ioc_iova_init(struct ioc *ioc)
1581{
1582        int tcnfg;
1583        int agp_found = 0;
1584        struct pci_dev *device = NULL;
1585#ifdef FULL_VALID_PDIR
1586        unsigned long index;
1587#endif
1588
1589        /*
1590        ** Firmware programs the base and size of a "safe IOVA space"
1591        ** (one that doesn't overlap memory or LMMIO space) in the
1592        ** IBASE and IMASK registers.
1593        */
1594        ioc->ibase = READ_REG(ioc->ioc_hpa + IOC_IBASE) & ~0x1UL;
1595        ioc->imask = READ_REG(ioc->ioc_hpa + IOC_IMASK) | 0xFFFFFFFF00000000UL;
1596
1597        ioc->iov_size = ~ioc->imask + 1;
1598
1599        DBG_INIT("%s() hpa %p IOV base 0x%lx mask 0x%lx (%dMB)\n",
1600                __func__, ioc->ioc_hpa, ioc->ibase, ioc->imask,
1601                ioc->iov_size >> 20);
1602
1603        switch (iovp_size) {
1604                case  4*1024: tcnfg = 0; break;
1605                case  8*1024: tcnfg = 1; break;
1606                case 16*1024: tcnfg = 2; break;
1607                case 64*1024: tcnfg = 3; break;
1608                default:
1609                        panic(PFX "Unsupported IOTLB page size %ldK",
1610                                iovp_size >> 10);
1611                        break;
1612        }
1613        WRITE_REG(tcnfg, ioc->ioc_hpa + IOC_TCNFG);
1614
1615        ioc->pdir_size = (ioc->iov_size / iovp_size) * PDIR_ENTRY_SIZE;
1616        ioc->pdir_base = (void *) __get_free_pages(GFP_KERNEL,
1617                                                   get_order(ioc->pdir_size));
1618        if (!ioc->pdir_base)
1619                panic(PFX "Couldn't allocate I/O Page Table\n");
1620
1621        memset(ioc->pdir_base, 0, ioc->pdir_size);
1622
1623        DBG_INIT("%s() IOV page size %ldK pdir %p size %x\n", __func__,
1624                iovp_size >> 10, ioc->pdir_base, ioc->pdir_size);
1625
1626        ASSERT(ALIGN((unsigned long) ioc->pdir_base, 4*1024) == (unsigned long) ioc->pdir_base);
1627        WRITE_REG(virt_to_phys(ioc->pdir_base), ioc->ioc_hpa + IOC_PDIR_BASE);
1628
1629        /*
1630        ** If an AGP device is present, only use half of the IOV space
1631        ** for PCI DMA.  Unfortunately we can't know ahead of time
1632        ** whether GART support will actually be used, for now we
1633        ** can just key on an AGP device found in the system.
1634        ** We program the next pdir index after we stop w/ a key for
1635        ** the GART code to handshake on.
1636        */
1637        for_each_pci_dev(device)        
1638                agp_found |= pci_find_capability(device, PCI_CAP_ID_AGP);
1639
1640        if (agp_found && reserve_sba_gart) {
1641                printk(KERN_INFO PFX "reserving %dMb of IOVA space at 0x%lx for agpgart\n",
1642                      ioc->iov_size/2 >> 20, ioc->ibase + ioc->iov_size/2);
1643                ioc->pdir_size /= 2;
1644                ((u64 *)ioc->pdir_base)[PDIR_INDEX(ioc->iov_size/2)] = ZX1_SBA_IOMMU_COOKIE;
1645        }
1646#ifdef FULL_VALID_PDIR
1647        /*
1648        ** Check to see if the spill page has been allocated, we don't need more than
1649        ** one across multiple SBAs.
1650        */
1651        if (!prefetch_spill_page) {
1652                char *spill_poison = "SBAIOMMU POISON";
1653                int poison_size = 16;
1654                void *poison_addr, *addr;
1655
1656                addr = (void *)__get_free_pages(GFP_KERNEL, get_order(iovp_size));
1657                if (!addr)
1658                        panic(PFX "Couldn't allocate PDIR spill page\n");
1659
1660                poison_addr = addr;
1661                for ( ; (u64) poison_addr < addr + iovp_size; poison_addr += poison_size)
1662                        memcpy(poison_addr, spill_poison, poison_size);
1663
1664                prefetch_spill_page = virt_to_phys(addr);
1665
1666                DBG_INIT("%s() prefetch spill addr: 0x%lx\n", __func__, prefetch_spill_page);
1667        }
1668        /*
1669        ** Set all the PDIR entries valid w/ the spill page as the target
1670        */
1671        for (index = 0 ; index < (ioc->pdir_size / PDIR_ENTRY_SIZE) ; index++)
1672                ((u64 *)ioc->pdir_base)[index] = (0x80000000000000FF | prefetch_spill_page);
1673#endif
1674
1675        /* Clear I/O TLB of any possible entries */
1676        WRITE_REG(ioc->ibase | (get_iovp_order(ioc->iov_size) + iovp_shift), ioc->ioc_hpa + IOC_PCOM);
1677        READ_REG(ioc->ioc_hpa + IOC_PCOM);
1678
1679        /* Enable IOVA translation */
1680        WRITE_REG(ioc->ibase | 1, ioc->ioc_hpa + IOC_IBASE);
1681        READ_REG(ioc->ioc_hpa + IOC_IBASE);
1682}
1683
1684static void __init
1685ioc_resource_init(struct ioc *ioc)
1686{
1687        spin_lock_init(&ioc->res_lock);
1688#if DELAYED_RESOURCE_CNT > 0
1689        spin_lock_init(&ioc->saved_lock);
1690#endif
1691
1692        /* resource map size dictated by pdir_size */
1693        ioc->res_size = ioc->pdir_size / PDIR_ENTRY_SIZE; /* entries */
1694        ioc->res_size >>= 3;  /* convert bit count to byte count */
1695        DBG_INIT("%s() res_size 0x%x\n", __func__, ioc->res_size);
1696
1697        ioc->res_map = (char *) __get_free_pages(GFP_KERNEL,
1698                                                 get_order(ioc->res_size));
1699        if (!ioc->res_map)
1700                panic(PFX "Couldn't allocate resource map\n");
1701
1702        memset(ioc->res_map, 0, ioc->res_size);
1703        /* next available IOVP - circular search */
1704        ioc->res_hint = (unsigned long *) ioc->res_map;
1705
1706#ifdef ASSERT_PDIR_SANITY
1707        /* Mark first bit busy - ie no IOVA 0 */
1708        ioc->res_map[0] = 0x1;
1709        ioc->pdir_base[0] = 0x8000000000000000ULL | ZX1_SBA_IOMMU_COOKIE;
1710#endif
1711#ifdef FULL_VALID_PDIR
1712        /* Mark the last resource used so we don't prefetch beyond IOVA space */
1713        ioc->res_map[ioc->res_size - 1] |= 0x80UL; /* res_map is chars */
1714        ioc->pdir_base[(ioc->pdir_size / PDIR_ENTRY_SIZE) - 1] = (0x80000000000000FF
1715                                                              | prefetch_spill_page);
1716#endif
1717
1718        DBG_INIT("%s() res_map %x %p\n", __func__,
1719                 ioc->res_size, (void *) ioc->res_map);
1720}
1721
1722static void __init
1723ioc_sac_init(struct ioc *ioc)
1724{
1725        struct pci_dev *sac = NULL;
1726        struct pci_controller *controller = NULL;
1727
1728        /*
1729         * pci_alloc_coherent() must return a DMA address which is
1730         * SAC (single address cycle) addressable, so allocate a
1731         * pseudo-device to enforce that.
1732         */
1733        sac = kzalloc(sizeof(*sac), GFP_KERNEL);
1734        if (!sac)
1735                panic(PFX "Couldn't allocate struct pci_dev");
1736
1737        controller = kzalloc(sizeof(*controller), GFP_KERNEL);
1738        if (!controller)
1739                panic(PFX "Couldn't allocate struct pci_controller");
1740
1741        controller->iommu = ioc;
1742        sac->sysdata = controller;
1743        sac->dma_mask = 0xFFFFFFFFUL;
1744#ifdef CONFIG_PCI
1745        sac->dev.bus = &pci_bus_type;
1746#endif
1747        ioc->sac_only_dev = sac;
1748}
1749
1750static void __init
1751ioc_zx1_init(struct ioc *ioc)
1752{
1753        unsigned long rope_config;
1754        unsigned int i;
1755
1756        if (ioc->rev < 0x20)
1757                panic(PFX "IOC 2.0 or later required for IOMMU support\n");
1758
1759        /* 38 bit memory controller + extra bit for range displaced by MMIO */
1760        ioc->dma_mask = (0x1UL << 39) - 1;
1761
1762        /*
1763        ** Clear ROPE(N)_CONFIG AO bit.
1764        ** Disables "NT Ordering" (~= !"Relaxed Ordering")
1765        ** Overrides bit 1 in DMA Hint Sets.
1766        ** Improves netperf UDP_STREAM by ~10% for tg3 on bcm5701.
1767        */
1768        for (i=0; i<(8*8); i+=8) {
1769                rope_config = READ_REG(ioc->ioc_hpa + IOC_ROPE0_CFG + i);
1770                rope_config &= ~IOC_ROPE_AO;
1771                WRITE_REG(rope_config, ioc->ioc_hpa + IOC_ROPE0_CFG + i);
1772        }
1773}
1774
1775typedef void (initfunc)(struct ioc *);
1776
1777struct ioc_iommu {
1778        u32 func_id;
1779        char *name;
1780        initfunc *init;
1781};
1782
1783static struct ioc_iommu ioc_iommu_info[] __initdata = {
1784        { ZX1_IOC_ID, "zx1", ioc_zx1_init },
1785        { ZX2_IOC_ID, "zx2", NULL },
1786        { SX1000_IOC_ID, "sx1000", NULL },
1787        { SX2000_IOC_ID, "sx2000", NULL },
1788};
1789
1790static void __init ioc_init(unsigned long hpa, struct ioc *ioc)
1791{
1792        struct ioc_iommu *info;
1793
1794        ioc->next = ioc_list;
1795        ioc_list = ioc;
1796
1797        ioc->ioc_hpa = ioremap(hpa, 0x1000);
1798
1799        ioc->func_id = READ_REG(ioc->ioc_hpa + IOC_FUNC_ID);
1800        ioc->rev = READ_REG(ioc->ioc_hpa + IOC_FCLASS) & 0xFFUL;
1801        ioc->dma_mask = 0xFFFFFFFFFFFFFFFFUL;   /* conservative */
1802
1803        for (info = ioc_iommu_info; info < ioc_iommu_info + ARRAY_SIZE(ioc_iommu_info); info++) {
1804                if (ioc->func_id == info->func_id) {
1805                        ioc->name = info->name;
1806                        if (info->init)
1807                                (info->init)(ioc);
1808                }
1809        }
1810
1811        iovp_size = (1 << iovp_shift);
1812        iovp_mask = ~(iovp_size - 1);
1813
1814        DBG_INIT("%s: PAGE_SIZE %ldK, iovp_size %ldK\n", __func__,
1815                PAGE_SIZE >> 10, iovp_size >> 10);
1816
1817        if (!ioc->name) {
1818                ioc->name = kmalloc(24, GFP_KERNEL);
1819                if (ioc->name)
1820                        sprintf((char *) ioc->name, "Unknown (%04x:%04x)",
1821                                ioc->func_id & 0xFFFF, (ioc->func_id >> 16) & 0xFFFF);
1822                else
1823                        ioc->name = "Unknown";
1824        }
1825
1826        ioc_iova_init(ioc);
1827        ioc_resource_init(ioc);
1828        ioc_sac_init(ioc);
1829
1830        printk(KERN_INFO PFX
1831                "%s %d.%d HPA 0x%lx IOVA space %dMb at 0x%lx\n",
1832                ioc->name, (ioc->rev >> 4) & 0xF, ioc->rev & 0xF,
1833                hpa, ioc->iov_size >> 20, ioc->ibase);
1834}
1835
1836
1837
1838/**************************************************************************
1839**
1840**   SBA initialization code (HW and SW)
1841**
1842**   o identify SBA chip itself
1843**   o FIXME: initialize DMA hints for reasonable defaults
1844**
1845**************************************************************************/
1846
1847#ifdef CONFIG_PROC_FS
1848static void *
1849ioc_start(struct seq_file *s, loff_t *pos)
1850{
1851        struct ioc *ioc;
1852        loff_t n = *pos;
1853
1854        for (ioc = ioc_list; ioc; ioc = ioc->next)
1855                if (!n--)
1856                        return ioc;
1857
1858        return NULL;
1859}
1860
1861static void *
1862ioc_next(struct seq_file *s, void *v, loff_t *pos)
1863{
1864        struct ioc *ioc = v;
1865
1866        ++*pos;
1867        return ioc->next;
1868}
1869
1870static void
1871ioc_stop(struct seq_file *s, void *v)
1872{
1873}
1874
1875static int
1876ioc_show(struct seq_file *s, void *v)
1877{
1878        struct ioc *ioc = v;
1879        unsigned long *res_ptr = (unsigned long *)ioc->res_map;
1880        int i, used = 0;
1881
1882        seq_printf(s, "Hewlett Packard %s IOC rev %d.%d\n",
1883                ioc->name, ((ioc->rev >> 4) & 0xF), (ioc->rev & 0xF));
1884#ifdef CONFIG_NUMA
1885        if (ioc->node != NUMA_NO_NODE)
1886                seq_printf(s, "NUMA node       : %d\n", ioc->node);
1887#endif
1888        seq_printf(s, "IOVA size       : %ld MB\n", ((ioc->pdir_size >> 3) * iovp_size)/(1024*1024));
1889        seq_printf(s, "IOVA page size  : %ld kb\n", iovp_size/1024);
1890
1891        for (i = 0; i < (ioc->res_size / sizeof(unsigned long)); ++i, ++res_ptr)
1892                used += hweight64(*res_ptr);
1893
1894        seq_printf(s, "PDIR size       : %d entries\n", ioc->pdir_size >> 3);
1895        seq_printf(s, "PDIR used       : %d entries\n", used);
1896
1897#ifdef PDIR_SEARCH_TIMING
1898        {
1899                unsigned long i = 0, avg = 0, min, max;
1900                min = max = ioc->avg_search[0];
1901                for (i = 0; i < SBA_SEARCH_SAMPLE; i++) {
1902                        avg += ioc->avg_search[i];
1903                        if (ioc->avg_search[i] > max) max = ioc->avg_search[i];
1904                        if (ioc->avg_search[i] < min) min = ioc->avg_search[i];
1905                }
1906                avg /= SBA_SEARCH_SAMPLE;
1907                seq_printf(s, "Bitmap search   : %ld/%ld/%ld (min/avg/max CPU Cycles/IOVA page)\n",
1908                           min, avg, max);
1909        }
1910#endif
1911#ifndef ALLOW_IOV_BYPASS
1912         seq_printf(s, "IOVA bypass disabled\n");
1913#endif
1914        return 0;
1915}
1916
1917static const struct seq_operations ioc_seq_ops = {
1918        .start = ioc_start,
1919        .next  = ioc_next,
1920        .stop  = ioc_stop,
1921        .show  = ioc_show
1922};
1923
1924static void __init
1925ioc_proc_init(void)
1926{
1927        struct proc_dir_entry *dir;
1928
1929        dir = proc_mkdir("bus/mckinley", NULL);
1930        if (!dir)
1931                return;
1932
1933        proc_create_seq(ioc_list->name, 0, dir, &ioc_seq_ops);
1934}
1935#endif
1936
1937static void
1938sba_connect_bus(struct pci_bus *bus)
1939{
1940        acpi_handle handle, parent;
1941        acpi_status status;
1942        struct ioc *ioc;
1943
1944        if (!PCI_CONTROLLER(bus))
1945                panic(PFX "no sysdata on bus %d!\n", bus->number);
1946
1947        if (PCI_CONTROLLER(bus)->iommu)
1948                return;
1949
1950        handle = acpi_device_handle(PCI_CONTROLLER(bus)->companion);
1951        if (!handle)
1952                return;
1953
1954        /*
1955         * The IOC scope encloses PCI root bridges in the ACPI
1956         * namespace, so work our way out until we find an IOC we
1957         * claimed previously.
1958         */
1959        do {
1960                for (ioc = ioc_list; ioc; ioc = ioc->next)
1961                        if (ioc->handle == handle) {
1962                                PCI_CONTROLLER(bus)->iommu = ioc;
1963                                return;
1964                        }
1965
1966                status = acpi_get_parent(handle, &parent);
1967                handle = parent;
1968        } while (ACPI_SUCCESS(status));
1969
1970        printk(KERN_WARNING "No IOC for PCI Bus %04x:%02x in ACPI\n", pci_domain_nr(bus), bus->number);
1971}
1972
1973static void __init
1974sba_map_ioc_to_node(struct ioc *ioc, acpi_handle handle)
1975{
1976#ifdef CONFIG_NUMA
1977        unsigned int node;
1978
1979        node = acpi_get_node(handle);
1980        if (node != NUMA_NO_NODE && !node_online(node))
1981                node = NUMA_NO_NODE;
1982
1983        ioc->node = node;
1984#endif
1985}
1986
1987static void __init acpi_sba_ioc_add(struct ioc *ioc)
1988{
1989        acpi_handle handle = ioc->handle;
1990        acpi_status status;
1991        u64 hpa, length;
1992        struct acpi_device_info *adi;
1993
1994        ioc_found = ioc->next;
1995        status = hp_acpi_csr_space(handle, &hpa, &length);
1996        if (ACPI_FAILURE(status))
1997                goto err;
1998
1999        status = acpi_get_object_info(handle, &adi);
2000        if (ACPI_FAILURE(status))
2001                goto err;
2002
2003        /*
2004         * For HWP0001, only SBA appears in ACPI namespace.  It encloses the PCI
2005         * root bridges, and its CSR space includes the IOC function.
2006         */
2007        if (strncmp("HWP0001", adi->hardware_id.string, 7) == 0) {
2008                hpa += ZX1_IOC_OFFSET;
2009                /* zx1 based systems default to kernel page size iommu pages */
2010                if (!iovp_shift)
2011                        iovp_shift = min(PAGE_SHIFT, 16);
2012        }
2013        kfree(adi);
2014
2015        /*
2016         * default anything not caught above or specified on cmdline to 4k
2017         * iommu page size
2018         */
2019        if (!iovp_shift)
2020                iovp_shift = 12;
2021
2022        ioc_init(hpa, ioc);
2023        /* setup NUMA node association */
2024        sba_map_ioc_to_node(ioc, handle);
2025        return;
2026
2027 err:
2028        kfree(ioc);
2029}
2030
2031static const struct acpi_device_id hp_ioc_iommu_device_ids[] = {
2032        {"HWP0001", 0},
2033        {"HWP0004", 0},
2034        {"", 0},
2035};
2036
2037static int acpi_sba_ioc_attach(struct acpi_device *device,
2038                               const struct acpi_device_id *not_used)
2039{
2040        struct ioc *ioc;
2041
2042        ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2043        if (!ioc)
2044                return -ENOMEM;
2045
2046        ioc->next = ioc_found;
2047        ioc_found = ioc;
2048        ioc->handle = device->handle;
2049        return 1;
2050}
2051
2052
2053static struct acpi_scan_handler acpi_sba_ioc_handler = {
2054        .ids    = hp_ioc_iommu_device_ids,
2055        .attach = acpi_sba_ioc_attach,
2056};
2057
2058static int __init acpi_sba_ioc_init_acpi(void)
2059{
2060        return acpi_scan_add_handler(&acpi_sba_ioc_handler);
2061}
2062/* This has to run before acpi_scan_init(). */
2063arch_initcall(acpi_sba_ioc_init_acpi);
2064
2065static int __init
2066sba_init(void)
2067{
2068        if (!ia64_platform_is("hpzx1") && !ia64_platform_is("hpzx1_swiotlb"))
2069                return 0;
2070
2071#if defined(CONFIG_IA64_GENERIC)
2072        /* If we are booting a kdump kernel, the sba_iommu will
2073         * cause devices that were not shutdown properly to MCA
2074         * as soon as they are turned back on.  Our only option for
2075         * a successful kdump kernel boot is to use the swiotlb.
2076         */
2077        if (is_kdump_kernel()) {
2078                dma_ops = NULL;
2079                if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
2080                        panic("Unable to initialize software I/O TLB:"
2081                                  " Try machvec=dig boot option");
2082                machvec_init("dig");
2083                return 0;
2084        }
2085#endif
2086
2087        /*
2088         * ioc_found should be populated by the acpi_sba_ioc_handler's .attach()
2089         * routine, but that only happens if acpi_scan_init() has already run.
2090         */
2091        while (ioc_found)
2092                acpi_sba_ioc_add(ioc_found);
2093
2094        if (!ioc_list) {
2095#ifdef CONFIG_IA64_GENERIC
2096                /*
2097                 * If we didn't find something sba_iommu can claim, we
2098                 * need to setup the swiotlb and switch to the dig machvec.
2099                 */
2100                dma_ops = NULL;
2101                if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
2102                        panic("Unable to find SBA IOMMU or initialize "
2103                              "software I/O TLB: Try machvec=dig boot option");
2104                machvec_init("dig");
2105#else
2106                panic("Unable to find SBA IOMMU: Try a generic or DIG kernel");
2107#endif
2108                return 0;
2109        }
2110
2111#if defined(CONFIG_IA64_GENERIC) || defined(CONFIG_IA64_HP_ZX1_SWIOTLB)
2112        /*
2113         * hpzx1_swiotlb needs to have a fairly small swiotlb bounce
2114         * buffer setup to support devices with smaller DMA masks than
2115         * sba_iommu can handle.
2116         */
2117        if (ia64_platform_is("hpzx1_swiotlb")) {
2118                extern void hwsw_init(void);
2119
2120                hwsw_init();
2121        }
2122#endif
2123
2124#ifdef CONFIG_PCI
2125        {
2126                struct pci_bus *b = NULL;
2127                while ((b = pci_find_next_bus(b)) != NULL)
2128                        sba_connect_bus(b);
2129        }
2130#endif
2131
2132#ifdef CONFIG_PROC_FS
2133        ioc_proc_init();
2134#endif
2135        return 0;
2136}
2137
2138subsys_initcall(sba_init); /* must be initialized after ACPI etc., but before any drivers... */
2139
2140static int __init
2141nosbagart(char *str)
2142{
2143        reserve_sba_gart = 0;
2144        return 1;
2145}
2146
2147static int sba_dma_supported (struct device *dev, u64 mask)
2148{
2149        /* make sure it's at least 32bit capable */
2150        return ((mask & 0xFFFFFFFFUL) == 0xFFFFFFFFUL);
2151}
2152
2153__setup("nosbagart", nosbagart);
2154
2155static int __init
2156sba_page_override(char *str)
2157{
2158        unsigned long page_size;
2159
2160        page_size = memparse(str, &str);
2161        switch (page_size) {
2162                case 4096:
2163                case 8192:
2164                case 16384:
2165                case 65536:
2166                        iovp_shift = ffs(page_size) - 1;
2167                        break;
2168                default:
2169                        printk("%s: unknown/unsupported iommu page size %ld\n",
2170                               __func__, page_size);
2171        }
2172
2173        return 1;
2174}
2175
2176__setup("sbapagesize=",sba_page_override);
2177
2178const struct dma_map_ops sba_dma_ops = {
2179        .alloc                  = sba_alloc_coherent,
2180        .free                   = sba_free_coherent,
2181        .map_page               = sba_map_page,
2182        .unmap_page             = sba_unmap_page,
2183        .map_sg                 = sba_map_sg_attrs,
2184        .unmap_sg               = sba_unmap_sg_attrs,
2185        .dma_supported          = sba_dma_supported,
2186};
2187
2188void sba_dma_init(void)
2189{
2190        dma_ops = &sba_dma_ops;
2191}
2192