linux/drivers/vfio/pci/vfio_pci_config.c
<<
>>
Prefs
   1/*
   2 * VFIO PCI config space virtualization
   3 *
   4 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5 *     Author: Alex Williamson <alex.williamson@redhat.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio:
  12 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13 * Author: Tom Lyon, pugs@cisco.com
  14 */
  15
  16/*
  17 * This code handles reading and writing of PCI configuration registers.
  18 * This is hairy because we want to allow a lot of flexibility to the
  19 * user driver, but cannot trust it with all of the config fields.
  20 * Tables determine which fields can be read and written, as well as
  21 * which fields are 'virtualized' - special actions and translations to
  22 * make it appear to the user that he has control, when in fact things
  23 * must be negotiated with the underlying OS.
  24 */
  25
  26#include <linux/fs.h>
  27#include <linux/pci.h>
  28#include <linux/uaccess.h>
  29#include <linux/vfio.h>
  30#include <linux/slab.h>
  31
  32#include "vfio_pci_private.h"
  33
  34#define PCI_CFG_SPACE_SIZE      256
  35
  36/* Useful "pseudo" capabilities */
  37#define PCI_CAP_ID_BASIC        0
  38#define PCI_CAP_ID_INVALID      0xFF
  39
  40#define is_bar(offset)  \
  41        ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \
  42         (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4))
  43
  44/*
  45 * Lengths of PCI Config Capabilities
  46 *   0: Removed from the user visible capability list
  47 *   FF: Variable length
  48 */
  49static u8 pci_cap_length[] = {
  50        [PCI_CAP_ID_BASIC]      = PCI_STD_HEADER_SIZEOF, /* pci config header */
  51        [PCI_CAP_ID_PM]         = PCI_PM_SIZEOF,
  52        [PCI_CAP_ID_AGP]        = PCI_AGP_SIZEOF,
  53        [PCI_CAP_ID_VPD]        = PCI_CAP_VPD_SIZEOF,
  54        [PCI_CAP_ID_SLOTID]     = 0,            /* bridge - don't care */
  55        [PCI_CAP_ID_MSI]        = 0xFF,         /* 10, 14, 20, or 24 */
  56        [PCI_CAP_ID_CHSWP]      = 0,            /* cpci - not yet */
  57        [PCI_CAP_ID_PCIX]       = 0xFF,         /* 8 or 24 */
  58        [PCI_CAP_ID_HT]         = 0xFF,         /* hypertransport */
  59        [PCI_CAP_ID_VNDR]       = 0xFF,         /* variable */
  60        [PCI_CAP_ID_DBG]        = 0,            /* debug - don't care */
  61        [PCI_CAP_ID_CCRC]       = 0,            /* cpci - not yet */
  62        [PCI_CAP_ID_SHPC]       = 0,            /* hotswap - not yet */
  63        [PCI_CAP_ID_SSVID]      = 0,            /* bridge - don't care */
  64        [PCI_CAP_ID_AGP3]       = 0,            /* AGP8x - not yet */
  65        [PCI_CAP_ID_SECDEV]     = 0,            /* secure device not yet */
  66        [PCI_CAP_ID_EXP]        = 0xFF,         /* 20 or 44 */
  67        [PCI_CAP_ID_MSIX]       = PCI_CAP_MSIX_SIZEOF,
  68        [PCI_CAP_ID_SATA]       = 0xFF,
  69        [PCI_CAP_ID_AF]         = PCI_CAP_AF_SIZEOF,
  70};
  71
  72/*
  73 * Lengths of PCIe/PCI-X Extended Config Capabilities
  74 *   0: Removed or masked from the user visible capabilty list
  75 *   FF: Variable length
  76 */
  77static u16 pci_ext_cap_length[] = {
  78        [PCI_EXT_CAP_ID_ERR]    =       PCI_ERR_ROOT_COMMAND,
  79        [PCI_EXT_CAP_ID_VC]     =       0xFF,
  80        [PCI_EXT_CAP_ID_DSN]    =       PCI_EXT_CAP_DSN_SIZEOF,
  81        [PCI_EXT_CAP_ID_PWR]    =       PCI_EXT_CAP_PWR_SIZEOF,
  82        [PCI_EXT_CAP_ID_RCLD]   =       0,      /* root only - don't care */
  83        [PCI_EXT_CAP_ID_RCILC]  =       0,      /* root only - don't care */
  84        [PCI_EXT_CAP_ID_RCEC]   =       0,      /* root only - don't care */
  85        [PCI_EXT_CAP_ID_MFVC]   =       0xFF,
  86        [PCI_EXT_CAP_ID_VC9]    =       0xFF,   /* same as CAP_ID_VC */
  87        [PCI_EXT_CAP_ID_RCRB]   =       0,      /* root only - don't care */
  88        [PCI_EXT_CAP_ID_VNDR]   =       0xFF,
  89        [PCI_EXT_CAP_ID_CAC]    =       0,      /* obsolete */
  90        [PCI_EXT_CAP_ID_ACS]    =       0xFF,
  91        [PCI_EXT_CAP_ID_ARI]    =       PCI_EXT_CAP_ARI_SIZEOF,
  92        [PCI_EXT_CAP_ID_ATS]    =       PCI_EXT_CAP_ATS_SIZEOF,
  93        [PCI_EXT_CAP_ID_SRIOV]  =       PCI_EXT_CAP_SRIOV_SIZEOF,
  94        [PCI_EXT_CAP_ID_MRIOV]  =       0,      /* not yet */
  95        [PCI_EXT_CAP_ID_MCAST]  =       PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF,
  96        [PCI_EXT_CAP_ID_PRI]    =       PCI_EXT_CAP_PRI_SIZEOF,
  97        [PCI_EXT_CAP_ID_AMD_XXX] =      0,      /* not yet */
  98        [PCI_EXT_CAP_ID_REBAR]  =       0xFF,
  99        [PCI_EXT_CAP_ID_DPA]    =       0xFF,
 100        [PCI_EXT_CAP_ID_TPH]    =       0xFF,
 101        [PCI_EXT_CAP_ID_LTR]    =       PCI_EXT_CAP_LTR_SIZEOF,
 102        [PCI_EXT_CAP_ID_SECPCI] =       0,      /* not yet */
 103        [PCI_EXT_CAP_ID_PMUX]   =       0,      /* not yet */
 104        [PCI_EXT_CAP_ID_PASID]  =       0,      /* not yet */
 105};
 106
 107/*
 108 * Read/Write Permission Bits - one bit for each bit in capability
 109 * Any field can be read if it exists, but what is read depends on
 110 * whether the field is 'virtualized', or just pass thru to the
 111 * hardware.  Any virtualized field is also virtualized for writes.
 112 * Writes are only permitted if they have a 1 bit here.
 113 */
 114struct perm_bits {
 115        u8      *virt;          /* read/write virtual data, not hw */
 116        u8      *write;         /* writeable bits */
 117        int     (*readfn)(struct vfio_pci_device *vdev, int pos, int count,
 118                          struct perm_bits *perm, int offset, __le32 *val);
 119        int     (*writefn)(struct vfio_pci_device *vdev, int pos, int count,
 120                           struct perm_bits *perm, int offset, __le32 val);
 121};
 122
 123#define NO_VIRT         0
 124#define ALL_VIRT        0xFFFFFFFFU
 125#define NO_WRITE        0
 126#define ALL_WRITE       0xFFFFFFFFU
 127
 128static int vfio_user_config_read(struct pci_dev *pdev, int offset,
 129                                 __le32 *val, int count)
 130{
 131        int ret = -EINVAL;
 132        u32 tmp_val = 0;
 133
 134        switch (count) {
 135        case 1:
 136        {
 137                u8 tmp;
 138                ret = pci_user_read_config_byte(pdev, offset, &tmp);
 139                tmp_val = tmp;
 140                break;
 141        }
 142        case 2:
 143        {
 144                u16 tmp;
 145                ret = pci_user_read_config_word(pdev, offset, &tmp);
 146                tmp_val = tmp;
 147                break;
 148        }
 149        case 4:
 150                ret = pci_user_read_config_dword(pdev, offset, &tmp_val);
 151                break;
 152        }
 153
 154        *val = cpu_to_le32(tmp_val);
 155
 156        return pcibios_err_to_errno(ret);
 157}
 158
 159static int vfio_user_config_write(struct pci_dev *pdev, int offset,
 160                                  __le32 val, int count)
 161{
 162        int ret = -EINVAL;
 163        u32 tmp_val = le32_to_cpu(val);
 164
 165        switch (count) {
 166        case 1:
 167                ret = pci_user_write_config_byte(pdev, offset, tmp_val);
 168                break;
 169        case 2:
 170                ret = pci_user_write_config_word(pdev, offset, tmp_val);
 171                break;
 172        case 4:
 173                ret = pci_user_write_config_dword(pdev, offset, tmp_val);
 174                break;
 175        }
 176
 177        return pcibios_err_to_errno(ret);
 178}
 179
 180static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
 181                                    int count, struct perm_bits *perm,
 182                                    int offset, __le32 *val)
 183{
 184        __le32 virt = 0;
 185
 186        memcpy(val, vdev->vconfig + pos, count);
 187
 188        memcpy(&virt, perm->virt + offset, count);
 189
 190        /* Any non-virtualized bits? */
 191        if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) {
 192                struct pci_dev *pdev = vdev->pdev;
 193                __le32 phys_val = 0;
 194                int ret;
 195
 196                ret = vfio_user_config_read(pdev, pos, &phys_val, count);
 197                if (ret)
 198                        return ret;
 199
 200                *val = (phys_val & ~virt) | (*val & virt);
 201        }
 202
 203        return count;
 204}
 205
 206static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos,
 207                                     int count, struct perm_bits *perm,
 208                                     int offset, __le32 val)
 209{
 210        __le32 virt = 0, write = 0;
 211
 212        memcpy(&write, perm->write + offset, count);
 213
 214        if (!write)
 215                return count; /* drop, no writable bits */
 216
 217        memcpy(&virt, perm->virt + offset, count);
 218
 219        /* Virtualized and writable bits go to vconfig */
 220        if (write & virt) {
 221                __le32 virt_val = 0;
 222
 223                memcpy(&virt_val, vdev->vconfig + pos, count);
 224
 225                virt_val &= ~(write & virt);
 226                virt_val |= (val & (write & virt));
 227
 228                memcpy(vdev->vconfig + pos, &virt_val, count);
 229        }
 230
 231        /* Non-virtualzed and writable bits go to hardware */
 232        if (write & ~virt) {
 233                struct pci_dev *pdev = vdev->pdev;
 234                __le32 phys_val = 0;
 235                int ret;
 236
 237                ret = vfio_user_config_read(pdev, pos, &phys_val, count);
 238                if (ret)
 239                        return ret;
 240
 241                phys_val &= ~(write & ~virt);
 242                phys_val |= (val & (write & ~virt));
 243
 244                ret = vfio_user_config_write(pdev, pos, phys_val, count);
 245                if (ret)
 246                        return ret;
 247        }
 248
 249        return count;
 250}
 251
 252/* Allow direct read from hardware, except for capability next pointer */
 253static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
 254                                   int count, struct perm_bits *perm,
 255                                   int offset, __le32 *val)
 256{
 257        int ret;
 258
 259        ret = vfio_user_config_read(vdev->pdev, pos, val, count);
 260        if (ret)
 261                return pcibios_err_to_errno(ret);
 262
 263        if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */
 264                if (offset < 4)
 265                        memcpy(val, vdev->vconfig + pos, count);
 266        } else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */
 267                if (offset == PCI_CAP_LIST_ID && count > 1)
 268                        memcpy(val, vdev->vconfig + pos,
 269                               min(PCI_CAP_FLAGS, count));
 270                else if (offset == PCI_CAP_LIST_NEXT)
 271                        memcpy(val, vdev->vconfig + pos, 1);
 272        }
 273
 274        return count;
 275}
 276
 277/* Raw access skips any kind of virtualization */
 278static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos,
 279                                 int count, struct perm_bits *perm,
 280                                 int offset, __le32 val)
 281{
 282        int ret;
 283
 284        ret = vfio_user_config_write(vdev->pdev, pos, val, count);
 285        if (ret)
 286                return ret;
 287
 288        return count;
 289}
 290
 291static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
 292                                int count, struct perm_bits *perm,
 293                                int offset, __le32 *val)
 294{
 295        int ret;
 296
 297        ret = vfio_user_config_read(vdev->pdev, pos, val, count);
 298        if (ret)
 299                return pcibios_err_to_errno(ret);
 300
 301        return count;
 302}
 303
 304/* Default capability regions to read-only, no-virtualization */
 305static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
 306        [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
 307};
 308static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
 309        [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
 310};
 311/*
 312 * Default unassigned regions to raw read-write access.  Some devices
 313 * require this to function as they hide registers between the gaps in
 314 * config space (be2net).  Like MMIO and I/O port registers, we have
 315 * to trust the hardware isolation.
 316 */
 317static struct perm_bits unassigned_perms = {
 318        .readfn = vfio_raw_config_read,
 319        .writefn = vfio_raw_config_write
 320};
 321
 322static void free_perm_bits(struct perm_bits *perm)
 323{
 324        kfree(perm->virt);
 325        kfree(perm->write);
 326        perm->virt = NULL;
 327        perm->write = NULL;
 328}
 329
 330static int alloc_perm_bits(struct perm_bits *perm, int size)
 331{
 332        /*
 333         * Round up all permission bits to the next dword, this lets us
 334         * ignore whether a read/write exceeds the defined capability
 335         * structure.  We can do this because:
 336         *  - Standard config space is already dword aligned
 337         *  - Capabilities are all dword alinged (bits 0:1 of next reserved)
 338         *  - Express capabilities defined as dword aligned
 339         */
 340        size = round_up(size, 4);
 341
 342        /*
 343         * Zero state is
 344         * - All Readable, None Writeable, None Virtualized
 345         */
 346        perm->virt = kzalloc(size, GFP_KERNEL);
 347        perm->write = kzalloc(size, GFP_KERNEL);
 348        if (!perm->virt || !perm->write) {
 349                free_perm_bits(perm);
 350                return -ENOMEM;
 351        }
 352
 353        perm->readfn = vfio_default_config_read;
 354        perm->writefn = vfio_default_config_write;
 355
 356        return 0;
 357}
 358
 359/*
 360 * Helper functions for filling in permission tables
 361 */
 362static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write)
 363{
 364        p->virt[off] = virt;
 365        p->write[off] = write;
 366}
 367
 368/* Handle endian-ness - pci and tables are little-endian */
 369static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write)
 370{
 371        *(__le16 *)(&p->virt[off]) = cpu_to_le16(virt);
 372        *(__le16 *)(&p->write[off]) = cpu_to_le16(write);
 373}
 374
 375/* Handle endian-ness - pci and tables are little-endian */
 376static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write)
 377{
 378        *(__le32 *)(&p->virt[off]) = cpu_to_le32(virt);
 379        *(__le32 *)(&p->write[off]) = cpu_to_le32(write);
 380}
 381
 382/*
 383 * Restore the *real* BARs after we detect a FLR or backdoor reset.
 384 * (backdoor = some device specific technique that we didn't catch)
 385 */
 386static void vfio_bar_restore(struct vfio_pci_device *vdev)
 387{
 388        struct pci_dev *pdev = vdev->pdev;
 389        u32 *rbar = vdev->rbar;
 390        int i;
 391
 392        if (pdev->is_virtfn)
 393                return;
 394
 395        pr_info("%s: %s reset recovery - restoring bars\n",
 396                __func__, dev_name(&pdev->dev));
 397
 398        for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++)
 399                pci_user_write_config_dword(pdev, i, *rbar);
 400
 401        pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar);
 402}
 403
 404static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar)
 405{
 406        unsigned long flags = pci_resource_flags(pdev, bar);
 407        u32 val;
 408
 409        if (flags & IORESOURCE_IO)
 410                return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO);
 411
 412        val = PCI_BASE_ADDRESS_SPACE_MEMORY;
 413
 414        if (flags & IORESOURCE_PREFETCH)
 415                val |= PCI_BASE_ADDRESS_MEM_PREFETCH;
 416
 417        if (flags & IORESOURCE_MEM_64)
 418                val |= PCI_BASE_ADDRESS_MEM_TYPE_64;
 419
 420        return cpu_to_le32(val);
 421}
 422
 423/*
 424 * Pretend we're hardware and tweak the values of the *virtual* PCI BARs
 425 * to reflect the hardware capabilities.  This implements BAR sizing.
 426 */
 427static void vfio_bar_fixup(struct vfio_pci_device *vdev)
 428{
 429        struct pci_dev *pdev = vdev->pdev;
 430        int i;
 431        __le32 *bar;
 432        u64 mask;
 433
 434        bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
 435
 436        for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) {
 437                if (!pci_resource_start(pdev, i)) {
 438                        *bar = 0; /* Unmapped by host = unimplemented to user */
 439                        continue;
 440                }
 441
 442                mask = ~(pci_resource_len(pdev, i) - 1);
 443
 444                *bar &= cpu_to_le32((u32)mask);
 445                *bar |= vfio_generate_bar_flags(pdev, i);
 446
 447                if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
 448                        bar++;
 449                        *bar &= cpu_to_le32((u32)(mask >> 32));
 450                        i++;
 451                }
 452        }
 453
 454        bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
 455
 456        /*
 457         * NB. we expose the actual BAR size here, regardless of whether
 458         * we can read it.  When we report the REGION_INFO for the ROM
 459         * we report what PCI tells us is the actual ROM size.
 460         */
 461        if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
 462                mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
 463                mask |= PCI_ROM_ADDRESS_ENABLE;
 464                *bar &= cpu_to_le32((u32)mask);
 465        } else
 466                *bar = 0;
 467
 468        vdev->bardirty = false;
 469}
 470
 471static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
 472                                  int count, struct perm_bits *perm,
 473                                  int offset, __le32 *val)
 474{
 475        if (is_bar(offset)) /* pos == offset for basic config */
 476                vfio_bar_fixup(vdev);
 477
 478        count = vfio_default_config_read(vdev, pos, count, perm, offset, val);
 479
 480        /* Mask in virtual memory enable for SR-IOV devices */
 481        if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) {
 482                u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
 483                u32 tmp_val = le32_to_cpu(*val);
 484
 485                tmp_val |= cmd & PCI_COMMAND_MEMORY;
 486                *val = cpu_to_le32(tmp_val);
 487        }
 488
 489        return count;
 490}
 491
 492static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
 493                                   int count, struct perm_bits *perm,
 494                                   int offset, __le32 val)
 495{
 496        struct pci_dev *pdev = vdev->pdev;
 497        __le16 *virt_cmd;
 498        u16 new_cmd = 0;
 499        int ret;
 500
 501        virt_cmd = (__le16 *)&vdev->vconfig[PCI_COMMAND];
 502
 503        if (offset == PCI_COMMAND) {
 504                bool phys_mem, virt_mem, new_mem, phys_io, virt_io, new_io;
 505                u16 phys_cmd;
 506
 507                ret = pci_user_read_config_word(pdev, PCI_COMMAND, &phys_cmd);
 508                if (ret)
 509                        return ret;
 510
 511                new_cmd = le32_to_cpu(val);
 512
 513                phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY);
 514                virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
 515                new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
 516
 517                phys_io = !!(phys_cmd & PCI_COMMAND_IO);
 518                virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO);
 519                new_io = !!(new_cmd & PCI_COMMAND_IO);
 520
 521                /*
 522                 * If the user is writing mem/io enable (new_mem/io) and we
 523                 * think it's already enabled (virt_mem/io), but the hardware
 524                 * shows it disabled (phys_mem/io, then the device has
 525                 * undergone some kind of backdoor reset and needs to be
 526                 * restored before we allow it to enable the bars.
 527                 * SR-IOV devices will trigger this, but we catch them later
 528                 */
 529                if ((new_mem && virt_mem && !phys_mem) ||
 530                    (new_io && virt_io && !phys_io))
 531                        vfio_bar_restore(vdev);
 532        }
 533
 534        count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
 535        if (count < 0)
 536                return count;
 537
 538        /*
 539         * Save current memory/io enable bits in vconfig to allow for
 540         * the test above next time.
 541         */
 542        if (offset == PCI_COMMAND) {
 543                u16 mask = PCI_COMMAND_MEMORY | PCI_COMMAND_IO;
 544
 545                *virt_cmd &= cpu_to_le16(~mask);
 546                *virt_cmd |= cpu_to_le16(new_cmd & mask);
 547        }
 548
 549        /* Emulate INTx disable */
 550        if (offset >= PCI_COMMAND && offset <= PCI_COMMAND + 1) {
 551                bool virt_intx_disable;
 552
 553                virt_intx_disable = !!(le16_to_cpu(*virt_cmd) &
 554                                       PCI_COMMAND_INTX_DISABLE);
 555
 556                if (virt_intx_disable && !vdev->virq_disabled) {
 557                        vdev->virq_disabled = true;
 558                        vfio_pci_intx_mask(vdev);
 559                } else if (!virt_intx_disable && vdev->virq_disabled) {
 560                        vdev->virq_disabled = false;
 561                        vfio_pci_intx_unmask(vdev);
 562                }
 563        }
 564
 565        if (is_bar(offset))
 566                vdev->bardirty = true;
 567
 568        return count;
 569}
 570
 571/* Permissions for the Basic PCI Header */
 572static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
 573{
 574        if (alloc_perm_bits(perm, PCI_STD_HEADER_SIZEOF))
 575                return -ENOMEM;
 576
 577        perm->readfn = vfio_basic_config_read;
 578        perm->writefn = vfio_basic_config_write;
 579
 580        /* Virtualized for SR-IOV functions, which just have FFFF */
 581        p_setw(perm, PCI_VENDOR_ID, (u16)ALL_VIRT, NO_WRITE);
 582        p_setw(perm, PCI_DEVICE_ID, (u16)ALL_VIRT, NO_WRITE);
 583
 584        /*
 585         * Virtualize INTx disable, we use it internally for interrupt
 586         * control and can emulate it for non-PCI 2.3 devices.
 587         */
 588        p_setw(perm, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE, (u16)ALL_WRITE);
 589
 590        /* Virtualize capability list, we might want to skip/disable */
 591        p_setw(perm, PCI_STATUS, PCI_STATUS_CAP_LIST, NO_WRITE);
 592
 593        /* No harm to write */
 594        p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, (u8)ALL_WRITE);
 595        p_setb(perm, PCI_LATENCY_TIMER, NO_VIRT, (u8)ALL_WRITE);
 596        p_setb(perm, PCI_BIST, NO_VIRT, (u8)ALL_WRITE);
 597
 598        /* Virtualize all bars, can't touch the real ones */
 599        p_setd(perm, PCI_BASE_ADDRESS_0, ALL_VIRT, ALL_WRITE);
 600        p_setd(perm, PCI_BASE_ADDRESS_1, ALL_VIRT, ALL_WRITE);
 601        p_setd(perm, PCI_BASE_ADDRESS_2, ALL_VIRT, ALL_WRITE);
 602        p_setd(perm, PCI_BASE_ADDRESS_3, ALL_VIRT, ALL_WRITE);
 603        p_setd(perm, PCI_BASE_ADDRESS_4, ALL_VIRT, ALL_WRITE);
 604        p_setd(perm, PCI_BASE_ADDRESS_5, ALL_VIRT, ALL_WRITE);
 605        p_setd(perm, PCI_ROM_ADDRESS, ALL_VIRT, ALL_WRITE);
 606
 607        /* Allow us to adjust capability chain */
 608        p_setb(perm, PCI_CAPABILITY_LIST, (u8)ALL_VIRT, NO_WRITE);
 609
 610        /* Sometimes used by sw, just virtualize */
 611        p_setb(perm, PCI_INTERRUPT_LINE, (u8)ALL_VIRT, (u8)ALL_WRITE);
 612        return 0;
 613}
 614
 615static int vfio_pm_config_write(struct vfio_pci_device *vdev, int pos,
 616                                int count, struct perm_bits *perm,
 617                                int offset, __le32 val)
 618{
 619        count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
 620        if (count < 0)
 621                return count;
 622
 623        if (offset == PCI_PM_CTRL) {
 624                pci_power_t state;
 625
 626                switch (le32_to_cpu(val) & PCI_PM_CTRL_STATE_MASK) {
 627                case 0:
 628                        state = PCI_D0;
 629                        break;
 630                case 1:
 631                        state = PCI_D1;
 632                        break;
 633                case 2:
 634                        state = PCI_D2;
 635                        break;
 636                case 3:
 637                        state = PCI_D3hot;
 638                        break;
 639                }
 640
 641                pci_set_power_state(vdev->pdev, state);
 642        }
 643
 644        return count;
 645}
 646
 647/* Permissions for the Power Management capability */
 648static int __init init_pci_cap_pm_perm(struct perm_bits *perm)
 649{
 650        if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM]))
 651                return -ENOMEM;
 652
 653        perm->writefn = vfio_pm_config_write;
 654
 655        /*
 656         * We always virtualize the next field so we can remove
 657         * capabilities from the chain if we want to.
 658         */
 659        p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
 660
 661        /*
 662         * Power management is defined *per function*, so we can let
 663         * the user change power state, but we trap and initiate the
 664         * change ourselves, so the state bits are read-only.
 665         */
 666        p_setd(perm, PCI_PM_CTRL, NO_VIRT, ~PCI_PM_CTRL_STATE_MASK);
 667        return 0;
 668}
 669
 670/* Permissions for PCI-X capability */
 671static int __init init_pci_cap_pcix_perm(struct perm_bits *perm)
 672{
 673        /* Alloc 24, but only 8 are used in v0 */
 674        if (alloc_perm_bits(perm, PCI_CAP_PCIX_SIZEOF_V2))
 675                return -ENOMEM;
 676
 677        p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
 678
 679        p_setw(perm, PCI_X_CMD, NO_VIRT, (u16)ALL_WRITE);
 680        p_setd(perm, PCI_X_ECC_CSR, NO_VIRT, ALL_WRITE);
 681        return 0;
 682}
 683
 684/* Permissions for PCI Express capability */
 685static int __init init_pci_cap_exp_perm(struct perm_bits *perm)
 686{
 687        /* Alloc larger of two possible sizes */
 688        if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2))
 689                return -ENOMEM;
 690
 691        p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
 692
 693        /*
 694         * Allow writes to device control fields (includes FLR!)
 695         * but not to devctl_phantom which could confuse IOMMU
 696         * or to the ARI bit in devctl2 which is set at probe time
 697         */
 698        p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM);
 699        p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI);
 700        return 0;
 701}
 702
 703/* Permissions for Advanced Function capability */
 704static int __init init_pci_cap_af_perm(struct perm_bits *perm)
 705{
 706        if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF]))
 707                return -ENOMEM;
 708
 709        p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
 710        p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR);
 711        return 0;
 712}
 713
 714/* Permissions for Advanced Error Reporting extended capability */
 715static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm)
 716{
 717        u32 mask;
 718
 719        if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_ERR]))
 720                return -ENOMEM;
 721
 722        /*
 723         * Virtualize the first dword of all express capabilities
 724         * because it includes the next pointer.  This lets us later
 725         * remove capabilities from the chain if we need to.
 726         */
 727        p_setd(perm, 0, ALL_VIRT, NO_WRITE);
 728
 729        /* Writable bits mask */
 730        mask =  PCI_ERR_UNC_TRAIN |             /* Training */
 731                PCI_ERR_UNC_DLP |               /* Data Link Protocol */
 732                PCI_ERR_UNC_SURPDN |            /* Surprise Down */
 733                PCI_ERR_UNC_POISON_TLP |        /* Poisoned TLP */
 734                PCI_ERR_UNC_FCP |               /* Flow Control Protocol */
 735                PCI_ERR_UNC_COMP_TIME |         /* Completion Timeout */
 736                PCI_ERR_UNC_COMP_ABORT |        /* Completer Abort */
 737                PCI_ERR_UNC_UNX_COMP |          /* Unexpected Completion */
 738                PCI_ERR_UNC_RX_OVER |           /* Receiver Overflow */
 739                PCI_ERR_UNC_MALF_TLP |          /* Malformed TLP */
 740                PCI_ERR_UNC_ECRC |              /* ECRC Error Status */
 741                PCI_ERR_UNC_UNSUP |             /* Unsupported Request */
 742                PCI_ERR_UNC_ACSV |              /* ACS Violation */
 743                PCI_ERR_UNC_INTN |              /* internal error */
 744                PCI_ERR_UNC_MCBTLP |            /* MC blocked TLP */
 745                PCI_ERR_UNC_ATOMEG |            /* Atomic egress blocked */
 746                PCI_ERR_UNC_TLPPRE;             /* TLP prefix blocked */
 747        p_setd(perm, PCI_ERR_UNCOR_STATUS, NO_VIRT, mask);
 748        p_setd(perm, PCI_ERR_UNCOR_MASK, NO_VIRT, mask);
 749        p_setd(perm, PCI_ERR_UNCOR_SEVER, NO_VIRT, mask);
 750
 751        mask =  PCI_ERR_COR_RCVR |              /* Receiver Error Status */
 752                PCI_ERR_COR_BAD_TLP |           /* Bad TLP Status */
 753                PCI_ERR_COR_BAD_DLLP |          /* Bad DLLP Status */
 754                PCI_ERR_COR_REP_ROLL |          /* REPLAY_NUM Rollover */
 755                PCI_ERR_COR_REP_TIMER |         /* Replay Timer Timeout */
 756                PCI_ERR_COR_ADV_NFAT |          /* Advisory Non-Fatal */
 757                PCI_ERR_COR_INTERNAL |          /* Corrected Internal */
 758                PCI_ERR_COR_LOG_OVER;           /* Header Log Overflow */
 759        p_setd(perm, PCI_ERR_COR_STATUS, NO_VIRT, mask);
 760        p_setd(perm, PCI_ERR_COR_MASK, NO_VIRT, mask);
 761
 762        mask =  PCI_ERR_CAP_ECRC_GENE |         /* ECRC Generation Enable */
 763                PCI_ERR_CAP_ECRC_CHKE;          /* ECRC Check Enable */
 764        p_setd(perm, PCI_ERR_CAP, NO_VIRT, mask);
 765        return 0;
 766}
 767
 768/* Permissions for Power Budgeting extended capability */
 769static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm)
 770{
 771        if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_PWR]))
 772                return -ENOMEM;
 773
 774        p_setd(perm, 0, ALL_VIRT, NO_WRITE);
 775
 776        /* Writing the data selector is OK, the info is still read-only */
 777        p_setb(perm, PCI_PWR_DATA, NO_VIRT, (u8)ALL_WRITE);
 778        return 0;
 779}
 780
 781/*
 782 * Initialize the shared permission tables
 783 */
 784void vfio_pci_uninit_perm_bits(void)
 785{
 786        free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]);
 787
 788        free_perm_bits(&cap_perms[PCI_CAP_ID_PM]);
 789        free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]);
 790        free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]);
 791        free_perm_bits(&cap_perms[PCI_CAP_ID_AF]);
 792
 793        free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
 794        free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
 795}
 796
 797int __init vfio_pci_init_perm_bits(void)
 798{
 799        int ret;
 800
 801        /* Basic config space */
 802        ret = init_pci_cap_basic_perm(&cap_perms[PCI_CAP_ID_BASIC]);
 803
 804        /* Capabilities */
 805        ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]);
 806        cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write;
 807        ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]);
 808        cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write;
 809        ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]);
 810        ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]);
 811
 812        /* Extended capabilities */
 813        ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
 814        ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
 815        ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write;
 816
 817        if (ret)
 818                vfio_pci_uninit_perm_bits();
 819
 820        return ret;
 821}
 822
 823static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
 824{
 825        u8 cap;
 826        int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
 827                                                 PCI_STD_HEADER_SIZEOF;
 828        cap = vdev->pci_config_map[pos];
 829
 830        if (cap == PCI_CAP_ID_BASIC)
 831                return 0;
 832
 833        /* XXX Can we have to abutting capabilities of the same type? */
 834        while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap)
 835                pos--;
 836
 837        return pos;
 838}
 839
 840static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
 841                                int count, struct perm_bits *perm,
 842                                int offset, __le32 *val)
 843{
 844        /* Update max available queue size from msi_qmax */
 845        if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) {
 846                __le16 *flags;
 847                int start;
 848
 849                start = vfio_find_cap_start(vdev, pos);
 850
 851                flags = (__le16 *)&vdev->vconfig[start];
 852
 853                *flags &= cpu_to_le16(~PCI_MSI_FLAGS_QMASK);
 854                *flags |= cpu_to_le16(vdev->msi_qmax << 1);
 855        }
 856
 857        return vfio_default_config_read(vdev, pos, count, perm, offset, val);
 858}
 859
 860static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos,
 861                                 int count, struct perm_bits *perm,
 862                                 int offset, __le32 val)
 863{
 864        count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
 865        if (count < 0)
 866                return count;
 867
 868        /* Fixup and write configured queue size and enable to hardware */
 869        if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) {
 870                __le16 *pflags;
 871                u16 flags;
 872                int start, ret;
 873
 874                start = vfio_find_cap_start(vdev, pos);
 875
 876                pflags = (__le16 *)&vdev->vconfig[start + PCI_MSI_FLAGS];
 877
 878                flags = le16_to_cpu(*pflags);
 879
 880                /* MSI is enabled via ioctl */
 881                if  (!is_msi(vdev))
 882                        flags &= ~PCI_MSI_FLAGS_ENABLE;
 883
 884                /* Check queue size */
 885                if ((flags & PCI_MSI_FLAGS_QSIZE) >> 4 > vdev->msi_qmax) {
 886                        flags &= ~PCI_MSI_FLAGS_QSIZE;
 887                        flags |= vdev->msi_qmax << 4;
 888                }
 889
 890                /* Write back to virt and to hardware */
 891                *pflags = cpu_to_le16(flags);
 892                ret = pci_user_write_config_word(vdev->pdev,
 893                                                 start + PCI_MSI_FLAGS,
 894                                                 flags);
 895                if (ret)
 896                        return pcibios_err_to_errno(ret);
 897        }
 898
 899        return count;
 900}
 901
 902/*
 903 * MSI determination is per-device, so this routine gets used beyond
 904 * initialization time. Don't add __init
 905 */
 906static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags)
 907{
 908        if (alloc_perm_bits(perm, len))
 909                return -ENOMEM;
 910
 911        perm->readfn = vfio_msi_config_read;
 912        perm->writefn = vfio_msi_config_write;
 913
 914        p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
 915
 916        /*
 917         * The upper byte of the control register is reserved,
 918         * just setup the lower byte.
 919         */
 920        p_setb(perm, PCI_MSI_FLAGS, (u8)ALL_VIRT, (u8)ALL_WRITE);
 921        p_setd(perm, PCI_MSI_ADDRESS_LO, ALL_VIRT, ALL_WRITE);
 922        if (flags & PCI_MSI_FLAGS_64BIT) {
 923                p_setd(perm, PCI_MSI_ADDRESS_HI, ALL_VIRT, ALL_WRITE);
 924                p_setw(perm, PCI_MSI_DATA_64, (u16)ALL_VIRT, (u16)ALL_WRITE);
 925                if (flags & PCI_MSI_FLAGS_MASKBIT) {
 926                        p_setd(perm, PCI_MSI_MASK_64, NO_VIRT, ALL_WRITE);
 927                        p_setd(perm, PCI_MSI_PENDING_64, NO_VIRT, ALL_WRITE);
 928                }
 929        } else {
 930                p_setw(perm, PCI_MSI_DATA_32, (u16)ALL_VIRT, (u16)ALL_WRITE);
 931                if (flags & PCI_MSI_FLAGS_MASKBIT) {
 932                        p_setd(perm, PCI_MSI_MASK_32, NO_VIRT, ALL_WRITE);
 933                        p_setd(perm, PCI_MSI_PENDING_32, NO_VIRT, ALL_WRITE);
 934                }
 935        }
 936        return 0;
 937}
 938
 939/* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */
 940static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos)
 941{
 942        struct pci_dev *pdev = vdev->pdev;
 943        int len, ret;
 944        u16 flags;
 945
 946        ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags);
 947        if (ret)
 948                return pcibios_err_to_errno(ret);
 949
 950        len = 10; /* Minimum size */
 951        if (flags & PCI_MSI_FLAGS_64BIT)
 952                len += 4;
 953        if (flags & PCI_MSI_FLAGS_MASKBIT)
 954                len += 10;
 955
 956        if (vdev->msi_perm)
 957                return len;
 958
 959        vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL);
 960        if (!vdev->msi_perm)
 961                return -ENOMEM;
 962
 963        ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags);
 964        if (ret)
 965                return ret;
 966
 967        return len;
 968}
 969
 970/* Determine extended capability length for VC (2 & 9) and MFVC */
 971static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos)
 972{
 973        struct pci_dev *pdev = vdev->pdev;
 974        u32 tmp;
 975        int ret, evcc, phases, vc_arb;
 976        int len = PCI_CAP_VC_BASE_SIZEOF;
 977
 978        ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG1, &tmp);
 979        if (ret)
 980                return pcibios_err_to_errno(ret);
 981
 982        evcc = tmp & PCI_VC_REG1_EVCC; /* extended vc count */
 983        ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG2, &tmp);
 984        if (ret)
 985                return pcibios_err_to_errno(ret);
 986
 987        if (tmp & PCI_VC_REG2_128_PHASE)
 988                phases = 128;
 989        else if (tmp & PCI_VC_REG2_64_PHASE)
 990                phases = 64;
 991        else if (tmp & PCI_VC_REG2_32_PHASE)
 992                phases = 32;
 993        else
 994                phases = 0;
 995
 996        vc_arb = phases * 4;
 997
 998        /*
 999         * Port arbitration tables are root & switch only;
1000         * function arbitration tables are function 0 only.
1001         * In either case, we'll never let user write them so
1002         * we don't care how big they are
1003         */
1004        len += (1 + evcc) * PCI_CAP_VC_PER_VC_SIZEOF;
1005        if (vc_arb) {
1006                len = round_up(len, 16);
1007                len += vc_arb / 8;
1008        }
1009        return len;
1010}
1011
1012static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
1013{
1014        struct pci_dev *pdev = vdev->pdev;
1015        u32 dword;
1016        u16 word;
1017        u8 byte;
1018        int ret;
1019
1020        switch (cap) {
1021        case PCI_CAP_ID_MSI:
1022                return vfio_msi_cap_len(vdev, pos);
1023        case PCI_CAP_ID_PCIX:
1024                ret = pci_read_config_word(pdev, pos + PCI_X_CMD, &word);
1025                if (ret)
1026                        return pcibios_err_to_errno(ret);
1027
1028                if (PCI_X_CMD_VERSION(word)) {
1029                        /* Test for extended capabilities */
1030                        pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
1031                        vdev->extended_caps = (dword != 0);
1032                        return PCI_CAP_PCIX_SIZEOF_V2;
1033                } else
1034                        return PCI_CAP_PCIX_SIZEOF_V0;
1035        case PCI_CAP_ID_VNDR:
1036                /* length follows next field */
1037                ret = pci_read_config_byte(pdev, pos + PCI_CAP_FLAGS, &byte);
1038                if (ret)
1039                        return pcibios_err_to_errno(ret);
1040
1041                return byte;
1042        case PCI_CAP_ID_EXP:
1043                /* Test for extended capabilities */
1044                pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
1045                vdev->extended_caps = (dword != 0);
1046
1047                /* length based on version */
1048                if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1)
1049                        return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
1050                else
1051                        return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
1052        case PCI_CAP_ID_HT:
1053                ret = pci_read_config_byte(pdev, pos + 3, &byte);
1054                if (ret)
1055                        return pcibios_err_to_errno(ret);
1056
1057                return (byte & HT_3BIT_CAP_MASK) ?
1058                        HT_CAP_SIZEOF_SHORT : HT_CAP_SIZEOF_LONG;
1059        case PCI_CAP_ID_SATA:
1060                ret = pci_read_config_byte(pdev, pos + PCI_SATA_REGS, &byte);
1061                if (ret)
1062                        return pcibios_err_to_errno(ret);
1063
1064                byte &= PCI_SATA_REGS_MASK;
1065                if (byte == PCI_SATA_REGS_INLINE)
1066                        return PCI_SATA_SIZEOF_LONG;
1067                else
1068                        return PCI_SATA_SIZEOF_SHORT;
1069        default:
1070                pr_warn("%s: %s unknown length for pci cap 0x%x@0x%x\n",
1071                        dev_name(&pdev->dev), __func__, cap, pos);
1072        }
1073
1074        return 0;
1075}
1076
1077static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos)
1078{
1079        struct pci_dev *pdev = vdev->pdev;
1080        u8 byte;
1081        u32 dword;
1082        int ret;
1083
1084        switch (ecap) {
1085        case PCI_EXT_CAP_ID_VNDR:
1086                ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword);
1087                if (ret)
1088                        return pcibios_err_to_errno(ret);
1089
1090                return dword >> PCI_VSEC_HDR_LEN_SHIFT;
1091        case PCI_EXT_CAP_ID_VC:
1092        case PCI_EXT_CAP_ID_VC9:
1093        case PCI_EXT_CAP_ID_MFVC:
1094                return vfio_vc_cap_len(vdev, epos);
1095        case PCI_EXT_CAP_ID_ACS:
1096                ret = pci_read_config_byte(pdev, epos + PCI_ACS_CAP, &byte);
1097                if (ret)
1098                        return pcibios_err_to_errno(ret);
1099
1100                if (byte & PCI_ACS_EC) {
1101                        int bits;
1102
1103                        ret = pci_read_config_byte(pdev,
1104                                                   epos + PCI_ACS_EGRESS_BITS,
1105                                                   &byte);
1106                        if (ret)
1107                                return pcibios_err_to_errno(ret);
1108
1109                        bits = byte ? round_up(byte, 32) : 256;
1110                        return 8 + (bits / 8);
1111                }
1112                return 8;
1113
1114        case PCI_EXT_CAP_ID_REBAR:
1115                ret = pci_read_config_byte(pdev, epos + PCI_REBAR_CTRL, &byte);
1116                if (ret)
1117                        return pcibios_err_to_errno(ret);
1118
1119                byte &= PCI_REBAR_CTRL_NBAR_MASK;
1120                byte >>= PCI_REBAR_CTRL_NBAR_SHIFT;
1121
1122                return 4 + (byte * 8);
1123        case PCI_EXT_CAP_ID_DPA:
1124                ret = pci_read_config_byte(pdev, epos + PCI_DPA_CAP, &byte);
1125                if (ret)
1126                        return pcibios_err_to_errno(ret);
1127
1128                byte &= PCI_DPA_CAP_SUBSTATE_MASK;
1129                byte = round_up(byte + 1, 4);
1130                return PCI_DPA_BASE_SIZEOF + byte;
1131        case PCI_EXT_CAP_ID_TPH:
1132                ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword);
1133                if (ret)
1134                        return pcibios_err_to_errno(ret);
1135
1136                if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) {
1137                        int sts;
1138
1139                        sts = byte & PCI_TPH_CAP_ST_MASK;
1140                        sts >>= PCI_TPH_CAP_ST_SHIFT;
1141                        return PCI_TPH_BASE_SIZEOF + round_up(sts * 2, 4);
1142                }
1143                return PCI_TPH_BASE_SIZEOF;
1144        default:
1145                pr_warn("%s: %s unknown length for pci ecap 0x%x@0x%x\n",
1146                        dev_name(&pdev->dev), __func__, ecap, epos);
1147        }
1148
1149        return 0;
1150}
1151
1152static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev,
1153                                   int offset, int size)
1154{
1155        struct pci_dev *pdev = vdev->pdev;
1156        int ret = 0;
1157
1158        /*
1159         * We try to read physical config space in the largest chunks
1160         * we can, assuming that all of the fields support dword access.
1161         * pci_save_state() makes this same assumption and seems to do ok.
1162         */
1163        while (size) {
1164                int filled;
1165
1166                if (size >= 4 && !(offset % 4)) {
1167                        __le32 *dwordp = (__le32 *)&vdev->vconfig[offset];
1168                        u32 dword;
1169
1170                        ret = pci_read_config_dword(pdev, offset, &dword);
1171                        if (ret)
1172                                return ret;
1173                        *dwordp = cpu_to_le32(dword);
1174                        filled = 4;
1175                } else if (size >= 2 && !(offset % 2)) {
1176                        __le16 *wordp = (__le16 *)&vdev->vconfig[offset];
1177                        u16 word;
1178
1179                        ret = pci_read_config_word(pdev, offset, &word);
1180                        if (ret)
1181                                return ret;
1182                        *wordp = cpu_to_le16(word);
1183                        filled = 2;
1184                } else {
1185                        u8 *byte = &vdev->vconfig[offset];
1186                        ret = pci_read_config_byte(pdev, offset, byte);
1187                        if (ret)
1188                                return ret;
1189                        filled = 1;
1190                }
1191
1192                offset += filled;
1193                size -= filled;
1194        }
1195
1196        return ret;
1197}
1198
1199static int vfio_cap_init(struct vfio_pci_device *vdev)
1200{
1201        struct pci_dev *pdev = vdev->pdev;
1202        u8 *map = vdev->pci_config_map;
1203        u16 status;
1204        u8 pos, *prev, cap;
1205        int loops, ret, caps = 0;
1206
1207        /* Any capabilities? */
1208        ret = pci_read_config_word(pdev, PCI_STATUS, &status);
1209        if (ret)
1210                return ret;
1211
1212        if (!(status & PCI_STATUS_CAP_LIST))
1213                return 0; /* Done */
1214
1215        ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
1216        if (ret)
1217                return ret;
1218
1219        /* Mark the previous position in case we want to skip a capability */
1220        prev = &vdev->vconfig[PCI_CAPABILITY_LIST];
1221
1222        /* We can bound our loop, capabilities are dword aligned */
1223        loops = (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF;
1224        while (pos && loops--) {
1225                u8 next;
1226                int i, len = 0;
1227
1228                ret = pci_read_config_byte(pdev, pos, &cap);
1229                if (ret)
1230                        return ret;
1231
1232                ret = pci_read_config_byte(pdev,
1233                                           pos + PCI_CAP_LIST_NEXT, &next);
1234                if (ret)
1235                        return ret;
1236
1237                if (cap <= PCI_CAP_ID_MAX) {
1238                        len = pci_cap_length[cap];
1239                        if (len == 0xFF) { /* Variable length */
1240                                len = vfio_cap_len(vdev, cap, pos);
1241                                if (len < 0)
1242                                        return len;
1243                        }
1244                }
1245
1246                if (!len) {
1247                        pr_info("%s: %s hiding cap 0x%x\n",
1248                                __func__, dev_name(&pdev->dev), cap);
1249                        *prev = next;
1250                        pos = next;
1251                        continue;
1252                }
1253
1254                /* Sanity check, do we overlap other capabilities? */
1255                for (i = 0; i < len; i++) {
1256                        if (likely(map[pos + i] == PCI_CAP_ID_INVALID))
1257                                continue;
1258
1259                        pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n",
1260                                __func__, dev_name(&pdev->dev),
1261                                pos + i, map[pos + i], cap);
1262                }
1263
1264                memset(map + pos, cap, len);
1265                ret = vfio_fill_vconfig_bytes(vdev, pos, len);
1266                if (ret)
1267                        return ret;
1268
1269                prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT];
1270                pos = next;
1271                caps++;
1272        }
1273
1274        /* If we didn't fill any capabilities, clear the status flag */
1275        if (!caps) {
1276                __le16 *vstatus = (__le16 *)&vdev->vconfig[PCI_STATUS];
1277                *vstatus &= ~cpu_to_le16(PCI_STATUS_CAP_LIST);
1278        }
1279
1280        return 0;
1281}
1282
1283static int vfio_ecap_init(struct vfio_pci_device *vdev)
1284{
1285        struct pci_dev *pdev = vdev->pdev;
1286        u8 *map = vdev->pci_config_map;
1287        u16 epos;
1288        __le32 *prev = NULL;
1289        int loops, ret, ecaps = 0;
1290
1291        if (!vdev->extended_caps)
1292                return 0;
1293
1294        epos = PCI_CFG_SPACE_SIZE;
1295
1296        loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF;
1297
1298        while (loops-- && epos >= PCI_CFG_SPACE_SIZE) {
1299                u32 header;
1300                u16 ecap;
1301                int i, len = 0;
1302                bool hidden = false;
1303
1304                ret = pci_read_config_dword(pdev, epos, &header);
1305                if (ret)
1306                        return ret;
1307
1308                ecap = PCI_EXT_CAP_ID(header);
1309
1310                if (ecap <= PCI_EXT_CAP_ID_MAX) {
1311                        len = pci_ext_cap_length[ecap];
1312                        if (len == 0xFF) {
1313                                len = vfio_ext_cap_len(vdev, ecap, epos);
1314                                if (len < 0)
1315                                        return ret;
1316                        }
1317                }
1318
1319                if (!len) {
1320                        pr_info("%s: %s hiding ecap 0x%x@0x%x\n",
1321                                __func__, dev_name(&pdev->dev), ecap, epos);
1322
1323                        /* If not the first in the chain, we can skip over it */
1324                        if (prev) {
1325                                u32 val = epos = PCI_EXT_CAP_NEXT(header);
1326                                *prev &= cpu_to_le32(~(0xffcU << 20));
1327                                *prev |= cpu_to_le32(val << 20);
1328                                continue;
1329                        }
1330
1331                        /*
1332                         * Otherwise, fill in a placeholder, the direct
1333                         * readfn will virtualize this automatically
1334                         */
1335                        len = PCI_CAP_SIZEOF;
1336                        hidden = true;
1337                }
1338
1339                for (i = 0; i < len; i++) {
1340                        if (likely(map[epos + i] == PCI_CAP_ID_INVALID))
1341                                continue;
1342
1343                        pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n",
1344                                __func__, dev_name(&pdev->dev),
1345                                epos + i, map[epos + i], ecap);
1346                }
1347
1348                /*
1349                 * Even though ecap is 2 bytes, we're currently a long way
1350                 * from exceeding 1 byte capabilities.  If we ever make it
1351                 * up to 0xFF we'll need to up this to a two-byte, byte map.
1352                 */
1353                BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID);
1354
1355                memset(map + epos, ecap, len);
1356                ret = vfio_fill_vconfig_bytes(vdev, epos, len);
1357                if (ret)
1358                        return ret;
1359
1360                /*
1361                 * If we're just using this capability to anchor the list,
1362                 * hide the real ID.  Only count real ecaps.  XXX PCI spec
1363                 * indicates to use cap id = 0, version = 0, next = 0 if
1364                 * ecaps are absent, hope users check all the way to next.
1365                 */
1366                if (hidden)
1367                        *(__le32 *)&vdev->vconfig[epos] &=
1368                                cpu_to_le32((0xffcU << 20));
1369                else
1370                        ecaps++;
1371
1372                prev = (__le32 *)&vdev->vconfig[epos];
1373                epos = PCI_EXT_CAP_NEXT(header);
1374        }
1375
1376        if (!ecaps)
1377                *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0;
1378
1379        return 0;
1380}
1381
1382/*
1383 * For each device we allocate a pci_config_map that indicates the
1384 * capability occupying each dword and thus the struct perm_bits we
1385 * use for read and write.  We also allocate a virtualized config
1386 * space which tracks reads and writes to bits that we emulate for
1387 * the user.  Initial values filled from device.
1388 *
1389 * Using shared stuct perm_bits between all vfio-pci devices saves
1390 * us from allocating cfg_size buffers for virt and write for every
1391 * device.  We could remove vconfig and allocate individual buffers
1392 * for each area requring emulated bits, but the array of pointers
1393 * would be comparable in size (at least for standard config space).
1394 */
1395int vfio_config_init(struct vfio_pci_device *vdev)
1396{
1397        struct pci_dev *pdev = vdev->pdev;
1398        u8 *map, *vconfig;
1399        int ret;
1400
1401        /*
1402         * Config space, caps and ecaps are all dword aligned, so we could
1403         * use one byte per dword to record the type.  However, there are
1404         * no requiremenst on the length of a capability, so the gap between
1405         * capabilities needs byte granularity.
1406         */
1407        map = kmalloc(pdev->cfg_size, GFP_KERNEL);
1408        if (!map)
1409                return -ENOMEM;
1410
1411        vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL);
1412        if (!vconfig) {
1413                kfree(map);
1414                return -ENOMEM;
1415        }
1416
1417        vdev->pci_config_map = map;
1418        vdev->vconfig = vconfig;
1419
1420        memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF);
1421        memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID,
1422               pdev->cfg_size - PCI_STD_HEADER_SIZEOF);
1423
1424        ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF);
1425        if (ret)
1426                goto out;
1427
1428        vdev->bardirty = true;
1429
1430        /*
1431         * XXX can we just pci_load_saved_state/pci_restore_state?
1432         * may need to rebuild vconfig after that
1433         */
1434
1435        /* For restore after reset */
1436        vdev->rbar[0] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_0]);
1437        vdev->rbar[1] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_1]);
1438        vdev->rbar[2] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_2]);
1439        vdev->rbar[3] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_3]);
1440        vdev->rbar[4] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_4]);
1441        vdev->rbar[5] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_5]);
1442        vdev->rbar[6] = le32_to_cpu(*(__le32 *)&vconfig[PCI_ROM_ADDRESS]);
1443
1444        if (pdev->is_virtfn) {
1445                *(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor);
1446                *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device);
1447        }
1448
1449        ret = vfio_cap_init(vdev);
1450        if (ret)
1451                goto out;
1452
1453        ret = vfio_ecap_init(vdev);
1454        if (ret)
1455                goto out;
1456
1457        return 0;
1458
1459out:
1460        kfree(map);
1461        vdev->pci_config_map = NULL;
1462        kfree(vconfig);
1463        vdev->vconfig = NULL;
1464        return pcibios_err_to_errno(ret);
1465}
1466
1467void vfio_config_free(struct vfio_pci_device *vdev)
1468{
1469        kfree(vdev->vconfig);
1470        vdev->vconfig = NULL;
1471        kfree(vdev->pci_config_map);
1472        vdev->pci_config_map = NULL;
1473        kfree(vdev->msi_perm);
1474        vdev->msi_perm = NULL;
1475}
1476
1477/*
1478 * Find the remaining number of bytes in a dword that match the given
1479 * position.  Stop at either the end of the capability or the dword boundary.
1480 */
1481static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
1482                                           loff_t pos)
1483{
1484        u8 cap = vdev->pci_config_map[pos];
1485        size_t i;
1486
1487        for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++)
1488                /* nop */;
1489
1490        return i;
1491}
1492
1493static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
1494                                 size_t count, loff_t *ppos, bool iswrite)
1495{
1496        struct pci_dev *pdev = vdev->pdev;
1497        struct perm_bits *perm;
1498        __le32 val = 0;
1499        int cap_start = 0, offset;
1500        u8 cap_id;
1501        ssize_t ret;
1502
1503        if (*ppos < 0 || *ppos >= pdev->cfg_size ||
1504            *ppos + count > pdev->cfg_size)
1505                return -EFAULT;
1506
1507        /*
1508         * Chop accesses into aligned chunks containing no more than a
1509         * single capability.  Caller increments to the next chunk.
1510         */
1511        count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos));
1512        if (count >= 4 && !(*ppos % 4))
1513                count = 4;
1514        else if (count >= 2 && !(*ppos % 2))
1515                count = 2;
1516        else
1517                count = 1;
1518
1519        ret = count;
1520
1521        cap_id = vdev->pci_config_map[*ppos];
1522
1523        if (cap_id == PCI_CAP_ID_INVALID) {
1524                perm = &unassigned_perms;
1525                cap_start = *ppos;
1526        } else {
1527                if (*ppos >= PCI_CFG_SPACE_SIZE) {
1528                        WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
1529
1530                        perm = &ecap_perms[cap_id];
1531                        cap_start = vfio_find_cap_start(vdev, *ppos);
1532                } else {
1533                        WARN_ON(cap_id > PCI_CAP_ID_MAX);
1534
1535                        perm = &cap_perms[cap_id];
1536
1537                        if (cap_id == PCI_CAP_ID_MSI)
1538                                perm = vdev->msi_perm;
1539
1540                        if (cap_id > PCI_CAP_ID_BASIC)
1541                                cap_start = vfio_find_cap_start(vdev, *ppos);
1542                }
1543        }
1544
1545        WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC);
1546        WARN_ON(cap_start > *ppos);
1547
1548        offset = *ppos - cap_start;
1549
1550        if (iswrite) {
1551                if (!perm->writefn)
1552                        return ret;
1553
1554                if (copy_from_user(&val, buf, count))
1555                        return -EFAULT;
1556
1557                ret = perm->writefn(vdev, *ppos, count, perm, offset, val);
1558        } else {
1559                if (perm->readfn) {
1560                        ret = perm->readfn(vdev, *ppos, count,
1561                                           perm, offset, &val);
1562                        if (ret < 0)
1563                                return ret;
1564                }
1565
1566                if (copy_to_user(buf, &val, count))
1567                        return -EFAULT;
1568        }
1569
1570        return ret;
1571}
1572
1573ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf,
1574                           size_t count, loff_t *ppos, bool iswrite)
1575{
1576        size_t done = 0;
1577        int ret = 0;
1578        loff_t pos = *ppos;
1579
1580        pos &= VFIO_PCI_OFFSET_MASK;
1581
1582        while (count) {
1583                ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite);
1584                if (ret < 0)
1585                        return ret;
1586
1587                count -= ret;
1588                done += ret;
1589                buf += ret;
1590                pos += ret;
1591        }
1592
1593        *ppos += done;
1594
1595        return done;
1596}
1597