linux/drivers/pci/host/pci-hyperv.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) Microsoft Corporation.
   3 *
   4 * Author:
   5 *   Jake Oshins <jakeo@microsoft.com>
   6 *
   7 * This driver acts as a paravirtual front-end for PCI Express root buses.
   8 * When a PCI Express function (either an entire device or an SR-IOV
   9 * Virtual Function) is being passed through to the VM, this driver exposes
  10 * a new bus to the guest VM.  This is modeled as a root PCI bus because
  11 * no bridges are being exposed to the VM.  In fact, with a "Generation 2"
  12 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
  13 * until a device as been exposed using this driver.
  14 *
  15 * Each root PCI bus has its own PCI domain, which is called "Segment" in
  16 * the PCI Firmware Specifications.  Thus while each device passed through
  17 * to the VM using this front-end will appear at "device 0", the domain will
  18 * be unique.  Typically, each bus will have one PCI function on it, though
  19 * this driver does support more than one.
  20 *
  21 * In order to map the interrupts from the device through to the guest VM,
  22 * this driver also implements an IRQ Domain, which handles interrupts (either
  23 * MSI or MSI-X) associated with the functions on the bus.  As interrupts are
  24 * set up, torn down, or reaffined, this driver communicates with the
  25 * underlying hypervisor to adjust the mappings in the I/O MMU so that each
  26 * interrupt will be delivered to the correct virtual processor at the right
  27 * vector.  This driver does not support level-triggered (line-based)
  28 * interrupts, and will report that the Interrupt Line register in the
  29 * function's configuration space is zero.
  30 *
  31 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
  32 * facilities.  For instance, the configuration space of a function exposed
  33 * by Hyper-V is mapped into a single page of memory space, and the
  34 * read and write handlers for config space must be aware of this mechanism.
  35 * Similarly, device setup and teardown involves messages sent to and from
  36 * the PCI back-end driver in Hyper-V.
  37 *
  38 * This program is free software; you can redistribute it and/or modify it
  39 * under the terms of the GNU General Public License version 2 as published
  40 * by the Free Software Foundation.
  41 *
  42 * This program is distributed in the hope that it will be useful, but
  43 * WITHOUT ANY WARRANTY; without even the implied warranty of
  44 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  45 * NON INFRINGEMENT.  See the GNU General Public License for more
  46 * details.
  47 *
  48 */
  49
  50#include <linux/kernel.h>
  51#include <linux/module.h>
  52#include <linux/pci.h>
  53#include <linux/semaphore.h>
  54#include <linux/irqdomain.h>
  55#include <asm/irqdomain.h>
  56#include <asm/apic.h>
  57#include <linux/msi.h>
  58#include <linux/hyperv.h>
  59#include <asm/mshyperv.h>
  60
  61/*
  62 * Protocol versions. The low word is the minor version, the high word the
  63 * major version.
  64 */
  65
  66#define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (major)))
  67#define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
  68#define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
  69
  70enum {
  71        PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),
  72        PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1
  73};
  74
  75#define PCI_CONFIG_MMIO_LENGTH  0x2000
  76#define CFG_PAGE_OFFSET 0x1000
  77#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
  78
  79#define MAX_SUPPORTED_MSI_MESSAGES 0x400
  80
  81/*
  82 * Message Types
  83 */
  84
  85enum pci_message_type {
  86        /*
  87         * Version 1.1
  88         */
  89        PCI_MESSAGE_BASE                = 0x42490000,
  90        PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
  91        PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
  92        PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
  93        PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
  94        PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
  95        PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
  96        PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
  97        PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
  98        PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
  99        PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
 100        PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
 101        PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
 102        PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
 103        PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
 104        PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
 105        PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
 106        PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
 107        PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
 108        PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
 109        PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
 110        PCI_MESSAGE_MAXIMUM
 111};
 112
 113/*
 114 * Structures defining the virtual PCI Express protocol.
 115 */
 116
 117union pci_version {
 118        struct {
 119                u16 minor_version;
 120                u16 major_version;
 121        } parts;
 122        u32 version;
 123} __packed;
 124
 125/*
 126 * Function numbers are 8-bits wide on Express, as interpreted through ARI,
 127 * which is all this driver does.  This representation is the one used in
 128 * Windows, which is what is expected when sending this back and forth with
 129 * the Hyper-V parent partition.
 130 */
 131union win_slot_encoding {
 132        struct {
 133                u32     func:8;
 134                u32     reserved:24;
 135        } bits;
 136        u32 slot;
 137} __packed;
 138
 139/*
 140 * Pretty much as defined in the PCI Specifications.
 141 */
 142struct pci_function_description {
 143        u16     v_id;   /* vendor ID */
 144        u16     d_id;   /* device ID */
 145        u8      rev;
 146        u8      prog_intf;
 147        u8      subclass;
 148        u8      base_class;
 149        u32     subsystem_id;
 150        union win_slot_encoding win_slot;
 151        u32     ser;    /* serial number */
 152} __packed;
 153
 154/**
 155 * struct hv_msi_desc
 156 * @vector:             IDT entry
 157 * @delivery_mode:      As defined in Intel's Programmer's
 158 *                      Reference Manual, Volume 3, Chapter 8.
 159 * @vector_count:       Number of contiguous entries in the
 160 *                      Interrupt Descriptor Table that are
 161 *                      occupied by this Message-Signaled
 162 *                      Interrupt. For "MSI", as first defined
 163 *                      in PCI 2.2, this can be between 1 and
 164 *                      32. For "MSI-X," as first defined in PCI
 165 *                      3.0, this must be 1, as each MSI-X table
 166 *                      entry would have its own descriptor.
 167 * @reserved:           Empty space
 168 * @cpu_mask:           All the target virtual processors.
 169 */
 170struct hv_msi_desc {
 171        u8      vector;
 172        u8      delivery_mode;
 173        u16     vector_count;
 174        u32     reserved;
 175        u64     cpu_mask;
 176} __packed;
 177
 178/**
 179 * struct tran_int_desc
 180 * @reserved:           unused, padding
 181 * @vector_count:       same as in hv_msi_desc
 182 * @data:               This is the "data payload" value that is
 183 *                      written by the device when it generates
 184 *                      a message-signaled interrupt, either MSI
 185 *                      or MSI-X.
 186 * @address:            This is the address to which the data
 187 *                      payload is written on interrupt
 188 *                      generation.
 189 */
 190struct tran_int_desc {
 191        u16     reserved;
 192        u16     vector_count;
 193        u32     data;
 194        u64     address;
 195} __packed;
 196
 197/*
 198 * A generic message format for virtual PCI.
 199 * Specific message formats are defined later in the file.
 200 */
 201
 202struct pci_message {
 203        u32 type;
 204} __packed;
 205
 206struct pci_child_message {
 207        struct pci_message message_type;
 208        union win_slot_encoding wslot;
 209} __packed;
 210
 211struct pci_incoming_message {
 212        struct vmpacket_descriptor hdr;
 213        struct pci_message message_type;
 214} __packed;
 215
 216struct pci_response {
 217        struct vmpacket_descriptor hdr;
 218        s32 status;                     /* negative values are failures */
 219} __packed;
 220
 221struct pci_packet {
 222        void (*completion_func)(void *context, struct pci_response *resp,
 223                                int resp_packet_size);
 224        void *compl_ctxt;
 225
 226        struct pci_message message[0];
 227};
 228
 229/*
 230 * Specific message types supporting the PCI protocol.
 231 */
 232
 233/*
 234 * Version negotiation message. Sent from the guest to the host.
 235 * The guest is free to try different versions until the host
 236 * accepts the version.
 237 *
 238 * pci_version: The protocol version requested.
 239 * is_last_attempt: If TRUE, this is the last version guest will request.
 240 * reservedz: Reserved field, set to zero.
 241 */
 242
 243struct pci_version_request {
 244        struct pci_message message_type;
 245        enum pci_message_type protocol_version;
 246} __packed;
 247
 248/*
 249 * Bus D0 Entry.  This is sent from the guest to the host when the virtual
 250 * bus (PCI Express port) is ready for action.
 251 */
 252
 253struct pci_bus_d0_entry {
 254        struct pci_message message_type;
 255        u32 reserved;
 256        u64 mmio_base;
 257} __packed;
 258
 259struct pci_bus_relations {
 260        struct pci_incoming_message incoming;
 261        u32 device_count;
 262        struct pci_function_description func[0];
 263} __packed;
 264
 265struct pci_q_res_req_response {
 266        struct vmpacket_descriptor hdr;
 267        s32 status;                     /* negative values are failures */
 268        u32 probed_bar[6];
 269} __packed;
 270
 271struct pci_set_power {
 272        struct pci_message message_type;
 273        union win_slot_encoding wslot;
 274        u32 power_state;                /* In Windows terms */
 275        u32 reserved;
 276} __packed;
 277
 278struct pci_set_power_response {
 279        struct vmpacket_descriptor hdr;
 280        s32 status;                     /* negative values are failures */
 281        union win_slot_encoding wslot;
 282        u32 resultant_state;            /* In Windows terms */
 283        u32 reserved;
 284} __packed;
 285
 286struct pci_resources_assigned {
 287        struct pci_message message_type;
 288        union win_slot_encoding wslot;
 289        u8 memory_range[0x14][6];       /* not used here */
 290        u32 msi_descriptors;
 291        u32 reserved[4];
 292} __packed;
 293
 294struct pci_create_interrupt {
 295        struct pci_message message_type;
 296        union win_slot_encoding wslot;
 297        struct hv_msi_desc int_desc;
 298} __packed;
 299
 300struct pci_create_int_response {
 301        struct pci_response response;
 302        u32 reserved;
 303        struct tran_int_desc int_desc;
 304} __packed;
 305
 306struct pci_delete_interrupt {
 307        struct pci_message message_type;
 308        union win_slot_encoding wslot;
 309        struct tran_int_desc int_desc;
 310} __packed;
 311
 312struct pci_dev_incoming {
 313        struct pci_incoming_message incoming;
 314        union win_slot_encoding wslot;
 315} __packed;
 316
 317struct pci_eject_response {
 318        struct pci_message message_type;
 319        union win_slot_encoding wslot;
 320        u32 status;
 321} __packed;
 322
 323static int pci_ring_size = (4 * PAGE_SIZE);
 324
 325/*
 326 * Definitions or interrupt steering hypercall.
 327 */
 328#define HV_PARTITION_ID_SELF            ((u64)-1)
 329#define HVCALL_RETARGET_INTERRUPT       0x7e
 330
 331struct retarget_msi_interrupt {
 332        u64     partition_id;           /* use "self" */
 333        u64     device_id;
 334        u32     source;                 /* 1 for MSI(-X) */
 335        u32     reserved1;
 336        u32     address;
 337        u32     data;
 338        u64     reserved2;
 339        u32     vector;
 340        u32     flags;
 341        u64     vp_mask;
 342} __packed;
 343
 344/*
 345 * Driver specific state.
 346 */
 347
 348enum hv_pcibus_state {
 349        hv_pcibus_init = 0,
 350        hv_pcibus_probed,
 351        hv_pcibus_installed,
 352        hv_pcibus_maximum
 353};
 354
 355struct hv_pcibus_device {
 356        struct pci_sysdata sysdata;
 357        enum hv_pcibus_state state;
 358        atomic_t remove_lock;
 359        struct hv_device *hdev;
 360        resource_size_t low_mmio_space;
 361        resource_size_t high_mmio_space;
 362        struct resource *mem_config;
 363        struct resource *low_mmio_res;
 364        struct resource *high_mmio_res;
 365        struct completion *survey_event;
 366        struct completion remove_event;
 367        struct pci_bus *pci_bus;
 368        spinlock_t config_lock; /* Avoid two threads writing index page */
 369        spinlock_t device_list_lock;    /* Protect lists below */
 370        void __iomem *cfg_addr;
 371
 372        struct semaphore enum_sem;
 373        struct list_head resources_for_children;
 374
 375        struct list_head children;
 376        struct list_head dr_list;
 377
 378        struct msi_domain_info msi_info;
 379        struct msi_controller msi_chip;
 380        struct irq_domain *irq_domain;
 381        struct retarget_msi_interrupt retarget_msi_interrupt_params;
 382        spinlock_t retarget_msi_interrupt_lock;
 383};
 384
 385/*
 386 * Tracks "Device Relations" messages from the host, which must be both
 387 * processed in order and deferred so that they don't run in the context
 388 * of the incoming packet callback.
 389 */
 390struct hv_dr_work {
 391        struct work_struct wrk;
 392        struct hv_pcibus_device *bus;
 393};
 394
 395struct hv_dr_state {
 396        struct list_head list_entry;
 397        u32 device_count;
 398        struct pci_function_description func[0];
 399};
 400
 401enum hv_pcichild_state {
 402        hv_pcichild_init = 0,
 403        hv_pcichild_requirements,
 404        hv_pcichild_resourced,
 405        hv_pcichild_ejecting,
 406        hv_pcichild_maximum
 407};
 408
 409enum hv_pcidev_ref_reason {
 410        hv_pcidev_ref_invalid = 0,
 411        hv_pcidev_ref_initial,
 412        hv_pcidev_ref_by_slot,
 413        hv_pcidev_ref_packet,
 414        hv_pcidev_ref_pnp,
 415        hv_pcidev_ref_childlist,
 416        hv_pcidev_irqdata,
 417        hv_pcidev_ref_max
 418};
 419
 420struct hv_pci_dev {
 421        /* List protected by pci_rescan_remove_lock */
 422        struct list_head list_entry;
 423        atomic_t refs;
 424        enum hv_pcichild_state state;
 425        struct pci_function_description desc;
 426        bool reported_missing;
 427        struct hv_pcibus_device *hbus;
 428        struct work_struct wrk;
 429
 430        /*
 431         * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
 432         * read it back, for each of the BAR offsets within config space.
 433         */
 434        u32 probed_bar[6];
 435};
 436
 437struct hv_pci_compl {
 438        struct completion host_event;
 439        s32 completion_status;
 440};
 441
 442/**
 443 * hv_pci_generic_compl() - Invoked for a completion packet
 444 * @context:            Set up by the sender of the packet.
 445 * @resp:               The response packet
 446 * @resp_packet_size:   Size in bytes of the packet
 447 *
 448 * This function is used to trigger an event and report status
 449 * for any message for which the completion packet contains a
 450 * status and nothing else.
 451 */
 452static void hv_pci_generic_compl(void *context, struct pci_response *resp,
 453                                 int resp_packet_size)
 454{
 455        struct hv_pci_compl *comp_pkt = context;
 456
 457        if (resp_packet_size >= offsetofend(struct pci_response, status))
 458                comp_pkt->completion_status = resp->status;
 459        else
 460                comp_pkt->completion_status = -1;
 461
 462        complete(&comp_pkt->host_event);
 463}
 464
 465static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
 466                                                u32 wslot);
 467static void get_pcichild(struct hv_pci_dev *hv_pcidev,
 468                         enum hv_pcidev_ref_reason reason);
 469static void put_pcichild(struct hv_pci_dev *hv_pcidev,
 470                         enum hv_pcidev_ref_reason reason);
 471
 472static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus);
 473static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus);
 474
 475/**
 476 * devfn_to_wslot() - Convert from Linux PCI slot to Windows
 477 * @devfn:      The Linux representation of PCI slot
 478 *
 479 * Windows uses a slightly different representation of PCI slot.
 480 *
 481 * Return: The Windows representation
 482 */
 483static u32 devfn_to_wslot(int devfn)
 484{
 485        union win_slot_encoding wslot;
 486
 487        wslot.slot = 0;
 488        wslot.bits.func = PCI_SLOT(devfn) | (PCI_FUNC(devfn) << 5);
 489
 490        return wslot.slot;
 491}
 492
 493/**
 494 * wslot_to_devfn() - Convert from Windows PCI slot to Linux
 495 * @wslot:      The Windows representation of PCI slot
 496 *
 497 * Windows uses a slightly different representation of PCI slot.
 498 *
 499 * Return: The Linux representation
 500 */
 501static int wslot_to_devfn(u32 wslot)
 502{
 503        union win_slot_encoding slot_no;
 504
 505        slot_no.slot = wslot;
 506        return PCI_DEVFN(0, slot_no.bits.func);
 507}
 508
 509/*
 510 * PCI Configuration Space for these root PCI buses is implemented as a pair
 511 * of pages in memory-mapped I/O space.  Writing to the first page chooses
 512 * the PCI function being written or read.  Once the first page has been
 513 * written to, the following page maps in the entire configuration space of
 514 * the function.
 515 */
 516
 517/**
 518 * _hv_pcifront_read_config() - Internal PCI config read
 519 * @hpdev:      The PCI driver's representation of the device
 520 * @where:      Offset within config space
 521 * @size:       Size of the transfer
 522 * @val:        Pointer to the buffer receiving the data
 523 */
 524static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
 525                                     int size, u32 *val)
 526{
 527        unsigned long flags;
 528        void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
 529
 530        /*
 531         * If the attempt is to read the IDs or the ROM BAR, simulate that.
 532         */
 533        if (where + size <= PCI_COMMAND) {
 534                memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
 535        } else if (where >= PCI_CLASS_REVISION && where + size <=
 536                   PCI_CACHE_LINE_SIZE) {
 537                memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
 538                       PCI_CLASS_REVISION, size);
 539        } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
 540                   PCI_ROM_ADDRESS) {
 541                memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
 542                       PCI_SUBSYSTEM_VENDOR_ID, size);
 543        } else if (where >= PCI_ROM_ADDRESS && where + size <=
 544                   PCI_CAPABILITY_LIST) {
 545                /* ROM BARs are unimplemented */
 546                *val = 0;
 547        } else if (where >= PCI_INTERRUPT_LINE && where + size <=
 548                   PCI_INTERRUPT_PIN) {
 549                /*
 550                 * Interrupt Line and Interrupt PIN are hard-wired to zero
 551                 * because this front-end only supports message-signaled
 552                 * interrupts.
 553                 */
 554                *val = 0;
 555        } else if (where + size <= CFG_PAGE_SIZE) {
 556                spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
 557                /* Choose the function to be read. (See comment above) */
 558                writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
 559                /* Make sure the function was chosen before we start reading. */
 560                mb();
 561                /* Read from that function's config space. */
 562                switch (size) {
 563                case 1:
 564                        *val = readb(addr);
 565                        break;
 566                case 2:
 567                        *val = readw(addr);
 568                        break;
 569                default:
 570                        *val = readl(addr);
 571                        break;
 572                }
 573                /*
 574                 * Make sure the write was done before we release the spinlock
 575                 * allowing consecutive reads/writes.
 576                 */
 577                mb();
 578                spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
 579        } else {
 580                dev_err(&hpdev->hbus->hdev->device,
 581                        "Attempt to read beyond a function's config space.\n");
 582        }
 583}
 584
 585/**
 586 * _hv_pcifront_write_config() - Internal PCI config write
 587 * @hpdev:      The PCI driver's representation of the device
 588 * @where:      Offset within config space
 589 * @size:       Size of the transfer
 590 * @val:        The data being transferred
 591 */
 592static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
 593                                      int size, u32 val)
 594{
 595        unsigned long flags;
 596        void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
 597
 598        if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
 599            where + size <= PCI_CAPABILITY_LIST) {
 600                /* SSIDs and ROM BARs are read-only */
 601        } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
 602                spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
 603                /* Choose the function to be written. (See comment above) */
 604                writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
 605                /* Make sure the function was chosen before we start writing. */
 606                wmb();
 607                /* Write to that function's config space. */
 608                switch (size) {
 609                case 1:
 610                        writeb(val, addr);
 611                        break;
 612                case 2:
 613                        writew(val, addr);
 614                        break;
 615                default:
 616                        writel(val, addr);
 617                        break;
 618                }
 619                /*
 620                 * Make sure the write was done before we release the spinlock
 621                 * allowing consecutive reads/writes.
 622                 */
 623                mb();
 624                spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
 625        } else {
 626                dev_err(&hpdev->hbus->hdev->device,
 627                        "Attempt to write beyond a function's config space.\n");
 628        }
 629}
 630
 631/**
 632 * hv_pcifront_read_config() - Read configuration space
 633 * @bus: PCI Bus structure
 634 * @devfn: Device/function
 635 * @where: Offset from base
 636 * @size: Byte/word/dword
 637 * @val: Value to be read
 638 *
 639 * Return: PCIBIOS_SUCCESSFUL on success
 640 *         PCIBIOS_DEVICE_NOT_FOUND on failure
 641 */
 642static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
 643                                   int where, int size, u32 *val)
 644{
 645        struct hv_pcibus_device *hbus =
 646                container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
 647        struct hv_pci_dev *hpdev;
 648
 649        hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
 650        if (!hpdev)
 651                return PCIBIOS_DEVICE_NOT_FOUND;
 652
 653        _hv_pcifront_read_config(hpdev, where, size, val);
 654
 655        put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 656        return PCIBIOS_SUCCESSFUL;
 657}
 658
 659/**
 660 * hv_pcifront_write_config() - Write configuration space
 661 * @bus: PCI Bus structure
 662 * @devfn: Device/function
 663 * @where: Offset from base
 664 * @size: Byte/word/dword
 665 * @val: Value to be written to device
 666 *
 667 * Return: PCIBIOS_SUCCESSFUL on success
 668 *         PCIBIOS_DEVICE_NOT_FOUND on failure
 669 */
 670static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
 671                                    int where, int size, u32 val)
 672{
 673        struct hv_pcibus_device *hbus =
 674            container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
 675        struct hv_pci_dev *hpdev;
 676
 677        hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
 678        if (!hpdev)
 679                return PCIBIOS_DEVICE_NOT_FOUND;
 680
 681        _hv_pcifront_write_config(hpdev, where, size, val);
 682
 683        put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 684        return PCIBIOS_SUCCESSFUL;
 685}
 686
 687/* PCIe operations */
 688static struct pci_ops hv_pcifront_ops = {
 689        .read  = hv_pcifront_read_config,
 690        .write = hv_pcifront_write_config,
 691};
 692
 693/* Interrupt management hooks */
 694static void hv_int_desc_free(struct hv_pci_dev *hpdev,
 695                             struct tran_int_desc *int_desc)
 696{
 697        struct pci_delete_interrupt *int_pkt;
 698        struct {
 699                struct pci_packet pkt;
 700                u8 buffer[sizeof(struct pci_delete_interrupt)];
 701        } ctxt;
 702
 703        memset(&ctxt, 0, sizeof(ctxt));
 704        int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
 705        int_pkt->message_type.type =
 706                PCI_DELETE_INTERRUPT_MESSAGE;
 707        int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
 708        int_pkt->int_desc = *int_desc;
 709        vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
 710                         (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0);
 711        kfree(int_desc);
 712}
 713
 714/**
 715 * hv_msi_free() - Free the MSI.
 716 * @domain:     The interrupt domain pointer
 717 * @info:       Extra MSI-related context
 718 * @irq:        Identifies the IRQ.
 719 *
 720 * The Hyper-V parent partition and hypervisor are tracking the
 721 * messages that are in use, keeping the interrupt redirection
 722 * table up to date.  This callback sends a message that frees
 723 * the IRT entry and related tracking nonsense.
 724 */
 725static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
 726                        unsigned int irq)
 727{
 728        struct hv_pcibus_device *hbus;
 729        struct hv_pci_dev *hpdev;
 730        struct pci_dev *pdev;
 731        struct tran_int_desc *int_desc;
 732        struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
 733        struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
 734
 735        pdev = msi_desc_to_pci_dev(msi);
 736        hbus = info->data;
 737        int_desc = irq_data_get_irq_chip_data(irq_data);
 738        if (!int_desc)
 739                return;
 740
 741        irq_data->chip_data = NULL;
 742        hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
 743        if (!hpdev) {
 744                kfree(int_desc);
 745                return;
 746        }
 747
 748        hv_int_desc_free(hpdev, int_desc);
 749        put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 750}
 751
 752static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
 753                           bool force)
 754{
 755        struct irq_data *parent = data->parent_data;
 756
 757        return parent->chip->irq_set_affinity(parent, dest, force);
 758}
 759
 760static void hv_irq_mask(struct irq_data *data)
 761{
 762        pci_msi_mask_irq(data);
 763}
 764
 765/**
 766 * hv_irq_unmask() - "Unmask" the IRQ by setting its current
 767 * affinity.
 768 * @data:       Describes the IRQ
 769 *
 770 * Build new a destination for the MSI and make a hypercall to
 771 * update the Interrupt Redirection Table. "Device Logical ID"
 772 * is built out of this PCI bus's instance GUID and the function
 773 * number of the device.
 774 */
 775static void hv_irq_unmask(struct irq_data *data)
 776{
 777        struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
 778        struct irq_cfg *cfg = irqd_cfg(data);
 779        struct retarget_msi_interrupt *params;
 780        struct hv_pcibus_device *hbus;
 781        struct cpumask *dest;
 782        struct pci_bus *pbus;
 783        struct pci_dev *pdev;
 784        int cpu;
 785        unsigned long flags;
 786
 787        dest = irq_data_get_affinity_mask(data);
 788        pdev = msi_desc_to_pci_dev(msi_desc);
 789        pbus = pdev->bus;
 790        hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
 791
 792        spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
 793
 794        params = &hbus->retarget_msi_interrupt_params;
 795        memset(params, 0, sizeof(*params));
 796        params->partition_id = HV_PARTITION_ID_SELF;
 797        params->source = 1; /* MSI(-X) */
 798        params->address = msi_desc->msg.address_lo;
 799        params->data = msi_desc->msg.data;
 800        params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
 801                           (hbus->hdev->dev_instance.b[4] << 16) |
 802                           (hbus->hdev->dev_instance.b[7] << 8) |
 803                           (hbus->hdev->dev_instance.b[6] & 0xf8) |
 804                           PCI_FUNC(pdev->devfn);
 805        params->vector = cfg->vector;
 806
 807        for_each_cpu_and(cpu, dest, cpu_online_mask)
 808                params->vp_mask |= (1ULL << vmbus_cpu_number_to_vp_number(cpu));
 809
 810        hv_do_hypercall(HVCALL_RETARGET_INTERRUPT, params, NULL);
 811
 812        spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
 813
 814        pci_msi_unmask_irq(data);
 815}
 816
 817struct compose_comp_ctxt {
 818        struct hv_pci_compl comp_pkt;
 819        struct tran_int_desc int_desc;
 820};
 821
 822static void hv_pci_compose_compl(void *context, struct pci_response *resp,
 823                                 int resp_packet_size)
 824{
 825        struct compose_comp_ctxt *comp_pkt = context;
 826        struct pci_create_int_response *int_resp =
 827                (struct pci_create_int_response *)resp;
 828
 829        comp_pkt->comp_pkt.completion_status = resp->status;
 830        comp_pkt->int_desc = int_resp->int_desc;
 831        complete(&comp_pkt->comp_pkt.host_event);
 832}
 833
 834/**
 835 * hv_compose_msi_msg() - Supplies a valid MSI address/data
 836 * @data:       Everything about this MSI
 837 * @msg:        Buffer that is filled in by this function
 838 *
 839 * This function unpacks the IRQ looking for target CPU set, IDT
 840 * vector and mode and sends a message to the parent partition
 841 * asking for a mapping for that tuple in this partition.  The
 842 * response supplies a data value and address to which that data
 843 * should be written to trigger that interrupt.
 844 */
 845static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 846{
 847        struct irq_cfg *cfg = irqd_cfg(data);
 848        struct hv_pcibus_device *hbus;
 849        struct hv_pci_dev *hpdev;
 850        struct pci_bus *pbus;
 851        struct pci_dev *pdev;
 852        struct pci_create_interrupt *int_pkt;
 853        struct compose_comp_ctxt comp;
 854        struct tran_int_desc *int_desc;
 855        struct cpumask *affinity;
 856        struct {
 857                struct pci_packet pkt;
 858                u8 buffer[sizeof(struct pci_create_interrupt)];
 859        } ctxt;
 860        int cpu;
 861        int ret;
 862
 863        pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
 864        pbus = pdev->bus;
 865        hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
 866        hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
 867        if (!hpdev)
 868                goto return_null_message;
 869
 870        /* Free any previous message that might have already been composed. */
 871        if (data->chip_data) {
 872                int_desc = data->chip_data;
 873                data->chip_data = NULL;
 874                hv_int_desc_free(hpdev, int_desc);
 875        }
 876
 877        int_desc = kzalloc(sizeof(*int_desc), GFP_KERNEL);
 878        if (!int_desc)
 879                goto drop_reference;
 880
 881        memset(&ctxt, 0, sizeof(ctxt));
 882        init_completion(&comp.comp_pkt.host_event);
 883        ctxt.pkt.completion_func = hv_pci_compose_compl;
 884        ctxt.pkt.compl_ctxt = &comp;
 885        int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
 886        int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
 887        int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
 888        int_pkt->int_desc.vector = cfg->vector;
 889        int_pkt->int_desc.vector_count = 1;
 890        int_pkt->int_desc.delivery_mode =
 891                (apic->irq_delivery_mode == dest_LowestPrio) ? 1 : 0;
 892
 893        /*
 894         * This bit doesn't have to work on machines with more than 64
 895         * processors because Hyper-V only supports 64 in a guest.
 896         */
 897        affinity = irq_data_get_affinity_mask(data);
 898        for_each_cpu_and(cpu, affinity, cpu_online_mask) {
 899                int_pkt->int_desc.cpu_mask |=
 900                        (1ULL << vmbus_cpu_number_to_vp_number(cpu));
 901        }
 902
 903        ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt,
 904                               sizeof(*int_pkt), (unsigned long)&ctxt.pkt,
 905                               VM_PKT_DATA_INBAND,
 906                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 907        if (ret)
 908                goto free_int_desc;
 909
 910        wait_for_completion(&comp.comp_pkt.host_event);
 911
 912        if (comp.comp_pkt.completion_status < 0) {
 913                dev_err(&hbus->hdev->device,
 914                        "Request for interrupt failed: 0x%x",
 915                        comp.comp_pkt.completion_status);
 916                goto free_int_desc;
 917        }
 918
 919        /*
 920         * Record the assignment so that this can be unwound later. Using
 921         * irq_set_chip_data() here would be appropriate, but the lock it takes
 922         * is already held.
 923         */
 924        *int_desc = comp.int_desc;
 925        data->chip_data = int_desc;
 926
 927        /* Pass up the result. */
 928        msg->address_hi = comp.int_desc.address >> 32;
 929        msg->address_lo = comp.int_desc.address & 0xffffffff;
 930        msg->data = comp.int_desc.data;
 931
 932        put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 933        return;
 934
 935free_int_desc:
 936        kfree(int_desc);
 937drop_reference:
 938        put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 939return_null_message:
 940        msg->address_hi = 0;
 941        msg->address_lo = 0;
 942        msg->data = 0;
 943}
 944
 945/* HW Interrupt Chip Descriptor */
 946static struct irq_chip hv_msi_irq_chip = {
 947        .name                   = "Hyper-V PCIe MSI",
 948        .irq_compose_msi_msg    = hv_compose_msi_msg,
 949        .irq_set_affinity       = hv_set_affinity,
 950        .irq_ack                = irq_chip_ack_parent,
 951        .irq_mask               = hv_irq_mask,
 952        .irq_unmask             = hv_irq_unmask,
 953};
 954
 955static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
 956                                                   msi_alloc_info_t *arg)
 957{
 958        return arg->msi_hwirq;
 959}
 960
 961static struct msi_domain_ops hv_msi_ops = {
 962        .get_hwirq      = hv_msi_domain_ops_get_hwirq,
 963        .msi_prepare    = pci_msi_prepare,
 964        .set_desc       = pci_msi_set_desc,
 965        .msi_free       = hv_msi_free,
 966};
 967
 968/**
 969 * hv_pcie_init_irq_domain() - Initialize IRQ domain
 970 * @hbus:       The root PCI bus
 971 *
 972 * This function creates an IRQ domain which will be used for
 973 * interrupts from devices that have been passed through.  These
 974 * devices only support MSI and MSI-X, not line-based interrupts
 975 * or simulations of line-based interrupts through PCIe's
 976 * fabric-layer messages.  Because interrupts are remapped, we
 977 * can support multi-message MSI here.
 978 *
 979 * Return: '0' on success and error value on failure
 980 */
 981static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
 982{
 983        hbus->msi_info.chip = &hv_msi_irq_chip;
 984        hbus->msi_info.ops = &hv_msi_ops;
 985        hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
 986                MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
 987                MSI_FLAG_PCI_MSIX);
 988        hbus->msi_info.handler = handle_edge_irq;
 989        hbus->msi_info.handler_name = "edge";
 990        hbus->msi_info.data = hbus;
 991        hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode,
 992                                                     &hbus->msi_info,
 993                                                     x86_vector_domain);
 994        if (!hbus->irq_domain) {
 995                dev_err(&hbus->hdev->device,
 996                        "Failed to build an MSI IRQ domain\n");
 997                return -ENODEV;
 998        }
 999
1000        return 0;
1001}
1002
1003/**
1004 * get_bar_size() - Get the address space consumed by a BAR
1005 * @bar_val:    Value that a BAR returned after -1 was written
1006 *              to it.
1007 *
1008 * This function returns the size of the BAR, rounded up to 1
1009 * page.  It has to be rounded up because the hypervisor's page
1010 * table entry that maps the BAR into the VM can't specify an
1011 * offset within a page.  The invariant is that the hypervisor
1012 * must place any BARs of smaller than page length at the
1013 * beginning of a page.
1014 *
1015 * Return:      Size in bytes of the consumed MMIO space.
1016 */
1017static u64 get_bar_size(u64 bar_val)
1018{
1019        return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
1020                        PAGE_SIZE);
1021}
1022
1023/**
1024 * survey_child_resources() - Total all MMIO requirements
1025 * @hbus:       Root PCI bus, as understood by this driver
1026 */
1027static void survey_child_resources(struct hv_pcibus_device *hbus)
1028{
1029        struct list_head *iter;
1030        struct hv_pci_dev *hpdev;
1031        resource_size_t bar_size = 0;
1032        unsigned long flags;
1033        struct completion *event;
1034        u64 bar_val;
1035        int i;
1036
1037        /* If nobody is waiting on the answer, don't compute it. */
1038        event = xchg(&hbus->survey_event, NULL);
1039        if (!event)
1040                return;
1041
1042        /* If the answer has already been computed, go with it. */
1043        if (hbus->low_mmio_space || hbus->high_mmio_space) {
1044                complete(event);
1045                return;
1046        }
1047
1048        spin_lock_irqsave(&hbus->device_list_lock, flags);
1049
1050        /*
1051         * Due to an interesting quirk of the PCI spec, all memory regions
1052         * for a child device are a power of 2 in size and aligned in memory,
1053         * so it's sufficient to just add them up without tracking alignment.
1054         */
1055        list_for_each(iter, &hbus->children) {
1056                hpdev = container_of(iter, struct hv_pci_dev, list_entry);
1057                for (i = 0; i < 6; i++) {
1058                        if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
1059                                dev_err(&hbus->hdev->device,
1060                                        "There's an I/O BAR in this list!\n");
1061
1062                        if (hpdev->probed_bar[i] != 0) {
1063                                /*
1064                                 * A probed BAR has all the upper bits set that
1065                                 * can be changed.
1066                                 */
1067
1068                                bar_val = hpdev->probed_bar[i];
1069                                if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1070                                        bar_val |=
1071                                        ((u64)hpdev->probed_bar[++i] << 32);
1072                                else
1073                                        bar_val |= 0xffffffff00000000ULL;
1074
1075                                bar_size = get_bar_size(bar_val);
1076
1077                                if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1078                                        hbus->high_mmio_space += bar_size;
1079                                else
1080                                        hbus->low_mmio_space += bar_size;
1081                        }
1082                }
1083        }
1084
1085        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1086        complete(event);
1087}
1088
1089/**
1090 * prepopulate_bars() - Fill in BARs with defaults
1091 * @hbus:       Root PCI bus, as understood by this driver
1092 *
1093 * The core PCI driver code seems much, much happier if the BARs
1094 * for a device have values upon first scan. So fill them in.
1095 * The algorithm below works down from large sizes to small,
1096 * attempting to pack the assignments optimally. The assumption,
1097 * enforced in other parts of the code, is that the beginning of
1098 * the memory-mapped I/O space will be aligned on the largest
1099 * BAR size.
1100 */
1101static void prepopulate_bars(struct hv_pcibus_device *hbus)
1102{
1103        resource_size_t high_size = 0;
1104        resource_size_t low_size = 0;
1105        resource_size_t high_base = 0;
1106        resource_size_t low_base = 0;
1107        resource_size_t bar_size;
1108        struct hv_pci_dev *hpdev;
1109        struct list_head *iter;
1110        unsigned long flags;
1111        u64 bar_val;
1112        u32 command;
1113        bool high;
1114        int i;
1115
1116        if (hbus->low_mmio_space) {
1117                low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1118                low_base = hbus->low_mmio_res->start;
1119        }
1120
1121        if (hbus->high_mmio_space) {
1122                high_size = 1ULL <<
1123                        (63 - __builtin_clzll(hbus->high_mmio_space));
1124                high_base = hbus->high_mmio_res->start;
1125        }
1126
1127        spin_lock_irqsave(&hbus->device_list_lock, flags);
1128
1129        /* Pick addresses for the BARs. */
1130        do {
1131                list_for_each(iter, &hbus->children) {
1132                        hpdev = container_of(iter, struct hv_pci_dev,
1133                                             list_entry);
1134                        for (i = 0; i < 6; i++) {
1135                                bar_val = hpdev->probed_bar[i];
1136                                if (bar_val == 0)
1137                                        continue;
1138                                high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
1139                                if (high) {
1140                                        bar_val |=
1141                                                ((u64)hpdev->probed_bar[i + 1]
1142                                                 << 32);
1143                                } else {
1144                                        bar_val |= 0xffffffffULL << 32;
1145                                }
1146                                bar_size = get_bar_size(bar_val);
1147                                if (high) {
1148                                        if (high_size != bar_size) {
1149                                                i++;
1150                                                continue;
1151                                        }
1152                                        _hv_pcifront_write_config(hpdev,
1153                                                PCI_BASE_ADDRESS_0 + (4 * i),
1154                                                4,
1155                                                (u32)(high_base & 0xffffff00));
1156                                        i++;
1157                                        _hv_pcifront_write_config(hpdev,
1158                                                PCI_BASE_ADDRESS_0 + (4 * i),
1159                                                4, (u32)(high_base >> 32));
1160                                        high_base += bar_size;
1161                                } else {
1162                                        if (low_size != bar_size)
1163                                                continue;
1164                                        _hv_pcifront_write_config(hpdev,
1165                                                PCI_BASE_ADDRESS_0 + (4 * i),
1166                                                4,
1167                                                (u32)(low_base & 0xffffff00));
1168                                        low_base += bar_size;
1169                                }
1170                        }
1171                        if (high_size <= 1 && low_size <= 1) {
1172                                /* Set the memory enable bit. */
1173                                _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2,
1174                                                         &command);
1175                                command |= PCI_COMMAND_MEMORY;
1176                                _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2,
1177                                                          command);
1178                                break;
1179                        }
1180                }
1181
1182                high_size >>= 1;
1183                low_size >>= 1;
1184        }  while (high_size || low_size);
1185
1186        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1187}
1188
1189/**
1190 * create_root_hv_pci_bus() - Expose a new root PCI bus
1191 * @hbus:       Root PCI bus, as understood by this driver
1192 *
1193 * Return: 0 on success, -errno on failure
1194 */
1195static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
1196{
1197        /* Register the device */
1198        hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device,
1199                                            0, /* bus number is always zero */
1200                                            &hv_pcifront_ops,
1201                                            &hbus->sysdata,
1202                                            &hbus->resources_for_children);
1203        if (!hbus->pci_bus)
1204                return -ENODEV;
1205
1206        hbus->pci_bus->msi = &hbus->msi_chip;
1207        hbus->pci_bus->msi->dev = &hbus->hdev->device;
1208
1209        pci_scan_child_bus(hbus->pci_bus);
1210        pci_bus_assign_resources(hbus->pci_bus);
1211        pci_bus_add_devices(hbus->pci_bus);
1212        hbus->state = hv_pcibus_installed;
1213        return 0;
1214}
1215
1216struct q_res_req_compl {
1217        struct completion host_event;
1218        struct hv_pci_dev *hpdev;
1219};
1220
1221/**
1222 * q_resource_requirements() - Query Resource Requirements
1223 * @context:            The completion context.
1224 * @resp:               The response that came from the host.
1225 * @resp_packet_size:   The size in bytes of resp.
1226 *
1227 * This function is invoked on completion of a Query Resource
1228 * Requirements packet.
1229 */
1230static void q_resource_requirements(void *context, struct pci_response *resp,
1231                                    int resp_packet_size)
1232{
1233        struct q_res_req_compl *completion = context;
1234        struct pci_q_res_req_response *q_res_req =
1235                (struct pci_q_res_req_response *)resp;
1236        int i;
1237
1238        if (resp->status < 0) {
1239                dev_err(&completion->hpdev->hbus->hdev->device,
1240                        "query resource requirements failed: %x\n",
1241                        resp->status);
1242        } else {
1243                for (i = 0; i < 6; i++) {
1244                        completion->hpdev->probed_bar[i] =
1245                                q_res_req->probed_bar[i];
1246                }
1247        }
1248
1249        complete(&completion->host_event);
1250}
1251
1252static void get_pcichild(struct hv_pci_dev *hpdev,
1253                            enum hv_pcidev_ref_reason reason)
1254{
1255        atomic_inc(&hpdev->refs);
1256}
1257
1258static void put_pcichild(struct hv_pci_dev *hpdev,
1259                            enum hv_pcidev_ref_reason reason)
1260{
1261        if (atomic_dec_and_test(&hpdev->refs))
1262                kfree(hpdev);
1263}
1264
1265/**
1266 * new_pcichild_device() - Create a new child device
1267 * @hbus:       The internal struct tracking this root PCI bus.
1268 * @desc:       The information supplied so far from the host
1269 *              about the device.
1270 *
1271 * This function creates the tracking structure for a new child
1272 * device and kicks off the process of figuring out what it is.
1273 *
1274 * Return: Pointer to the new tracking struct
1275 */
1276static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
1277                struct pci_function_description *desc)
1278{
1279        struct hv_pci_dev *hpdev;
1280        struct pci_child_message *res_req;
1281        struct q_res_req_compl comp_pkt;
1282        struct {
1283                struct pci_packet init_packet;
1284                u8 buffer[sizeof(struct pci_child_message)];
1285        } pkt;
1286        unsigned long flags;
1287        int ret;
1288
1289        hpdev = kzalloc(sizeof(*hpdev), GFP_ATOMIC);
1290        if (!hpdev)
1291                return NULL;
1292
1293        hpdev->hbus = hbus;
1294
1295        memset(&pkt, 0, sizeof(pkt));
1296        init_completion(&comp_pkt.host_event);
1297        comp_pkt.hpdev = hpdev;
1298        pkt.init_packet.compl_ctxt = &comp_pkt;
1299        pkt.init_packet.completion_func = q_resource_requirements;
1300        res_req = (struct pci_child_message *)&pkt.init_packet.message;
1301        res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
1302        res_req->wslot.slot = desc->win_slot.slot;
1303
1304        ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
1305                               sizeof(struct pci_child_message),
1306                               (unsigned long)&pkt.init_packet,
1307                               VM_PKT_DATA_INBAND,
1308                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1309        if (ret)
1310                goto error;
1311
1312        wait_for_completion(&comp_pkt.host_event);
1313
1314        hpdev->desc = *desc;
1315        get_pcichild(hpdev, hv_pcidev_ref_initial);
1316        get_pcichild(hpdev, hv_pcidev_ref_childlist);
1317        spin_lock_irqsave(&hbus->device_list_lock, flags);
1318        list_add_tail(&hpdev->list_entry, &hbus->children);
1319        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1320        return hpdev;
1321
1322error:
1323        kfree(hpdev);
1324        return NULL;
1325}
1326
1327/**
1328 * get_pcichild_wslot() - Find device from slot
1329 * @hbus:       Root PCI bus, as understood by this driver
1330 * @wslot:      Location on the bus
1331 *
1332 * This function looks up a PCI device and returns the internal
1333 * representation of it.  It acquires a reference on it, so that
1334 * the device won't be deleted while somebody is using it.  The
1335 * caller is responsible for calling put_pcichild() to release
1336 * this reference.
1337 *
1338 * Return:      Internal representation of a PCI device
1339 */
1340static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
1341                                             u32 wslot)
1342{
1343        unsigned long flags;
1344        struct hv_pci_dev *iter, *hpdev = NULL;
1345
1346        spin_lock_irqsave(&hbus->device_list_lock, flags);
1347        list_for_each_entry(iter, &hbus->children, list_entry) {
1348                if (iter->desc.win_slot.slot == wslot) {
1349                        hpdev = iter;
1350                        get_pcichild(hpdev, hv_pcidev_ref_by_slot);
1351                        break;
1352                }
1353        }
1354        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1355
1356        return hpdev;
1357}
1358
1359/**
1360 * pci_devices_present_work() - Handle new list of child devices
1361 * @work:       Work struct embedded in struct hv_dr_work
1362 *
1363 * "Bus Relations" is the Windows term for "children of this
1364 * bus."  The terminology is preserved here for people trying to
1365 * debug the interaction between Hyper-V and Linux.  This
1366 * function is called when the parent partition reports a list
1367 * of functions that should be observed under this PCI Express
1368 * port (bus).
1369 *
1370 * This function updates the list, and must tolerate being
1371 * called multiple times with the same information.  The typical
1372 * number of child devices is one, with very atypical cases
1373 * involving three or four, so the algorithms used here can be
1374 * simple and inefficient.
1375 *
1376 * It must also treat the omission of a previously observed device as
1377 * notification that the device no longer exists.
1378 *
1379 * Note that this function is a work item, and it may not be
1380 * invoked in the order that it was queued.  Back to back
1381 * updates of the list of present devices may involve queuing
1382 * multiple work items, and this one may run before ones that
1383 * were sent later. As such, this function only does something
1384 * if is the last one in the queue.
1385 */
1386static void pci_devices_present_work(struct work_struct *work)
1387{
1388        u32 child_no;
1389        bool found;
1390        struct list_head *iter;
1391        struct pci_function_description *new_desc;
1392        struct hv_pci_dev *hpdev;
1393        struct hv_pcibus_device *hbus;
1394        struct list_head removed;
1395        struct hv_dr_work *dr_wrk;
1396        struct hv_dr_state *dr = NULL;
1397        unsigned long flags;
1398
1399        dr_wrk = container_of(work, struct hv_dr_work, wrk);
1400        hbus = dr_wrk->bus;
1401        kfree(dr_wrk);
1402
1403        INIT_LIST_HEAD(&removed);
1404
1405        if (down_interruptible(&hbus->enum_sem)) {
1406                put_hvpcibus(hbus);
1407                return;
1408        }
1409
1410        /* Pull this off the queue and process it if it was the last one. */
1411        spin_lock_irqsave(&hbus->device_list_lock, flags);
1412        while (!list_empty(&hbus->dr_list)) {
1413                dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
1414                                      list_entry);
1415                list_del(&dr->list_entry);
1416
1417                /* Throw this away if the list still has stuff in it. */
1418                if (!list_empty(&hbus->dr_list)) {
1419                        kfree(dr);
1420                        continue;
1421                }
1422        }
1423        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1424
1425        if (!dr) {
1426                up(&hbus->enum_sem);
1427                put_hvpcibus(hbus);
1428                return;
1429        }
1430
1431        /* First, mark all existing children as reported missing. */
1432        spin_lock_irqsave(&hbus->device_list_lock, flags);
1433        list_for_each(iter, &hbus->children) {
1434                        hpdev = container_of(iter, struct hv_pci_dev,
1435                                             list_entry);
1436                        hpdev->reported_missing = true;
1437        }
1438        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1439
1440        /* Next, add back any reported devices. */
1441        for (child_no = 0; child_no < dr->device_count; child_no++) {
1442                found = false;
1443                new_desc = &dr->func[child_no];
1444
1445                spin_lock_irqsave(&hbus->device_list_lock, flags);
1446                list_for_each(iter, &hbus->children) {
1447                        hpdev = container_of(iter, struct hv_pci_dev,
1448                                             list_entry);
1449                        if ((hpdev->desc.win_slot.slot ==
1450                             new_desc->win_slot.slot) &&
1451                            (hpdev->desc.v_id == new_desc->v_id) &&
1452                            (hpdev->desc.d_id == new_desc->d_id) &&
1453                            (hpdev->desc.ser == new_desc->ser)) {
1454                                hpdev->reported_missing = false;
1455                                found = true;
1456                        }
1457                }
1458                spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1459
1460                if (!found) {
1461                        hpdev = new_pcichild_device(hbus, new_desc);
1462                        if (!hpdev)
1463                                dev_err(&hbus->hdev->device,
1464                                        "couldn't record a child device.\n");
1465                }
1466        }
1467
1468        /* Move missing children to a list on the stack. */
1469        spin_lock_irqsave(&hbus->device_list_lock, flags);
1470        do {
1471                found = false;
1472                list_for_each(iter, &hbus->children) {
1473                        hpdev = container_of(iter, struct hv_pci_dev,
1474                                             list_entry);
1475                        if (hpdev->reported_missing) {
1476                                found = true;
1477                                put_pcichild(hpdev, hv_pcidev_ref_childlist);
1478                                list_move_tail(&hpdev->list_entry, &removed);
1479                                break;
1480                        }
1481                }
1482        } while (found);
1483        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1484
1485        /* Delete everything that should no longer exist. */
1486        while (!list_empty(&removed)) {
1487                hpdev = list_first_entry(&removed, struct hv_pci_dev,
1488                                         list_entry);
1489                list_del(&hpdev->list_entry);
1490                put_pcichild(hpdev, hv_pcidev_ref_initial);
1491        }
1492
1493        /* Tell the core to rescan bus because there may have been changes. */
1494        if (hbus->state == hv_pcibus_installed) {
1495                pci_lock_rescan_remove();
1496                pci_scan_child_bus(hbus->pci_bus);
1497                pci_unlock_rescan_remove();
1498        } else {
1499                survey_child_resources(hbus);
1500        }
1501
1502        up(&hbus->enum_sem);
1503        put_hvpcibus(hbus);
1504        kfree(dr);
1505}
1506
1507/**
1508 * hv_pci_devices_present() - Handles list of new children
1509 * @hbus:       Root PCI bus, as understood by this driver
1510 * @relations:  Packet from host listing children
1511 *
1512 * This function is invoked whenever a new list of devices for
1513 * this bus appears.
1514 */
1515static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
1516                                   struct pci_bus_relations *relations)
1517{
1518        struct hv_dr_state *dr;
1519        struct hv_dr_work *dr_wrk;
1520        unsigned long flags;
1521
1522        dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
1523        if (!dr_wrk)
1524                return;
1525
1526        dr = kzalloc(offsetof(struct hv_dr_state, func) +
1527                     (sizeof(struct pci_function_description) *
1528                      (relations->device_count)), GFP_NOWAIT);
1529        if (!dr)  {
1530                kfree(dr_wrk);
1531                return;
1532        }
1533
1534        INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
1535        dr_wrk->bus = hbus;
1536        dr->device_count = relations->device_count;
1537        if (dr->device_count != 0) {
1538                memcpy(dr->func, relations->func,
1539                       sizeof(struct pci_function_description) *
1540                       dr->device_count);
1541        }
1542
1543        spin_lock_irqsave(&hbus->device_list_lock, flags);
1544        list_add_tail(&dr->list_entry, &hbus->dr_list);
1545        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1546
1547        get_hvpcibus(hbus);
1548        schedule_work(&dr_wrk->wrk);
1549}
1550
1551/**
1552 * hv_eject_device_work() - Asynchronously handles ejection
1553 * @work:       Work struct embedded in internal device struct
1554 *
1555 * This function handles ejecting a device.  Windows will
1556 * attempt to gracefully eject a device, waiting 60 seconds to
1557 * hear back from the guest OS that this completed successfully.
1558 * If this timer expires, the device will be forcibly removed.
1559 */
1560static void hv_eject_device_work(struct work_struct *work)
1561{
1562        struct pci_eject_response *ejct_pkt;
1563        struct hv_pci_dev *hpdev;
1564        struct pci_dev *pdev;
1565        unsigned long flags;
1566        int wslot;
1567        struct {
1568                struct pci_packet pkt;
1569                u8 buffer[sizeof(struct pci_eject_response)];
1570        } ctxt;
1571
1572        hpdev = container_of(work, struct hv_pci_dev, wrk);
1573
1574        if (hpdev->state != hv_pcichild_ejecting) {
1575                put_pcichild(hpdev, hv_pcidev_ref_pnp);
1576                return;
1577        }
1578
1579        /*
1580         * Ejection can come before or after the PCI bus has been set up, so
1581         * attempt to find it and tear down the bus state, if it exists.  This
1582         * must be done without constructs like pci_domain_nr(hbus->pci_bus)
1583         * because hbus->pci_bus may not exist yet.
1584         */
1585        wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
1586        pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0,
1587                                           wslot);
1588        if (pdev) {
1589                pci_stop_and_remove_bus_device(pdev);
1590                pci_dev_put(pdev);
1591        }
1592
1593        spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags);
1594        list_del(&hpdev->list_entry);
1595        spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
1596
1597        memset(&ctxt, 0, sizeof(ctxt));
1598        ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
1599        ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
1600        ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
1601        vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt,
1602                         sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
1603                         VM_PKT_DATA_INBAND, 0);
1604
1605        put_pcichild(hpdev, hv_pcidev_ref_childlist);
1606        put_pcichild(hpdev, hv_pcidev_ref_pnp);
1607        put_hvpcibus(hpdev->hbus);
1608}
1609
1610/**
1611 * hv_pci_eject_device() - Handles device ejection
1612 * @hpdev:      Internal device tracking struct
1613 *
1614 * This function is invoked when an ejection packet arrives.  It
1615 * just schedules work so that we don't re-enter the packet
1616 * delivery code handling the ejection.
1617 */
1618static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
1619{
1620        hpdev->state = hv_pcichild_ejecting;
1621        get_pcichild(hpdev, hv_pcidev_ref_pnp);
1622        INIT_WORK(&hpdev->wrk, hv_eject_device_work);
1623        get_hvpcibus(hpdev->hbus);
1624        schedule_work(&hpdev->wrk);
1625}
1626
1627/**
1628 * hv_pci_onchannelcallback() - Handles incoming packets
1629 * @context:    Internal bus tracking struct
1630 *
1631 * This function is invoked whenever the host sends a packet to
1632 * this channel (which is private to this root PCI bus).
1633 */
1634static void hv_pci_onchannelcallback(void *context)
1635{
1636        const int packet_size = 0x100;
1637        int ret;
1638        struct hv_pcibus_device *hbus = context;
1639        u32 bytes_recvd;
1640        u64 req_id;
1641        struct vmpacket_descriptor *desc;
1642        unsigned char *buffer;
1643        int bufferlen = packet_size;
1644        struct pci_packet *comp_packet;
1645        struct pci_response *response;
1646        struct pci_incoming_message *new_message;
1647        struct pci_bus_relations *bus_rel;
1648        struct pci_dev_incoming *dev_message;
1649        struct hv_pci_dev *hpdev;
1650
1651        buffer = kmalloc(bufferlen, GFP_ATOMIC);
1652        if (!buffer)
1653                return;
1654
1655        while (1) {
1656                ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer,
1657                                           bufferlen, &bytes_recvd, &req_id);
1658
1659                if (ret == -ENOBUFS) {
1660                        kfree(buffer);
1661                        /* Handle large packet */
1662                        bufferlen = bytes_recvd;
1663                        buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
1664                        if (!buffer)
1665                                return;
1666                        continue;
1667                }
1668
1669                /* Zero length indicates there are no more packets. */
1670                if (ret || !bytes_recvd)
1671                        break;
1672
1673                /*
1674                 * All incoming packets must be at least as large as a
1675                 * response.
1676                 */
1677                if (bytes_recvd <= sizeof(struct pci_response))
1678                        continue;
1679                desc = (struct vmpacket_descriptor *)buffer;
1680
1681                switch (desc->type) {
1682                case VM_PKT_COMP:
1683
1684                        /*
1685                         * The host is trusted, and thus it's safe to interpret
1686                         * this transaction ID as a pointer.
1687                         */
1688                        comp_packet = (struct pci_packet *)req_id;
1689                        response = (struct pci_response *)buffer;
1690                        comp_packet->completion_func(comp_packet->compl_ctxt,
1691                                                     response,
1692                                                     bytes_recvd);
1693                        break;
1694
1695                case VM_PKT_DATA_INBAND:
1696
1697                        new_message = (struct pci_incoming_message *)buffer;
1698                        switch (new_message->message_type.type) {
1699                        case PCI_BUS_RELATIONS:
1700
1701                                bus_rel = (struct pci_bus_relations *)buffer;
1702                                if (bytes_recvd <
1703                                    offsetof(struct pci_bus_relations, func) +
1704                                    (sizeof(struct pci_function_description) *
1705                                     (bus_rel->device_count))) {
1706                                        dev_err(&hbus->hdev->device,
1707                                                "bus relations too small\n");
1708                                        break;
1709                                }
1710
1711                                hv_pci_devices_present(hbus, bus_rel);
1712                                break;
1713
1714                        case PCI_EJECT:
1715
1716                                dev_message = (struct pci_dev_incoming *)buffer;
1717                                hpdev = get_pcichild_wslot(hbus,
1718                                                      dev_message->wslot.slot);
1719                                if (hpdev) {
1720                                        hv_pci_eject_device(hpdev);
1721                                        put_pcichild(hpdev,
1722                                                        hv_pcidev_ref_by_slot);
1723                                }
1724                                break;
1725
1726                        default:
1727                                dev_warn(&hbus->hdev->device,
1728                                        "Unimplemented protocol message %x\n",
1729                                        new_message->message_type.type);
1730                                break;
1731                        }
1732                        break;
1733
1734                default:
1735                        dev_err(&hbus->hdev->device,
1736                                "unhandled packet type %d, tid %llx len %d\n",
1737                                desc->type, req_id, bytes_recvd);
1738                        break;
1739                }
1740        }
1741
1742        kfree(buffer);
1743}
1744
1745/**
1746 * hv_pci_protocol_negotiation() - Set up protocol
1747 * @hdev:       VMBus's tracking struct for this root PCI bus
1748 *
1749 * This driver is intended to support running on Windows 10
1750 * (server) and later versions. It will not run on earlier
1751 * versions, as they assume that many of the operations which
1752 * Linux needs accomplished with a spinlock held were done via
1753 * asynchronous messaging via VMBus.  Windows 10 increases the
1754 * surface area of PCI emulation so that these actions can take
1755 * place by suspending a virtual processor for their duration.
1756 *
1757 * This function negotiates the channel protocol version,
1758 * failing if the host doesn't support the necessary protocol
1759 * level.
1760 */
1761static int hv_pci_protocol_negotiation(struct hv_device *hdev)
1762{
1763        struct pci_version_request *version_req;
1764        struct hv_pci_compl comp_pkt;
1765        struct pci_packet *pkt;
1766        int ret;
1767
1768        /*
1769         * Initiate the handshake with the host and negotiate
1770         * a version that the host can support. We start with the
1771         * highest version number and go down if the host cannot
1772         * support it.
1773         */
1774        pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
1775        if (!pkt)
1776                return -ENOMEM;
1777
1778        init_completion(&comp_pkt.host_event);
1779        pkt->completion_func = hv_pci_generic_compl;
1780        pkt->compl_ctxt = &comp_pkt;
1781        version_req = (struct pci_version_request *)&pkt->message;
1782        version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
1783        version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT;
1784
1785        ret = vmbus_sendpacket(hdev->channel, version_req,
1786                               sizeof(struct pci_version_request),
1787                               (unsigned long)pkt, VM_PKT_DATA_INBAND,
1788                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1789        if (ret)
1790                goto exit;
1791
1792        wait_for_completion(&comp_pkt.host_event);
1793
1794        if (comp_pkt.completion_status < 0) {
1795                dev_err(&hdev->device,
1796                        "PCI Pass-through VSP failed version request %x\n",
1797                        comp_pkt.completion_status);
1798                ret = -EPROTO;
1799                goto exit;
1800        }
1801
1802        ret = 0;
1803
1804exit:
1805        kfree(pkt);
1806        return ret;
1807}
1808
1809/**
1810 * hv_pci_free_bridge_windows() - Release memory regions for the
1811 * bus
1812 * @hbus:       Root PCI bus, as understood by this driver
1813 */
1814static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
1815{
1816        /*
1817         * Set the resources back to the way they looked when they
1818         * were allocated by setting IORESOURCE_BUSY again.
1819         */
1820
1821        if (hbus->low_mmio_space && hbus->low_mmio_res) {
1822                hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
1823                vmbus_free_mmio(hbus->low_mmio_res->start,
1824                                resource_size(hbus->low_mmio_res));
1825        }
1826
1827        if (hbus->high_mmio_space && hbus->high_mmio_res) {
1828                hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
1829                vmbus_free_mmio(hbus->high_mmio_res->start,
1830                                resource_size(hbus->high_mmio_res));
1831        }
1832}
1833
1834/**
1835 * hv_pci_allocate_bridge_windows() - Allocate memory regions
1836 * for the bus
1837 * @hbus:       Root PCI bus, as understood by this driver
1838 *
1839 * This function calls vmbus_allocate_mmio(), which is itself a
1840 * bit of a compromise.  Ideally, we might change the pnp layer
1841 * in the kernel such that it comprehends either PCI devices
1842 * which are "grandchildren of ACPI," with some intermediate bus
1843 * node (in this case, VMBus) or change it such that it
1844 * understands VMBus.  The pnp layer, however, has been declared
1845 * deprecated, and not subject to change.
1846 *
1847 * The workaround, implemented here, is to ask VMBus to allocate
1848 * MMIO space for this bus.  VMBus itself knows which ranges are
1849 * appropriate by looking at its own ACPI objects.  Then, after
1850 * these ranges are claimed, they're modified to look like they
1851 * would have looked if the ACPI and pnp code had allocated
1852 * bridge windows.  These descriptors have to exist in this form
1853 * in order to satisfy the code which will get invoked when the
1854 * endpoint PCI function driver calls request_mem_region() or
1855 * request_mem_region_exclusive().
1856 *
1857 * Return: 0 on success, -errno on failure
1858 */
1859static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
1860{
1861        resource_size_t align;
1862        int ret;
1863
1864        if (hbus->low_mmio_space) {
1865                align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1866                ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
1867                                          (u64)(u32)0xffffffff,
1868                                          hbus->low_mmio_space,
1869                                          align, false);
1870                if (ret) {
1871                        dev_err(&hbus->hdev->device,
1872                                "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
1873                                hbus->low_mmio_space);
1874                        return ret;
1875                }
1876
1877                /* Modify this resource to become a bridge window. */
1878                hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
1879                hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
1880                pci_add_resource(&hbus->resources_for_children,
1881                                 hbus->low_mmio_res);
1882        }
1883
1884        if (hbus->high_mmio_space) {
1885                align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
1886                ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
1887                                          0x100000000, -1,
1888                                          hbus->high_mmio_space, align,
1889                                          false);
1890                if (ret) {
1891                        dev_err(&hbus->hdev->device,
1892                                "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
1893                                hbus->high_mmio_space);
1894                        goto release_low_mmio;
1895                }
1896
1897                /* Modify this resource to become a bridge window. */
1898                hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
1899                hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
1900                pci_add_resource(&hbus->resources_for_children,
1901                                 hbus->high_mmio_res);
1902        }
1903
1904        return 0;
1905
1906release_low_mmio:
1907        if (hbus->low_mmio_res) {
1908                vmbus_free_mmio(hbus->low_mmio_res->start,
1909                                resource_size(hbus->low_mmio_res));
1910        }
1911
1912        return ret;
1913}
1914
1915/**
1916 * hv_allocate_config_window() - Find MMIO space for PCI Config
1917 * @hbus:       Root PCI bus, as understood by this driver
1918 *
1919 * This function claims memory-mapped I/O space for accessing
1920 * configuration space for the functions on this bus.
1921 *
1922 * Return: 0 on success, -errno on failure
1923 */
1924static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
1925{
1926        int ret;
1927
1928        /*
1929         * Set up a region of MMIO space to use for accessing configuration
1930         * space.
1931         */
1932        ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
1933                                  PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
1934        if (ret)
1935                return ret;
1936
1937        /*
1938         * vmbus_allocate_mmio() gets used for allocating both device endpoint
1939         * resource claims (those which cannot be overlapped) and the ranges
1940         * which are valid for the children of this bus, which are intended
1941         * to be overlapped by those children.  Set the flag on this claim
1942         * meaning that this region can't be overlapped.
1943         */
1944
1945        hbus->mem_config->flags |= IORESOURCE_BUSY;
1946
1947        return 0;
1948}
1949
1950static void hv_free_config_window(struct hv_pcibus_device *hbus)
1951{
1952        vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
1953}
1954
1955/**
1956 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
1957 * @hdev:       VMBus's tracking struct for this root PCI bus
1958 *
1959 * Return: 0 on success, -errno on failure
1960 */
1961static int hv_pci_enter_d0(struct hv_device *hdev)
1962{
1963        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
1964        struct pci_bus_d0_entry *d0_entry;
1965        struct hv_pci_compl comp_pkt;
1966        struct pci_packet *pkt;
1967        int ret;
1968
1969        /*
1970         * Tell the host that the bus is ready to use, and moved into the
1971         * powered-on state.  This includes telling the host which region
1972         * of memory-mapped I/O space has been chosen for configuration space
1973         * access.
1974         */
1975        pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
1976        if (!pkt)
1977                return -ENOMEM;
1978
1979        init_completion(&comp_pkt.host_event);
1980        pkt->completion_func = hv_pci_generic_compl;
1981        pkt->compl_ctxt = &comp_pkt;
1982        d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
1983        d0_entry->message_type.type = PCI_BUS_D0ENTRY;
1984        d0_entry->mmio_base = hbus->mem_config->start;
1985
1986        ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
1987                               (unsigned long)pkt, VM_PKT_DATA_INBAND,
1988                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1989        if (ret)
1990                goto exit;
1991
1992        wait_for_completion(&comp_pkt.host_event);
1993
1994        if (comp_pkt.completion_status < 0) {
1995                dev_err(&hdev->device,
1996                        "PCI Pass-through VSP failed D0 Entry with status %x\n",
1997                        comp_pkt.completion_status);
1998                ret = -EPROTO;
1999                goto exit;
2000        }
2001
2002        ret = 0;
2003
2004exit:
2005        kfree(pkt);
2006        return ret;
2007}
2008
2009/**
2010 * hv_pci_query_relations() - Ask host to send list of child
2011 * devices
2012 * @hdev:       VMBus's tracking struct for this root PCI bus
2013 *
2014 * Return: 0 on success, -errno on failure
2015 */
2016static int hv_pci_query_relations(struct hv_device *hdev)
2017{
2018        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2019        struct pci_message message;
2020        struct completion comp;
2021        int ret;
2022
2023        /* Ask the host to send along the list of child devices */
2024        init_completion(&comp);
2025        if (cmpxchg(&hbus->survey_event, NULL, &comp))
2026                return -ENOTEMPTY;
2027
2028        memset(&message, 0, sizeof(message));
2029        message.type = PCI_QUERY_BUS_RELATIONS;
2030
2031        ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
2032                               0, VM_PKT_DATA_INBAND, 0);
2033        if (ret)
2034                return ret;
2035
2036        wait_for_completion(&comp);
2037        return 0;
2038}
2039
2040/**
2041 * hv_send_resources_allocated() - Report local resource choices
2042 * @hdev:       VMBus's tracking struct for this root PCI bus
2043 *
2044 * The host OS is expecting to be sent a request as a message
2045 * which contains all the resources that the device will use.
2046 * The response contains those same resources, "translated"
2047 * which is to say, the values which should be used by the
2048 * hardware, when it delivers an interrupt.  (MMIO resources are
2049 * used in local terms.)  This is nice for Windows, and lines up
2050 * with the FDO/PDO split, which doesn't exist in Linux.  Linux
2051 * is deeply expecting to scan an emulated PCI configuration
2052 * space.  So this message is sent here only to drive the state
2053 * machine on the host forward.
2054 *
2055 * Return: 0 on success, -errno on failure
2056 */
2057static int hv_send_resources_allocated(struct hv_device *hdev)
2058{
2059        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2060        struct pci_resources_assigned *res_assigned;
2061        struct hv_pci_compl comp_pkt;
2062        struct hv_pci_dev *hpdev;
2063        struct pci_packet *pkt;
2064        u32 wslot;
2065        int ret;
2066
2067        pkt = kmalloc(sizeof(*pkt) + sizeof(*res_assigned), GFP_KERNEL);
2068        if (!pkt)
2069                return -ENOMEM;
2070
2071        ret = 0;
2072
2073        for (wslot = 0; wslot < 256; wslot++) {
2074                hpdev = get_pcichild_wslot(hbus, wslot);
2075                if (!hpdev)
2076                        continue;
2077
2078                memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned));
2079                init_completion(&comp_pkt.host_event);
2080                pkt->completion_func = hv_pci_generic_compl;
2081                pkt->compl_ctxt = &comp_pkt;
2082                res_assigned = (struct pci_resources_assigned *)&pkt->message;
2083                res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED;
2084                res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
2085
2086                put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2087
2088                ret = vmbus_sendpacket(
2089                        hdev->channel, &pkt->message,
2090                        sizeof(*res_assigned),
2091                        (unsigned long)pkt,
2092                        VM_PKT_DATA_INBAND,
2093                        VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2094                if (ret)
2095                        break;
2096
2097                wait_for_completion(&comp_pkt.host_event);
2098
2099                if (comp_pkt.completion_status < 0) {
2100                        ret = -EPROTO;
2101                        dev_err(&hdev->device,
2102                                "resource allocated returned 0x%x",
2103                                comp_pkt.completion_status);
2104                        break;
2105                }
2106        }
2107
2108        kfree(pkt);
2109        return ret;
2110}
2111
2112/**
2113 * hv_send_resources_released() - Report local resources
2114 * released
2115 * @hdev:       VMBus's tracking struct for this root PCI bus
2116 *
2117 * Return: 0 on success, -errno on failure
2118 */
2119static int hv_send_resources_released(struct hv_device *hdev)
2120{
2121        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2122        struct pci_child_message pkt;
2123        struct hv_pci_dev *hpdev;
2124        u32 wslot;
2125        int ret;
2126
2127        for (wslot = 0; wslot < 256; wslot++) {
2128                hpdev = get_pcichild_wslot(hbus, wslot);
2129                if (!hpdev)
2130                        continue;
2131
2132                memset(&pkt, 0, sizeof(pkt));
2133                pkt.message_type.type = PCI_RESOURCES_RELEASED;
2134                pkt.wslot.slot = hpdev->desc.win_slot.slot;
2135
2136                put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2137
2138                ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
2139                                       VM_PKT_DATA_INBAND, 0);
2140                if (ret)
2141                        return ret;
2142        }
2143
2144        return 0;
2145}
2146
2147static void get_hvpcibus(struct hv_pcibus_device *hbus)
2148{
2149        atomic_inc(&hbus->remove_lock);
2150}
2151
2152static void put_hvpcibus(struct hv_pcibus_device *hbus)
2153{
2154        if (atomic_dec_and_test(&hbus->remove_lock))
2155                complete(&hbus->remove_event);
2156}
2157
2158/**
2159 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
2160 * @hdev:       VMBus's tracking struct for this root PCI bus
2161 * @dev_id:     Identifies the device itself
2162 *
2163 * Return: 0 on success, -errno on failure
2164 */
2165static int hv_pci_probe(struct hv_device *hdev,
2166                        const struct hv_vmbus_device_id *dev_id)
2167{
2168        struct hv_pcibus_device *hbus;
2169        int ret;
2170
2171        hbus = kzalloc(sizeof(*hbus), GFP_KERNEL);
2172        if (!hbus)
2173                return -ENOMEM;
2174
2175        /*
2176         * The PCI bus "domain" is what is called "segment" in ACPI and
2177         * other specs.  Pull it from the instance ID, to get something
2178         * unique.  Bytes 8 and 9 are what is used in Windows guests, so
2179         * do the same thing for consistency.  Note that, since this code
2180         * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
2181         * that (1) the only domain in use for something that looks like
2182         * a physical PCI bus (which is actually emulated by the
2183         * hypervisor) is domain 0 and (2) there will be no overlap
2184         * between domains derived from these instance IDs in the same
2185         * VM.
2186         */
2187        hbus->sysdata.domain = hdev->dev_instance.b[9] |
2188                               hdev->dev_instance.b[8] << 8;
2189
2190        hbus->hdev = hdev;
2191        atomic_inc(&hbus->remove_lock);
2192        INIT_LIST_HEAD(&hbus->children);
2193        INIT_LIST_HEAD(&hbus->dr_list);
2194        INIT_LIST_HEAD(&hbus->resources_for_children);
2195        spin_lock_init(&hbus->config_lock);
2196        spin_lock_init(&hbus->device_list_lock);
2197        spin_lock_init(&hbus->retarget_msi_interrupt_lock);
2198        sema_init(&hbus->enum_sem, 1);
2199        init_completion(&hbus->remove_event);
2200
2201        ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
2202                         hv_pci_onchannelcallback, hbus);
2203        if (ret)
2204                goto free_bus;
2205
2206        hv_set_drvdata(hdev, hbus);
2207
2208        ret = hv_pci_protocol_negotiation(hdev);
2209        if (ret)
2210                goto close;
2211
2212        ret = hv_allocate_config_window(hbus);
2213        if (ret)
2214                goto close;
2215
2216        hbus->cfg_addr = ioremap(hbus->mem_config->start,
2217                                 PCI_CONFIG_MMIO_LENGTH);
2218        if (!hbus->cfg_addr) {
2219                dev_err(&hdev->device,
2220                        "Unable to map a virtual address for config space\n");
2221                ret = -ENOMEM;
2222                goto free_config;
2223        }
2224
2225        hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus);
2226        if (!hbus->sysdata.fwnode) {
2227                ret = -ENOMEM;
2228                goto unmap;
2229        }
2230
2231        ret = hv_pcie_init_irq_domain(hbus);
2232        if (ret)
2233                goto free_fwnode;
2234
2235        ret = hv_pci_query_relations(hdev);
2236        if (ret)
2237                goto free_irq_domain;
2238
2239        ret = hv_pci_enter_d0(hdev);
2240        if (ret)
2241                goto free_irq_domain;
2242
2243        ret = hv_pci_allocate_bridge_windows(hbus);
2244        if (ret)
2245                goto free_irq_domain;
2246
2247        ret = hv_send_resources_allocated(hdev);
2248        if (ret)
2249                goto free_windows;
2250
2251        prepopulate_bars(hbus);
2252
2253        hbus->state = hv_pcibus_probed;
2254
2255        ret = create_root_hv_pci_bus(hbus);
2256        if (ret)
2257                goto free_windows;
2258
2259        return 0;
2260
2261free_windows:
2262        hv_pci_free_bridge_windows(hbus);
2263free_irq_domain:
2264        irq_domain_remove(hbus->irq_domain);
2265free_fwnode:
2266        irq_domain_free_fwnode(hbus->sysdata.fwnode);
2267unmap:
2268        iounmap(hbus->cfg_addr);
2269free_config:
2270        hv_free_config_window(hbus);
2271close:
2272        vmbus_close(hdev->channel);
2273free_bus:
2274        kfree(hbus);
2275        return ret;
2276}
2277
2278static void hv_pci_bus_exit(struct hv_device *hdev)
2279{
2280        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2281        struct {
2282                struct pci_packet teardown_packet;
2283                u8 buffer[sizeof(struct pci_message)];
2284        } pkt;
2285        struct pci_bus_relations relations;
2286        struct hv_pci_compl comp_pkt;
2287        int ret;
2288
2289        /*
2290         * After the host sends the RESCIND_CHANNEL message, it doesn't
2291         * access the per-channel ringbuffer any longer.
2292         */
2293        if (hdev->channel->rescind)
2294                return;
2295
2296        /* Delete any children which might still exist. */
2297        memset(&relations, 0, sizeof(relations));
2298        hv_pci_devices_present(hbus, &relations);
2299
2300        ret = hv_send_resources_released(hdev);
2301        if (ret)
2302                dev_err(&hdev->device,
2303                        "Couldn't send resources released packet(s)\n");
2304
2305        memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
2306        init_completion(&comp_pkt.host_event);
2307        pkt.teardown_packet.completion_func = hv_pci_generic_compl;
2308        pkt.teardown_packet.compl_ctxt = &comp_pkt;
2309        pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
2310
2311        ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message,
2312                               sizeof(struct pci_message),
2313                               (unsigned long)&pkt.teardown_packet,
2314                               VM_PKT_DATA_INBAND,
2315                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2316        if (!ret)
2317                wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ);
2318}
2319
2320/**
2321 * hv_pci_remove() - Remove routine for this VMBus channel
2322 * @hdev:       VMBus's tracking struct for this root PCI bus
2323 *
2324 * Return: 0 on success, -errno on failure
2325 */
2326static int hv_pci_remove(struct hv_device *hdev)
2327{
2328        struct hv_pcibus_device *hbus;
2329
2330        hbus = hv_get_drvdata(hdev);
2331        if (hbus->state == hv_pcibus_installed) {
2332                /* Remove the bus from PCI's point of view. */
2333                pci_lock_rescan_remove();
2334                pci_stop_root_bus(hbus->pci_bus);
2335                pci_remove_root_bus(hbus->pci_bus);
2336                pci_unlock_rescan_remove();
2337        }
2338
2339        hv_pci_bus_exit(hdev);
2340
2341        vmbus_close(hdev->channel);
2342
2343        iounmap(hbus->cfg_addr);
2344        hv_free_config_window(hbus);
2345        pci_free_resource_list(&hbus->resources_for_children);
2346        hv_pci_free_bridge_windows(hbus);
2347        irq_domain_remove(hbus->irq_domain);
2348        irq_domain_free_fwnode(hbus->sysdata.fwnode);
2349        put_hvpcibus(hbus);
2350        wait_for_completion(&hbus->remove_event);
2351        kfree(hbus);
2352        return 0;
2353}
2354
2355static const struct hv_vmbus_device_id hv_pci_id_table[] = {
2356        /* PCI Pass-through Class ID */
2357        /* 44C4F61D-4444-4400-9D52-802E27EDE19F */
2358        { HV_PCIE_GUID, },
2359        { },
2360};
2361
2362MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
2363
2364static struct hv_driver hv_pci_drv = {
2365        .name           = "hv_pci",
2366        .id_table       = hv_pci_id_table,
2367        .probe          = hv_pci_probe,
2368        .remove         = hv_pci_remove,
2369};
2370
2371static void __exit exit_hv_pci_drv(void)
2372{
2373        vmbus_driver_unregister(&hv_pci_drv);
2374}
2375
2376static int __init init_hv_pci_drv(void)
2377{
2378        return vmbus_driver_register(&hv_pci_drv);
2379}
2380
2381module_init(init_hv_pci_drv);
2382module_exit(exit_hv_pci_drv);
2383
2384MODULE_DESCRIPTION("Hyper-V PCI");
2385MODULE_LICENSE("GPL v2");
2386