linux/drivers/pci/controller/pci-hyperv.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) Microsoft Corporation.
   4 *
   5 * Author:
   6 *   Jake Oshins <jakeo@microsoft.com>
   7 *
   8 * This driver acts as a paravirtual front-end for PCI Express root buses.
   9 * When a PCI Express function (either an entire device or an SR-IOV
  10 * Virtual Function) is being passed through to the VM, this driver exposes
  11 * a new bus to the guest VM.  This is modeled as a root PCI bus because
  12 * no bridges are being exposed to the VM.  In fact, with a "Generation 2"
  13 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
  14 * until a device as been exposed using this driver.
  15 *
  16 * Each root PCI bus has its own PCI domain, which is called "Segment" in
  17 * the PCI Firmware Specifications.  Thus while each device passed through
  18 * to the VM using this front-end will appear at "device 0", the domain will
  19 * be unique.  Typically, each bus will have one PCI function on it, though
  20 * this driver does support more than one.
  21 *
  22 * In order to map the interrupts from the device through to the guest VM,
  23 * this driver also implements an IRQ Domain, which handles interrupts (either
  24 * MSI or MSI-X) associated with the functions on the bus.  As interrupts are
  25 * set up, torn down, or reaffined, this driver communicates with the
  26 * underlying hypervisor to adjust the mappings in the I/O MMU so that each
  27 * interrupt will be delivered to the correct virtual processor at the right
  28 * vector.  This driver does not support level-triggered (line-based)
  29 * interrupts, and will report that the Interrupt Line register in the
  30 * function's configuration space is zero.
  31 *
  32 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
  33 * facilities.  For instance, the configuration space of a function exposed
  34 * by Hyper-V is mapped into a single page of memory space, and the
  35 * read and write handlers for config space must be aware of this mechanism.
  36 * Similarly, device setup and teardown involves messages sent to and from
  37 * the PCI back-end driver in Hyper-V.
  38 */
  39
  40#include <linux/kernel.h>
  41#include <linux/module.h>
  42#include <linux/pci.h>
  43#include <linux/delay.h>
  44#include <linux/semaphore.h>
  45#include <linux/irqdomain.h>
  46#include <asm/irqdomain.h>
  47#include <asm/apic.h>
  48#include <linux/irq.h>
  49#include <linux/msi.h>
  50#include <linux/hyperv.h>
  51#include <linux/refcount.h>
  52#include <asm/mshyperv.h>
  53
  54/*
  55 * Protocol versions. The low word is the minor version, the high word the
  56 * major version.
  57 */
  58
  59#define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor)))
  60#define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
  61#define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
  62
  63enum pci_protocol_version_t {
  64        PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),      /* Win10 */
  65        PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2),      /* RS1 */
  66};
  67
  68#define CPU_AFFINITY_ALL        -1ULL
  69
  70/*
  71 * Supported protocol versions in the order of probing - highest go
  72 * first.
  73 */
  74static enum pci_protocol_version_t pci_protocol_versions[] = {
  75        PCI_PROTOCOL_VERSION_1_2,
  76        PCI_PROTOCOL_VERSION_1_1,
  77};
  78
  79#define PCI_CONFIG_MMIO_LENGTH  0x2000
  80#define CFG_PAGE_OFFSET 0x1000
  81#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
  82
  83#define MAX_SUPPORTED_MSI_MESSAGES 0x400
  84
  85#define STATUS_REVISION_MISMATCH 0xC0000059
  86
  87/* space for 32bit serial number as string */
  88#define SLOT_NAME_SIZE 11
  89
  90/*
  91 * Message Types
  92 */
  93
  94enum pci_message_type {
  95        /*
  96         * Version 1.1
  97         */
  98        PCI_MESSAGE_BASE                = 0x42490000,
  99        PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
 100        PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
 101        PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
 102        PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
 103        PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
 104        PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
 105        PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
 106        PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
 107        PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
 108        PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
 109        PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
 110        PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
 111        PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
 112        PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
 113        PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
 114        PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
 115        PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
 116        PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
 117        PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
 118        PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
 119        PCI_RESOURCES_ASSIGNED2         = PCI_MESSAGE_BASE + 0x16,
 120        PCI_CREATE_INTERRUPT_MESSAGE2   = PCI_MESSAGE_BASE + 0x17,
 121        PCI_DELETE_INTERRUPT_MESSAGE2   = PCI_MESSAGE_BASE + 0x18, /* unused */
 122        PCI_MESSAGE_MAXIMUM
 123};
 124
 125/*
 126 * Structures defining the virtual PCI Express protocol.
 127 */
 128
 129union pci_version {
 130        struct {
 131                u16 minor_version;
 132                u16 major_version;
 133        } parts;
 134        u32 version;
 135} __packed;
 136
 137/*
 138 * Function numbers are 8-bits wide on Express, as interpreted through ARI,
 139 * which is all this driver does.  This representation is the one used in
 140 * Windows, which is what is expected when sending this back and forth with
 141 * the Hyper-V parent partition.
 142 */
 143union win_slot_encoding {
 144        struct {
 145                u32     dev:5;
 146                u32     func:3;
 147                u32     reserved:24;
 148        } bits;
 149        u32 slot;
 150} __packed;
 151
 152/*
 153 * Pretty much as defined in the PCI Specifications.
 154 */
 155struct pci_function_description {
 156        u16     v_id;   /* vendor ID */
 157        u16     d_id;   /* device ID */
 158        u8      rev;
 159        u8      prog_intf;
 160        u8      subclass;
 161        u8      base_class;
 162        u32     subsystem_id;
 163        union win_slot_encoding win_slot;
 164        u32     ser;    /* serial number */
 165} __packed;
 166
 167/**
 168 * struct hv_msi_desc
 169 * @vector:             IDT entry
 170 * @delivery_mode:      As defined in Intel's Programmer's
 171 *                      Reference Manual, Volume 3, Chapter 8.
 172 * @vector_count:       Number of contiguous entries in the
 173 *                      Interrupt Descriptor Table that are
 174 *                      occupied by this Message-Signaled
 175 *                      Interrupt. For "MSI", as first defined
 176 *                      in PCI 2.2, this can be between 1 and
 177 *                      32. For "MSI-X," as first defined in PCI
 178 *                      3.0, this must be 1, as each MSI-X table
 179 *                      entry would have its own descriptor.
 180 * @reserved:           Empty space
 181 * @cpu_mask:           All the target virtual processors.
 182 */
 183struct hv_msi_desc {
 184        u8      vector;
 185        u8      delivery_mode;
 186        u16     vector_count;
 187        u32     reserved;
 188        u64     cpu_mask;
 189} __packed;
 190
 191/**
 192 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc
 193 * @vector:             IDT entry
 194 * @delivery_mode:      As defined in Intel's Programmer's
 195 *                      Reference Manual, Volume 3, Chapter 8.
 196 * @vector_count:       Number of contiguous entries in the
 197 *                      Interrupt Descriptor Table that are
 198 *                      occupied by this Message-Signaled
 199 *                      Interrupt. For "MSI", as first defined
 200 *                      in PCI 2.2, this can be between 1 and
 201 *                      32. For "MSI-X," as first defined in PCI
 202 *                      3.0, this must be 1, as each MSI-X table
 203 *                      entry would have its own descriptor.
 204 * @processor_count:    number of bits enabled in array.
 205 * @processor_array:    All the target virtual processors.
 206 */
 207struct hv_msi_desc2 {
 208        u8      vector;
 209        u8      delivery_mode;
 210        u16     vector_count;
 211        u16     processor_count;
 212        u16     processor_array[32];
 213} __packed;
 214
 215/**
 216 * struct tran_int_desc
 217 * @reserved:           unused, padding
 218 * @vector_count:       same as in hv_msi_desc
 219 * @data:               This is the "data payload" value that is
 220 *                      written by the device when it generates
 221 *                      a message-signaled interrupt, either MSI
 222 *                      or MSI-X.
 223 * @address:            This is the address to which the data
 224 *                      payload is written on interrupt
 225 *                      generation.
 226 */
 227struct tran_int_desc {
 228        u16     reserved;
 229        u16     vector_count;
 230        u32     data;
 231        u64     address;
 232} __packed;
 233
 234/*
 235 * A generic message format for virtual PCI.
 236 * Specific message formats are defined later in the file.
 237 */
 238
 239struct pci_message {
 240        u32 type;
 241} __packed;
 242
 243struct pci_child_message {
 244        struct pci_message message_type;
 245        union win_slot_encoding wslot;
 246} __packed;
 247
 248struct pci_incoming_message {
 249        struct vmpacket_descriptor hdr;
 250        struct pci_message message_type;
 251} __packed;
 252
 253struct pci_response {
 254        struct vmpacket_descriptor hdr;
 255        s32 status;                     /* negative values are failures */
 256} __packed;
 257
 258struct pci_packet {
 259        void (*completion_func)(void *context, struct pci_response *resp,
 260                                int resp_packet_size);
 261        void *compl_ctxt;
 262
 263        struct pci_message message[0];
 264};
 265
 266/*
 267 * Specific message types supporting the PCI protocol.
 268 */
 269
 270/*
 271 * Version negotiation message. Sent from the guest to the host.
 272 * The guest is free to try different versions until the host
 273 * accepts the version.
 274 *
 275 * pci_version: The protocol version requested.
 276 * is_last_attempt: If TRUE, this is the last version guest will request.
 277 * reservedz: Reserved field, set to zero.
 278 */
 279
 280struct pci_version_request {
 281        struct pci_message message_type;
 282        u32 protocol_version;
 283} __packed;
 284
 285/*
 286 * Bus D0 Entry.  This is sent from the guest to the host when the virtual
 287 * bus (PCI Express port) is ready for action.
 288 */
 289
 290struct pci_bus_d0_entry {
 291        struct pci_message message_type;
 292        u32 reserved;
 293        u64 mmio_base;
 294} __packed;
 295
 296struct pci_bus_relations {
 297        struct pci_incoming_message incoming;
 298        u32 device_count;
 299        struct pci_function_description func[0];
 300} __packed;
 301
 302struct pci_q_res_req_response {
 303        struct vmpacket_descriptor hdr;
 304        s32 status;                     /* negative values are failures */
 305        u32 probed_bar[PCI_STD_NUM_BARS];
 306} __packed;
 307
 308struct pci_set_power {
 309        struct pci_message message_type;
 310        union win_slot_encoding wslot;
 311        u32 power_state;                /* In Windows terms */
 312        u32 reserved;
 313} __packed;
 314
 315struct pci_set_power_response {
 316        struct vmpacket_descriptor hdr;
 317        s32 status;                     /* negative values are failures */
 318        union win_slot_encoding wslot;
 319        u32 resultant_state;            /* In Windows terms */
 320        u32 reserved;
 321} __packed;
 322
 323struct pci_resources_assigned {
 324        struct pci_message message_type;
 325        union win_slot_encoding wslot;
 326        u8 memory_range[0x14][6];       /* not used here */
 327        u32 msi_descriptors;
 328        u32 reserved[4];
 329} __packed;
 330
 331struct pci_resources_assigned2 {
 332        struct pci_message message_type;
 333        union win_slot_encoding wslot;
 334        u8 memory_range[0x14][6];       /* not used here */
 335        u32 msi_descriptor_count;
 336        u8 reserved[70];
 337} __packed;
 338
 339struct pci_create_interrupt {
 340        struct pci_message message_type;
 341        union win_slot_encoding wslot;
 342        struct hv_msi_desc int_desc;
 343} __packed;
 344
 345struct pci_create_int_response {
 346        struct pci_response response;
 347        u32 reserved;
 348        struct tran_int_desc int_desc;
 349} __packed;
 350
 351struct pci_create_interrupt2 {
 352        struct pci_message message_type;
 353        union win_slot_encoding wslot;
 354        struct hv_msi_desc2 int_desc;
 355} __packed;
 356
 357struct pci_delete_interrupt {
 358        struct pci_message message_type;
 359        union win_slot_encoding wslot;
 360        struct tran_int_desc int_desc;
 361} __packed;
 362
 363/*
 364 * Note: the VM must pass a valid block id, wslot and bytes_requested.
 365 */
 366struct pci_read_block {
 367        struct pci_message message_type;
 368        u32 block_id;
 369        union win_slot_encoding wslot;
 370        u32 bytes_requested;
 371} __packed;
 372
 373struct pci_read_block_response {
 374        struct vmpacket_descriptor hdr;
 375        u32 status;
 376        u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
 377} __packed;
 378
 379/*
 380 * Note: the VM must pass a valid block id, wslot and byte_count.
 381 */
 382struct pci_write_block {
 383        struct pci_message message_type;
 384        u32 block_id;
 385        union win_slot_encoding wslot;
 386        u32 byte_count;
 387        u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
 388} __packed;
 389
 390struct pci_dev_inval_block {
 391        struct pci_incoming_message incoming;
 392        union win_slot_encoding wslot;
 393        u64 block_mask;
 394} __packed;
 395
 396struct pci_dev_incoming {
 397        struct pci_incoming_message incoming;
 398        union win_slot_encoding wslot;
 399} __packed;
 400
 401struct pci_eject_response {
 402        struct pci_message message_type;
 403        union win_slot_encoding wslot;
 404        u32 status;
 405} __packed;
 406
 407static int pci_ring_size = (4 * PAGE_SIZE);
 408
 409/*
 410 * Definitions or interrupt steering hypercall.
 411 */
 412#define HV_PARTITION_ID_SELF            ((u64)-1)
 413#define HVCALL_RETARGET_INTERRUPT       0x7e
 414
 415struct hv_interrupt_entry {
 416        u32     source;                 /* 1 for MSI(-X) */
 417        u32     reserved1;
 418        u32     address;
 419        u32     data;
 420};
 421
 422/*
 423 * flags for hv_device_interrupt_target.flags
 424 */
 425#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST            1
 426#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET        2
 427
 428struct hv_device_interrupt_target {
 429        u32     vector;
 430        u32     flags;
 431        union {
 432                u64              vp_mask;
 433                struct hv_vpset vp_set;
 434        };
 435};
 436
 437struct retarget_msi_interrupt {
 438        u64     partition_id;           /* use "self" */
 439        u64     device_id;
 440        struct hv_interrupt_entry int_entry;
 441        u64     reserved2;
 442        struct hv_device_interrupt_target int_target;
 443} __packed __aligned(8);
 444
 445/*
 446 * Driver specific state.
 447 */
 448
 449enum hv_pcibus_state {
 450        hv_pcibus_init = 0,
 451        hv_pcibus_probed,
 452        hv_pcibus_installed,
 453        hv_pcibus_removing,
 454        hv_pcibus_removed,
 455        hv_pcibus_maximum
 456};
 457
 458struct hv_pcibus_device {
 459        struct pci_sysdata sysdata;
 460        /* Protocol version negotiated with the host */
 461        enum pci_protocol_version_t protocol_version;
 462        enum hv_pcibus_state state;
 463        refcount_t remove_lock;
 464        struct hv_device *hdev;
 465        resource_size_t low_mmio_space;
 466        resource_size_t high_mmio_space;
 467        struct resource *mem_config;
 468        struct resource *low_mmio_res;
 469        struct resource *high_mmio_res;
 470        struct completion *survey_event;
 471        struct completion remove_event;
 472        struct pci_bus *pci_bus;
 473        spinlock_t config_lock; /* Avoid two threads writing index page */
 474        spinlock_t device_list_lock;    /* Protect lists below */
 475        void __iomem *cfg_addr;
 476
 477        struct list_head resources_for_children;
 478
 479        struct list_head children;
 480        struct list_head dr_list;
 481
 482        struct msi_domain_info msi_info;
 483        struct msi_controller msi_chip;
 484        struct irq_domain *irq_domain;
 485
 486        spinlock_t retarget_msi_interrupt_lock;
 487
 488        struct workqueue_struct *wq;
 489
 490        /* hypercall arg, must not cross page boundary */
 491        struct retarget_msi_interrupt retarget_msi_interrupt_params;
 492
 493        /*
 494         * Don't put anything here: retarget_msi_interrupt_params must be last
 495         */
 496};
 497
 498/*
 499 * Tracks "Device Relations" messages from the host, which must be both
 500 * processed in order and deferred so that they don't run in the context
 501 * of the incoming packet callback.
 502 */
 503struct hv_dr_work {
 504        struct work_struct wrk;
 505        struct hv_pcibus_device *bus;
 506};
 507
 508struct hv_dr_state {
 509        struct list_head list_entry;
 510        u32 device_count;
 511        struct pci_function_description func[0];
 512};
 513
 514enum hv_pcichild_state {
 515        hv_pcichild_init = 0,
 516        hv_pcichild_requirements,
 517        hv_pcichild_resourced,
 518        hv_pcichild_ejecting,
 519        hv_pcichild_maximum
 520};
 521
 522struct hv_pci_dev {
 523        /* List protected by pci_rescan_remove_lock */
 524        struct list_head list_entry;
 525        refcount_t refs;
 526        enum hv_pcichild_state state;
 527        struct pci_slot *pci_slot;
 528        struct pci_function_description desc;
 529        bool reported_missing;
 530        struct hv_pcibus_device *hbus;
 531        struct work_struct wrk;
 532
 533        void (*block_invalidate)(void *context, u64 block_mask);
 534        void *invalidate_context;
 535
 536        /*
 537         * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
 538         * read it back, for each of the BAR offsets within config space.
 539         */
 540        u32 probed_bar[PCI_STD_NUM_BARS];
 541};
 542
 543struct hv_pci_compl {
 544        struct completion host_event;
 545        s32 completion_status;
 546};
 547
 548static void hv_pci_onchannelcallback(void *context);
 549
 550/**
 551 * hv_pci_generic_compl() - Invoked for a completion packet
 552 * @context:            Set up by the sender of the packet.
 553 * @resp:               The response packet
 554 * @resp_packet_size:   Size in bytes of the packet
 555 *
 556 * This function is used to trigger an event and report status
 557 * for any message for which the completion packet contains a
 558 * status and nothing else.
 559 */
 560static void hv_pci_generic_compl(void *context, struct pci_response *resp,
 561                                 int resp_packet_size)
 562{
 563        struct hv_pci_compl *comp_pkt = context;
 564
 565        if (resp_packet_size >= offsetofend(struct pci_response, status))
 566                comp_pkt->completion_status = resp->status;
 567        else
 568                comp_pkt->completion_status = -1;
 569
 570        complete(&comp_pkt->host_event);
 571}
 572
 573static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
 574                                                u32 wslot);
 575
 576static void get_pcichild(struct hv_pci_dev *hpdev)
 577{
 578        refcount_inc(&hpdev->refs);
 579}
 580
 581static void put_pcichild(struct hv_pci_dev *hpdev)
 582{
 583        if (refcount_dec_and_test(&hpdev->refs))
 584                kfree(hpdev);
 585}
 586
 587static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus);
 588static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus);
 589
 590/*
 591 * There is no good way to get notified from vmbus_onoffer_rescind(),
 592 * so let's use polling here, since this is not a hot path.
 593 */
 594static int wait_for_response(struct hv_device *hdev,
 595                             struct completion *comp)
 596{
 597        while (true) {
 598                if (hdev->channel->rescind) {
 599                        dev_warn_once(&hdev->device, "The device is gone.\n");
 600                        return -ENODEV;
 601                }
 602
 603                if (wait_for_completion_timeout(comp, HZ / 10))
 604                        break;
 605        }
 606
 607        return 0;
 608}
 609
 610/**
 611 * devfn_to_wslot() - Convert from Linux PCI slot to Windows
 612 * @devfn:      The Linux representation of PCI slot
 613 *
 614 * Windows uses a slightly different representation of PCI slot.
 615 *
 616 * Return: The Windows representation
 617 */
 618static u32 devfn_to_wslot(int devfn)
 619{
 620        union win_slot_encoding wslot;
 621
 622        wslot.slot = 0;
 623        wslot.bits.dev = PCI_SLOT(devfn);
 624        wslot.bits.func = PCI_FUNC(devfn);
 625
 626        return wslot.slot;
 627}
 628
 629/**
 630 * wslot_to_devfn() - Convert from Windows PCI slot to Linux
 631 * @wslot:      The Windows representation of PCI slot
 632 *
 633 * Windows uses a slightly different representation of PCI slot.
 634 *
 635 * Return: The Linux representation
 636 */
 637static int wslot_to_devfn(u32 wslot)
 638{
 639        union win_slot_encoding slot_no;
 640
 641        slot_no.slot = wslot;
 642        return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
 643}
 644
 645/*
 646 * PCI Configuration Space for these root PCI buses is implemented as a pair
 647 * of pages in memory-mapped I/O space.  Writing to the first page chooses
 648 * the PCI function being written or read.  Once the first page has been
 649 * written to, the following page maps in the entire configuration space of
 650 * the function.
 651 */
 652
 653/**
 654 * _hv_pcifront_read_config() - Internal PCI config read
 655 * @hpdev:      The PCI driver's representation of the device
 656 * @where:      Offset within config space
 657 * @size:       Size of the transfer
 658 * @val:        Pointer to the buffer receiving the data
 659 */
 660static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
 661                                     int size, u32 *val)
 662{
 663        unsigned long flags;
 664        void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
 665
 666        /*
 667         * If the attempt is to read the IDs or the ROM BAR, simulate that.
 668         */
 669        if (where + size <= PCI_COMMAND) {
 670                memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
 671        } else if (where >= PCI_CLASS_REVISION && where + size <=
 672                   PCI_CACHE_LINE_SIZE) {
 673                memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
 674                       PCI_CLASS_REVISION, size);
 675        } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
 676                   PCI_ROM_ADDRESS) {
 677                memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
 678                       PCI_SUBSYSTEM_VENDOR_ID, size);
 679        } else if (where >= PCI_ROM_ADDRESS && where + size <=
 680                   PCI_CAPABILITY_LIST) {
 681                /* ROM BARs are unimplemented */
 682                *val = 0;
 683        } else if (where >= PCI_INTERRUPT_LINE && where + size <=
 684                   PCI_INTERRUPT_PIN) {
 685                /*
 686                 * Interrupt Line and Interrupt PIN are hard-wired to zero
 687                 * because this front-end only supports message-signaled
 688                 * interrupts.
 689                 */
 690                *val = 0;
 691        } else if (where + size <= CFG_PAGE_SIZE) {
 692                spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
 693                /* Choose the function to be read. (See comment above) */
 694                writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
 695                /* Make sure the function was chosen before we start reading. */
 696                mb();
 697                /* Read from that function's config space. */
 698                switch (size) {
 699                case 1:
 700                        *val = readb(addr);
 701                        break;
 702                case 2:
 703                        *val = readw(addr);
 704                        break;
 705                default:
 706                        *val = readl(addr);
 707                        break;
 708                }
 709                /*
 710                 * Make sure the read was done before we release the spinlock
 711                 * allowing consecutive reads/writes.
 712                 */
 713                mb();
 714                spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
 715        } else {
 716                dev_err(&hpdev->hbus->hdev->device,
 717                        "Attempt to read beyond a function's config space.\n");
 718        }
 719}
 720
 721static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
 722{
 723        u16 ret;
 724        unsigned long flags;
 725        void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
 726                             PCI_VENDOR_ID;
 727
 728        spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
 729
 730        /* Choose the function to be read. (See comment above) */
 731        writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
 732        /* Make sure the function was chosen before we start reading. */
 733        mb();
 734        /* Read from that function's config space. */
 735        ret = readw(addr);
 736        /*
 737         * mb() is not required here, because the spin_unlock_irqrestore()
 738         * is a barrier.
 739         */
 740
 741        spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
 742
 743        return ret;
 744}
 745
 746/**
 747 * _hv_pcifront_write_config() - Internal PCI config write
 748 * @hpdev:      The PCI driver's representation of the device
 749 * @where:      Offset within config space
 750 * @size:       Size of the transfer
 751 * @val:        The data being transferred
 752 */
 753static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
 754                                      int size, u32 val)
 755{
 756        unsigned long flags;
 757        void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
 758
 759        if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
 760            where + size <= PCI_CAPABILITY_LIST) {
 761                /* SSIDs and ROM BARs are read-only */
 762        } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
 763                spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
 764                /* Choose the function to be written. (See comment above) */
 765                writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
 766                /* Make sure the function was chosen before we start writing. */
 767                wmb();
 768                /* Write to that function's config space. */
 769                switch (size) {
 770                case 1:
 771                        writeb(val, addr);
 772                        break;
 773                case 2:
 774                        writew(val, addr);
 775                        break;
 776                default:
 777                        writel(val, addr);
 778                        break;
 779                }
 780                /*
 781                 * Make sure the write was done before we release the spinlock
 782                 * allowing consecutive reads/writes.
 783                 */
 784                mb();
 785                spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
 786        } else {
 787                dev_err(&hpdev->hbus->hdev->device,
 788                        "Attempt to write beyond a function's config space.\n");
 789        }
 790}
 791
 792/**
 793 * hv_pcifront_read_config() - Read configuration space
 794 * @bus: PCI Bus structure
 795 * @devfn: Device/function
 796 * @where: Offset from base
 797 * @size: Byte/word/dword
 798 * @val: Value to be read
 799 *
 800 * Return: PCIBIOS_SUCCESSFUL on success
 801 *         PCIBIOS_DEVICE_NOT_FOUND on failure
 802 */
 803static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
 804                                   int where, int size, u32 *val)
 805{
 806        struct hv_pcibus_device *hbus =
 807                container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
 808        struct hv_pci_dev *hpdev;
 809
 810        hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
 811        if (!hpdev)
 812                return PCIBIOS_DEVICE_NOT_FOUND;
 813
 814        _hv_pcifront_read_config(hpdev, where, size, val);
 815
 816        put_pcichild(hpdev);
 817        return PCIBIOS_SUCCESSFUL;
 818}
 819
 820/**
 821 * hv_pcifront_write_config() - Write configuration space
 822 * @bus: PCI Bus structure
 823 * @devfn: Device/function
 824 * @where: Offset from base
 825 * @size: Byte/word/dword
 826 * @val: Value to be written to device
 827 *
 828 * Return: PCIBIOS_SUCCESSFUL on success
 829 *         PCIBIOS_DEVICE_NOT_FOUND on failure
 830 */
 831static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
 832                                    int where, int size, u32 val)
 833{
 834        struct hv_pcibus_device *hbus =
 835            container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
 836        struct hv_pci_dev *hpdev;
 837
 838        hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
 839        if (!hpdev)
 840                return PCIBIOS_DEVICE_NOT_FOUND;
 841
 842        _hv_pcifront_write_config(hpdev, where, size, val);
 843
 844        put_pcichild(hpdev);
 845        return PCIBIOS_SUCCESSFUL;
 846}
 847
 848/* PCIe operations */
 849static struct pci_ops hv_pcifront_ops = {
 850        .read  = hv_pcifront_read_config,
 851        .write = hv_pcifront_write_config,
 852};
 853
 854/*
 855 * Paravirtual backchannel
 856 *
 857 * Hyper-V SR-IOV provides a backchannel mechanism in software for
 858 * communication between a VF driver and a PF driver.  These
 859 * "configuration blocks" are similar in concept to PCI configuration space,
 860 * but instead of doing reads and writes in 32-bit chunks through a very slow
 861 * path, packets of up to 128 bytes can be sent or received asynchronously.
 862 *
 863 * Nearly every SR-IOV device contains just such a communications channel in
 864 * hardware, so using this one in software is usually optional.  Using the
 865 * software channel, however, allows driver implementers to leverage software
 866 * tools that fuzz the communications channel looking for vulnerabilities.
 867 *
 868 * The usage model for these packets puts the responsibility for reading or
 869 * writing on the VF driver.  The VF driver sends a read or a write packet,
 870 * indicating which "block" is being referred to by number.
 871 *
 872 * If the PF driver wishes to initiate communication, it can "invalidate" one or
 873 * more of the first 64 blocks.  This invalidation is delivered via a callback
 874 * supplied by the VF driver by this driver.
 875 *
 876 * No protocol is implied, except that supplied by the PF and VF drivers.
 877 */
 878
 879struct hv_read_config_compl {
 880        struct hv_pci_compl comp_pkt;
 881        void *buf;
 882        unsigned int len;
 883        unsigned int bytes_returned;
 884};
 885
 886/**
 887 * hv_pci_read_config_compl() - Invoked when a response packet
 888 * for a read config block operation arrives.
 889 * @context:            Identifies the read config operation
 890 * @resp:               The response packet itself
 891 * @resp_packet_size:   Size in bytes of the response packet
 892 */
 893static void hv_pci_read_config_compl(void *context, struct pci_response *resp,
 894                                     int resp_packet_size)
 895{
 896        struct hv_read_config_compl *comp = context;
 897        struct pci_read_block_response *read_resp =
 898                (struct pci_read_block_response *)resp;
 899        unsigned int data_len, hdr_len;
 900
 901        hdr_len = offsetof(struct pci_read_block_response, bytes);
 902        if (resp_packet_size < hdr_len) {
 903                comp->comp_pkt.completion_status = -1;
 904                goto out;
 905        }
 906
 907        data_len = resp_packet_size - hdr_len;
 908        if (data_len > 0 && read_resp->status == 0) {
 909                comp->bytes_returned = min(comp->len, data_len);
 910                memcpy(comp->buf, read_resp->bytes, comp->bytes_returned);
 911        } else {
 912                comp->bytes_returned = 0;
 913        }
 914
 915        comp->comp_pkt.completion_status = read_resp->status;
 916out:
 917        complete(&comp->comp_pkt.host_event);
 918}
 919
 920/**
 921 * hv_read_config_block() - Sends a read config block request to
 922 * the back-end driver running in the Hyper-V parent partition.
 923 * @pdev:               The PCI driver's representation for this device.
 924 * @buf:                Buffer into which the config block will be copied.
 925 * @len:                Size in bytes of buf.
 926 * @block_id:           Identifies the config block which has been requested.
 927 * @bytes_returned:     Size which came back from the back-end driver.
 928 *
 929 * Return: 0 on success, -errno on failure
 930 */
 931int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
 932                         unsigned int block_id, unsigned int *bytes_returned)
 933{
 934        struct hv_pcibus_device *hbus =
 935                container_of(pdev->bus->sysdata, struct hv_pcibus_device,
 936                             sysdata);
 937        struct {
 938                struct pci_packet pkt;
 939                char buf[sizeof(struct pci_read_block)];
 940        } pkt;
 941        struct hv_read_config_compl comp_pkt;
 942        struct pci_read_block *read_blk;
 943        int ret;
 944
 945        if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
 946                return -EINVAL;
 947
 948        init_completion(&comp_pkt.comp_pkt.host_event);
 949        comp_pkt.buf = buf;
 950        comp_pkt.len = len;
 951
 952        memset(&pkt, 0, sizeof(pkt));
 953        pkt.pkt.completion_func = hv_pci_read_config_compl;
 954        pkt.pkt.compl_ctxt = &comp_pkt;
 955        read_blk = (struct pci_read_block *)&pkt.pkt.message;
 956        read_blk->message_type.type = PCI_READ_BLOCK;
 957        read_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
 958        read_blk->block_id = block_id;
 959        read_blk->bytes_requested = len;
 960
 961        ret = vmbus_sendpacket(hbus->hdev->channel, read_blk,
 962                               sizeof(*read_blk), (unsigned long)&pkt.pkt,
 963                               VM_PKT_DATA_INBAND,
 964                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 965        if (ret)
 966                return ret;
 967
 968        ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event);
 969        if (ret)
 970                return ret;
 971
 972        if (comp_pkt.comp_pkt.completion_status != 0 ||
 973            comp_pkt.bytes_returned == 0) {
 974                dev_err(&hbus->hdev->device,
 975                        "Read Config Block failed: 0x%x, bytes_returned=%d\n",
 976                        comp_pkt.comp_pkt.completion_status,
 977                        comp_pkt.bytes_returned);
 978                return -EIO;
 979        }
 980
 981        *bytes_returned = comp_pkt.bytes_returned;
 982        return 0;
 983}
 984
 985/**
 986 * hv_pci_write_config_compl() - Invoked when a response packet for a write
 987 * config block operation arrives.
 988 * @context:            Identifies the write config operation
 989 * @resp:               The response packet itself
 990 * @resp_packet_size:   Size in bytes of the response packet
 991 */
 992static void hv_pci_write_config_compl(void *context, struct pci_response *resp,
 993                                      int resp_packet_size)
 994{
 995        struct hv_pci_compl *comp_pkt = context;
 996
 997        comp_pkt->completion_status = resp->status;
 998        complete(&comp_pkt->host_event);
 999}
1000
1001/**
1002 * hv_write_config_block() - Sends a write config block request to the
1003 * back-end driver running in the Hyper-V parent partition.
1004 * @pdev:               The PCI driver's representation for this device.
1005 * @buf:                Buffer from which the config block will be copied.
1006 * @len:                Size in bytes of buf.
1007 * @block_id:           Identifies the config block which is being written.
1008 *
1009 * Return: 0 on success, -errno on failure
1010 */
1011int hv_write_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
1012                          unsigned int block_id)
1013{
1014        struct hv_pcibus_device *hbus =
1015                container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1016                             sysdata);
1017        struct {
1018                struct pci_packet pkt;
1019                char buf[sizeof(struct pci_write_block)];
1020                u32 reserved;
1021        } pkt;
1022        struct hv_pci_compl comp_pkt;
1023        struct pci_write_block *write_blk;
1024        u32 pkt_size;
1025        int ret;
1026
1027        if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
1028                return -EINVAL;
1029
1030        init_completion(&comp_pkt.host_event);
1031
1032        memset(&pkt, 0, sizeof(pkt));
1033        pkt.pkt.completion_func = hv_pci_write_config_compl;
1034        pkt.pkt.compl_ctxt = &comp_pkt;
1035        write_blk = (struct pci_write_block *)&pkt.pkt.message;
1036        write_blk->message_type.type = PCI_WRITE_BLOCK;
1037        write_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
1038        write_blk->block_id = block_id;
1039        write_blk->byte_count = len;
1040        memcpy(write_blk->bytes, buf, len);
1041        pkt_size = offsetof(struct pci_write_block, bytes) + len;
1042        /*
1043         * This quirk is required on some hosts shipped around 2018, because
1044         * these hosts don't check the pkt_size correctly (new hosts have been
1045         * fixed since early 2019). The quirk is also safe on very old hosts
1046         * and new hosts, because, on them, what really matters is the length
1047         * specified in write_blk->byte_count.
1048         */
1049        pkt_size += sizeof(pkt.reserved);
1050
1051        ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size,
1052                               (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND,
1053                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1054        if (ret)
1055                return ret;
1056
1057        ret = wait_for_response(hbus->hdev, &comp_pkt.host_event);
1058        if (ret)
1059                return ret;
1060
1061        if (comp_pkt.completion_status != 0) {
1062                dev_err(&hbus->hdev->device,
1063                        "Write Config Block failed: 0x%x\n",
1064                        comp_pkt.completion_status);
1065                return -EIO;
1066        }
1067
1068        return 0;
1069}
1070
1071/**
1072 * hv_register_block_invalidate() - Invoked when a config block invalidation
1073 * arrives from the back-end driver.
1074 * @pdev:               The PCI driver's representation for this device.
1075 * @context:            Identifies the device.
1076 * @block_invalidate:   Identifies all of the blocks being invalidated.
1077 *
1078 * Return: 0 on success, -errno on failure
1079 */
1080int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
1081                                 void (*block_invalidate)(void *context,
1082                                                          u64 block_mask))
1083{
1084        struct hv_pcibus_device *hbus =
1085                container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1086                             sysdata);
1087        struct hv_pci_dev *hpdev;
1088
1089        hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1090        if (!hpdev)
1091                return -ENODEV;
1092
1093        hpdev->block_invalidate = block_invalidate;
1094        hpdev->invalidate_context = context;
1095
1096        put_pcichild(hpdev);
1097        return 0;
1098
1099}
1100
1101/* Interrupt management hooks */
1102static void hv_int_desc_free(struct hv_pci_dev *hpdev,
1103                             struct tran_int_desc *int_desc)
1104{
1105        struct pci_delete_interrupt *int_pkt;
1106        struct {
1107                struct pci_packet pkt;
1108                u8 buffer[sizeof(struct pci_delete_interrupt)];
1109        } ctxt;
1110
1111        memset(&ctxt, 0, sizeof(ctxt));
1112        int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
1113        int_pkt->message_type.type =
1114                PCI_DELETE_INTERRUPT_MESSAGE;
1115        int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
1116        int_pkt->int_desc = *int_desc;
1117        vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
1118                         (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0);
1119        kfree(int_desc);
1120}
1121
1122/**
1123 * hv_msi_free() - Free the MSI.
1124 * @domain:     The interrupt domain pointer
1125 * @info:       Extra MSI-related context
1126 * @irq:        Identifies the IRQ.
1127 *
1128 * The Hyper-V parent partition and hypervisor are tracking the
1129 * messages that are in use, keeping the interrupt redirection
1130 * table up to date.  This callback sends a message that frees
1131 * the IRT entry and related tracking nonsense.
1132 */
1133static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
1134                        unsigned int irq)
1135{
1136        struct hv_pcibus_device *hbus;
1137        struct hv_pci_dev *hpdev;
1138        struct pci_dev *pdev;
1139        struct tran_int_desc *int_desc;
1140        struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
1141        struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
1142
1143        pdev = msi_desc_to_pci_dev(msi);
1144        hbus = info->data;
1145        int_desc = irq_data_get_irq_chip_data(irq_data);
1146        if (!int_desc)
1147                return;
1148
1149        irq_data->chip_data = NULL;
1150        hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1151        if (!hpdev) {
1152                kfree(int_desc);
1153                return;
1154        }
1155
1156        hv_int_desc_free(hpdev, int_desc);
1157        put_pcichild(hpdev);
1158}
1159
1160static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
1161                           bool force)
1162{
1163        struct irq_data *parent = data->parent_data;
1164
1165        return parent->chip->irq_set_affinity(parent, dest, force);
1166}
1167
1168static void hv_irq_mask(struct irq_data *data)
1169{
1170        pci_msi_mask_irq(data);
1171}
1172
1173/**
1174 * hv_irq_unmask() - "Unmask" the IRQ by setting its current
1175 * affinity.
1176 * @data:       Describes the IRQ
1177 *
1178 * Build new a destination for the MSI and make a hypercall to
1179 * update the Interrupt Redirection Table. "Device Logical ID"
1180 * is built out of this PCI bus's instance GUID and the function
1181 * number of the device.
1182 */
1183static void hv_irq_unmask(struct irq_data *data)
1184{
1185        struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
1186        struct irq_cfg *cfg = irqd_cfg(data);
1187        struct retarget_msi_interrupt *params;
1188        struct hv_pcibus_device *hbus;
1189        struct cpumask *dest;
1190        cpumask_var_t tmp;
1191        struct pci_bus *pbus;
1192        struct pci_dev *pdev;
1193        unsigned long flags;
1194        u32 var_size = 0;
1195        int cpu, nr_bank;
1196        u64 res;
1197
1198        dest = irq_data_get_effective_affinity_mask(data);
1199        pdev = msi_desc_to_pci_dev(msi_desc);
1200        pbus = pdev->bus;
1201        hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1202
1203        spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
1204
1205        params = &hbus->retarget_msi_interrupt_params;
1206        memset(params, 0, sizeof(*params));
1207        params->partition_id = HV_PARTITION_ID_SELF;
1208        params->int_entry.source = 1; /* MSI(-X) */
1209        params->int_entry.address = msi_desc->msg.address_lo;
1210        params->int_entry.data = msi_desc->msg.data;
1211        params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
1212                           (hbus->hdev->dev_instance.b[4] << 16) |
1213                           (hbus->hdev->dev_instance.b[7] << 8) |
1214                           (hbus->hdev->dev_instance.b[6] & 0xf8) |
1215                           PCI_FUNC(pdev->devfn);
1216        params->int_target.vector = cfg->vector;
1217
1218        /*
1219         * Honoring apic->irq_delivery_mode set to dest_Fixed by
1220         * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
1221         * spurious interrupt storm. Not doing so does not seem to have a
1222         * negative effect (yet?).
1223         */
1224
1225        if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
1226                /*
1227                 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
1228                 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
1229                 * with >64 VP support.
1230                 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
1231                 * is not sufficient for this hypercall.
1232                 */
1233                params->int_target.flags |=
1234                        HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
1235
1236                if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) {
1237                        res = 1;
1238                        goto exit_unlock;
1239                }
1240
1241                cpumask_and(tmp, dest, cpu_online_mask);
1242                nr_bank = cpumask_to_vpset(&params->int_target.vp_set, tmp);
1243                free_cpumask_var(tmp);
1244
1245                if (nr_bank <= 0) {
1246                        res = 1;
1247                        goto exit_unlock;
1248                }
1249
1250                /*
1251                 * var-sized hypercall, var-size starts after vp_mask (thus
1252                 * vp_set.format does not count, but vp_set.valid_bank_mask
1253                 * does).
1254                 */
1255                var_size = 1 + nr_bank;
1256        } else {
1257                for_each_cpu_and(cpu, dest, cpu_online_mask) {
1258                        params->int_target.vp_mask |=
1259                                (1ULL << hv_cpu_number_to_vp_number(cpu));
1260                }
1261        }
1262
1263        res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
1264                              params, NULL);
1265
1266exit_unlock:
1267        spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
1268
1269        if (res) {
1270                dev_err(&hbus->hdev->device,
1271                        "%s() failed: %#llx", __func__, res);
1272                return;
1273        }
1274
1275        pci_msi_unmask_irq(data);
1276}
1277
1278struct compose_comp_ctxt {
1279        struct hv_pci_compl comp_pkt;
1280        struct tran_int_desc int_desc;
1281};
1282
1283static void hv_pci_compose_compl(void *context, struct pci_response *resp,
1284                                 int resp_packet_size)
1285{
1286        struct compose_comp_ctxt *comp_pkt = context;
1287        struct pci_create_int_response *int_resp =
1288                (struct pci_create_int_response *)resp;
1289
1290        comp_pkt->comp_pkt.completion_status = resp->status;
1291        comp_pkt->int_desc = int_resp->int_desc;
1292        complete(&comp_pkt->comp_pkt.host_event);
1293}
1294
1295static u32 hv_compose_msi_req_v1(
1296        struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
1297        u32 slot, u8 vector)
1298{
1299        int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1300        int_pkt->wslot.slot = slot;
1301        int_pkt->int_desc.vector = vector;
1302        int_pkt->int_desc.vector_count = 1;
1303        int_pkt->int_desc.delivery_mode = dest_Fixed;
1304
1305        /*
1306         * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
1307         * hv_irq_unmask().
1308         */
1309        int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL;
1310
1311        return sizeof(*int_pkt);
1312}
1313
1314static u32 hv_compose_msi_req_v2(
1315        struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
1316        u32 slot, u8 vector)
1317{
1318        int cpu;
1319
1320        int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
1321        int_pkt->wslot.slot = slot;
1322        int_pkt->int_desc.vector = vector;
1323        int_pkt->int_desc.vector_count = 1;
1324        int_pkt->int_desc.delivery_mode = dest_Fixed;
1325
1326        /*
1327         * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
1328         * by subsequent retarget in hv_irq_unmask().
1329         */
1330        cpu = cpumask_first_and(affinity, cpu_online_mask);
1331        int_pkt->int_desc.processor_array[0] =
1332                hv_cpu_number_to_vp_number(cpu);
1333        int_pkt->int_desc.processor_count = 1;
1334
1335        return sizeof(*int_pkt);
1336}
1337
1338/**
1339 * hv_compose_msi_msg() - Supplies a valid MSI address/data
1340 * @data:       Everything about this MSI
1341 * @msg:        Buffer that is filled in by this function
1342 *
1343 * This function unpacks the IRQ looking for target CPU set, IDT
1344 * vector and mode and sends a message to the parent partition
1345 * asking for a mapping for that tuple in this partition.  The
1346 * response supplies a data value and address to which that data
1347 * should be written to trigger that interrupt.
1348 */
1349static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1350{
1351        struct irq_cfg *cfg = irqd_cfg(data);
1352        struct hv_pcibus_device *hbus;
1353        struct hv_pci_dev *hpdev;
1354        struct pci_bus *pbus;
1355        struct pci_dev *pdev;
1356        struct cpumask *dest;
1357        unsigned long flags;
1358        struct compose_comp_ctxt comp;
1359        struct tran_int_desc *int_desc;
1360        struct {
1361                struct pci_packet pci_pkt;
1362                union {
1363                        struct pci_create_interrupt v1;
1364                        struct pci_create_interrupt2 v2;
1365                } int_pkts;
1366        } __packed ctxt;
1367
1368        u32 size;
1369        int ret;
1370
1371        pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
1372        dest = irq_data_get_effective_affinity_mask(data);
1373        pbus = pdev->bus;
1374        hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1375        hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1376        if (!hpdev)
1377                goto return_null_message;
1378
1379        /* Free any previous message that might have already been composed. */
1380        if (data->chip_data) {
1381                int_desc = data->chip_data;
1382                data->chip_data = NULL;
1383                hv_int_desc_free(hpdev, int_desc);
1384        }
1385
1386        int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
1387        if (!int_desc)
1388                goto drop_reference;
1389
1390        memset(&ctxt, 0, sizeof(ctxt));
1391        init_completion(&comp.comp_pkt.host_event);
1392        ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
1393        ctxt.pci_pkt.compl_ctxt = &comp;
1394
1395        switch (hbus->protocol_version) {
1396        case PCI_PROTOCOL_VERSION_1_1:
1397                size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
1398                                        dest,
1399                                        hpdev->desc.win_slot.slot,
1400                                        cfg->vector);
1401                break;
1402
1403        case PCI_PROTOCOL_VERSION_1_2:
1404                size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
1405                                        dest,
1406                                        hpdev->desc.win_slot.slot,
1407                                        cfg->vector);
1408                break;
1409
1410        default:
1411                /* As we only negotiate protocol versions known to this driver,
1412                 * this path should never hit. However, this is it not a hot
1413                 * path so we print a message to aid future updates.
1414                 */
1415                dev_err(&hbus->hdev->device,
1416                        "Unexpected vPCI protocol, update driver.");
1417                goto free_int_desc;
1418        }
1419
1420        ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
1421                               size, (unsigned long)&ctxt.pci_pkt,
1422                               VM_PKT_DATA_INBAND,
1423                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1424        if (ret) {
1425                dev_err(&hbus->hdev->device,
1426                        "Sending request for interrupt failed: 0x%x",
1427                        comp.comp_pkt.completion_status);
1428                goto free_int_desc;
1429        }
1430
1431        /*
1432         * Since this function is called with IRQ locks held, can't
1433         * do normal wait for completion; instead poll.
1434         */
1435        while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
1436                /* 0xFFFF means an invalid PCI VENDOR ID. */
1437                if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
1438                        dev_err_once(&hbus->hdev->device,
1439                                     "the device has gone\n");
1440                        goto free_int_desc;
1441                }
1442
1443                /*
1444                 * When the higher level interrupt code calls us with
1445                 * interrupt disabled, we must poll the channel by calling
1446                 * the channel callback directly when channel->target_cpu is
1447                 * the current CPU. When the higher level interrupt code
1448                 * calls us with interrupt enabled, let's add the
1449                 * local_irq_save()/restore() to avoid race:
1450                 * hv_pci_onchannelcallback() can also run in tasklet.
1451                 */
1452                local_irq_save(flags);
1453
1454                if (hbus->hdev->channel->target_cpu == smp_processor_id())
1455                        hv_pci_onchannelcallback(hbus);
1456
1457                local_irq_restore(flags);
1458
1459                if (hpdev->state == hv_pcichild_ejecting) {
1460                        dev_err_once(&hbus->hdev->device,
1461                                     "the device is being ejected\n");
1462                        goto free_int_desc;
1463                }
1464
1465                udelay(100);
1466        }
1467
1468        if (comp.comp_pkt.completion_status < 0) {
1469                dev_err(&hbus->hdev->device,
1470                        "Request for interrupt failed: 0x%x",
1471                        comp.comp_pkt.completion_status);
1472                goto free_int_desc;
1473        }
1474
1475        /*
1476         * Record the assignment so that this can be unwound later. Using
1477         * irq_set_chip_data() here would be appropriate, but the lock it takes
1478         * is already held.
1479         */
1480        *int_desc = comp.int_desc;
1481        data->chip_data = int_desc;
1482
1483        /* Pass up the result. */
1484        msg->address_hi = comp.int_desc.address >> 32;
1485        msg->address_lo = comp.int_desc.address & 0xffffffff;
1486        msg->data = comp.int_desc.data;
1487
1488        put_pcichild(hpdev);
1489        return;
1490
1491free_int_desc:
1492        kfree(int_desc);
1493drop_reference:
1494        put_pcichild(hpdev);
1495return_null_message:
1496        msg->address_hi = 0;
1497        msg->address_lo = 0;
1498        msg->data = 0;
1499}
1500
1501/* HW Interrupt Chip Descriptor */
1502static struct irq_chip hv_msi_irq_chip = {
1503        .name                   = "Hyper-V PCIe MSI",
1504        .irq_compose_msi_msg    = hv_compose_msi_msg,
1505        .irq_set_affinity       = hv_set_affinity,
1506        .irq_ack                = irq_chip_ack_parent,
1507        .irq_mask               = hv_irq_mask,
1508        .irq_unmask             = hv_irq_unmask,
1509};
1510
1511static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
1512                                                   msi_alloc_info_t *arg)
1513{
1514        return arg->msi_hwirq;
1515}
1516
1517static struct msi_domain_ops hv_msi_ops = {
1518        .get_hwirq      = hv_msi_domain_ops_get_hwirq,
1519        .msi_prepare    = pci_msi_prepare,
1520        .set_desc       = pci_msi_set_desc,
1521        .msi_free       = hv_msi_free,
1522};
1523
1524/**
1525 * hv_pcie_init_irq_domain() - Initialize IRQ domain
1526 * @hbus:       The root PCI bus
1527 *
1528 * This function creates an IRQ domain which will be used for
1529 * interrupts from devices that have been passed through.  These
1530 * devices only support MSI and MSI-X, not line-based interrupts
1531 * or simulations of line-based interrupts through PCIe's
1532 * fabric-layer messages.  Because interrupts are remapped, we
1533 * can support multi-message MSI here.
1534 *
1535 * Return: '0' on success and error value on failure
1536 */
1537static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
1538{
1539        hbus->msi_info.chip = &hv_msi_irq_chip;
1540        hbus->msi_info.ops = &hv_msi_ops;
1541        hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
1542                MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
1543                MSI_FLAG_PCI_MSIX);
1544        hbus->msi_info.handler = handle_edge_irq;
1545        hbus->msi_info.handler_name = "edge";
1546        hbus->msi_info.data = hbus;
1547        hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode,
1548                                                     &hbus->msi_info,
1549                                                     x86_vector_domain);
1550        if (!hbus->irq_domain) {
1551                dev_err(&hbus->hdev->device,
1552                        "Failed to build an MSI IRQ domain\n");
1553                return -ENODEV;
1554        }
1555
1556        return 0;
1557}
1558
1559/**
1560 * get_bar_size() - Get the address space consumed by a BAR
1561 * @bar_val:    Value that a BAR returned after -1 was written
1562 *              to it.
1563 *
1564 * This function returns the size of the BAR, rounded up to 1
1565 * page.  It has to be rounded up because the hypervisor's page
1566 * table entry that maps the BAR into the VM can't specify an
1567 * offset within a page.  The invariant is that the hypervisor
1568 * must place any BARs of smaller than page length at the
1569 * beginning of a page.
1570 *
1571 * Return:      Size in bytes of the consumed MMIO space.
1572 */
1573static u64 get_bar_size(u64 bar_val)
1574{
1575        return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
1576                        PAGE_SIZE);
1577}
1578
1579/**
1580 * survey_child_resources() - Total all MMIO requirements
1581 * @hbus:       Root PCI bus, as understood by this driver
1582 */
1583static void survey_child_resources(struct hv_pcibus_device *hbus)
1584{
1585        struct hv_pci_dev *hpdev;
1586        resource_size_t bar_size = 0;
1587        unsigned long flags;
1588        struct completion *event;
1589        u64 bar_val;
1590        int i;
1591
1592        /* If nobody is waiting on the answer, don't compute it. */
1593        event = xchg(&hbus->survey_event, NULL);
1594        if (!event)
1595                return;
1596
1597        /* If the answer has already been computed, go with it. */
1598        if (hbus->low_mmio_space || hbus->high_mmio_space) {
1599                complete(event);
1600                return;
1601        }
1602
1603        spin_lock_irqsave(&hbus->device_list_lock, flags);
1604
1605        /*
1606         * Due to an interesting quirk of the PCI spec, all memory regions
1607         * for a child device are a power of 2 in size and aligned in memory,
1608         * so it's sufficient to just add them up without tracking alignment.
1609         */
1610        list_for_each_entry(hpdev, &hbus->children, list_entry) {
1611                for (i = 0; i < PCI_STD_NUM_BARS; i++) {
1612                        if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
1613                                dev_err(&hbus->hdev->device,
1614                                        "There's an I/O BAR in this list!\n");
1615
1616                        if (hpdev->probed_bar[i] != 0) {
1617                                /*
1618                                 * A probed BAR has all the upper bits set that
1619                                 * can be changed.
1620                                 */
1621
1622                                bar_val = hpdev->probed_bar[i];
1623                                if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1624                                        bar_val |=
1625                                        ((u64)hpdev->probed_bar[++i] << 32);
1626                                else
1627                                        bar_val |= 0xffffffff00000000ULL;
1628
1629                                bar_size = get_bar_size(bar_val);
1630
1631                                if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1632                                        hbus->high_mmio_space += bar_size;
1633                                else
1634                                        hbus->low_mmio_space += bar_size;
1635                        }
1636                }
1637        }
1638
1639        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1640        complete(event);
1641}
1642
1643/**
1644 * prepopulate_bars() - Fill in BARs with defaults
1645 * @hbus:       Root PCI bus, as understood by this driver
1646 *
1647 * The core PCI driver code seems much, much happier if the BARs
1648 * for a device have values upon first scan. So fill them in.
1649 * The algorithm below works down from large sizes to small,
1650 * attempting to pack the assignments optimally. The assumption,
1651 * enforced in other parts of the code, is that the beginning of
1652 * the memory-mapped I/O space will be aligned on the largest
1653 * BAR size.
1654 */
1655static void prepopulate_bars(struct hv_pcibus_device *hbus)
1656{
1657        resource_size_t high_size = 0;
1658        resource_size_t low_size = 0;
1659        resource_size_t high_base = 0;
1660        resource_size_t low_base = 0;
1661        resource_size_t bar_size;
1662        struct hv_pci_dev *hpdev;
1663        unsigned long flags;
1664        u64 bar_val;
1665        u32 command;
1666        bool high;
1667        int i;
1668
1669        if (hbus->low_mmio_space) {
1670                low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1671                low_base = hbus->low_mmio_res->start;
1672        }
1673
1674        if (hbus->high_mmio_space) {
1675                high_size = 1ULL <<
1676                        (63 - __builtin_clzll(hbus->high_mmio_space));
1677                high_base = hbus->high_mmio_res->start;
1678        }
1679
1680        spin_lock_irqsave(&hbus->device_list_lock, flags);
1681
1682        /*
1683         * Clear the memory enable bit, in case it's already set. This occurs
1684         * in the suspend path of hibernation, where the device is suspended,
1685         * resumed and suspended again: see hibernation_snapshot() and
1686         * hibernation_platform_enter().
1687         *
1688         * If the memory enable bit is already set, Hyper-V sliently ignores
1689         * the below BAR updates, and the related PCI device driver can not
1690         * work, because reading from the device register(s) always returns
1691         * 0xFFFFFFFF.
1692         */
1693        list_for_each_entry(hpdev, &hbus->children, list_entry) {
1694                _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command);
1695                command &= ~PCI_COMMAND_MEMORY;
1696                _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command);
1697        }
1698
1699        /* Pick addresses for the BARs. */
1700        do {
1701                list_for_each_entry(hpdev, &hbus->children, list_entry) {
1702                        for (i = 0; i < PCI_STD_NUM_BARS; i++) {
1703                                bar_val = hpdev->probed_bar[i];
1704                                if (bar_val == 0)
1705                                        continue;
1706                                high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
1707                                if (high) {
1708                                        bar_val |=
1709                                                ((u64)hpdev->probed_bar[i + 1]
1710                                                 << 32);
1711                                } else {
1712                                        bar_val |= 0xffffffffULL << 32;
1713                                }
1714                                bar_size = get_bar_size(bar_val);
1715                                if (high) {
1716                                        if (high_size != bar_size) {
1717                                                i++;
1718                                                continue;
1719                                        }
1720                                        _hv_pcifront_write_config(hpdev,
1721                                                PCI_BASE_ADDRESS_0 + (4 * i),
1722                                                4,
1723                                                (u32)(high_base & 0xffffff00));
1724                                        i++;
1725                                        _hv_pcifront_write_config(hpdev,
1726                                                PCI_BASE_ADDRESS_0 + (4 * i),
1727                                                4, (u32)(high_base >> 32));
1728                                        high_base += bar_size;
1729                                } else {
1730                                        if (low_size != bar_size)
1731                                                continue;
1732                                        _hv_pcifront_write_config(hpdev,
1733                                                PCI_BASE_ADDRESS_0 + (4 * i),
1734                                                4,
1735                                                (u32)(low_base & 0xffffff00));
1736                                        low_base += bar_size;
1737                                }
1738                        }
1739                        if (high_size <= 1 && low_size <= 1) {
1740                                /* Set the memory enable bit. */
1741                                _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2,
1742                                                         &command);
1743                                command |= PCI_COMMAND_MEMORY;
1744                                _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2,
1745                                                          command);
1746                                break;
1747                        }
1748                }
1749
1750                high_size >>= 1;
1751                low_size >>= 1;
1752        }  while (high_size || low_size);
1753
1754        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1755}
1756
1757/*
1758 * Assign entries in sysfs pci slot directory.
1759 *
1760 * Note that this function does not need to lock the children list
1761 * because it is called from pci_devices_present_work which
1762 * is serialized with hv_eject_device_work because they are on the
1763 * same ordered workqueue. Therefore hbus->children list will not change
1764 * even when pci_create_slot sleeps.
1765 */
1766static void hv_pci_assign_slots(struct hv_pcibus_device *hbus)
1767{
1768        struct hv_pci_dev *hpdev;
1769        char name[SLOT_NAME_SIZE];
1770        int slot_nr;
1771
1772        list_for_each_entry(hpdev, &hbus->children, list_entry) {
1773                if (hpdev->pci_slot)
1774                        continue;
1775
1776                slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot));
1777                snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser);
1778                hpdev->pci_slot = pci_create_slot(hbus->pci_bus, slot_nr,
1779                                          name, NULL);
1780                if (IS_ERR(hpdev->pci_slot)) {
1781                        pr_warn("pci_create slot %s failed\n", name);
1782                        hpdev->pci_slot = NULL;
1783                }
1784        }
1785}
1786
1787/*
1788 * Remove entries in sysfs pci slot directory.
1789 */
1790static void hv_pci_remove_slots(struct hv_pcibus_device *hbus)
1791{
1792        struct hv_pci_dev *hpdev;
1793
1794        list_for_each_entry(hpdev, &hbus->children, list_entry) {
1795                if (!hpdev->pci_slot)
1796                        continue;
1797                pci_destroy_slot(hpdev->pci_slot);
1798                hpdev->pci_slot = NULL;
1799        }
1800}
1801
1802/**
1803 * create_root_hv_pci_bus() - Expose a new root PCI bus
1804 * @hbus:       Root PCI bus, as understood by this driver
1805 *
1806 * Return: 0 on success, -errno on failure
1807 */
1808static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
1809{
1810        /* Register the device */
1811        hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device,
1812                                            0, /* bus number is always zero */
1813                                            &hv_pcifront_ops,
1814                                            &hbus->sysdata,
1815                                            &hbus->resources_for_children);
1816        if (!hbus->pci_bus)
1817                return -ENODEV;
1818
1819        hbus->pci_bus->msi = &hbus->msi_chip;
1820        hbus->pci_bus->msi->dev = &hbus->hdev->device;
1821
1822        pci_lock_rescan_remove();
1823        pci_scan_child_bus(hbus->pci_bus);
1824        pci_bus_assign_resources(hbus->pci_bus);
1825        hv_pci_assign_slots(hbus);
1826        pci_bus_add_devices(hbus->pci_bus);
1827        pci_unlock_rescan_remove();
1828        hbus->state = hv_pcibus_installed;
1829        return 0;
1830}
1831
1832struct q_res_req_compl {
1833        struct completion host_event;
1834        struct hv_pci_dev *hpdev;
1835};
1836
1837/**
1838 * q_resource_requirements() - Query Resource Requirements
1839 * @context:            The completion context.
1840 * @resp:               The response that came from the host.
1841 * @resp_packet_size:   The size in bytes of resp.
1842 *
1843 * This function is invoked on completion of a Query Resource
1844 * Requirements packet.
1845 */
1846static void q_resource_requirements(void *context, struct pci_response *resp,
1847                                    int resp_packet_size)
1848{
1849        struct q_res_req_compl *completion = context;
1850        struct pci_q_res_req_response *q_res_req =
1851                (struct pci_q_res_req_response *)resp;
1852        int i;
1853
1854        if (resp->status < 0) {
1855                dev_err(&completion->hpdev->hbus->hdev->device,
1856                        "query resource requirements failed: %x\n",
1857                        resp->status);
1858        } else {
1859                for (i = 0; i < PCI_STD_NUM_BARS; i++) {
1860                        completion->hpdev->probed_bar[i] =
1861                                q_res_req->probed_bar[i];
1862                }
1863        }
1864
1865        complete(&completion->host_event);
1866}
1867
1868/**
1869 * new_pcichild_device() - Create a new child device
1870 * @hbus:       The internal struct tracking this root PCI bus.
1871 * @desc:       The information supplied so far from the host
1872 *              about the device.
1873 *
1874 * This function creates the tracking structure for a new child
1875 * device and kicks off the process of figuring out what it is.
1876 *
1877 * Return: Pointer to the new tracking struct
1878 */
1879static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
1880                struct pci_function_description *desc)
1881{
1882        struct hv_pci_dev *hpdev;
1883        struct pci_child_message *res_req;
1884        struct q_res_req_compl comp_pkt;
1885        struct {
1886                struct pci_packet init_packet;
1887                u8 buffer[sizeof(struct pci_child_message)];
1888        } pkt;
1889        unsigned long flags;
1890        int ret;
1891
1892        hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL);
1893        if (!hpdev)
1894                return NULL;
1895
1896        hpdev->hbus = hbus;
1897
1898        memset(&pkt, 0, sizeof(pkt));
1899        init_completion(&comp_pkt.host_event);
1900        comp_pkt.hpdev = hpdev;
1901        pkt.init_packet.compl_ctxt = &comp_pkt;
1902        pkt.init_packet.completion_func = q_resource_requirements;
1903        res_req = (struct pci_child_message *)&pkt.init_packet.message;
1904        res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
1905        res_req->wslot.slot = desc->win_slot.slot;
1906
1907        ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
1908                               sizeof(struct pci_child_message),
1909                               (unsigned long)&pkt.init_packet,
1910                               VM_PKT_DATA_INBAND,
1911                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1912        if (ret)
1913                goto error;
1914
1915        if (wait_for_response(hbus->hdev, &comp_pkt.host_event))
1916                goto error;
1917
1918        hpdev->desc = *desc;
1919        refcount_set(&hpdev->refs, 1);
1920        get_pcichild(hpdev);
1921        spin_lock_irqsave(&hbus->device_list_lock, flags);
1922
1923        list_add_tail(&hpdev->list_entry, &hbus->children);
1924        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1925        return hpdev;
1926
1927error:
1928        kfree(hpdev);
1929        return NULL;
1930}
1931
1932/**
1933 * get_pcichild_wslot() - Find device from slot
1934 * @hbus:       Root PCI bus, as understood by this driver
1935 * @wslot:      Location on the bus
1936 *
1937 * This function looks up a PCI device and returns the internal
1938 * representation of it.  It acquires a reference on it, so that
1939 * the device won't be deleted while somebody is using it.  The
1940 * caller is responsible for calling put_pcichild() to release
1941 * this reference.
1942 *
1943 * Return:      Internal representation of a PCI device
1944 */
1945static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
1946                                             u32 wslot)
1947{
1948        unsigned long flags;
1949        struct hv_pci_dev *iter, *hpdev = NULL;
1950
1951        spin_lock_irqsave(&hbus->device_list_lock, flags);
1952        list_for_each_entry(iter, &hbus->children, list_entry) {
1953                if (iter->desc.win_slot.slot == wslot) {
1954                        hpdev = iter;
1955                        get_pcichild(hpdev);
1956                        break;
1957                }
1958        }
1959        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1960
1961        return hpdev;
1962}
1963
1964/**
1965 * pci_devices_present_work() - Handle new list of child devices
1966 * @work:       Work struct embedded in struct hv_dr_work
1967 *
1968 * "Bus Relations" is the Windows term for "children of this
1969 * bus."  The terminology is preserved here for people trying to
1970 * debug the interaction between Hyper-V and Linux.  This
1971 * function is called when the parent partition reports a list
1972 * of functions that should be observed under this PCI Express
1973 * port (bus).
1974 *
1975 * This function updates the list, and must tolerate being
1976 * called multiple times with the same information.  The typical
1977 * number of child devices is one, with very atypical cases
1978 * involving three or four, so the algorithms used here can be
1979 * simple and inefficient.
1980 *
1981 * It must also treat the omission of a previously observed device as
1982 * notification that the device no longer exists.
1983 *
1984 * Note that this function is serialized with hv_eject_device_work(),
1985 * because both are pushed to the ordered workqueue hbus->wq.
1986 */
1987static void pci_devices_present_work(struct work_struct *work)
1988{
1989        u32 child_no;
1990        bool found;
1991        struct pci_function_description *new_desc;
1992        struct hv_pci_dev *hpdev;
1993        struct hv_pcibus_device *hbus;
1994        struct list_head removed;
1995        struct hv_dr_work *dr_wrk;
1996        struct hv_dr_state *dr = NULL;
1997        unsigned long flags;
1998
1999        dr_wrk = container_of(work, struct hv_dr_work, wrk);
2000        hbus = dr_wrk->bus;
2001        kfree(dr_wrk);
2002
2003        INIT_LIST_HEAD(&removed);
2004
2005        /* Pull this off the queue and process it if it was the last one. */
2006        spin_lock_irqsave(&hbus->device_list_lock, flags);
2007        while (!list_empty(&hbus->dr_list)) {
2008                dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
2009                                      list_entry);
2010                list_del(&dr->list_entry);
2011
2012                /* Throw this away if the list still has stuff in it. */
2013                if (!list_empty(&hbus->dr_list)) {
2014                        kfree(dr);
2015                        continue;
2016                }
2017        }
2018        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2019
2020        if (!dr) {
2021                put_hvpcibus(hbus);
2022                return;
2023        }
2024
2025        /* First, mark all existing children as reported missing. */
2026        spin_lock_irqsave(&hbus->device_list_lock, flags);
2027        list_for_each_entry(hpdev, &hbus->children, list_entry) {
2028                hpdev->reported_missing = true;
2029        }
2030        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2031
2032        /* Next, add back any reported devices. */
2033        for (child_no = 0; child_no < dr->device_count; child_no++) {
2034                found = false;
2035                new_desc = &dr->func[child_no];
2036
2037                spin_lock_irqsave(&hbus->device_list_lock, flags);
2038                list_for_each_entry(hpdev, &hbus->children, list_entry) {
2039                        if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) &&
2040                            (hpdev->desc.v_id == new_desc->v_id) &&
2041                            (hpdev->desc.d_id == new_desc->d_id) &&
2042                            (hpdev->desc.ser == new_desc->ser)) {
2043                                hpdev->reported_missing = false;
2044                                found = true;
2045                        }
2046                }
2047                spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2048
2049                if (!found) {
2050                        hpdev = new_pcichild_device(hbus, new_desc);
2051                        if (!hpdev)
2052                                dev_err(&hbus->hdev->device,
2053                                        "couldn't record a child device.\n");
2054                }
2055        }
2056
2057        /* Move missing children to a list on the stack. */
2058        spin_lock_irqsave(&hbus->device_list_lock, flags);
2059        do {
2060                found = false;
2061                list_for_each_entry(hpdev, &hbus->children, list_entry) {
2062                        if (hpdev->reported_missing) {
2063                                found = true;
2064                                put_pcichild(hpdev);
2065                                list_move_tail(&hpdev->list_entry, &removed);
2066                                break;
2067                        }
2068                }
2069        } while (found);
2070        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2071
2072        /* Delete everything that should no longer exist. */
2073        while (!list_empty(&removed)) {
2074                hpdev = list_first_entry(&removed, struct hv_pci_dev,
2075                                         list_entry);
2076                list_del(&hpdev->list_entry);
2077
2078                if (hpdev->pci_slot)
2079                        pci_destroy_slot(hpdev->pci_slot);
2080
2081                put_pcichild(hpdev);
2082        }
2083
2084        switch (hbus->state) {
2085        case hv_pcibus_installed:
2086                /*
2087                 * Tell the core to rescan bus
2088                 * because there may have been changes.
2089                 */
2090                pci_lock_rescan_remove();
2091                pci_scan_child_bus(hbus->pci_bus);
2092                hv_pci_assign_slots(hbus);
2093                pci_unlock_rescan_remove();
2094                break;
2095
2096        case hv_pcibus_init:
2097        case hv_pcibus_probed:
2098                survey_child_resources(hbus);
2099                break;
2100
2101        default:
2102                break;
2103        }
2104
2105        put_hvpcibus(hbus);
2106        kfree(dr);
2107}
2108
2109/**
2110 * hv_pci_devices_present() - Handles list of new children
2111 * @hbus:       Root PCI bus, as understood by this driver
2112 * @relations:  Packet from host listing children
2113 *
2114 * This function is invoked whenever a new list of devices for
2115 * this bus appears.
2116 */
2117static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
2118                                   struct pci_bus_relations *relations)
2119{
2120        struct hv_dr_state *dr;
2121        struct hv_dr_work *dr_wrk;
2122        unsigned long flags;
2123        bool pending_dr;
2124
2125        if (hbus->state == hv_pcibus_removing) {
2126                dev_info(&hbus->hdev->device,
2127                         "PCI VMBus BUS_RELATIONS: ignored\n");
2128                return;
2129        }
2130
2131        dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
2132        if (!dr_wrk)
2133                return;
2134
2135        dr = kzalloc(offsetof(struct hv_dr_state, func) +
2136                     (sizeof(struct pci_function_description) *
2137                      (relations->device_count)), GFP_NOWAIT);
2138        if (!dr)  {
2139                kfree(dr_wrk);
2140                return;
2141        }
2142
2143        INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
2144        dr_wrk->bus = hbus;
2145        dr->device_count = relations->device_count;
2146        if (dr->device_count != 0) {
2147                memcpy(dr->func, relations->func,
2148                       sizeof(struct pci_function_description) *
2149                       dr->device_count);
2150        }
2151
2152        spin_lock_irqsave(&hbus->device_list_lock, flags);
2153        /*
2154         * If pending_dr is true, we have already queued a work,
2155         * which will see the new dr. Otherwise, we need to
2156         * queue a new work.
2157         */
2158        pending_dr = !list_empty(&hbus->dr_list);
2159        list_add_tail(&dr->list_entry, &hbus->dr_list);
2160        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2161
2162        if (pending_dr) {
2163                kfree(dr_wrk);
2164        } else {
2165                get_hvpcibus(hbus);
2166                queue_work(hbus->wq, &dr_wrk->wrk);
2167        }
2168}
2169
2170/**
2171 * hv_eject_device_work() - Asynchronously handles ejection
2172 * @work:       Work struct embedded in internal device struct
2173 *
2174 * This function handles ejecting a device.  Windows will
2175 * attempt to gracefully eject a device, waiting 60 seconds to
2176 * hear back from the guest OS that this completed successfully.
2177 * If this timer expires, the device will be forcibly removed.
2178 */
2179static void hv_eject_device_work(struct work_struct *work)
2180{
2181        struct pci_eject_response *ejct_pkt;
2182        struct hv_pcibus_device *hbus;
2183        struct hv_pci_dev *hpdev;
2184        struct pci_dev *pdev;
2185        unsigned long flags;
2186        int wslot;
2187        struct {
2188                struct pci_packet pkt;
2189                u8 buffer[sizeof(struct pci_eject_response)];
2190        } ctxt;
2191
2192        hpdev = container_of(work, struct hv_pci_dev, wrk);
2193        hbus = hpdev->hbus;
2194
2195        WARN_ON(hpdev->state != hv_pcichild_ejecting);
2196
2197        /*
2198         * Ejection can come before or after the PCI bus has been set up, so
2199         * attempt to find it and tear down the bus state, if it exists.  This
2200         * must be done without constructs like pci_domain_nr(hbus->pci_bus)
2201         * because hbus->pci_bus may not exist yet.
2202         */
2203        wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
2204        pdev = pci_get_domain_bus_and_slot(hbus->sysdata.domain, 0, wslot);
2205        if (pdev) {
2206                pci_lock_rescan_remove();
2207                pci_stop_and_remove_bus_device(pdev);
2208                pci_dev_put(pdev);
2209                pci_unlock_rescan_remove();
2210        }
2211
2212        spin_lock_irqsave(&hbus->device_list_lock, flags);
2213        list_del(&hpdev->list_entry);
2214        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2215
2216        if (hpdev->pci_slot)
2217                pci_destroy_slot(hpdev->pci_slot);
2218
2219        memset(&ctxt, 0, sizeof(ctxt));
2220        ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
2221        ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
2222        ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
2223        vmbus_sendpacket(hbus->hdev->channel, ejct_pkt,
2224                         sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
2225                         VM_PKT_DATA_INBAND, 0);
2226
2227        /* For the get_pcichild() in hv_pci_eject_device() */
2228        put_pcichild(hpdev);
2229        /* For the two refs got in new_pcichild_device() */
2230        put_pcichild(hpdev);
2231        put_pcichild(hpdev);
2232        /* hpdev has been freed. Do not use it any more. */
2233
2234        put_hvpcibus(hbus);
2235}
2236
2237/**
2238 * hv_pci_eject_device() - Handles device ejection
2239 * @hpdev:      Internal device tracking struct
2240 *
2241 * This function is invoked when an ejection packet arrives.  It
2242 * just schedules work so that we don't re-enter the packet
2243 * delivery code handling the ejection.
2244 */
2245static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
2246{
2247        struct hv_pcibus_device *hbus = hpdev->hbus;
2248        struct hv_device *hdev = hbus->hdev;
2249
2250        if (hbus->state == hv_pcibus_removing) {
2251                dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n");
2252                return;
2253        }
2254
2255        hpdev->state = hv_pcichild_ejecting;
2256        get_pcichild(hpdev);
2257        INIT_WORK(&hpdev->wrk, hv_eject_device_work);
2258        get_hvpcibus(hbus);
2259        queue_work(hbus->wq, &hpdev->wrk);
2260}
2261
2262/**
2263 * hv_pci_onchannelcallback() - Handles incoming packets
2264 * @context:    Internal bus tracking struct
2265 *
2266 * This function is invoked whenever the host sends a packet to
2267 * this channel (which is private to this root PCI bus).
2268 */
2269static void hv_pci_onchannelcallback(void *context)
2270{
2271        const int packet_size = 0x100;
2272        int ret;
2273        struct hv_pcibus_device *hbus = context;
2274        u32 bytes_recvd;
2275        u64 req_id;
2276        struct vmpacket_descriptor *desc;
2277        unsigned char *buffer;
2278        int bufferlen = packet_size;
2279        struct pci_packet *comp_packet;
2280        struct pci_response *response;
2281        struct pci_incoming_message *new_message;
2282        struct pci_bus_relations *bus_rel;
2283        struct pci_dev_inval_block *inval;
2284        struct pci_dev_incoming *dev_message;
2285        struct hv_pci_dev *hpdev;
2286
2287        buffer = kmalloc(bufferlen, GFP_ATOMIC);
2288        if (!buffer)
2289                return;
2290
2291        while (1) {
2292                ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer,
2293                                           bufferlen, &bytes_recvd, &req_id);
2294
2295                if (ret == -ENOBUFS) {
2296                        kfree(buffer);
2297                        /* Handle large packet */
2298                        bufferlen = bytes_recvd;
2299                        buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
2300                        if (!buffer)
2301                                return;
2302                        continue;
2303                }
2304
2305                /* Zero length indicates there are no more packets. */
2306                if (ret || !bytes_recvd)
2307                        break;
2308
2309                /*
2310                 * All incoming packets must be at least as large as a
2311                 * response.
2312                 */
2313                if (bytes_recvd <= sizeof(struct pci_response))
2314                        continue;
2315                desc = (struct vmpacket_descriptor *)buffer;
2316
2317                switch (desc->type) {
2318                case VM_PKT_COMP:
2319
2320                        /*
2321                         * The host is trusted, and thus it's safe to interpret
2322                         * this transaction ID as a pointer.
2323                         */
2324                        comp_packet = (struct pci_packet *)req_id;
2325                        response = (struct pci_response *)buffer;
2326                        comp_packet->completion_func(comp_packet->compl_ctxt,
2327                                                     response,
2328                                                     bytes_recvd);
2329                        break;
2330
2331                case VM_PKT_DATA_INBAND:
2332
2333                        new_message = (struct pci_incoming_message *)buffer;
2334                        switch (new_message->message_type.type) {
2335                        case PCI_BUS_RELATIONS:
2336
2337                                bus_rel = (struct pci_bus_relations *)buffer;
2338                                if (bytes_recvd <
2339                                    offsetof(struct pci_bus_relations, func) +
2340                                    (sizeof(struct pci_function_description) *
2341                                     (bus_rel->device_count))) {
2342                                        dev_err(&hbus->hdev->device,
2343                                                "bus relations too small\n");
2344                                        break;
2345                                }
2346
2347                                hv_pci_devices_present(hbus, bus_rel);
2348                                break;
2349
2350                        case PCI_EJECT:
2351
2352                                dev_message = (struct pci_dev_incoming *)buffer;
2353                                hpdev = get_pcichild_wslot(hbus,
2354                                                      dev_message->wslot.slot);
2355                                if (hpdev) {
2356                                        hv_pci_eject_device(hpdev);
2357                                        put_pcichild(hpdev);
2358                                }
2359                                break;
2360
2361                        case PCI_INVALIDATE_BLOCK:
2362
2363                                inval = (struct pci_dev_inval_block *)buffer;
2364                                hpdev = get_pcichild_wslot(hbus,
2365                                                           inval->wslot.slot);
2366                                if (hpdev) {
2367                                        if (hpdev->block_invalidate) {
2368                                                hpdev->block_invalidate(
2369                                                    hpdev->invalidate_context,
2370                                                    inval->block_mask);
2371                                        }
2372                                        put_pcichild(hpdev);
2373                                }
2374                                break;
2375
2376                        default:
2377                                dev_warn(&hbus->hdev->device,
2378                                        "Unimplemented protocol message %x\n",
2379                                        new_message->message_type.type);
2380                                break;
2381                        }
2382                        break;
2383
2384                default:
2385                        dev_err(&hbus->hdev->device,
2386                                "unhandled packet type %d, tid %llx len %d\n",
2387                                desc->type, req_id, bytes_recvd);
2388                        break;
2389                }
2390        }
2391
2392        kfree(buffer);
2393}
2394
2395/**
2396 * hv_pci_protocol_negotiation() - Set up protocol
2397 * @hdev:       VMBus's tracking struct for this root PCI bus
2398 *
2399 * This driver is intended to support running on Windows 10
2400 * (server) and later versions. It will not run on earlier
2401 * versions, as they assume that many of the operations which
2402 * Linux needs accomplished with a spinlock held were done via
2403 * asynchronous messaging via VMBus.  Windows 10 increases the
2404 * surface area of PCI emulation so that these actions can take
2405 * place by suspending a virtual processor for their duration.
2406 *
2407 * This function negotiates the channel protocol version,
2408 * failing if the host doesn't support the necessary protocol
2409 * level.
2410 */
2411static int hv_pci_protocol_negotiation(struct hv_device *hdev,
2412                                       enum pci_protocol_version_t version[],
2413                                       int num_version)
2414{
2415        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2416        struct pci_version_request *version_req;
2417        struct hv_pci_compl comp_pkt;
2418        struct pci_packet *pkt;
2419        int ret;
2420        int i;
2421
2422        /*
2423         * Initiate the handshake with the host and negotiate
2424         * a version that the host can support. We start with the
2425         * highest version number and go down if the host cannot
2426         * support it.
2427         */
2428        pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
2429        if (!pkt)
2430                return -ENOMEM;
2431
2432        init_completion(&comp_pkt.host_event);
2433        pkt->completion_func = hv_pci_generic_compl;
2434        pkt->compl_ctxt = &comp_pkt;
2435        version_req = (struct pci_version_request *)&pkt->message;
2436        version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
2437
2438        for (i = 0; i < num_version; i++) {
2439                version_req->protocol_version = version[i];
2440                ret = vmbus_sendpacket(hdev->channel, version_req,
2441                                sizeof(struct pci_version_request),
2442                                (unsigned long)pkt, VM_PKT_DATA_INBAND,
2443                                VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2444                if (!ret)
2445                        ret = wait_for_response(hdev, &comp_pkt.host_event);
2446
2447                if (ret) {
2448                        dev_err(&hdev->device,
2449                                "PCI Pass-through VSP failed to request version: %d",
2450                                ret);
2451                        goto exit;
2452                }
2453
2454                if (comp_pkt.completion_status >= 0) {
2455                        hbus->protocol_version = version[i];
2456                        dev_info(&hdev->device,
2457                                "PCI VMBus probing: Using version %#x\n",
2458                                hbus->protocol_version);
2459                        goto exit;
2460                }
2461
2462                if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) {
2463                        dev_err(&hdev->device,
2464                                "PCI Pass-through VSP failed version request: %#x",
2465                                comp_pkt.completion_status);
2466                        ret = -EPROTO;
2467                        goto exit;
2468                }
2469
2470                reinit_completion(&comp_pkt.host_event);
2471        }
2472
2473        dev_err(&hdev->device,
2474                "PCI pass-through VSP failed to find supported version");
2475        ret = -EPROTO;
2476
2477exit:
2478        kfree(pkt);
2479        return ret;
2480}
2481
2482/**
2483 * hv_pci_free_bridge_windows() - Release memory regions for the
2484 * bus
2485 * @hbus:       Root PCI bus, as understood by this driver
2486 */
2487static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
2488{
2489        /*
2490         * Set the resources back to the way they looked when they
2491         * were allocated by setting IORESOURCE_BUSY again.
2492         */
2493
2494        if (hbus->low_mmio_space && hbus->low_mmio_res) {
2495                hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
2496                vmbus_free_mmio(hbus->low_mmio_res->start,
2497                                resource_size(hbus->low_mmio_res));
2498        }
2499
2500        if (hbus->high_mmio_space && hbus->high_mmio_res) {
2501                hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
2502                vmbus_free_mmio(hbus->high_mmio_res->start,
2503                                resource_size(hbus->high_mmio_res));
2504        }
2505}
2506
2507/**
2508 * hv_pci_allocate_bridge_windows() - Allocate memory regions
2509 * for the bus
2510 * @hbus:       Root PCI bus, as understood by this driver
2511 *
2512 * This function calls vmbus_allocate_mmio(), which is itself a
2513 * bit of a compromise.  Ideally, we might change the pnp layer
2514 * in the kernel such that it comprehends either PCI devices
2515 * which are "grandchildren of ACPI," with some intermediate bus
2516 * node (in this case, VMBus) or change it such that it
2517 * understands VMBus.  The pnp layer, however, has been declared
2518 * deprecated, and not subject to change.
2519 *
2520 * The workaround, implemented here, is to ask VMBus to allocate
2521 * MMIO space for this bus.  VMBus itself knows which ranges are
2522 * appropriate by looking at its own ACPI objects.  Then, after
2523 * these ranges are claimed, they're modified to look like they
2524 * would have looked if the ACPI and pnp code had allocated
2525 * bridge windows.  These descriptors have to exist in this form
2526 * in order to satisfy the code which will get invoked when the
2527 * endpoint PCI function driver calls request_mem_region() or
2528 * request_mem_region_exclusive().
2529 *
2530 * Return: 0 on success, -errno on failure
2531 */
2532static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
2533{
2534        resource_size_t align;
2535        int ret;
2536
2537        if (hbus->low_mmio_space) {
2538                align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
2539                ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
2540                                          (u64)(u32)0xffffffff,
2541                                          hbus->low_mmio_space,
2542                                          align, false);
2543                if (ret) {
2544                        dev_err(&hbus->hdev->device,
2545                                "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
2546                                hbus->low_mmio_space);
2547                        return ret;
2548                }
2549
2550                /* Modify this resource to become a bridge window. */
2551                hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
2552                hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
2553                pci_add_resource(&hbus->resources_for_children,
2554                                 hbus->low_mmio_res);
2555        }
2556
2557        if (hbus->high_mmio_space) {
2558                align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
2559                ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
2560                                          0x100000000, -1,
2561                                          hbus->high_mmio_space, align,
2562                                          false);
2563                if (ret) {
2564                        dev_err(&hbus->hdev->device,
2565                                "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
2566                                hbus->high_mmio_space);
2567                        goto release_low_mmio;
2568                }
2569
2570                /* Modify this resource to become a bridge window. */
2571                hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
2572                hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
2573                pci_add_resource(&hbus->resources_for_children,
2574                                 hbus->high_mmio_res);
2575        }
2576
2577        return 0;
2578
2579release_low_mmio:
2580        if (hbus->low_mmio_res) {
2581                vmbus_free_mmio(hbus->low_mmio_res->start,
2582                                resource_size(hbus->low_mmio_res));
2583        }
2584
2585        return ret;
2586}
2587
2588/**
2589 * hv_allocate_config_window() - Find MMIO space for PCI Config
2590 * @hbus:       Root PCI bus, as understood by this driver
2591 *
2592 * This function claims memory-mapped I/O space for accessing
2593 * configuration space for the functions on this bus.
2594 *
2595 * Return: 0 on success, -errno on failure
2596 */
2597static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
2598{
2599        int ret;
2600
2601        /*
2602         * Set up a region of MMIO space to use for accessing configuration
2603         * space.
2604         */
2605        ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
2606                                  PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
2607        if (ret)
2608                return ret;
2609
2610        /*
2611         * vmbus_allocate_mmio() gets used for allocating both device endpoint
2612         * resource claims (those which cannot be overlapped) and the ranges
2613         * which are valid for the children of this bus, which are intended
2614         * to be overlapped by those children.  Set the flag on this claim
2615         * meaning that this region can't be overlapped.
2616         */
2617
2618        hbus->mem_config->flags |= IORESOURCE_BUSY;
2619
2620        return 0;
2621}
2622
2623static void hv_free_config_window(struct hv_pcibus_device *hbus)
2624{
2625        vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
2626}
2627
2628/**
2629 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
2630 * @hdev:       VMBus's tracking struct for this root PCI bus
2631 *
2632 * Return: 0 on success, -errno on failure
2633 */
2634static int hv_pci_enter_d0(struct hv_device *hdev)
2635{
2636        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2637        struct pci_bus_d0_entry *d0_entry;
2638        struct hv_pci_compl comp_pkt;
2639        struct pci_packet *pkt;
2640        int ret;
2641
2642        /*
2643         * Tell the host that the bus is ready to use, and moved into the
2644         * powered-on state.  This includes telling the host which region
2645         * of memory-mapped I/O space has been chosen for configuration space
2646         * access.
2647         */
2648        pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
2649        if (!pkt)
2650                return -ENOMEM;
2651
2652        init_completion(&comp_pkt.host_event);
2653        pkt->completion_func = hv_pci_generic_compl;
2654        pkt->compl_ctxt = &comp_pkt;
2655        d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
2656        d0_entry->message_type.type = PCI_BUS_D0ENTRY;
2657        d0_entry->mmio_base = hbus->mem_config->start;
2658
2659        ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
2660                               (unsigned long)pkt, VM_PKT_DATA_INBAND,
2661                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2662        if (!ret)
2663                ret = wait_for_response(hdev, &comp_pkt.host_event);
2664
2665        if (ret)
2666                goto exit;
2667
2668        if (comp_pkt.completion_status < 0) {
2669                dev_err(&hdev->device,
2670                        "PCI Pass-through VSP failed D0 Entry with status %x\n",
2671                        comp_pkt.completion_status);
2672                ret = -EPROTO;
2673                goto exit;
2674        }
2675
2676        ret = 0;
2677
2678exit:
2679        kfree(pkt);
2680        return ret;
2681}
2682
2683/**
2684 * hv_pci_query_relations() - Ask host to send list of child
2685 * devices
2686 * @hdev:       VMBus's tracking struct for this root PCI bus
2687 *
2688 * Return: 0 on success, -errno on failure
2689 */
2690static int hv_pci_query_relations(struct hv_device *hdev)
2691{
2692        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2693        struct pci_message message;
2694        struct completion comp;
2695        int ret;
2696
2697        /* Ask the host to send along the list of child devices */
2698        init_completion(&comp);
2699        if (cmpxchg(&hbus->survey_event, NULL, &comp))
2700                return -ENOTEMPTY;
2701
2702        memset(&message, 0, sizeof(message));
2703        message.type = PCI_QUERY_BUS_RELATIONS;
2704
2705        ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
2706                               0, VM_PKT_DATA_INBAND, 0);
2707        if (!ret)
2708                ret = wait_for_response(hdev, &comp);
2709
2710        return ret;
2711}
2712
2713/**
2714 * hv_send_resources_allocated() - Report local resource choices
2715 * @hdev:       VMBus's tracking struct for this root PCI bus
2716 *
2717 * The host OS is expecting to be sent a request as a message
2718 * which contains all the resources that the device will use.
2719 * The response contains those same resources, "translated"
2720 * which is to say, the values which should be used by the
2721 * hardware, when it delivers an interrupt.  (MMIO resources are
2722 * used in local terms.)  This is nice for Windows, and lines up
2723 * with the FDO/PDO split, which doesn't exist in Linux.  Linux
2724 * is deeply expecting to scan an emulated PCI configuration
2725 * space.  So this message is sent here only to drive the state
2726 * machine on the host forward.
2727 *
2728 * Return: 0 on success, -errno on failure
2729 */
2730static int hv_send_resources_allocated(struct hv_device *hdev)
2731{
2732        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2733        struct pci_resources_assigned *res_assigned;
2734        struct pci_resources_assigned2 *res_assigned2;
2735        struct hv_pci_compl comp_pkt;
2736        struct hv_pci_dev *hpdev;
2737        struct pci_packet *pkt;
2738        size_t size_res;
2739        u32 wslot;
2740        int ret;
2741
2742        size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2)
2743                        ? sizeof(*res_assigned) : sizeof(*res_assigned2);
2744
2745        pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
2746        if (!pkt)
2747                return -ENOMEM;
2748
2749        ret = 0;
2750
2751        for (wslot = 0; wslot < 256; wslot++) {
2752                hpdev = get_pcichild_wslot(hbus, wslot);
2753                if (!hpdev)
2754                        continue;
2755
2756                memset(pkt, 0, sizeof(*pkt) + size_res);
2757                init_completion(&comp_pkt.host_event);
2758                pkt->completion_func = hv_pci_generic_compl;
2759                pkt->compl_ctxt = &comp_pkt;
2760
2761                if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) {
2762                        res_assigned =
2763                                (struct pci_resources_assigned *)&pkt->message;
2764                        res_assigned->message_type.type =
2765                                PCI_RESOURCES_ASSIGNED;
2766                        res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
2767                } else {
2768                        res_assigned2 =
2769                                (struct pci_resources_assigned2 *)&pkt->message;
2770                        res_assigned2->message_type.type =
2771                                PCI_RESOURCES_ASSIGNED2;
2772                        res_assigned2->wslot.slot = hpdev->desc.win_slot.slot;
2773                }
2774                put_pcichild(hpdev);
2775
2776                ret = vmbus_sendpacket(hdev->channel, &pkt->message,
2777                                size_res, (unsigned long)pkt,
2778                                VM_PKT_DATA_INBAND,
2779                                VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2780                if (!ret)
2781                        ret = wait_for_response(hdev, &comp_pkt.host_event);
2782                if (ret)
2783                        break;
2784
2785                if (comp_pkt.completion_status < 0) {
2786                        ret = -EPROTO;
2787                        dev_err(&hdev->device,
2788                                "resource allocated returned 0x%x",
2789                                comp_pkt.completion_status);
2790                        break;
2791                }
2792        }
2793
2794        kfree(pkt);
2795        return ret;
2796}
2797
2798/**
2799 * hv_send_resources_released() - Report local resources
2800 * released
2801 * @hdev:       VMBus's tracking struct for this root PCI bus
2802 *
2803 * Return: 0 on success, -errno on failure
2804 */
2805static int hv_send_resources_released(struct hv_device *hdev)
2806{
2807        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2808        struct pci_child_message pkt;
2809        struct hv_pci_dev *hpdev;
2810        u32 wslot;
2811        int ret;
2812
2813        for (wslot = 0; wslot < 256; wslot++) {
2814                hpdev = get_pcichild_wslot(hbus, wslot);
2815                if (!hpdev)
2816                        continue;
2817
2818                memset(&pkt, 0, sizeof(pkt));
2819                pkt.message_type.type = PCI_RESOURCES_RELEASED;
2820                pkt.wslot.slot = hpdev->desc.win_slot.slot;
2821
2822                put_pcichild(hpdev);
2823
2824                ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
2825                                       VM_PKT_DATA_INBAND, 0);
2826                if (ret)
2827                        return ret;
2828        }
2829
2830        return 0;
2831}
2832
2833static void get_hvpcibus(struct hv_pcibus_device *hbus)
2834{
2835        refcount_inc(&hbus->remove_lock);
2836}
2837
2838static void put_hvpcibus(struct hv_pcibus_device *hbus)
2839{
2840        if (refcount_dec_and_test(&hbus->remove_lock))
2841                complete(&hbus->remove_event);
2842}
2843
2844#define HVPCI_DOM_MAP_SIZE (64 * 1024)
2845static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
2846
2847/*
2848 * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
2849 * as invalid for passthrough PCI devices of this driver.
2850 */
2851#define HVPCI_DOM_INVALID 0
2852
2853/**
2854 * hv_get_dom_num() - Get a valid PCI domain number
2855 * Check if the PCI domain number is in use, and return another number if
2856 * it is in use.
2857 *
2858 * @dom: Requested domain number
2859 *
2860 * return: domain number on success, HVPCI_DOM_INVALID on failure
2861 */
2862static u16 hv_get_dom_num(u16 dom)
2863{
2864        unsigned int i;
2865
2866        if (test_and_set_bit(dom, hvpci_dom_map) == 0)
2867                return dom;
2868
2869        for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
2870                if (test_and_set_bit(i, hvpci_dom_map) == 0)
2871                        return i;
2872        }
2873
2874        return HVPCI_DOM_INVALID;
2875}
2876
2877/**
2878 * hv_put_dom_num() - Mark the PCI domain number as free
2879 * @dom: Domain number to be freed
2880 */
2881static void hv_put_dom_num(u16 dom)
2882{
2883        clear_bit(dom, hvpci_dom_map);
2884}
2885
2886/**
2887 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
2888 * @hdev:       VMBus's tracking struct for this root PCI bus
2889 * @dev_id:     Identifies the device itself
2890 *
2891 * Return: 0 on success, -errno on failure
2892 */
2893static int hv_pci_probe(struct hv_device *hdev,
2894                        const struct hv_vmbus_device_id *dev_id)
2895{
2896        struct hv_pcibus_device *hbus;
2897        u16 dom_req, dom;
2898        char *name;
2899        int ret;
2900
2901        /*
2902         * hv_pcibus_device contains the hypercall arguments for retargeting in
2903         * hv_irq_unmask(). Those must not cross a page boundary.
2904         */
2905        BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE);
2906
2907        /*
2908         * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural
2909         * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate
2910         * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and
2911         * alignment of hbus is important because hbus's field
2912         * retarget_msi_interrupt_params must not cross a 4KB page boundary.
2913         *
2914         * Here we prefer kzalloc to get_zeroed_page(), because a buffer
2915         * allocated by the latter is not tracked and scanned by kmemleak, and
2916         * hence kmemleak reports the pointer contained in the hbus buffer
2917         * (i.e. the hpdev struct, which is created in new_pcichild_device() and
2918         * is tracked by hbus->children) as memory leak (false positive).
2919         *
2920         * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be
2921         * used to allocate the hbus buffer and we can avoid the kmemleak false
2922         * positive by using kmemleak_alloc() and kmemleak_free() to ask
2923         * kmemleak to track and scan the hbus buffer.
2924         */
2925        hbus = (struct hv_pcibus_device *)kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
2926        if (!hbus)
2927                return -ENOMEM;
2928        hbus->state = hv_pcibus_init;
2929
2930        /*
2931         * The PCI bus "domain" is what is called "segment" in ACPI and other
2932         * specs. Pull it from the instance ID, to get something usually
2933         * unique. In rare cases of collision, we will find out another number
2934         * not in use.
2935         *
2936         * Note that, since this code only runs in a Hyper-V VM, Hyper-V
2937         * together with this guest driver can guarantee that (1) The only
2938         * domain used by Gen1 VMs for something that looks like a physical
2939         * PCI bus (which is actually emulated by the hypervisor) is domain 0.
2940         * (2) There will be no overlap between domains (after fixing possible
2941         * collisions) in the same VM.
2942         */
2943        dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
2944        dom = hv_get_dom_num(dom_req);
2945
2946        if (dom == HVPCI_DOM_INVALID) {
2947                dev_err(&hdev->device,
2948                        "Unable to use dom# 0x%hx or other numbers", dom_req);
2949                ret = -EINVAL;
2950                goto free_bus;
2951        }
2952
2953        if (dom != dom_req)
2954                dev_info(&hdev->device,
2955                         "PCI dom# 0x%hx has collision, using 0x%hx",
2956                         dom_req, dom);
2957
2958        hbus->sysdata.domain = dom;
2959
2960        hbus->hdev = hdev;
2961        refcount_set(&hbus->remove_lock, 1);
2962        INIT_LIST_HEAD(&hbus->children);
2963        INIT_LIST_HEAD(&hbus->dr_list);
2964        INIT_LIST_HEAD(&hbus->resources_for_children);
2965        spin_lock_init(&hbus->config_lock);
2966        spin_lock_init(&hbus->device_list_lock);
2967        spin_lock_init(&hbus->retarget_msi_interrupt_lock);
2968        init_completion(&hbus->remove_event);
2969        hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0,
2970                                           hbus->sysdata.domain);
2971        if (!hbus->wq) {
2972                ret = -ENOMEM;
2973                goto free_dom;
2974        }
2975
2976        ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
2977                         hv_pci_onchannelcallback, hbus);
2978        if (ret)
2979                goto destroy_wq;
2980
2981        hv_set_drvdata(hdev, hbus);
2982
2983        ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions,
2984                                          ARRAY_SIZE(pci_protocol_versions));
2985        if (ret)
2986                goto close;
2987
2988        ret = hv_allocate_config_window(hbus);
2989        if (ret)
2990                goto close;
2991
2992        hbus->cfg_addr = ioremap(hbus->mem_config->start,
2993                                 PCI_CONFIG_MMIO_LENGTH);
2994        if (!hbus->cfg_addr) {
2995                dev_err(&hdev->device,
2996                        "Unable to map a virtual address for config space\n");
2997                ret = -ENOMEM;
2998                goto free_config;
2999        }
3000
3001        name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance);
3002        if (!name) {
3003                ret = -ENOMEM;
3004                goto unmap;
3005        }
3006
3007        hbus->sysdata.fwnode = irq_domain_alloc_named_fwnode(name);
3008        kfree(name);
3009        if (!hbus->sysdata.fwnode) {
3010                ret = -ENOMEM;
3011                goto unmap;
3012        }
3013
3014        ret = hv_pcie_init_irq_domain(hbus);
3015        if (ret)
3016                goto free_fwnode;
3017
3018        ret = hv_pci_query_relations(hdev);
3019        if (ret)
3020                goto free_irq_domain;
3021
3022        ret = hv_pci_enter_d0(hdev);
3023        if (ret)
3024                goto free_irq_domain;
3025
3026        ret = hv_pci_allocate_bridge_windows(hbus);
3027        if (ret)
3028                goto free_irq_domain;
3029
3030        ret = hv_send_resources_allocated(hdev);
3031        if (ret)
3032                goto free_windows;
3033
3034        prepopulate_bars(hbus);
3035
3036        hbus->state = hv_pcibus_probed;
3037
3038        ret = create_root_hv_pci_bus(hbus);
3039        if (ret)
3040                goto free_windows;
3041
3042        return 0;
3043
3044free_windows:
3045        hv_pci_free_bridge_windows(hbus);
3046free_irq_domain:
3047        irq_domain_remove(hbus->irq_domain);
3048free_fwnode:
3049        irq_domain_free_fwnode(hbus->sysdata.fwnode);
3050unmap:
3051        iounmap(hbus->cfg_addr);
3052free_config:
3053        hv_free_config_window(hbus);
3054close:
3055        vmbus_close(hdev->channel);
3056destroy_wq:
3057        destroy_workqueue(hbus->wq);
3058free_dom:
3059        hv_put_dom_num(hbus->sysdata.domain);
3060free_bus:
3061        free_page((unsigned long)hbus);
3062        return ret;
3063}
3064
3065static int hv_pci_bus_exit(struct hv_device *hdev, bool hibernating)
3066{
3067        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3068        struct {
3069                struct pci_packet teardown_packet;
3070                u8 buffer[sizeof(struct pci_message)];
3071        } pkt;
3072        struct pci_bus_relations relations;
3073        struct hv_pci_compl comp_pkt;
3074        int ret;
3075
3076        /*
3077         * After the host sends the RESCIND_CHANNEL message, it doesn't
3078         * access the per-channel ringbuffer any longer.
3079         */
3080        if (hdev->channel->rescind)
3081                return 0;
3082
3083        if (!hibernating) {
3084                /* Delete any children which might still exist. */
3085                memset(&relations, 0, sizeof(relations));
3086                hv_pci_devices_present(hbus, &relations);
3087        }
3088
3089        ret = hv_send_resources_released(hdev);
3090        if (ret) {
3091                dev_err(&hdev->device,
3092                        "Couldn't send resources released packet(s)\n");
3093                return ret;
3094        }
3095
3096        memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
3097        init_completion(&comp_pkt.host_event);
3098        pkt.teardown_packet.completion_func = hv_pci_generic_compl;
3099        pkt.teardown_packet.compl_ctxt = &comp_pkt;
3100        pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
3101
3102        ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message,
3103                               sizeof(struct pci_message),
3104                               (unsigned long)&pkt.teardown_packet,
3105                               VM_PKT_DATA_INBAND,
3106                               VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
3107        if (ret)
3108                return ret;
3109
3110        if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0)
3111                return -ETIMEDOUT;
3112
3113        return 0;
3114}
3115
3116/**
3117 * hv_pci_remove() - Remove routine for this VMBus channel
3118 * @hdev:       VMBus's tracking struct for this root PCI bus
3119 *
3120 * Return: 0 on success, -errno on failure
3121 */
3122static int hv_pci_remove(struct hv_device *hdev)
3123{
3124        struct hv_pcibus_device *hbus;
3125        int ret;
3126
3127        hbus = hv_get_drvdata(hdev);
3128        if (hbus->state == hv_pcibus_installed) {
3129                /* Remove the bus from PCI's point of view. */
3130                pci_lock_rescan_remove();
3131                pci_stop_root_bus(hbus->pci_bus);
3132                hv_pci_remove_slots(hbus);
3133                pci_remove_root_bus(hbus->pci_bus);
3134                pci_unlock_rescan_remove();
3135                hbus->state = hv_pcibus_removed;
3136        }
3137
3138        ret = hv_pci_bus_exit(hdev, false);
3139
3140        vmbus_close(hdev->channel);
3141
3142        iounmap(hbus->cfg_addr);
3143        hv_free_config_window(hbus);
3144        pci_free_resource_list(&hbus->resources_for_children);
3145        hv_pci_free_bridge_windows(hbus);
3146        irq_domain_remove(hbus->irq_domain);
3147        irq_domain_free_fwnode(hbus->sysdata.fwnode);
3148        put_hvpcibus(hbus);
3149        wait_for_completion(&hbus->remove_event);
3150        destroy_workqueue(hbus->wq);
3151
3152        hv_put_dom_num(hbus->sysdata.domain);
3153
3154        kfree(hbus);
3155        return ret;
3156}
3157
3158static int hv_pci_suspend(struct hv_device *hdev)
3159{
3160        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3161        enum hv_pcibus_state old_state;
3162        int ret;
3163
3164        /*
3165         * hv_pci_suspend() must make sure there are no pending work items
3166         * before calling vmbus_close(), since it runs in a process context
3167         * as a callback in dpm_suspend().  When it starts to run, the channel
3168         * callback hv_pci_onchannelcallback(), which runs in a tasklet
3169         * context, can be still running concurrently and scheduling new work
3170         * items onto hbus->wq in hv_pci_devices_present() and
3171         * hv_pci_eject_device(), and the work item handlers can access the
3172         * vmbus channel, which can be being closed by hv_pci_suspend(), e.g.
3173         * the work item handler pci_devices_present_work() ->
3174         * new_pcichild_device() writes to the vmbus channel.
3175         *
3176         * To eliminate the race, hv_pci_suspend() disables the channel
3177         * callback tasklet, sets hbus->state to hv_pcibus_removing, and
3178         * re-enables the tasklet. This way, when hv_pci_suspend() proceeds,
3179         * it knows that no new work item can be scheduled, and then it flushes
3180         * hbus->wq and safely closes the vmbus channel.
3181         */
3182        tasklet_disable(&hdev->channel->callback_event);
3183
3184        /* Change the hbus state to prevent new work items. */
3185        old_state = hbus->state;
3186        if (hbus->state == hv_pcibus_installed)
3187                hbus->state = hv_pcibus_removing;
3188
3189        tasklet_enable(&hdev->channel->callback_event);
3190
3191        if (old_state != hv_pcibus_installed)
3192                return -EINVAL;
3193
3194        flush_workqueue(hbus->wq);
3195
3196        ret = hv_pci_bus_exit(hdev, true);
3197        if (ret)
3198                return ret;
3199
3200        vmbus_close(hdev->channel);
3201
3202        return 0;
3203}
3204
3205static int hv_pci_resume(struct hv_device *hdev)
3206{
3207        struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3208        enum pci_protocol_version_t version[1];
3209        int ret;
3210
3211        hbus->state = hv_pcibus_init;
3212
3213        ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
3214                         hv_pci_onchannelcallback, hbus);
3215        if (ret)
3216                return ret;
3217
3218        /* Only use the version that was in use before hibernation. */
3219        version[0] = hbus->protocol_version;
3220        ret = hv_pci_protocol_negotiation(hdev, version, 1);
3221        if (ret)
3222                goto out;
3223
3224        ret = hv_pci_query_relations(hdev);
3225        if (ret)
3226                goto out;
3227
3228        ret = hv_pci_enter_d0(hdev);
3229        if (ret)
3230                goto out;
3231
3232        ret = hv_send_resources_allocated(hdev);
3233        if (ret)
3234                goto out;
3235
3236        prepopulate_bars(hbus);
3237
3238        hbus->state = hv_pcibus_installed;
3239        return 0;
3240out:
3241        vmbus_close(hdev->channel);
3242        return ret;
3243}
3244
3245static const struct hv_vmbus_device_id hv_pci_id_table[] = {
3246        /* PCI Pass-through Class ID */
3247        /* 44C4F61D-4444-4400-9D52-802E27EDE19F */
3248        { HV_PCIE_GUID, },
3249        { },
3250};
3251
3252MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
3253
3254static struct hv_driver hv_pci_drv = {
3255        .name           = "hv_pci",
3256        .id_table       = hv_pci_id_table,
3257        .probe          = hv_pci_probe,
3258        .remove         = hv_pci_remove,
3259        .suspend        = hv_pci_suspend,
3260        .resume         = hv_pci_resume,
3261};
3262
3263static void __exit exit_hv_pci_drv(void)
3264{
3265        vmbus_driver_unregister(&hv_pci_drv);
3266
3267        hvpci_block_ops.read_block = NULL;
3268        hvpci_block_ops.write_block = NULL;
3269        hvpci_block_ops.reg_blk_invalidate = NULL;
3270}
3271
3272static int __init init_hv_pci_drv(void)
3273{
3274        /* Set the invalid domain number's bit, so it will not be used */
3275        set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
3276
3277        /* Initialize PCI block r/w interface */
3278        hvpci_block_ops.read_block = hv_read_config_block;
3279        hvpci_block_ops.write_block = hv_write_config_block;
3280        hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate;
3281
3282        return vmbus_driver_register(&hv_pci_drv);
3283}
3284
3285module_init(init_hv_pci_drv);
3286module_exit(exit_hv_pci_drv);
3287
3288MODULE_DESCRIPTION("Hyper-V PCI");
3289MODULE_LICENSE("GPL v2");
3290