qemu/hw/i386/amd_iommu.c
<<
>>
Prefs
   1/*
   2 * QEMU emulation of AMD IOMMU (AMD-Vi)
   3 *
   4 * Copyright (C) 2011 Eduard - Gabriel Munteanu
   5 * Copyright (C) 2015, 2016 David Kiarie Kahurani
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2 of the License, or
  10 * (at your option) any later version.
  11
  12 * This program is distributed in the hope that it will be useful,
  13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 * GNU General Public License for more details.
  16
  17 * You should have received a copy of the GNU General Public License along
  18 * with this program; if not, see <http://www.gnu.org/licenses/>.
  19 *
  20 * Cache implementation inspired by hw/i386/intel_iommu.c
  21 */
  22
  23#include "qemu/osdep.h"
  24#include "hw/i386/pc.h"
  25#include "hw/pci/msi.h"
  26#include "hw/pci/pci_bus.h"
  27#include "migration/vmstate.h"
  28#include "amd_iommu.h"
  29#include "qapi/error.h"
  30#include "qemu/error-report.h"
  31#include "hw/i386/apic_internal.h"
  32#include "trace.h"
  33#include "hw/i386/apic-msidef.h"
  34
  35/* used AMD-Vi MMIO registers */
  36const char *amdvi_mmio_low[] = {
  37    "AMDVI_MMIO_DEVTAB_BASE",
  38    "AMDVI_MMIO_CMDBUF_BASE",
  39    "AMDVI_MMIO_EVTLOG_BASE",
  40    "AMDVI_MMIO_CONTROL",
  41    "AMDVI_MMIO_EXCL_BASE",
  42    "AMDVI_MMIO_EXCL_LIMIT",
  43    "AMDVI_MMIO_EXT_FEATURES",
  44    "AMDVI_MMIO_PPR_BASE",
  45    "UNHANDLED"
  46};
  47const char *amdvi_mmio_high[] = {
  48    "AMDVI_MMIO_COMMAND_HEAD",
  49    "AMDVI_MMIO_COMMAND_TAIL",
  50    "AMDVI_MMIO_EVTLOG_HEAD",
  51    "AMDVI_MMIO_EVTLOG_TAIL",
  52    "AMDVI_MMIO_STATUS",
  53    "AMDVI_MMIO_PPR_HEAD",
  54    "AMDVI_MMIO_PPR_TAIL",
  55    "UNHANDLED"
  56};
  57
  58struct AMDVIAddressSpace {
  59    uint8_t bus_num;            /* bus number                           */
  60    uint8_t devfn;              /* device function                      */
  61    AMDVIState *iommu_state;    /* AMDVI - one per machine              */
  62    MemoryRegion root;          /* AMDVI Root memory map region */
  63    IOMMUMemoryRegion iommu;    /* Device's address translation region  */
  64    MemoryRegion iommu_ir;      /* Device's interrupt remapping region  */
  65    AddressSpace as;            /* device's corresponding address space */
  66};
  67
  68/* AMDVI cache entry */
  69typedef struct AMDVIIOTLBEntry {
  70    uint16_t domid;             /* assigned domain id  */
  71    uint16_t devid;             /* device owning entry */
  72    uint64_t perms;             /* access permissions  */
  73    uint64_t translated_addr;   /* translated address  */
  74    uint64_t page_mask;         /* physical page size  */
  75} AMDVIIOTLBEntry;
  76
  77/* configure MMIO registers at startup/reset */
  78static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
  79                           uint64_t romask, uint64_t w1cmask)
  80{
  81    stq_le_p(&s->mmior[addr], val);
  82    stq_le_p(&s->romask[addr], romask);
  83    stq_le_p(&s->w1cmask[addr], w1cmask);
  84}
  85
  86static uint16_t amdvi_readw(AMDVIState *s, hwaddr addr)
  87{
  88    return lduw_le_p(&s->mmior[addr]);
  89}
  90
  91static uint32_t amdvi_readl(AMDVIState *s, hwaddr addr)
  92{
  93    return ldl_le_p(&s->mmior[addr]);
  94}
  95
  96static uint64_t amdvi_readq(AMDVIState *s, hwaddr addr)
  97{
  98    return ldq_le_p(&s->mmior[addr]);
  99}
 100
 101/* internal write */
 102static void amdvi_writeq_raw(AMDVIState *s, uint64_t val, hwaddr addr)
 103{
 104    stq_le_p(&s->mmior[addr], val);
 105}
 106
 107/* external write */
 108static void amdvi_writew(AMDVIState *s, hwaddr addr, uint16_t val)
 109{
 110    uint16_t romask = lduw_le_p(&s->romask[addr]);
 111    uint16_t w1cmask = lduw_le_p(&s->w1cmask[addr]);
 112    uint16_t oldval = lduw_le_p(&s->mmior[addr]);
 113    stw_le_p(&s->mmior[addr],
 114            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
 115}
 116
 117static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val)
 118{
 119    uint32_t romask = ldl_le_p(&s->romask[addr]);
 120    uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
 121    uint32_t oldval = ldl_le_p(&s->mmior[addr]);
 122    stl_le_p(&s->mmior[addr],
 123            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
 124}
 125
 126static void amdvi_writeq(AMDVIState *s, hwaddr addr, uint64_t val)
 127{
 128    uint64_t romask = ldq_le_p(&s->romask[addr]);
 129    uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
 130    uint32_t oldval = ldq_le_p(&s->mmior[addr]);
 131    stq_le_p(&s->mmior[addr],
 132            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
 133}
 134
 135/* OR a 64-bit register with a 64-bit value */
 136static bool amdvi_test_mask(AMDVIState *s, hwaddr addr, uint64_t val)
 137{
 138    return amdvi_readq(s, addr) | val;
 139}
 140
 141/* OR a 64-bit register with a 64-bit value storing result in the register */
 142static void amdvi_assign_orq(AMDVIState *s, hwaddr addr, uint64_t val)
 143{
 144    amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) | val);
 145}
 146
 147/* AND a 64-bit register with a 64-bit value storing result in the register */
 148static void amdvi_assign_andq(AMDVIState *s, hwaddr addr, uint64_t val)
 149{
 150   amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) & val);
 151}
 152
 153static void amdvi_generate_msi_interrupt(AMDVIState *s)
 154{
 155    MSIMessage msg = {};
 156    MemTxAttrs attrs = {
 157        .requester_id = pci_requester_id(&s->pci.dev)
 158    };
 159
 160    if (msi_enabled(&s->pci.dev)) {
 161        msg = msi_get_message(&s->pci.dev, 0);
 162        address_space_stl_le(&address_space_memory, msg.address, msg.data,
 163                             attrs, NULL);
 164    }
 165}
 166
 167static void amdvi_log_event(AMDVIState *s, uint64_t *evt)
 168{
 169    /* event logging not enabled */
 170    if (!s->evtlog_enabled || amdvi_test_mask(s, AMDVI_MMIO_STATUS,
 171        AMDVI_MMIO_STATUS_EVT_OVF)) {
 172        return;
 173    }
 174
 175    /* event log buffer full */
 176    if (s->evtlog_tail >= s->evtlog_len) {
 177        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF);
 178        /* generate interrupt */
 179        amdvi_generate_msi_interrupt(s);
 180        return;
 181    }
 182
 183    if (dma_memory_write(&address_space_memory, s->evtlog + s->evtlog_tail,
 184                         evt, AMDVI_EVENT_LEN)) {
 185        trace_amdvi_evntlog_fail(s->evtlog, s->evtlog_tail);
 186    }
 187
 188    s->evtlog_tail += AMDVI_EVENT_LEN;
 189    amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
 190    amdvi_generate_msi_interrupt(s);
 191}
 192
 193static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start,
 194                                int length)
 195{
 196    int index = start / 64, bitpos = start % 64;
 197    uint64_t mask = MAKE_64BIT_MASK(start, length);
 198    buffer[index] &= ~mask;
 199    buffer[index] |= (value << bitpos) & mask;
 200}
 201/*
 202 * AMDVi event structure
 203 *    0:15   -> DeviceID
 204 *    55:63  -> event type + miscellaneous info
 205 *    63:127 -> related address
 206 */
 207static void amdvi_encode_event(uint64_t *evt, uint16_t devid, uint64_t addr,
 208                               uint16_t info)
 209{
 210    amdvi_setevent_bits(evt, devid, 0, 16);
 211    amdvi_setevent_bits(evt, info, 55, 8);
 212    amdvi_setevent_bits(evt, addr, 63, 64);
 213}
 214/* log an error encountered during a page walk
 215 *
 216 * @addr: virtual address in translation request
 217 */
 218static void amdvi_page_fault(AMDVIState *s, uint16_t devid,
 219                             hwaddr addr, uint16_t info)
 220{
 221    uint64_t evt[4];
 222
 223    info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF;
 224    amdvi_encode_event(evt, devid, addr, info);
 225    amdvi_log_event(s, evt);
 226    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 227            PCI_STATUS_SIG_TARGET_ABORT);
 228}
 229/*
 230 * log a master abort accessing device table
 231 *  @devtab : address of device table entry
 232 *  @info : error flags
 233 */
 234static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid,
 235                                   hwaddr devtab, uint16_t info)
 236{
 237    uint64_t evt[4];
 238
 239    info |= AMDVI_EVENT_DEV_TAB_HW_ERROR;
 240
 241    amdvi_encode_event(evt, devid, devtab, info);
 242    amdvi_log_event(s, evt);
 243    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 244            PCI_STATUS_SIG_TARGET_ABORT);
 245}
 246/* log an event trying to access command buffer
 247 *   @addr : address that couldn't be accessed
 248 */
 249static void amdvi_log_command_error(AMDVIState *s, hwaddr addr)
 250{
 251    uint64_t evt[4], info = AMDVI_EVENT_COMMAND_HW_ERROR;
 252
 253    amdvi_encode_event(evt, 0, addr, info);
 254    amdvi_log_event(s, evt);
 255    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 256            PCI_STATUS_SIG_TARGET_ABORT);
 257}
 258/* log an illegal comand event
 259 *   @addr : address of illegal command
 260 */
 261static void amdvi_log_illegalcom_error(AMDVIState *s, uint16_t info,
 262                                       hwaddr addr)
 263{
 264    uint64_t evt[4];
 265
 266    info |= AMDVI_EVENT_ILLEGAL_COMMAND_ERROR;
 267    amdvi_encode_event(evt, 0, addr, info);
 268    amdvi_log_event(s, evt);
 269}
 270/* log an error accessing device table
 271 *
 272 *  @devid : device owning the table entry
 273 *  @devtab : address of device table entry
 274 *  @info : error flags
 275 */
 276static void amdvi_log_illegaldevtab_error(AMDVIState *s, uint16_t devid,
 277                                          hwaddr addr, uint16_t info)
 278{
 279    uint64_t evt[4];
 280
 281    info |= AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY;
 282    amdvi_encode_event(evt, devid, addr, info);
 283    amdvi_log_event(s, evt);
 284}
 285/* log an error accessing a PTE entry
 286 * @addr : address that couldn't be accessed
 287 */
 288static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid,
 289                                    hwaddr addr, uint16_t info)
 290{
 291    uint64_t evt[4];
 292
 293    info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR;
 294    amdvi_encode_event(evt, devid, addr, info);
 295    amdvi_log_event(s, evt);
 296    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 297             PCI_STATUS_SIG_TARGET_ABORT);
 298}
 299
 300static gboolean amdvi_uint64_equal(gconstpointer v1, gconstpointer v2)
 301{
 302    return *((const uint64_t *)v1) == *((const uint64_t *)v2);
 303}
 304
 305static guint amdvi_uint64_hash(gconstpointer v)
 306{
 307    return (guint)*(const uint64_t *)v;
 308}
 309
 310static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr,
 311                                           uint64_t devid)
 312{
 313    uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
 314                   ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
 315    return g_hash_table_lookup(s->iotlb, &key);
 316}
 317
 318static void amdvi_iotlb_reset(AMDVIState *s)
 319{
 320    assert(s->iotlb);
 321    trace_amdvi_iotlb_reset();
 322    g_hash_table_remove_all(s->iotlb);
 323}
 324
 325static gboolean amdvi_iotlb_remove_by_devid(gpointer key, gpointer value,
 326                                            gpointer user_data)
 327{
 328    AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
 329    uint16_t devid = *(uint16_t *)user_data;
 330    return entry->devid == devid;
 331}
 332
 333static void amdvi_iotlb_remove_page(AMDVIState *s, hwaddr addr,
 334                                    uint64_t devid)
 335{
 336    uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
 337                   ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
 338    g_hash_table_remove(s->iotlb, &key);
 339}
 340
 341static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid,
 342                               uint64_t gpa, IOMMUTLBEntry to_cache,
 343                               uint16_t domid)
 344{
 345    AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1);
 346    uint64_t *key = g_new(uint64_t, 1);
 347    uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K;
 348
 349    /* don't cache erroneous translations */
 350    if (to_cache.perm != IOMMU_NONE) {
 351        trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid),
 352                PCI_FUNC(devid), gpa, to_cache.translated_addr);
 353
 354        if (g_hash_table_size(s->iotlb) >= AMDVI_IOTLB_MAX_SIZE) {
 355            amdvi_iotlb_reset(s);
 356        }
 357
 358        entry->domid = domid;
 359        entry->perms = to_cache.perm;
 360        entry->translated_addr = to_cache.translated_addr;
 361        entry->page_mask = to_cache.addr_mask;
 362        *key = gfn | ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
 363        g_hash_table_replace(s->iotlb, key, entry);
 364    }
 365}
 366
 367static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd)
 368{
 369    /* pad the last 3 bits */
 370    hwaddr addr = cpu_to_le64(extract64(cmd[0], 3, 49)) << 3;
 371    uint64_t data = cpu_to_le64(cmd[1]);
 372
 373    if (extract64(cmd[0], 51, 8)) {
 374        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 375                                   s->cmdbuf + s->cmdbuf_head);
 376    }
 377    if (extract64(cmd[0], 0, 1)) {
 378        if (dma_memory_write(&address_space_memory, addr, &data,
 379            AMDVI_COMPLETION_DATA_SIZE)) {
 380            trace_amdvi_completion_wait_fail(addr);
 381        }
 382    }
 383    /* set completion interrupt */
 384    if (extract64(cmd[0], 1, 1)) {
 385        amdvi_test_mask(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
 386        /* generate interrupt */
 387        amdvi_generate_msi_interrupt(s);
 388    }
 389    trace_amdvi_completion_wait(addr, data);
 390}
 391
 392/* log error without aborting since linux seems to be using reserved bits */
 393static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd)
 394{
 395    uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16));
 396
 397    /* This command should invalidate internal caches of which there isn't */
 398    if (extract64(cmd[0], 15, 16) || cmd[1]) {
 399        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 400                                   s->cmdbuf + s->cmdbuf_head);
 401    }
 402    trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid),
 403                             PCI_FUNC(devid));
 404}
 405
 406static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd)
 407{
 408    if (extract64(cmd[0], 15, 16) ||  extract64(cmd[0], 19, 8) ||
 409        extract64(cmd[1], 0, 2) || extract64(cmd[1], 3, 29)
 410        || extract64(cmd[1], 47, 16)) {
 411        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 412                                   s->cmdbuf + s->cmdbuf_head);
 413    }
 414    trace_amdvi_ppr_exec();
 415}
 416
 417static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
 418{
 419    if (extract64(cmd[0], 0, 60) || cmd[1]) {
 420        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 421                                   s->cmdbuf + s->cmdbuf_head);
 422    }
 423
 424    amdvi_iotlb_reset(s);
 425    trace_amdvi_all_inval();
 426}
 427
 428static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value,
 429                                            gpointer user_data)
 430{
 431    AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
 432    uint16_t domid = *(uint16_t *)user_data;
 433    return entry->domid == domid;
 434}
 435
 436/* we don't have devid - we can't remove pages by address */
 437static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd)
 438{
 439    uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16));
 440
 441    if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 16, 12) ||
 442        extract64(cmd[0], 3, 10)) {
 443        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 444                                   s->cmdbuf + s->cmdbuf_head);
 445    }
 446
 447    g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid,
 448                                &domid);
 449    trace_amdvi_pages_inval(domid);
 450}
 451
 452static void amdvi_prefetch_pages(AMDVIState *s, uint64_t *cmd)
 453{
 454    if (extract64(cmd[0], 16, 8) || extract64(cmd[0], 20, 8) ||
 455        extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
 456        extract64(cmd[1], 5, 7)) {
 457        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 458                                   s->cmdbuf + s->cmdbuf_head);
 459    }
 460
 461    trace_amdvi_prefetch_pages();
 462}
 463
 464static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd)
 465{
 466    if (extract64(cmd[0], 16, 16) || cmd[1]) {
 467        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 468                                   s->cmdbuf + s->cmdbuf_head);
 469        return;
 470    }
 471
 472    trace_amdvi_intr_inval();
 473}
 474
 475/* FIXME: Try to work with the specified size instead of all the pages
 476 * when the S bit is on
 477 */
 478static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd)
 479{
 480
 481    uint16_t devid = extract64(cmd[0], 0, 16);
 482    if (extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 9)) {
 483        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 484                                   s->cmdbuf + s->cmdbuf_head);
 485        return;
 486    }
 487
 488    if (extract64(cmd[1], 0, 1)) {
 489        g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_devid,
 490                                    &devid);
 491    } else {
 492        amdvi_iotlb_remove_page(s, cpu_to_le64(extract64(cmd[1], 12, 52)) << 12,
 493                                cpu_to_le16(extract64(cmd[1], 0, 16)));
 494    }
 495    trace_amdvi_iotlb_inval();
 496}
 497
 498/* not honouring reserved bits is regarded as an illegal command */
 499static void amdvi_cmdbuf_exec(AMDVIState *s)
 500{
 501    uint64_t cmd[2];
 502
 503    if (dma_memory_read(&address_space_memory, s->cmdbuf + s->cmdbuf_head,
 504        cmd, AMDVI_COMMAND_SIZE)) {
 505        trace_amdvi_command_read_fail(s->cmdbuf, s->cmdbuf_head);
 506        amdvi_log_command_error(s, s->cmdbuf + s->cmdbuf_head);
 507        return;
 508    }
 509
 510    switch (extract64(cmd[0], 60, 4)) {
 511    case AMDVI_CMD_COMPLETION_WAIT:
 512        amdvi_completion_wait(s, cmd);
 513        break;
 514    case AMDVI_CMD_INVAL_DEVTAB_ENTRY:
 515        amdvi_inval_devtab_entry(s, cmd);
 516        break;
 517    case AMDVI_CMD_INVAL_AMDVI_PAGES:
 518        amdvi_inval_pages(s, cmd);
 519        break;
 520    case AMDVI_CMD_INVAL_IOTLB_PAGES:
 521        iommu_inval_iotlb(s, cmd);
 522        break;
 523    case AMDVI_CMD_INVAL_INTR_TABLE:
 524        amdvi_inval_inttable(s, cmd);
 525        break;
 526    case AMDVI_CMD_PREFETCH_AMDVI_PAGES:
 527        amdvi_prefetch_pages(s, cmd);
 528        break;
 529    case AMDVI_CMD_COMPLETE_PPR_REQUEST:
 530        amdvi_complete_ppr(s, cmd);
 531        break;
 532    case AMDVI_CMD_INVAL_AMDVI_ALL:
 533        amdvi_inval_all(s, cmd);
 534        break;
 535    default:
 536        trace_amdvi_unhandled_command(extract64(cmd[1], 60, 4));
 537        /* log illegal command */
 538        amdvi_log_illegalcom_error(s, extract64(cmd[1], 60, 4),
 539                                   s->cmdbuf + s->cmdbuf_head);
 540    }
 541}
 542
 543static void amdvi_cmdbuf_run(AMDVIState *s)
 544{
 545    if (!s->cmdbuf_enabled) {
 546        trace_amdvi_command_error(amdvi_readq(s, AMDVI_MMIO_CONTROL));
 547        return;
 548    }
 549
 550    /* check if there is work to do. */
 551    while (s->cmdbuf_head != s->cmdbuf_tail) {
 552        trace_amdvi_command_exec(s->cmdbuf_head, s->cmdbuf_tail, s->cmdbuf);
 553        amdvi_cmdbuf_exec(s);
 554        s->cmdbuf_head += AMDVI_COMMAND_SIZE;
 555        amdvi_writeq_raw(s, s->cmdbuf_head, AMDVI_MMIO_COMMAND_HEAD);
 556
 557        /* wrap head pointer */
 558        if (s->cmdbuf_head >= s->cmdbuf_len * AMDVI_COMMAND_SIZE) {
 559            s->cmdbuf_head = 0;
 560        }
 561    }
 562}
 563
 564static void amdvi_mmio_trace(hwaddr addr, unsigned size)
 565{
 566    uint8_t index = (addr & ~0x2000) / 8;
 567
 568    if ((addr & 0x2000)) {
 569        /* high table */
 570        index = index >= AMDVI_MMIO_REGS_HIGH ? AMDVI_MMIO_REGS_HIGH : index;
 571        trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07);
 572    } else {
 573        index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index;
 574        trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07);
 575    }
 576}
 577
 578static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size)
 579{
 580    AMDVIState *s = opaque;
 581
 582    uint64_t val = -1;
 583    if (addr + size > AMDVI_MMIO_SIZE) {
 584        trace_amdvi_mmio_read_invalid(AMDVI_MMIO_SIZE, addr, size);
 585        return (uint64_t)-1;
 586    }
 587
 588    if (size == 2) {
 589        val = amdvi_readw(s, addr);
 590    } else if (size == 4) {
 591        val = amdvi_readl(s, addr);
 592    } else if (size == 8) {
 593        val = amdvi_readq(s, addr);
 594    }
 595    amdvi_mmio_trace(addr, size);
 596
 597    return val;
 598}
 599
 600static void amdvi_handle_control_write(AMDVIState *s)
 601{
 602    unsigned long control = amdvi_readq(s, AMDVI_MMIO_CONTROL);
 603    s->enabled = !!(control & AMDVI_MMIO_CONTROL_AMDVIEN);
 604
 605    s->ats_enabled = !!(control & AMDVI_MMIO_CONTROL_HTTUNEN);
 606    s->evtlog_enabled = s->enabled && !!(control &
 607                        AMDVI_MMIO_CONTROL_EVENTLOGEN);
 608
 609    s->evtlog_intr = !!(control & AMDVI_MMIO_CONTROL_EVENTINTEN);
 610    s->completion_wait_intr = !!(control & AMDVI_MMIO_CONTROL_COMWAITINTEN);
 611    s->cmdbuf_enabled = s->enabled && !!(control &
 612                        AMDVI_MMIO_CONTROL_CMDBUFLEN);
 613    s->ga_enabled = !!(control & AMDVI_MMIO_CONTROL_GAEN);
 614
 615    /* update the flags depending on the control register */
 616    if (s->cmdbuf_enabled) {
 617        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_CMDBUF_RUN);
 618    } else {
 619        amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_CMDBUF_RUN);
 620    }
 621    if (s->evtlog_enabled) {
 622        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_RUN);
 623    } else {
 624        amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_EVT_RUN);
 625    }
 626
 627    trace_amdvi_control_status(control);
 628    amdvi_cmdbuf_run(s);
 629}
 630
 631static inline void amdvi_handle_devtab_write(AMDVIState *s)
 632
 633{
 634    uint64_t val = amdvi_readq(s, AMDVI_MMIO_DEVICE_TABLE);
 635    s->devtab = (val & AMDVI_MMIO_DEVTAB_BASE_MASK);
 636
 637    /* set device table length */
 638    s->devtab_len = ((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1 *
 639                    (AMDVI_MMIO_DEVTAB_SIZE_UNIT /
 640                     AMDVI_MMIO_DEVTAB_ENTRY_SIZE));
 641}
 642
 643static inline void amdvi_handle_cmdhead_write(AMDVIState *s)
 644{
 645    s->cmdbuf_head = amdvi_readq(s, AMDVI_MMIO_COMMAND_HEAD)
 646                     & AMDVI_MMIO_CMDBUF_HEAD_MASK;
 647    amdvi_cmdbuf_run(s);
 648}
 649
 650static inline void amdvi_handle_cmdbase_write(AMDVIState *s)
 651{
 652    s->cmdbuf = amdvi_readq(s, AMDVI_MMIO_COMMAND_BASE)
 653                & AMDVI_MMIO_CMDBUF_BASE_MASK;
 654    s->cmdbuf_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_CMDBUF_SIZE_BYTE)
 655                    & AMDVI_MMIO_CMDBUF_SIZE_MASK);
 656    s->cmdbuf_head = s->cmdbuf_tail = 0;
 657}
 658
 659static inline void amdvi_handle_cmdtail_write(AMDVIState *s)
 660{
 661    s->cmdbuf_tail = amdvi_readq(s, AMDVI_MMIO_COMMAND_TAIL)
 662                     & AMDVI_MMIO_CMDBUF_TAIL_MASK;
 663    amdvi_cmdbuf_run(s);
 664}
 665
 666static inline void amdvi_handle_excllim_write(AMDVIState *s)
 667{
 668    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EXCL_LIMIT);
 669    s->excl_limit = (val & AMDVI_MMIO_EXCL_LIMIT_MASK) |
 670                    AMDVI_MMIO_EXCL_LIMIT_LOW;
 671}
 672
 673static inline void amdvi_handle_evtbase_write(AMDVIState *s)
 674{
 675    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_BASE);
 676    s->evtlog = val & AMDVI_MMIO_EVTLOG_BASE_MASK;
 677    s->evtlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_EVTLOG_SIZE_BYTE)
 678                    & AMDVI_MMIO_EVTLOG_SIZE_MASK);
 679}
 680
 681static inline void amdvi_handle_evttail_write(AMDVIState *s)
 682{
 683    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_TAIL);
 684    s->evtlog_tail = val & AMDVI_MMIO_EVTLOG_TAIL_MASK;
 685}
 686
 687static inline void amdvi_handle_evthead_write(AMDVIState *s)
 688{
 689    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_HEAD);
 690    s->evtlog_head = val & AMDVI_MMIO_EVTLOG_HEAD_MASK;
 691}
 692
 693static inline void amdvi_handle_pprbase_write(AMDVIState *s)
 694{
 695    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_BASE);
 696    s->ppr_log = val & AMDVI_MMIO_PPRLOG_BASE_MASK;
 697    s->pprlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_PPRLOG_SIZE_BYTE)
 698                    & AMDVI_MMIO_PPRLOG_SIZE_MASK);
 699}
 700
 701static inline void amdvi_handle_pprhead_write(AMDVIState *s)
 702{
 703    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_HEAD);
 704    s->pprlog_head = val & AMDVI_MMIO_PPRLOG_HEAD_MASK;
 705}
 706
 707static inline void amdvi_handle_pprtail_write(AMDVIState *s)
 708{
 709    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_TAIL);
 710    s->pprlog_tail = val & AMDVI_MMIO_PPRLOG_TAIL_MASK;
 711}
 712
 713/* FIXME: something might go wrong if System Software writes in chunks
 714 * of one byte but linux writes in chunks of 4 bytes so currently it
 715 * works correctly with linux but will definitely be busted if software
 716 * reads/writes 8 bytes
 717 */
 718static void amdvi_mmio_reg_write(AMDVIState *s, unsigned size, uint64_t val,
 719                                 hwaddr addr)
 720{
 721    if (size == 2) {
 722        amdvi_writew(s, addr, val);
 723    } else if (size == 4) {
 724        amdvi_writel(s, addr, val);
 725    } else if (size == 8) {
 726        amdvi_writeq(s, addr, val);
 727    }
 728}
 729
 730static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val,
 731                             unsigned size)
 732{
 733    AMDVIState *s = opaque;
 734    unsigned long offset = addr & 0x07;
 735
 736    if (addr + size > AMDVI_MMIO_SIZE) {
 737        trace_amdvi_mmio_write("error: addr outside region: max ",
 738                (uint64_t)AMDVI_MMIO_SIZE, size, val, offset);
 739        return;
 740    }
 741
 742    amdvi_mmio_trace(addr, size);
 743    switch (addr & ~0x07) {
 744    case AMDVI_MMIO_CONTROL:
 745        amdvi_mmio_reg_write(s, size, val, addr);
 746        amdvi_handle_control_write(s);
 747        break;
 748    case AMDVI_MMIO_DEVICE_TABLE:
 749        amdvi_mmio_reg_write(s, size, val, addr);
 750       /*  set device table address
 751        *   This also suffers from inability to tell whether software
 752        *   is done writing
 753        */
 754        if (offset || (size == 8)) {
 755            amdvi_handle_devtab_write(s);
 756        }
 757        break;
 758    case AMDVI_MMIO_COMMAND_HEAD:
 759        amdvi_mmio_reg_write(s, size, val, addr);
 760        amdvi_handle_cmdhead_write(s);
 761        break;
 762    case AMDVI_MMIO_COMMAND_BASE:
 763        amdvi_mmio_reg_write(s, size, val, addr);
 764        /* FIXME - make sure System Software has finished writing incase
 765         * it writes in chucks less than 8 bytes in a robust way.As for
 766         * now, this hacks works for the linux driver
 767         */
 768        if (offset || (size == 8)) {
 769            amdvi_handle_cmdbase_write(s);
 770        }
 771        break;
 772    case AMDVI_MMIO_COMMAND_TAIL:
 773        amdvi_mmio_reg_write(s, size, val, addr);
 774        amdvi_handle_cmdtail_write(s);
 775        break;
 776    case AMDVI_MMIO_EVENT_BASE:
 777        amdvi_mmio_reg_write(s, size, val, addr);
 778        amdvi_handle_evtbase_write(s);
 779        break;
 780    case AMDVI_MMIO_EVENT_HEAD:
 781        amdvi_mmio_reg_write(s, size, val, addr);
 782        amdvi_handle_evthead_write(s);
 783        break;
 784    case AMDVI_MMIO_EVENT_TAIL:
 785        amdvi_mmio_reg_write(s, size, val, addr);
 786        amdvi_handle_evttail_write(s);
 787        break;
 788    case AMDVI_MMIO_EXCL_LIMIT:
 789        amdvi_mmio_reg_write(s, size, val, addr);
 790        amdvi_handle_excllim_write(s);
 791        break;
 792        /* PPR log base - unused for now */
 793    case AMDVI_MMIO_PPR_BASE:
 794        amdvi_mmio_reg_write(s, size, val, addr);
 795        amdvi_handle_pprbase_write(s);
 796        break;
 797        /* PPR log head - also unused for now */
 798    case AMDVI_MMIO_PPR_HEAD:
 799        amdvi_mmio_reg_write(s, size, val, addr);
 800        amdvi_handle_pprhead_write(s);
 801        break;
 802        /* PPR log tail - unused for now */
 803    case AMDVI_MMIO_PPR_TAIL:
 804        amdvi_mmio_reg_write(s, size, val, addr);
 805        amdvi_handle_pprtail_write(s);
 806        break;
 807    }
 808}
 809
 810static inline uint64_t amdvi_get_perms(uint64_t entry)
 811{
 812    return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >>
 813           AMDVI_DEV_PERM_SHIFT;
 814}
 815
 816/* validate that reserved bits are honoured */
 817static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid,
 818                               uint64_t *dte)
 819{
 820    if ((dte[0] & AMDVI_DTE_LOWER_QUAD_RESERVED)
 821        || (dte[1] & AMDVI_DTE_MIDDLE_QUAD_RESERVED)
 822        || (dte[2] & AMDVI_DTE_UPPER_QUAD_RESERVED) || dte[3]) {
 823        amdvi_log_illegaldevtab_error(s, devid,
 824                                      s->devtab +
 825                                      devid * AMDVI_DEVTAB_ENTRY_SIZE, 0);
 826        return false;
 827    }
 828
 829    return true;
 830}
 831
 832/* get a device table entry given the devid */
 833static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry)
 834{
 835    uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE;
 836
 837    if (dma_memory_read(&address_space_memory, s->devtab + offset, entry,
 838        AMDVI_DEVTAB_ENTRY_SIZE)) {
 839        trace_amdvi_dte_get_fail(s->devtab, offset);
 840        /* log error accessing dte */
 841        amdvi_log_devtab_error(s, devid, s->devtab + offset, 0);
 842        return false;
 843    }
 844
 845    *entry = le64_to_cpu(*entry);
 846    if (!amdvi_validate_dte(s, devid, entry)) {
 847        trace_amdvi_invalid_dte(entry[0]);
 848        return false;
 849    }
 850
 851    return true;
 852}
 853
 854/* get pte translation mode */
 855static inline uint8_t get_pte_translation_mode(uint64_t pte)
 856{
 857    return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK;
 858}
 859
 860static inline uint64_t pte_override_page_mask(uint64_t pte)
 861{
 862    uint8_t page_mask = 12;
 863    uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) ^ AMDVI_DEV_PT_ROOT_MASK;
 864    /* find the first zero bit */
 865    while (addr & 1) {
 866        page_mask++;
 867        addr = addr >> 1;
 868    }
 869
 870    return ~((1ULL << page_mask) - 1);
 871}
 872
 873static inline uint64_t pte_get_page_mask(uint64_t oldlevel)
 874{
 875    return ~((1UL << ((oldlevel * 9) + 3)) - 1);
 876}
 877
 878static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
 879                                          uint16_t devid)
 880{
 881    uint64_t pte;
 882
 883    if (dma_memory_read(&address_space_memory, pte_addr, &pte, sizeof(pte))) {
 884        trace_amdvi_get_pte_hwerror(pte_addr);
 885        amdvi_log_pagetab_error(s, devid, pte_addr, 0);
 886        pte = 0;
 887        return pte;
 888    }
 889
 890    pte = le64_to_cpu(pte);
 891    return pte;
 892}
 893
 894static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
 895                            IOMMUTLBEntry *ret, unsigned perms,
 896                            hwaddr addr)
 897{
 898    unsigned level, present, pte_perms, oldlevel;
 899    uint64_t pte = dte[0], pte_addr, page_mask;
 900
 901    /* make sure the DTE has TV = 1 */
 902    if (pte & AMDVI_DEV_TRANSLATION_VALID) {
 903        level = get_pte_translation_mode(pte);
 904        if (level >= 7) {
 905            trace_amdvi_mode_invalid(level, addr);
 906            return;
 907        }
 908        if (level == 0) {
 909            goto no_remap;
 910        }
 911
 912        /* we are at the leaf page table or page table encodes a huge page */
 913        while (level > 0) {
 914            pte_perms = amdvi_get_perms(pte);
 915            present = pte & 1;
 916            if (!present || perms != (perms & pte_perms)) {
 917                amdvi_page_fault(as->iommu_state, as->devfn, addr, perms);
 918                trace_amdvi_page_fault(addr);
 919                return;
 920            }
 921
 922            /* go to the next lower level */
 923            pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK;
 924            /* add offset and load pte */
 925            pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3;
 926            pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn);
 927            if (!pte) {
 928                return;
 929            }
 930            oldlevel = level;
 931            level = get_pte_translation_mode(pte);
 932            if (level == 0x7) {
 933                break;
 934            }
 935        }
 936
 937        if (level == 0x7) {
 938            page_mask = pte_override_page_mask(pte);
 939        } else {
 940            page_mask = pte_get_page_mask(oldlevel);
 941        }
 942
 943        /* get access permissions from pte */
 944        ret->iova = addr & page_mask;
 945        ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask;
 946        ret->addr_mask = ~page_mask;
 947        ret->perm = amdvi_get_perms(pte);
 948        return;
 949    }
 950no_remap:
 951    ret->iova = addr & AMDVI_PAGE_MASK_4K;
 952    ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
 953    ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
 954    ret->perm = amdvi_get_perms(pte);
 955}
 956
 957static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
 958                               bool is_write, IOMMUTLBEntry *ret)
 959{
 960    AMDVIState *s = as->iommu_state;
 961    uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn);
 962    AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid);
 963    uint64_t entry[4];
 964
 965    if (iotlb_entry) {
 966        trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid),
 967                PCI_FUNC(devid), addr, iotlb_entry->translated_addr);
 968        ret->iova = addr & ~iotlb_entry->page_mask;
 969        ret->translated_addr = iotlb_entry->translated_addr;
 970        ret->addr_mask = iotlb_entry->page_mask;
 971        ret->perm = iotlb_entry->perms;
 972        return;
 973    }
 974
 975    if (!amdvi_get_dte(s, devid, entry)) {
 976        return;
 977    }
 978
 979    /* devices with V = 0 are not translated */
 980    if (!(entry[0] & AMDVI_DEV_VALID)) {
 981        goto out;
 982    }
 983
 984    amdvi_page_walk(as, entry, ret,
 985                    is_write ? AMDVI_PERM_WRITE : AMDVI_PERM_READ, addr);
 986
 987    amdvi_update_iotlb(s, devid, addr, *ret,
 988                       entry[1] & AMDVI_DEV_DOMID_ID_MASK);
 989    return;
 990
 991out:
 992    ret->iova = addr & AMDVI_PAGE_MASK_4K;
 993    ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
 994    ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
 995    ret->perm = IOMMU_RW;
 996}
 997
 998static inline bool amdvi_is_interrupt_addr(hwaddr addr)
 999{
1000    return addr >= AMDVI_INT_ADDR_FIRST && addr <= AMDVI_INT_ADDR_LAST;
1001}
1002
1003static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
1004                                     IOMMUAccessFlags flag, int iommu_idx)
1005{
1006    AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
1007    AMDVIState *s = as->iommu_state;
1008    IOMMUTLBEntry ret = {
1009        .target_as = &address_space_memory,
1010        .iova = addr,
1011        .translated_addr = 0,
1012        .addr_mask = ~(hwaddr)0,
1013        .perm = IOMMU_NONE
1014    };
1015
1016    if (!s->enabled) {
1017        /* AMDVI disabled - corresponds to iommu=off not
1018         * failure to provide any parameter
1019         */
1020        ret.iova = addr & AMDVI_PAGE_MASK_4K;
1021        ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
1022        ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
1023        ret.perm = IOMMU_RW;
1024        return ret;
1025    } else if (amdvi_is_interrupt_addr(addr)) {
1026        ret.iova = addr & AMDVI_PAGE_MASK_4K;
1027        ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
1028        ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
1029        ret.perm = IOMMU_WO;
1030        return ret;
1031    }
1032
1033    amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret);
1034    trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
1035            PCI_FUNC(as->devfn), addr, ret.translated_addr);
1036    return ret;
1037}
1038
1039static int amdvi_get_irte(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
1040                          union irte *irte, uint16_t devid)
1041{
1042    uint64_t irte_root, offset;
1043
1044    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
1045    offset = (origin->data & AMDVI_IRTE_OFFSET) << 2;
1046
1047    trace_amdvi_ir_irte(irte_root, offset);
1048
1049    if (dma_memory_read(&address_space_memory, irte_root + offset,
1050                        irte, sizeof(*irte))) {
1051        trace_amdvi_ir_err("failed to get irte");
1052        return -AMDVI_IR_GET_IRTE;
1053    }
1054
1055    trace_amdvi_ir_irte_val(irte->val);
1056
1057    return 0;
1058}
1059
1060static int amdvi_int_remap_legacy(AMDVIState *iommu,
1061                                  MSIMessage *origin,
1062                                  MSIMessage *translated,
1063                                  uint64_t *dte,
1064                                  X86IOMMUIrq *irq,
1065                                  uint16_t sid)
1066{
1067    int ret;
1068    union irte irte;
1069
1070    /* get interrupt remapping table */
1071    ret = amdvi_get_irte(iommu, origin, dte, &irte, sid);
1072    if (ret < 0) {
1073        return ret;
1074    }
1075
1076    if (!irte.fields.valid) {
1077        trace_amdvi_ir_target_abort("RemapEn is disabled");
1078        return -AMDVI_IR_TARGET_ABORT;
1079    }
1080
1081    if (irte.fields.guest_mode) {
1082        error_report_once("guest mode is not zero");
1083        return -AMDVI_IR_ERR;
1084    }
1085
1086    if (irte.fields.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
1087        error_report_once("reserved int_type");
1088        return -AMDVI_IR_ERR;
1089    }
1090
1091    irq->delivery_mode = irte.fields.int_type;
1092    irq->vector = irte.fields.vector;
1093    irq->dest_mode = irte.fields.dm;
1094    irq->redir_hint = irte.fields.rq_eoi;
1095    irq->dest = irte.fields.destination;
1096
1097    return 0;
1098}
1099
1100static int amdvi_get_irte_ga(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
1101                             struct irte_ga *irte, uint16_t devid)
1102{
1103    uint64_t irte_root, offset;
1104
1105    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
1106    offset = (origin->data & AMDVI_IRTE_OFFSET) << 4;
1107    trace_amdvi_ir_irte(irte_root, offset);
1108
1109    if (dma_memory_read(&address_space_memory, irte_root + offset,
1110                        irte, sizeof(*irte))) {
1111        trace_amdvi_ir_err("failed to get irte_ga");
1112        return -AMDVI_IR_GET_IRTE;
1113    }
1114
1115    trace_amdvi_ir_irte_ga_val(irte->hi.val, irte->lo.val);
1116    return 0;
1117}
1118
1119static int amdvi_int_remap_ga(AMDVIState *iommu,
1120                              MSIMessage *origin,
1121                              MSIMessage *translated,
1122                              uint64_t *dte,
1123                              X86IOMMUIrq *irq,
1124                              uint16_t sid)
1125{
1126    int ret;
1127    struct irte_ga irte;
1128
1129    /* get interrupt remapping table */
1130    ret = amdvi_get_irte_ga(iommu, origin, dte, &irte, sid);
1131    if (ret < 0) {
1132        return ret;
1133    }
1134
1135    if (!irte.lo.fields_remap.valid) {
1136        trace_amdvi_ir_target_abort("RemapEn is disabled");
1137        return -AMDVI_IR_TARGET_ABORT;
1138    }
1139
1140    if (irte.lo.fields_remap.guest_mode) {
1141        error_report_once("guest mode is not zero");
1142        return -AMDVI_IR_ERR;
1143    }
1144
1145    if (irte.lo.fields_remap.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
1146        error_report_once("reserved int_type is set");
1147        return -AMDVI_IR_ERR;
1148    }
1149
1150    irq->delivery_mode = irte.lo.fields_remap.int_type;
1151    irq->vector = irte.hi.fields.vector;
1152    irq->dest_mode = irte.lo.fields_remap.dm;
1153    irq->redir_hint = irte.lo.fields_remap.rq_eoi;
1154    irq->dest = irte.lo.fields_remap.destination;
1155
1156    return 0;
1157}
1158
1159static int __amdvi_int_remap_msi(AMDVIState *iommu,
1160                                 MSIMessage *origin,
1161                                 MSIMessage *translated,
1162                                 uint64_t *dte,
1163                                 X86IOMMUIrq *irq,
1164                                 uint16_t sid)
1165{
1166    int ret;
1167    uint8_t int_ctl;
1168
1169    int_ctl = (dte[2] >> AMDVI_IR_INTCTL_SHIFT) & 3;
1170    trace_amdvi_ir_intctl(int_ctl);
1171
1172    switch (int_ctl) {
1173    case AMDVI_IR_INTCTL_PASS:
1174        memcpy(translated, origin, sizeof(*origin));
1175        return 0;
1176    case AMDVI_IR_INTCTL_REMAP:
1177        break;
1178    case AMDVI_IR_INTCTL_ABORT:
1179        trace_amdvi_ir_target_abort("int_ctl abort");
1180        return -AMDVI_IR_TARGET_ABORT;
1181    default:
1182        trace_amdvi_ir_err("int_ctl reserved");
1183        return -AMDVI_IR_ERR;
1184    }
1185
1186    if (iommu->ga_enabled) {
1187        ret = amdvi_int_remap_ga(iommu, origin, translated, dte, irq, sid);
1188    } else {
1189        ret = amdvi_int_remap_legacy(iommu, origin, translated, dte, irq, sid);
1190    }
1191
1192    return ret;
1193}
1194
1195/* Interrupt remapping for MSI/MSI-X entry */
1196static int amdvi_int_remap_msi(AMDVIState *iommu,
1197                               MSIMessage *origin,
1198                               MSIMessage *translated,
1199                               uint16_t sid)
1200{
1201    int ret = 0;
1202    uint64_t pass = 0;
1203    uint64_t dte[4] = { 0 };
1204    X86IOMMUIrq irq = { 0 };
1205    uint8_t dest_mode, delivery_mode;
1206
1207    assert(origin && translated);
1208
1209    /*
1210     * When IOMMU is enabled, interrupt remap request will come either from
1211     * IO-APIC or PCI device. If interrupt is from PCI device then it will
1212     * have a valid requester id but if the interrupt is from IO-APIC
1213     * then requester id will be invalid.
1214     */
1215    if (sid == X86_IOMMU_SID_INVALID) {
1216        sid = AMDVI_IOAPIC_SB_DEVID;
1217    }
1218
1219    trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid);
1220
1221    /* check if device table entry is set before we go further. */
1222    if (!iommu || !iommu->devtab_len) {
1223        memcpy(translated, origin, sizeof(*origin));
1224        goto out;
1225    }
1226
1227    if (!amdvi_get_dte(iommu, sid, dte)) {
1228        return -AMDVI_IR_ERR;
1229    }
1230
1231    /* Check if IR is enabled in DTE */
1232    if (!(dte[2] & AMDVI_IR_REMAP_ENABLE)) {
1233        memcpy(translated, origin, sizeof(*origin));
1234        goto out;
1235    }
1236
1237    /* validate that we are configure with intremap=on */
1238    if (!x86_iommu_ir_supported(X86_IOMMU_DEVICE(iommu))) {
1239        trace_amdvi_err("Interrupt remapping is enabled in the guest but "
1240                        "not in the host. Use intremap=on to enable interrupt "
1241                        "remapping in amd-iommu.");
1242        return -AMDVI_IR_ERR;
1243    }
1244
1245    if (origin->address & AMDVI_MSI_ADDR_HI_MASK) {
1246        trace_amdvi_err("MSI address high 32 bits non-zero when "
1247                        "Interrupt Remapping enabled.");
1248        return -AMDVI_IR_ERR;
1249    }
1250
1251    if ((origin->address & AMDVI_MSI_ADDR_LO_MASK) != APIC_DEFAULT_ADDRESS) {
1252        trace_amdvi_err("MSI is not from IOAPIC.");
1253        return -AMDVI_IR_ERR;
1254    }
1255
1256    /*
1257     * The MSI data register [10:8] are used to get the upstream interrupt type.
1258     *
1259     * See MSI/MSI-X format:
1260     * https://pdfs.semanticscholar.org/presentation/9420/c279e942eca568157711ef5c92b800c40a79.pdf
1261     * (page 5)
1262     */
1263    delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
1264
1265    switch (delivery_mode) {
1266    case AMDVI_IOAPIC_INT_TYPE_FIXED:
1267    case AMDVI_IOAPIC_INT_TYPE_ARBITRATED:
1268        trace_amdvi_ir_delivery_mode("fixed/arbitrated");
1269        ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, sid);
1270        if (ret < 0) {
1271            goto remap_fail;
1272        } else {
1273            /* Translate IRQ to MSI messages */
1274            x86_iommu_irq_to_msi_message(&irq, translated);
1275            goto out;
1276        }
1277        break;
1278    case AMDVI_IOAPIC_INT_TYPE_SMI:
1279        error_report("SMI is not supported!");
1280        ret = -AMDVI_IR_ERR;
1281        break;
1282    case AMDVI_IOAPIC_INT_TYPE_NMI:
1283        pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK;
1284        trace_amdvi_ir_delivery_mode("nmi");
1285        break;
1286    case AMDVI_IOAPIC_INT_TYPE_INIT:
1287        pass = dte[3] & AMDVI_DEV_INT_PASS_MASK;
1288        trace_amdvi_ir_delivery_mode("init");
1289        break;
1290    case AMDVI_IOAPIC_INT_TYPE_EINT:
1291        pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK;
1292        trace_amdvi_ir_delivery_mode("eint");
1293        break;
1294    default:
1295        trace_amdvi_ir_delivery_mode("unsupported delivery_mode");
1296        ret = -AMDVI_IR_ERR;
1297        break;
1298    }
1299
1300    if (ret < 0) {
1301        goto remap_fail;
1302    }
1303
1304    /*
1305     * The MSI address register bit[2] is used to get the destination
1306     * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts
1307     * only.
1308     */
1309    dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
1310    if (dest_mode) {
1311        trace_amdvi_ir_err("invalid dest_mode");
1312        ret = -AMDVI_IR_ERR;
1313        goto remap_fail;
1314    }
1315
1316    if (pass) {
1317        memcpy(translated, origin, sizeof(*origin));
1318    } else {
1319        trace_amdvi_ir_err("passthrough is not enabled");
1320        ret = -AMDVI_IR_ERR;
1321        goto remap_fail;
1322    }
1323
1324out:
1325    trace_amdvi_ir_remap_msi(origin->address, origin->data,
1326                             translated->address, translated->data);
1327    return 0;
1328
1329remap_fail:
1330    return ret;
1331}
1332
1333static int amdvi_int_remap(X86IOMMUState *iommu,
1334                           MSIMessage *origin,
1335                           MSIMessage *translated,
1336                           uint16_t sid)
1337{
1338    return amdvi_int_remap_msi(AMD_IOMMU_DEVICE(iommu), origin,
1339                               translated, sid);
1340}
1341
1342static MemTxResult amdvi_mem_ir_write(void *opaque, hwaddr addr,
1343                                      uint64_t value, unsigned size,
1344                                      MemTxAttrs attrs)
1345{
1346    int ret;
1347    MSIMessage from = { 0, 0 }, to = { 0, 0 };
1348    uint16_t sid = AMDVI_IOAPIC_SB_DEVID;
1349
1350    from.address = (uint64_t) addr + AMDVI_INT_ADDR_FIRST;
1351    from.data = (uint32_t) value;
1352
1353    trace_amdvi_mem_ir_write_req(addr, value, size);
1354
1355    if (!attrs.unspecified) {
1356        /* We have explicit Source ID */
1357        sid = attrs.requester_id;
1358    }
1359
1360    ret = amdvi_int_remap_msi(opaque, &from, &to, sid);
1361    if (ret < 0) {
1362        /* TODO: log the event using IOMMU log event interface */
1363        error_report_once("failed to remap interrupt from devid 0x%x", sid);
1364        return MEMTX_ERROR;
1365    }
1366
1367    apic_get_class()->send_msi(&to);
1368
1369    trace_amdvi_mem_ir_write(to.address, to.data);
1370    return MEMTX_OK;
1371}
1372
1373static MemTxResult amdvi_mem_ir_read(void *opaque, hwaddr addr,
1374                                     uint64_t *data, unsigned size,
1375                                     MemTxAttrs attrs)
1376{
1377    return MEMTX_OK;
1378}
1379
1380static const MemoryRegionOps amdvi_ir_ops = {
1381    .read_with_attrs = amdvi_mem_ir_read,
1382    .write_with_attrs = amdvi_mem_ir_write,
1383    .endianness = DEVICE_LITTLE_ENDIAN,
1384    .impl = {
1385        .min_access_size = 4,
1386        .max_access_size = 4,
1387    },
1388    .valid = {
1389        .min_access_size = 4,
1390        .max_access_size = 4,
1391    }
1392};
1393
1394static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
1395{
1396    char name[128];
1397    AMDVIState *s = opaque;
1398    AMDVIAddressSpace **iommu_as, *amdvi_dev_as;
1399    int bus_num = pci_bus_num(bus);
1400
1401    iommu_as = s->address_spaces[bus_num];
1402
1403    /* allocate memory during the first run */
1404    if (!iommu_as) {
1405        iommu_as = g_malloc0(sizeof(AMDVIAddressSpace *) * PCI_DEVFN_MAX);
1406        s->address_spaces[bus_num] = iommu_as;
1407    }
1408
1409    /* set up AMD-Vi region */
1410    if (!iommu_as[devfn]) {
1411        snprintf(name, sizeof(name), "amd_iommu_devfn_%d", devfn);
1412
1413        iommu_as[devfn] = g_malloc0(sizeof(AMDVIAddressSpace));
1414        iommu_as[devfn]->bus_num = (uint8_t)bus_num;
1415        iommu_as[devfn]->devfn = (uint8_t)devfn;
1416        iommu_as[devfn]->iommu_state = s;
1417
1418        amdvi_dev_as = iommu_as[devfn];
1419
1420        /*
1421         * Memory region relationships looks like (Address range shows
1422         * only lower 32 bits to make it short in length...):
1423         *
1424         * |-----------------+-------------------+----------|
1425         * | Name            | Address range     | Priority |
1426         * |-----------------+-------------------+----------+
1427         * | amdvi_root      | 00000000-ffffffff |        0 |
1428         * |  amdvi_iommu    | 00000000-ffffffff |        1 |
1429         * |  amdvi_iommu_ir | fee00000-feefffff |       64 |
1430         * |-----------------+-------------------+----------|
1431         */
1432        memory_region_init_iommu(&amdvi_dev_as->iommu,
1433                                 sizeof(amdvi_dev_as->iommu),
1434                                 TYPE_AMD_IOMMU_MEMORY_REGION,
1435                                 OBJECT(s),
1436                                 "amd_iommu", UINT64_MAX);
1437        memory_region_init(&amdvi_dev_as->root, OBJECT(s),
1438                           "amdvi_root", UINT64_MAX);
1439        address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name);
1440        memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s),
1441                              &amdvi_ir_ops, s, "amd_iommu_ir",
1442                              AMDVI_INT_ADDR_SIZE);
1443        memory_region_add_subregion_overlap(&amdvi_dev_as->root,
1444                                            AMDVI_INT_ADDR_FIRST,
1445                                            &amdvi_dev_as->iommu_ir,
1446                                            64);
1447        memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
1448                                            MEMORY_REGION(&amdvi_dev_as->iommu),
1449                                            1);
1450    }
1451    return &iommu_as[devfn]->as;
1452}
1453
1454static const MemoryRegionOps mmio_mem_ops = {
1455    .read = amdvi_mmio_read,
1456    .write = amdvi_mmio_write,
1457    .endianness = DEVICE_LITTLE_ENDIAN,
1458    .impl = {
1459        .min_access_size = 1,
1460        .max_access_size = 8,
1461        .unaligned = false,
1462    },
1463    .valid = {
1464        .min_access_size = 1,
1465        .max_access_size = 8,
1466    }
1467};
1468
1469static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
1470                                           IOMMUNotifierFlag old,
1471                                           IOMMUNotifierFlag new,
1472                                           Error **errp)
1473{
1474    AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
1475
1476    if (new & IOMMU_NOTIFIER_MAP) {
1477        error_setg(errp,
1478                   "device %02x.%02x.%x requires iommu notifier which is not "
1479                   "currently supported", as->bus_num, PCI_SLOT(as->devfn),
1480                   PCI_FUNC(as->devfn));
1481        return -EINVAL;
1482    }
1483    return 0;
1484}
1485
1486static void amdvi_init(AMDVIState *s)
1487{
1488    amdvi_iotlb_reset(s);
1489
1490    s->devtab_len = 0;
1491    s->cmdbuf_len = 0;
1492    s->cmdbuf_head = 0;
1493    s->cmdbuf_tail = 0;
1494    s->evtlog_head = 0;
1495    s->evtlog_tail = 0;
1496    s->excl_enabled = false;
1497    s->excl_allow = false;
1498    s->mmio_enabled = false;
1499    s->enabled = false;
1500    s->ats_enabled = false;
1501    s->cmdbuf_enabled = false;
1502
1503    /* reset MMIO */
1504    memset(s->mmior, 0, AMDVI_MMIO_SIZE);
1505    amdvi_set_quad(s, AMDVI_MMIO_EXT_FEATURES, AMDVI_EXT_FEATURES,
1506            0xffffffffffffffef, 0);
1507    amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98, 0x67);
1508
1509    /* reset device ident */
1510    pci_config_set_vendor_id(s->pci.dev.config, PCI_VENDOR_ID_AMD);
1511    pci_config_set_prog_interface(s->pci.dev.config, 00);
1512    pci_config_set_device_id(s->pci.dev.config, s->devid);
1513    pci_config_set_class(s->pci.dev.config, 0x0806);
1514
1515    /* reset AMDVI specific capabilities, all r/o */
1516    pci_set_long(s->pci.dev.config + s->capab_offset, AMDVI_CAPAB_FEATURES);
1517    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_LOW,
1518                 s->mmio.addr & ~(0xffff0000));
1519    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH,
1520                (s->mmio.addr & ~(0xffff)) >> 16);
1521    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_RANGE,
1522                 0xff000000);
1523    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, 0);
1524    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC,
1525            AMDVI_MAX_PH_ADDR | AMDVI_MAX_GVA_ADDR | AMDVI_MAX_VA_ADDR);
1526}
1527
1528static void amdvi_reset(DeviceState *dev)
1529{
1530    AMDVIState *s = AMD_IOMMU_DEVICE(dev);
1531
1532    msi_reset(&s->pci.dev);
1533    amdvi_init(s);
1534}
1535
1536static void amdvi_realize(DeviceState *dev, Error **errp)
1537{
1538    int ret = 0;
1539    AMDVIState *s = AMD_IOMMU_DEVICE(dev);
1540    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
1541    MachineState *ms = MACHINE(qdev_get_machine());
1542    PCMachineState *pcms = PC_MACHINE(ms);
1543    X86MachineState *x86ms = X86_MACHINE(ms);
1544    PCIBus *bus = pcms->bus;
1545
1546    s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
1547                                     amdvi_uint64_equal, g_free, g_free);
1548
1549    /* This device should take care of IOMMU PCI properties */
1550    x86_iommu->type = TYPE_AMD;
1551    qdev_set_parent_bus(DEVICE(&s->pci), &bus->qbus);
1552    object_property_set_bool(OBJECT(&s->pci), true, "realized", errp);
1553    ret = pci_add_capability(&s->pci.dev, AMDVI_CAPAB_ID_SEC, 0,
1554                                         AMDVI_CAPAB_SIZE, errp);
1555    if (ret < 0) {
1556        return;
1557    }
1558    s->capab_offset = ret;
1559
1560    ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_MSI, 0,
1561                             AMDVI_CAPAB_REG_SIZE, errp);
1562    if (ret < 0) {
1563        return;
1564    }
1565    ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_HT, 0,
1566                             AMDVI_CAPAB_REG_SIZE, errp);
1567    if (ret < 0) {
1568        return;
1569    }
1570
1571    /* Pseudo address space under root PCI bus. */
1572    x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
1573
1574    /* set up MMIO */
1575    memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio",
1576                          AMDVI_MMIO_SIZE);
1577
1578    sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio);
1579    sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
1580    pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
1581    s->devid = object_property_get_int(OBJECT(&s->pci), "addr", errp);
1582    msi_init(&s->pci.dev, 0, 1, true, false, errp);
1583    amdvi_init(s);
1584}
1585
1586static const VMStateDescription vmstate_amdvi = {
1587    .name = "amd-iommu",
1588    .unmigratable = 1
1589};
1590
1591static void amdvi_instance_init(Object *klass)
1592{
1593    AMDVIState *s = AMD_IOMMU_DEVICE(klass);
1594
1595    object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI);
1596}
1597
1598static void amdvi_class_init(ObjectClass *klass, void* data)
1599{
1600    DeviceClass *dc = DEVICE_CLASS(klass);
1601    X86IOMMUClass *dc_class = X86_IOMMU_CLASS(klass);
1602
1603    dc->reset = amdvi_reset;
1604    dc->vmsd = &vmstate_amdvi;
1605    dc->hotpluggable = false;
1606    dc_class->realize = amdvi_realize;
1607    dc_class->int_remap = amdvi_int_remap;
1608    /* Supported by the pc-q35-* machine types */
1609    dc->user_creatable = true;
1610    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1611    dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device";
1612}
1613
1614static const TypeInfo amdvi = {
1615    .name = TYPE_AMD_IOMMU_DEVICE,
1616    .parent = TYPE_X86_IOMMU_DEVICE,
1617    .instance_size = sizeof(AMDVIState),
1618    .instance_init = amdvi_instance_init,
1619    .class_init = amdvi_class_init
1620};
1621
1622static const TypeInfo amdviPCI = {
1623    .name = "AMDVI-PCI",
1624    .parent = TYPE_PCI_DEVICE,
1625    .instance_size = sizeof(AMDVIPCIState),
1626    .interfaces = (InterfaceInfo[]) {
1627        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
1628        { },
1629    },
1630};
1631
1632static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, void *data)
1633{
1634    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
1635
1636    imrc->translate = amdvi_translate;
1637    imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed;
1638}
1639
1640static const TypeInfo amdvi_iommu_memory_region_info = {
1641    .parent = TYPE_IOMMU_MEMORY_REGION,
1642    .name = TYPE_AMD_IOMMU_MEMORY_REGION,
1643    .class_init = amdvi_iommu_memory_region_class_init,
1644};
1645
1646static void amdviPCI_register_types(void)
1647{
1648    type_register_static(&amdviPCI);
1649    type_register_static(&amdvi);
1650    type_register_static(&amdvi_iommu_memory_region_info);
1651}
1652
1653type_init(amdviPCI_register_types);
1654