qemu/hw/i386/amd_iommu.c
<<
>>
Prefs
   1/*
   2 * QEMU emulation of AMD IOMMU (AMD-Vi)
   3 *
   4 * Copyright (C) 2011 Eduard - Gabriel Munteanu
   5 * Copyright (C) 2015, 2016 David Kiarie Kahurani
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2 of the License, or
  10 * (at your option) any later version.
  11
  12 * This program is distributed in the hope that it will be useful,
  13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 * GNU General Public License for more details.
  16
  17 * You should have received a copy of the GNU General Public License along
  18 * with this program; if not, see <http://www.gnu.org/licenses/>.
  19 *
  20 * Cache implementation inspired by hw/i386/intel_iommu.c
  21 */
  22
  23#include "qemu/osdep.h"
  24#include "hw/i386/pc.h"
  25#include "hw/pci/msi.h"
  26#include "hw/pci/pci_bus.h"
  27#include "migration/vmstate.h"
  28#include "amd_iommu.h"
  29#include "qapi/error.h"
  30#include "qemu/error-report.h"
  31#include "hw/i386/apic_internal.h"
  32#include "trace.h"
  33#include "hw/i386/apic-msidef.h"
  34
  35/* used AMD-Vi MMIO registers */
  36const char *amdvi_mmio_low[] = {
  37    "AMDVI_MMIO_DEVTAB_BASE",
  38    "AMDVI_MMIO_CMDBUF_BASE",
  39    "AMDVI_MMIO_EVTLOG_BASE",
  40    "AMDVI_MMIO_CONTROL",
  41    "AMDVI_MMIO_EXCL_BASE",
  42    "AMDVI_MMIO_EXCL_LIMIT",
  43    "AMDVI_MMIO_EXT_FEATURES",
  44    "AMDVI_MMIO_PPR_BASE",
  45    "UNHANDLED"
  46};
  47const char *amdvi_mmio_high[] = {
  48    "AMDVI_MMIO_COMMAND_HEAD",
  49    "AMDVI_MMIO_COMMAND_TAIL",
  50    "AMDVI_MMIO_EVTLOG_HEAD",
  51    "AMDVI_MMIO_EVTLOG_TAIL",
  52    "AMDVI_MMIO_STATUS",
  53    "AMDVI_MMIO_PPR_HEAD",
  54    "AMDVI_MMIO_PPR_TAIL",
  55    "UNHANDLED"
  56};
  57
  58struct AMDVIAddressSpace {
  59    uint8_t bus_num;            /* bus number                           */
  60    uint8_t devfn;              /* device function                      */
  61    AMDVIState *iommu_state;    /* AMDVI - one per machine              */
  62    MemoryRegion root;          /* AMDVI Root memory map region */
  63    IOMMUMemoryRegion iommu;    /* Device's address translation region  */
  64    MemoryRegion iommu_ir;      /* Device's interrupt remapping region  */
  65    AddressSpace as;            /* device's corresponding address space */
  66};
  67
  68/* AMDVI cache entry */
  69typedef struct AMDVIIOTLBEntry {
  70    uint16_t domid;             /* assigned domain id  */
  71    uint16_t devid;             /* device owning entry */
  72    uint64_t perms;             /* access permissions  */
  73    uint64_t translated_addr;   /* translated address  */
  74    uint64_t page_mask;         /* physical page size  */
  75} AMDVIIOTLBEntry;
  76
  77/* configure MMIO registers at startup/reset */
  78static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
  79                           uint64_t romask, uint64_t w1cmask)
  80{
  81    stq_le_p(&s->mmior[addr], val);
  82    stq_le_p(&s->romask[addr], romask);
  83    stq_le_p(&s->w1cmask[addr], w1cmask);
  84}
  85
  86static uint16_t amdvi_readw(AMDVIState *s, hwaddr addr)
  87{
  88    return lduw_le_p(&s->mmior[addr]);
  89}
  90
  91static uint32_t amdvi_readl(AMDVIState *s, hwaddr addr)
  92{
  93    return ldl_le_p(&s->mmior[addr]);
  94}
  95
  96static uint64_t amdvi_readq(AMDVIState *s, hwaddr addr)
  97{
  98    return ldq_le_p(&s->mmior[addr]);
  99}
 100
 101/* internal write */
 102static void amdvi_writeq_raw(AMDVIState *s, hwaddr addr, uint64_t val)
 103{
 104    stq_le_p(&s->mmior[addr], val);
 105}
 106
 107/* external write */
 108static void amdvi_writew(AMDVIState *s, hwaddr addr, uint16_t val)
 109{
 110    uint16_t romask = lduw_le_p(&s->romask[addr]);
 111    uint16_t w1cmask = lduw_le_p(&s->w1cmask[addr]);
 112    uint16_t oldval = lduw_le_p(&s->mmior[addr]);
 113    stw_le_p(&s->mmior[addr],
 114            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
 115}
 116
 117static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val)
 118{
 119    uint32_t romask = ldl_le_p(&s->romask[addr]);
 120    uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
 121    uint32_t oldval = ldl_le_p(&s->mmior[addr]);
 122    stl_le_p(&s->mmior[addr],
 123            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
 124}
 125
 126static void amdvi_writeq(AMDVIState *s, hwaddr addr, uint64_t val)
 127{
 128    uint64_t romask = ldq_le_p(&s->romask[addr]);
 129    uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
 130    uint32_t oldval = ldq_le_p(&s->mmior[addr]);
 131    stq_le_p(&s->mmior[addr],
 132            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
 133}
 134
 135/* OR a 64-bit register with a 64-bit value */
 136static bool amdvi_test_mask(AMDVIState *s, hwaddr addr, uint64_t val)
 137{
 138    return amdvi_readq(s, addr) | val;
 139}
 140
 141/* OR a 64-bit register with a 64-bit value storing result in the register */
 142static void amdvi_assign_orq(AMDVIState *s, hwaddr addr, uint64_t val)
 143{
 144    amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) | val);
 145}
 146
 147/* AND a 64-bit register with a 64-bit value storing result in the register */
 148static void amdvi_assign_andq(AMDVIState *s, hwaddr addr, uint64_t val)
 149{
 150   amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) & val);
 151}
 152
 153static void amdvi_generate_msi_interrupt(AMDVIState *s)
 154{
 155    MSIMessage msg = {};
 156    MemTxAttrs attrs = {
 157        .requester_id = pci_requester_id(&s->pci.dev)
 158    };
 159
 160    if (msi_enabled(&s->pci.dev)) {
 161        msg = msi_get_message(&s->pci.dev, 0);
 162        address_space_stl_le(&address_space_memory, msg.address, msg.data,
 163                             attrs, NULL);
 164    }
 165}
 166
 167static void amdvi_log_event(AMDVIState *s, uint64_t *evt)
 168{
 169    /* event logging not enabled */
 170    if (!s->evtlog_enabled || amdvi_test_mask(s, AMDVI_MMIO_STATUS,
 171        AMDVI_MMIO_STATUS_EVT_OVF)) {
 172        return;
 173    }
 174
 175    /* event log buffer full */
 176    if (s->evtlog_tail >= s->evtlog_len) {
 177        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF);
 178        /* generate interrupt */
 179        amdvi_generate_msi_interrupt(s);
 180        return;
 181    }
 182
 183    if (dma_memory_write(&address_space_memory, s->evtlog + s->evtlog_tail,
 184                         evt, AMDVI_EVENT_LEN)) {
 185        trace_amdvi_evntlog_fail(s->evtlog, s->evtlog_tail);
 186    }
 187
 188    s->evtlog_tail += AMDVI_EVENT_LEN;
 189    amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
 190    amdvi_generate_msi_interrupt(s);
 191}
 192
 193static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start,
 194                                int length)
 195{
 196    int index = start / 64, bitpos = start % 64;
 197    uint64_t mask = MAKE_64BIT_MASK(start, length);
 198    buffer[index] &= ~mask;
 199    buffer[index] |= (value << bitpos) & mask;
 200}
 201/*
 202 * AMDVi event structure
 203 *    0:15   -> DeviceID
 204 *    55:63  -> event type + miscellaneous info
 205 *    63:127 -> related address
 206 */
 207static void amdvi_encode_event(uint64_t *evt, uint16_t devid, uint64_t addr,
 208                               uint16_t info)
 209{
 210    amdvi_setevent_bits(evt, devid, 0, 16);
 211    amdvi_setevent_bits(evt, info, 55, 8);
 212    amdvi_setevent_bits(evt, addr, 63, 64);
 213}
 214/* log an error encountered during a page walk
 215 *
 216 * @addr: virtual address in translation request
 217 */
 218static void amdvi_page_fault(AMDVIState *s, uint16_t devid,
 219                             hwaddr addr, uint16_t info)
 220{
 221    uint64_t evt[4];
 222
 223    info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF;
 224    amdvi_encode_event(evt, devid, addr, info);
 225    amdvi_log_event(s, evt);
 226    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 227            PCI_STATUS_SIG_TARGET_ABORT);
 228}
 229/*
 230 * log a master abort accessing device table
 231 *  @devtab : address of device table entry
 232 *  @info : error flags
 233 */
 234static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid,
 235                                   hwaddr devtab, uint16_t info)
 236{
 237    uint64_t evt[4];
 238
 239    info |= AMDVI_EVENT_DEV_TAB_HW_ERROR;
 240
 241    amdvi_encode_event(evt, devid, devtab, info);
 242    amdvi_log_event(s, evt);
 243    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 244            PCI_STATUS_SIG_TARGET_ABORT);
 245}
 246/* log an event trying to access command buffer
 247 *   @addr : address that couldn't be accessed
 248 */
 249static void amdvi_log_command_error(AMDVIState *s, hwaddr addr)
 250{
 251    uint64_t evt[4], info = AMDVI_EVENT_COMMAND_HW_ERROR;
 252
 253    amdvi_encode_event(evt, 0, addr, info);
 254    amdvi_log_event(s, evt);
 255    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 256            PCI_STATUS_SIG_TARGET_ABORT);
 257}
 258/* log an illegal comand event
 259 *   @addr : address of illegal command
 260 */
 261static void amdvi_log_illegalcom_error(AMDVIState *s, uint16_t info,
 262                                       hwaddr addr)
 263{
 264    uint64_t evt[4];
 265
 266    info |= AMDVI_EVENT_ILLEGAL_COMMAND_ERROR;
 267    amdvi_encode_event(evt, 0, addr, info);
 268    amdvi_log_event(s, evt);
 269}
 270/* log an error accessing device table
 271 *
 272 *  @devid : device owning the table entry
 273 *  @devtab : address of device table entry
 274 *  @info : error flags
 275 */
 276static void amdvi_log_illegaldevtab_error(AMDVIState *s, uint16_t devid,
 277                                          hwaddr addr, uint16_t info)
 278{
 279    uint64_t evt[4];
 280
 281    info |= AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY;
 282    amdvi_encode_event(evt, devid, addr, info);
 283    amdvi_log_event(s, evt);
 284}
 285/* log an error accessing a PTE entry
 286 * @addr : address that couldn't be accessed
 287 */
 288static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid,
 289                                    hwaddr addr, uint16_t info)
 290{
 291    uint64_t evt[4];
 292
 293    info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR;
 294    amdvi_encode_event(evt, devid, addr, info);
 295    amdvi_log_event(s, evt);
 296    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 297             PCI_STATUS_SIG_TARGET_ABORT);
 298}
 299
 300static gboolean amdvi_uint64_equal(gconstpointer v1, gconstpointer v2)
 301{
 302    return *((const uint64_t *)v1) == *((const uint64_t *)v2);
 303}
 304
 305static guint amdvi_uint64_hash(gconstpointer v)
 306{
 307    return (guint)*(const uint64_t *)v;
 308}
 309
 310static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr,
 311                                           uint64_t devid)
 312{
 313    uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
 314                   ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
 315    return g_hash_table_lookup(s->iotlb, &key);
 316}
 317
 318static void amdvi_iotlb_reset(AMDVIState *s)
 319{
 320    assert(s->iotlb);
 321    trace_amdvi_iotlb_reset();
 322    g_hash_table_remove_all(s->iotlb);
 323}
 324
 325static gboolean amdvi_iotlb_remove_by_devid(gpointer key, gpointer value,
 326                                            gpointer user_data)
 327{
 328    AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
 329    uint16_t devid = *(uint16_t *)user_data;
 330    return entry->devid == devid;
 331}
 332
 333static void amdvi_iotlb_remove_page(AMDVIState *s, hwaddr addr,
 334                                    uint64_t devid)
 335{
 336    uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
 337                   ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
 338    g_hash_table_remove(s->iotlb, &key);
 339}
 340
 341static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid,
 342                               uint64_t gpa, IOMMUTLBEntry to_cache,
 343                               uint16_t domid)
 344{
 345    AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1);
 346    uint64_t *key = g_new(uint64_t, 1);
 347    uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K;
 348
 349    /* don't cache erroneous translations */
 350    if (to_cache.perm != IOMMU_NONE) {
 351        trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid),
 352                PCI_FUNC(devid), gpa, to_cache.translated_addr);
 353
 354        if (g_hash_table_size(s->iotlb) >= AMDVI_IOTLB_MAX_SIZE) {
 355            amdvi_iotlb_reset(s);
 356        }
 357
 358        entry->domid = domid;
 359        entry->perms = to_cache.perm;
 360        entry->translated_addr = to_cache.translated_addr;
 361        entry->page_mask = to_cache.addr_mask;
 362        *key = gfn | ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
 363        g_hash_table_replace(s->iotlb, key, entry);
 364    }
 365}
 366
 367static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd)
 368{
 369    /* pad the last 3 bits */
 370    hwaddr addr = cpu_to_le64(extract64(cmd[0], 3, 49)) << 3;
 371    uint64_t data = cpu_to_le64(cmd[1]);
 372
 373    if (extract64(cmd[0], 52, 8)) {
 374        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 375                                   s->cmdbuf + s->cmdbuf_head);
 376    }
 377    if (extract64(cmd[0], 0, 1)) {
 378        if (dma_memory_write(&address_space_memory, addr, &data,
 379            AMDVI_COMPLETION_DATA_SIZE)) {
 380            trace_amdvi_completion_wait_fail(addr);
 381        }
 382    }
 383    /* set completion interrupt */
 384    if (extract64(cmd[0], 1, 1)) {
 385        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
 386        /* generate interrupt */
 387        amdvi_generate_msi_interrupt(s);
 388    }
 389    trace_amdvi_completion_wait(addr, data);
 390}
 391
 392/* log error without aborting since linux seems to be using reserved bits */
 393static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd)
 394{
 395    uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16));
 396
 397    /* This command should invalidate internal caches of which there isn't */
 398    if (extract64(cmd[0], 16, 44) || cmd[1]) {
 399        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 400                                   s->cmdbuf + s->cmdbuf_head);
 401    }
 402    trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid),
 403                             PCI_FUNC(devid));
 404}
 405
 406static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd)
 407{
 408    if (extract64(cmd[0], 16, 16) ||  extract64(cmd[0], 52, 8) ||
 409        extract64(cmd[1], 0, 2) || extract64(cmd[1], 3, 29)
 410        || extract64(cmd[1], 48, 16)) {
 411        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 412                                   s->cmdbuf + s->cmdbuf_head);
 413    }
 414    trace_amdvi_ppr_exec();
 415}
 416
 417static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
 418{
 419    if (extract64(cmd[0], 0, 60) || cmd[1]) {
 420        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 421                                   s->cmdbuf + s->cmdbuf_head);
 422    }
 423
 424    amdvi_iotlb_reset(s);
 425    trace_amdvi_all_inval();
 426}
 427
 428static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value,
 429                                            gpointer user_data)
 430{
 431    AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
 432    uint16_t domid = *(uint16_t *)user_data;
 433    return entry->domid == domid;
 434}
 435
 436/* we don't have devid - we can't remove pages by address */
 437static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd)
 438{
 439    uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16));
 440
 441    if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) ||
 442        extract64(cmd[1], 3, 9)) {
 443        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 444                                   s->cmdbuf + s->cmdbuf_head);
 445    }
 446
 447    g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid,
 448                                &domid);
 449    trace_amdvi_pages_inval(domid);
 450}
 451
 452static void amdvi_prefetch_pages(AMDVIState *s, uint64_t *cmd)
 453{
 454    if (extract64(cmd[0], 16, 8) || extract64(cmd[0], 52, 8) ||
 455        extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
 456        extract64(cmd[1], 5, 7)) {
 457        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 458                                   s->cmdbuf + s->cmdbuf_head);
 459    }
 460
 461    trace_amdvi_prefetch_pages();
 462}
 463
 464static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd)
 465{
 466    if (extract64(cmd[0], 16, 44) || cmd[1]) {
 467        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 468                                   s->cmdbuf + s->cmdbuf_head);
 469        return;
 470    }
 471
 472    trace_amdvi_intr_inval();
 473}
 474
 475/* FIXME: Try to work with the specified size instead of all the pages
 476 * when the S bit is on
 477 */
 478static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd)
 479{
 480
 481    uint16_t devid = extract64(cmd[0], 0, 16);
 482    if (extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
 483        extract64(cmd[1], 6, 6)) {
 484        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 485                                   s->cmdbuf + s->cmdbuf_head);
 486        return;
 487    }
 488
 489    if (extract64(cmd[1], 0, 1)) {
 490        g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_devid,
 491                                    &devid);
 492    } else {
 493        amdvi_iotlb_remove_page(s, cpu_to_le64(extract64(cmd[1], 12, 52)) << 12,
 494                                cpu_to_le16(extract64(cmd[1], 0, 16)));
 495    }
 496    trace_amdvi_iotlb_inval();
 497}
 498
 499/* not honouring reserved bits is regarded as an illegal command */
 500static void amdvi_cmdbuf_exec(AMDVIState *s)
 501{
 502    uint64_t cmd[2];
 503
 504    if (dma_memory_read(&address_space_memory, s->cmdbuf + s->cmdbuf_head,
 505        cmd, AMDVI_COMMAND_SIZE)) {
 506        trace_amdvi_command_read_fail(s->cmdbuf, s->cmdbuf_head);
 507        amdvi_log_command_error(s, s->cmdbuf + s->cmdbuf_head);
 508        return;
 509    }
 510
 511    switch (extract64(cmd[0], 60, 4)) {
 512    case AMDVI_CMD_COMPLETION_WAIT:
 513        amdvi_completion_wait(s, cmd);
 514        break;
 515    case AMDVI_CMD_INVAL_DEVTAB_ENTRY:
 516        amdvi_inval_devtab_entry(s, cmd);
 517        break;
 518    case AMDVI_CMD_INVAL_AMDVI_PAGES:
 519        amdvi_inval_pages(s, cmd);
 520        break;
 521    case AMDVI_CMD_INVAL_IOTLB_PAGES:
 522        iommu_inval_iotlb(s, cmd);
 523        break;
 524    case AMDVI_CMD_INVAL_INTR_TABLE:
 525        amdvi_inval_inttable(s, cmd);
 526        break;
 527    case AMDVI_CMD_PREFETCH_AMDVI_PAGES:
 528        amdvi_prefetch_pages(s, cmd);
 529        break;
 530    case AMDVI_CMD_COMPLETE_PPR_REQUEST:
 531        amdvi_complete_ppr(s, cmd);
 532        break;
 533    case AMDVI_CMD_INVAL_AMDVI_ALL:
 534        amdvi_inval_all(s, cmd);
 535        break;
 536    default:
 537        trace_amdvi_unhandled_command(extract64(cmd[1], 60, 4));
 538        /* log illegal command */
 539        amdvi_log_illegalcom_error(s, extract64(cmd[1], 60, 4),
 540                                   s->cmdbuf + s->cmdbuf_head);
 541    }
 542}
 543
 544static void amdvi_cmdbuf_run(AMDVIState *s)
 545{
 546    if (!s->cmdbuf_enabled) {
 547        trace_amdvi_command_error(amdvi_readq(s, AMDVI_MMIO_CONTROL));
 548        return;
 549    }
 550
 551    /* check if there is work to do. */
 552    while (s->cmdbuf_head != s->cmdbuf_tail) {
 553        trace_amdvi_command_exec(s->cmdbuf_head, s->cmdbuf_tail, s->cmdbuf);
 554        amdvi_cmdbuf_exec(s);
 555        s->cmdbuf_head += AMDVI_COMMAND_SIZE;
 556        amdvi_writeq_raw(s, AMDVI_MMIO_COMMAND_HEAD, s->cmdbuf_head);
 557
 558        /* wrap head pointer */
 559        if (s->cmdbuf_head >= s->cmdbuf_len * AMDVI_COMMAND_SIZE) {
 560            s->cmdbuf_head = 0;
 561        }
 562    }
 563}
 564
 565static void amdvi_mmio_trace(hwaddr addr, unsigned size)
 566{
 567    uint8_t index = (addr & ~0x2000) / 8;
 568
 569    if ((addr & 0x2000)) {
 570        /* high table */
 571        index = index >= AMDVI_MMIO_REGS_HIGH ? AMDVI_MMIO_REGS_HIGH : index;
 572        trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07);
 573    } else {
 574        index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index;
 575        trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07);
 576    }
 577}
 578
 579static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size)
 580{
 581    AMDVIState *s = opaque;
 582
 583    uint64_t val = -1;
 584    if (addr + size > AMDVI_MMIO_SIZE) {
 585        trace_amdvi_mmio_read_invalid(AMDVI_MMIO_SIZE, addr, size);
 586        return (uint64_t)-1;
 587    }
 588
 589    if (size == 2) {
 590        val = amdvi_readw(s, addr);
 591    } else if (size == 4) {
 592        val = amdvi_readl(s, addr);
 593    } else if (size == 8) {
 594        val = amdvi_readq(s, addr);
 595    }
 596    amdvi_mmio_trace(addr, size);
 597
 598    return val;
 599}
 600
 601static void amdvi_handle_control_write(AMDVIState *s)
 602{
 603    unsigned long control = amdvi_readq(s, AMDVI_MMIO_CONTROL);
 604    s->enabled = !!(control & AMDVI_MMIO_CONTROL_AMDVIEN);
 605
 606    s->ats_enabled = !!(control & AMDVI_MMIO_CONTROL_HTTUNEN);
 607    s->evtlog_enabled = s->enabled && !!(control &
 608                        AMDVI_MMIO_CONTROL_EVENTLOGEN);
 609
 610    s->evtlog_intr = !!(control & AMDVI_MMIO_CONTROL_EVENTINTEN);
 611    s->completion_wait_intr = !!(control & AMDVI_MMIO_CONTROL_COMWAITINTEN);
 612    s->cmdbuf_enabled = s->enabled && !!(control &
 613                        AMDVI_MMIO_CONTROL_CMDBUFLEN);
 614    s->ga_enabled = !!(control & AMDVI_MMIO_CONTROL_GAEN);
 615
 616    /* update the flags depending on the control register */
 617    if (s->cmdbuf_enabled) {
 618        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_CMDBUF_RUN);
 619    } else {
 620        amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_CMDBUF_RUN);
 621    }
 622    if (s->evtlog_enabled) {
 623        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_RUN);
 624    } else {
 625        amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_EVT_RUN);
 626    }
 627
 628    trace_amdvi_control_status(control);
 629    amdvi_cmdbuf_run(s);
 630}
 631
 632static inline void amdvi_handle_devtab_write(AMDVIState *s)
 633
 634{
 635    uint64_t val = amdvi_readq(s, AMDVI_MMIO_DEVICE_TABLE);
 636    s->devtab = (val & AMDVI_MMIO_DEVTAB_BASE_MASK);
 637
 638    /* set device table length */
 639    s->devtab_len = ((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1 *
 640                    (AMDVI_MMIO_DEVTAB_SIZE_UNIT /
 641                     AMDVI_MMIO_DEVTAB_ENTRY_SIZE));
 642}
 643
 644static inline void amdvi_handle_cmdhead_write(AMDVIState *s)
 645{
 646    s->cmdbuf_head = amdvi_readq(s, AMDVI_MMIO_COMMAND_HEAD)
 647                     & AMDVI_MMIO_CMDBUF_HEAD_MASK;
 648    amdvi_cmdbuf_run(s);
 649}
 650
 651static inline void amdvi_handle_cmdbase_write(AMDVIState *s)
 652{
 653    s->cmdbuf = amdvi_readq(s, AMDVI_MMIO_COMMAND_BASE)
 654                & AMDVI_MMIO_CMDBUF_BASE_MASK;
 655    s->cmdbuf_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_CMDBUF_SIZE_BYTE)
 656                    & AMDVI_MMIO_CMDBUF_SIZE_MASK);
 657    s->cmdbuf_head = s->cmdbuf_tail = 0;
 658}
 659
 660static inline void amdvi_handle_cmdtail_write(AMDVIState *s)
 661{
 662    s->cmdbuf_tail = amdvi_readq(s, AMDVI_MMIO_COMMAND_TAIL)
 663                     & AMDVI_MMIO_CMDBUF_TAIL_MASK;
 664    amdvi_cmdbuf_run(s);
 665}
 666
 667static inline void amdvi_handle_excllim_write(AMDVIState *s)
 668{
 669    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EXCL_LIMIT);
 670    s->excl_limit = (val & AMDVI_MMIO_EXCL_LIMIT_MASK) |
 671                    AMDVI_MMIO_EXCL_LIMIT_LOW;
 672}
 673
 674static inline void amdvi_handle_evtbase_write(AMDVIState *s)
 675{
 676    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_BASE);
 677    s->evtlog = val & AMDVI_MMIO_EVTLOG_BASE_MASK;
 678    s->evtlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_EVTLOG_SIZE_BYTE)
 679                    & AMDVI_MMIO_EVTLOG_SIZE_MASK);
 680}
 681
 682static inline void amdvi_handle_evttail_write(AMDVIState *s)
 683{
 684    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_TAIL);
 685    s->evtlog_tail = val & AMDVI_MMIO_EVTLOG_TAIL_MASK;
 686}
 687
 688static inline void amdvi_handle_evthead_write(AMDVIState *s)
 689{
 690    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_HEAD);
 691    s->evtlog_head = val & AMDVI_MMIO_EVTLOG_HEAD_MASK;
 692}
 693
 694static inline void amdvi_handle_pprbase_write(AMDVIState *s)
 695{
 696    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_BASE);
 697    s->ppr_log = val & AMDVI_MMIO_PPRLOG_BASE_MASK;
 698    s->pprlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_PPRLOG_SIZE_BYTE)
 699                    & AMDVI_MMIO_PPRLOG_SIZE_MASK);
 700}
 701
 702static inline void amdvi_handle_pprhead_write(AMDVIState *s)
 703{
 704    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_HEAD);
 705    s->pprlog_head = val & AMDVI_MMIO_PPRLOG_HEAD_MASK;
 706}
 707
 708static inline void amdvi_handle_pprtail_write(AMDVIState *s)
 709{
 710    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_TAIL);
 711    s->pprlog_tail = val & AMDVI_MMIO_PPRLOG_TAIL_MASK;
 712}
 713
 714/* FIXME: something might go wrong if System Software writes in chunks
 715 * of one byte but linux writes in chunks of 4 bytes so currently it
 716 * works correctly with linux but will definitely be busted if software
 717 * reads/writes 8 bytes
 718 */
 719static void amdvi_mmio_reg_write(AMDVIState *s, unsigned size, uint64_t val,
 720                                 hwaddr addr)
 721{
 722    if (size == 2) {
 723        amdvi_writew(s, addr, val);
 724    } else if (size == 4) {
 725        amdvi_writel(s, addr, val);
 726    } else if (size == 8) {
 727        amdvi_writeq(s, addr, val);
 728    }
 729}
 730
 731static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val,
 732                             unsigned size)
 733{
 734    AMDVIState *s = opaque;
 735    unsigned long offset = addr & 0x07;
 736
 737    if (addr + size > AMDVI_MMIO_SIZE) {
 738        trace_amdvi_mmio_write("error: addr outside region: max ",
 739                (uint64_t)AMDVI_MMIO_SIZE, size, val, offset);
 740        return;
 741    }
 742
 743    amdvi_mmio_trace(addr, size);
 744    switch (addr & ~0x07) {
 745    case AMDVI_MMIO_CONTROL:
 746        amdvi_mmio_reg_write(s, size, val, addr);
 747        amdvi_handle_control_write(s);
 748        break;
 749    case AMDVI_MMIO_DEVICE_TABLE:
 750        amdvi_mmio_reg_write(s, size, val, addr);
 751       /*  set device table address
 752        *   This also suffers from inability to tell whether software
 753        *   is done writing
 754        */
 755        if (offset || (size == 8)) {
 756            amdvi_handle_devtab_write(s);
 757        }
 758        break;
 759    case AMDVI_MMIO_COMMAND_HEAD:
 760        amdvi_mmio_reg_write(s, size, val, addr);
 761        amdvi_handle_cmdhead_write(s);
 762        break;
 763    case AMDVI_MMIO_COMMAND_BASE:
 764        amdvi_mmio_reg_write(s, size, val, addr);
 765        /* FIXME - make sure System Software has finished writing incase
 766         * it writes in chucks less than 8 bytes in a robust way.As for
 767         * now, this hacks works for the linux driver
 768         */
 769        if (offset || (size == 8)) {
 770            amdvi_handle_cmdbase_write(s);
 771        }
 772        break;
 773    case AMDVI_MMIO_COMMAND_TAIL:
 774        amdvi_mmio_reg_write(s, size, val, addr);
 775        amdvi_handle_cmdtail_write(s);
 776        break;
 777    case AMDVI_MMIO_EVENT_BASE:
 778        amdvi_mmio_reg_write(s, size, val, addr);
 779        amdvi_handle_evtbase_write(s);
 780        break;
 781    case AMDVI_MMIO_EVENT_HEAD:
 782        amdvi_mmio_reg_write(s, size, val, addr);
 783        amdvi_handle_evthead_write(s);
 784        break;
 785    case AMDVI_MMIO_EVENT_TAIL:
 786        amdvi_mmio_reg_write(s, size, val, addr);
 787        amdvi_handle_evttail_write(s);
 788        break;
 789    case AMDVI_MMIO_EXCL_LIMIT:
 790        amdvi_mmio_reg_write(s, size, val, addr);
 791        amdvi_handle_excllim_write(s);
 792        break;
 793        /* PPR log base - unused for now */
 794    case AMDVI_MMIO_PPR_BASE:
 795        amdvi_mmio_reg_write(s, size, val, addr);
 796        amdvi_handle_pprbase_write(s);
 797        break;
 798        /* PPR log head - also unused for now */
 799    case AMDVI_MMIO_PPR_HEAD:
 800        amdvi_mmio_reg_write(s, size, val, addr);
 801        amdvi_handle_pprhead_write(s);
 802        break;
 803        /* PPR log tail - unused for now */
 804    case AMDVI_MMIO_PPR_TAIL:
 805        amdvi_mmio_reg_write(s, size, val, addr);
 806        amdvi_handle_pprtail_write(s);
 807        break;
 808    }
 809}
 810
 811static inline uint64_t amdvi_get_perms(uint64_t entry)
 812{
 813    return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >>
 814           AMDVI_DEV_PERM_SHIFT;
 815}
 816
 817/* validate that reserved bits are honoured */
 818static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid,
 819                               uint64_t *dte)
 820{
 821    if ((dte[0] & AMDVI_DTE_LOWER_QUAD_RESERVED)
 822        || (dte[1] & AMDVI_DTE_MIDDLE_QUAD_RESERVED)
 823        || (dte[2] & AMDVI_DTE_UPPER_QUAD_RESERVED) || dte[3]) {
 824        amdvi_log_illegaldevtab_error(s, devid,
 825                                      s->devtab +
 826                                      devid * AMDVI_DEVTAB_ENTRY_SIZE, 0);
 827        return false;
 828    }
 829
 830    return true;
 831}
 832
 833/* get a device table entry given the devid */
 834static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry)
 835{
 836    uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE;
 837
 838    if (dma_memory_read(&address_space_memory, s->devtab + offset, entry,
 839        AMDVI_DEVTAB_ENTRY_SIZE)) {
 840        trace_amdvi_dte_get_fail(s->devtab, offset);
 841        /* log error accessing dte */
 842        amdvi_log_devtab_error(s, devid, s->devtab + offset, 0);
 843        return false;
 844    }
 845
 846    *entry = le64_to_cpu(*entry);
 847    if (!amdvi_validate_dte(s, devid, entry)) {
 848        trace_amdvi_invalid_dte(entry[0]);
 849        return false;
 850    }
 851
 852    return true;
 853}
 854
 855/* get pte translation mode */
 856static inline uint8_t get_pte_translation_mode(uint64_t pte)
 857{
 858    return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK;
 859}
 860
 861static inline uint64_t pte_override_page_mask(uint64_t pte)
 862{
 863    uint8_t page_mask = 13;
 864    uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) >> 12;
 865    /* find the first zero bit */
 866    while (addr & 1) {
 867        page_mask++;
 868        addr = addr >> 1;
 869    }
 870
 871    return ~((1ULL << page_mask) - 1);
 872}
 873
 874static inline uint64_t pte_get_page_mask(uint64_t oldlevel)
 875{
 876    return ~((1UL << ((oldlevel * 9) + 3)) - 1);
 877}
 878
 879static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
 880                                          uint16_t devid)
 881{
 882    uint64_t pte;
 883
 884    if (dma_memory_read(&address_space_memory, pte_addr, &pte, sizeof(pte))) {
 885        trace_amdvi_get_pte_hwerror(pte_addr);
 886        amdvi_log_pagetab_error(s, devid, pte_addr, 0);
 887        pte = 0;
 888        return pte;
 889    }
 890
 891    pte = le64_to_cpu(pte);
 892    return pte;
 893}
 894
 895static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
 896                            IOMMUTLBEntry *ret, unsigned perms,
 897                            hwaddr addr)
 898{
 899    unsigned level, present, pte_perms, oldlevel;
 900    uint64_t pte = dte[0], pte_addr, page_mask;
 901
 902    /* make sure the DTE has TV = 1 */
 903    if (pte & AMDVI_DEV_TRANSLATION_VALID) {
 904        level = get_pte_translation_mode(pte);
 905        if (level >= 7) {
 906            trace_amdvi_mode_invalid(level, addr);
 907            return;
 908        }
 909        if (level == 0) {
 910            goto no_remap;
 911        }
 912
 913        /* we are at the leaf page table or page table encodes a huge page */
 914        while (level > 0) {
 915            pte_perms = amdvi_get_perms(pte);
 916            present = pte & 1;
 917            if (!present || perms != (perms & pte_perms)) {
 918                amdvi_page_fault(as->iommu_state, as->devfn, addr, perms);
 919                trace_amdvi_page_fault(addr);
 920                return;
 921            }
 922
 923            /* go to the next lower level */
 924            pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK;
 925            /* add offset and load pte */
 926            pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3;
 927            pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn);
 928            if (!pte) {
 929                return;
 930            }
 931            oldlevel = level;
 932            level = get_pte_translation_mode(pte);
 933            if (level == 0x7) {
 934                break;
 935            }
 936        }
 937
 938        if (level == 0x7) {
 939            page_mask = pte_override_page_mask(pte);
 940        } else {
 941            page_mask = pte_get_page_mask(oldlevel);
 942        }
 943
 944        /* get access permissions from pte */
 945        ret->iova = addr & page_mask;
 946        ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask;
 947        ret->addr_mask = ~page_mask;
 948        ret->perm = amdvi_get_perms(pte);
 949        return;
 950    }
 951no_remap:
 952    ret->iova = addr & AMDVI_PAGE_MASK_4K;
 953    ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
 954    ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
 955    ret->perm = amdvi_get_perms(pte);
 956}
 957
 958static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
 959                               bool is_write, IOMMUTLBEntry *ret)
 960{
 961    AMDVIState *s = as->iommu_state;
 962    uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn);
 963    AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid);
 964    uint64_t entry[4];
 965
 966    if (iotlb_entry) {
 967        trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid),
 968                PCI_FUNC(devid), addr, iotlb_entry->translated_addr);
 969        ret->iova = addr & ~iotlb_entry->page_mask;
 970        ret->translated_addr = iotlb_entry->translated_addr;
 971        ret->addr_mask = iotlb_entry->page_mask;
 972        ret->perm = iotlb_entry->perms;
 973        return;
 974    }
 975
 976    if (!amdvi_get_dte(s, devid, entry)) {
 977        return;
 978    }
 979
 980    /* devices with V = 0 are not translated */
 981    if (!(entry[0] & AMDVI_DEV_VALID)) {
 982        goto out;
 983    }
 984
 985    amdvi_page_walk(as, entry, ret,
 986                    is_write ? AMDVI_PERM_WRITE : AMDVI_PERM_READ, addr);
 987
 988    amdvi_update_iotlb(s, devid, addr, *ret,
 989                       entry[1] & AMDVI_DEV_DOMID_ID_MASK);
 990    return;
 991
 992out:
 993    ret->iova = addr & AMDVI_PAGE_MASK_4K;
 994    ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
 995    ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
 996    ret->perm = IOMMU_RW;
 997}
 998
 999static inline bool amdvi_is_interrupt_addr(hwaddr addr)
1000{
1001    return addr >= AMDVI_INT_ADDR_FIRST && addr <= AMDVI_INT_ADDR_LAST;
1002}
1003
1004static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
1005                                     IOMMUAccessFlags flag, int iommu_idx)
1006{
1007    AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
1008    AMDVIState *s = as->iommu_state;
1009    IOMMUTLBEntry ret = {
1010        .target_as = &address_space_memory,
1011        .iova = addr,
1012        .translated_addr = 0,
1013        .addr_mask = ~(hwaddr)0,
1014        .perm = IOMMU_NONE
1015    };
1016
1017    if (!s->enabled) {
1018        /* AMDVI disabled - corresponds to iommu=off not
1019         * failure to provide any parameter
1020         */
1021        ret.iova = addr & AMDVI_PAGE_MASK_4K;
1022        ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
1023        ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
1024        ret.perm = IOMMU_RW;
1025        return ret;
1026    } else if (amdvi_is_interrupt_addr(addr)) {
1027        ret.iova = addr & AMDVI_PAGE_MASK_4K;
1028        ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
1029        ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
1030        ret.perm = IOMMU_WO;
1031        return ret;
1032    }
1033
1034    amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret);
1035    trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
1036            PCI_FUNC(as->devfn), addr, ret.translated_addr);
1037    return ret;
1038}
1039
1040static int amdvi_get_irte(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
1041                          union irte *irte, uint16_t devid)
1042{
1043    uint64_t irte_root, offset;
1044
1045    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
1046    offset = (origin->data & AMDVI_IRTE_OFFSET) << 2;
1047
1048    trace_amdvi_ir_irte(irte_root, offset);
1049
1050    if (dma_memory_read(&address_space_memory, irte_root + offset,
1051                        irte, sizeof(*irte))) {
1052        trace_amdvi_ir_err("failed to get irte");
1053        return -AMDVI_IR_GET_IRTE;
1054    }
1055
1056    trace_amdvi_ir_irte_val(irte->val);
1057
1058    return 0;
1059}
1060
1061static int amdvi_int_remap_legacy(AMDVIState *iommu,
1062                                  MSIMessage *origin,
1063                                  MSIMessage *translated,
1064                                  uint64_t *dte,
1065                                  X86IOMMUIrq *irq,
1066                                  uint16_t sid)
1067{
1068    int ret;
1069    union irte irte;
1070
1071    /* get interrupt remapping table */
1072    ret = amdvi_get_irte(iommu, origin, dte, &irte, sid);
1073    if (ret < 0) {
1074        return ret;
1075    }
1076
1077    if (!irte.fields.valid) {
1078        trace_amdvi_ir_target_abort("RemapEn is disabled");
1079        return -AMDVI_IR_TARGET_ABORT;
1080    }
1081
1082    if (irte.fields.guest_mode) {
1083        error_report_once("guest mode is not zero");
1084        return -AMDVI_IR_ERR;
1085    }
1086
1087    if (irte.fields.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
1088        error_report_once("reserved int_type");
1089        return -AMDVI_IR_ERR;
1090    }
1091
1092    irq->delivery_mode = irte.fields.int_type;
1093    irq->vector = irte.fields.vector;
1094    irq->dest_mode = irte.fields.dm;
1095    irq->redir_hint = irte.fields.rq_eoi;
1096    irq->dest = irte.fields.destination;
1097
1098    return 0;
1099}
1100
1101static int amdvi_get_irte_ga(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
1102                             struct irte_ga *irte, uint16_t devid)
1103{
1104    uint64_t irte_root, offset;
1105
1106    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
1107    offset = (origin->data & AMDVI_IRTE_OFFSET) << 4;
1108    trace_amdvi_ir_irte(irte_root, offset);
1109
1110    if (dma_memory_read(&address_space_memory, irte_root + offset,
1111                        irte, sizeof(*irte))) {
1112        trace_amdvi_ir_err("failed to get irte_ga");
1113        return -AMDVI_IR_GET_IRTE;
1114    }
1115
1116    trace_amdvi_ir_irte_ga_val(irte->hi.val, irte->lo.val);
1117    return 0;
1118}
1119
1120static int amdvi_int_remap_ga(AMDVIState *iommu,
1121                              MSIMessage *origin,
1122                              MSIMessage *translated,
1123                              uint64_t *dte,
1124                              X86IOMMUIrq *irq,
1125                              uint16_t sid)
1126{
1127    int ret;
1128    struct irte_ga irte;
1129
1130    /* get interrupt remapping table */
1131    ret = amdvi_get_irte_ga(iommu, origin, dte, &irte, sid);
1132    if (ret < 0) {
1133        return ret;
1134    }
1135
1136    if (!irte.lo.fields_remap.valid) {
1137        trace_amdvi_ir_target_abort("RemapEn is disabled");
1138        return -AMDVI_IR_TARGET_ABORT;
1139    }
1140
1141    if (irte.lo.fields_remap.guest_mode) {
1142        error_report_once("guest mode is not zero");
1143        return -AMDVI_IR_ERR;
1144    }
1145
1146    if (irte.lo.fields_remap.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
1147        error_report_once("reserved int_type is set");
1148        return -AMDVI_IR_ERR;
1149    }
1150
1151    irq->delivery_mode = irte.lo.fields_remap.int_type;
1152    irq->vector = irte.hi.fields.vector;
1153    irq->dest_mode = irte.lo.fields_remap.dm;
1154    irq->redir_hint = irte.lo.fields_remap.rq_eoi;
1155    irq->dest = irte.lo.fields_remap.destination;
1156
1157    return 0;
1158}
1159
1160static int __amdvi_int_remap_msi(AMDVIState *iommu,
1161                                 MSIMessage *origin,
1162                                 MSIMessage *translated,
1163                                 uint64_t *dte,
1164                                 X86IOMMUIrq *irq,
1165                                 uint16_t sid)
1166{
1167    int ret;
1168    uint8_t int_ctl;
1169
1170    int_ctl = (dte[2] >> AMDVI_IR_INTCTL_SHIFT) & 3;
1171    trace_amdvi_ir_intctl(int_ctl);
1172
1173    switch (int_ctl) {
1174    case AMDVI_IR_INTCTL_PASS:
1175        memcpy(translated, origin, sizeof(*origin));
1176        return 0;
1177    case AMDVI_IR_INTCTL_REMAP:
1178        break;
1179    case AMDVI_IR_INTCTL_ABORT:
1180        trace_amdvi_ir_target_abort("int_ctl abort");
1181        return -AMDVI_IR_TARGET_ABORT;
1182    default:
1183        trace_amdvi_ir_err("int_ctl reserved");
1184        return -AMDVI_IR_ERR;
1185    }
1186
1187    if (iommu->ga_enabled) {
1188        ret = amdvi_int_remap_ga(iommu, origin, translated, dte, irq, sid);
1189    } else {
1190        ret = amdvi_int_remap_legacy(iommu, origin, translated, dte, irq, sid);
1191    }
1192
1193    return ret;
1194}
1195
1196/* Interrupt remapping for MSI/MSI-X entry */
1197static int amdvi_int_remap_msi(AMDVIState *iommu,
1198                               MSIMessage *origin,
1199                               MSIMessage *translated,
1200                               uint16_t sid)
1201{
1202    int ret = 0;
1203    uint64_t pass = 0;
1204    uint64_t dte[4] = { 0 };
1205    X86IOMMUIrq irq = { 0 };
1206    uint8_t dest_mode, delivery_mode;
1207
1208    assert(origin && translated);
1209
1210    /*
1211     * When IOMMU is enabled, interrupt remap request will come either from
1212     * IO-APIC or PCI device. If interrupt is from PCI device then it will
1213     * have a valid requester id but if the interrupt is from IO-APIC
1214     * then requester id will be invalid.
1215     */
1216    if (sid == X86_IOMMU_SID_INVALID) {
1217        sid = AMDVI_IOAPIC_SB_DEVID;
1218    }
1219
1220    trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid);
1221
1222    /* check if device table entry is set before we go further. */
1223    if (!iommu || !iommu->devtab_len) {
1224        memcpy(translated, origin, sizeof(*origin));
1225        goto out;
1226    }
1227
1228    if (!amdvi_get_dte(iommu, sid, dte)) {
1229        return -AMDVI_IR_ERR;
1230    }
1231
1232    /* Check if IR is enabled in DTE */
1233    if (!(dte[2] & AMDVI_IR_REMAP_ENABLE)) {
1234        memcpy(translated, origin, sizeof(*origin));
1235        goto out;
1236    }
1237
1238    /* validate that we are configure with intremap=on */
1239    if (!x86_iommu_ir_supported(X86_IOMMU_DEVICE(iommu))) {
1240        trace_amdvi_err("Interrupt remapping is enabled in the guest but "
1241                        "not in the host. Use intremap=on to enable interrupt "
1242                        "remapping in amd-iommu.");
1243        return -AMDVI_IR_ERR;
1244    }
1245
1246    if (origin->address & AMDVI_MSI_ADDR_HI_MASK) {
1247        trace_amdvi_err("MSI address high 32 bits non-zero when "
1248                        "Interrupt Remapping enabled.");
1249        return -AMDVI_IR_ERR;
1250    }
1251
1252    if ((origin->address & AMDVI_MSI_ADDR_LO_MASK) != APIC_DEFAULT_ADDRESS) {
1253        trace_amdvi_err("MSI is not from IOAPIC.");
1254        return -AMDVI_IR_ERR;
1255    }
1256
1257    /*
1258     * The MSI data register [10:8] are used to get the upstream interrupt type.
1259     *
1260     * See MSI/MSI-X format:
1261     * https://pdfs.semanticscholar.org/presentation/9420/c279e942eca568157711ef5c92b800c40a79.pdf
1262     * (page 5)
1263     */
1264    delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
1265
1266    switch (delivery_mode) {
1267    case AMDVI_IOAPIC_INT_TYPE_FIXED:
1268    case AMDVI_IOAPIC_INT_TYPE_ARBITRATED:
1269        trace_amdvi_ir_delivery_mode("fixed/arbitrated");
1270        ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, sid);
1271        if (ret < 0) {
1272            goto remap_fail;
1273        } else {
1274            /* Translate IRQ to MSI messages */
1275            x86_iommu_irq_to_msi_message(&irq, translated);
1276            goto out;
1277        }
1278        break;
1279    case AMDVI_IOAPIC_INT_TYPE_SMI:
1280        error_report("SMI is not supported!");
1281        ret = -AMDVI_IR_ERR;
1282        break;
1283    case AMDVI_IOAPIC_INT_TYPE_NMI:
1284        pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK;
1285        trace_amdvi_ir_delivery_mode("nmi");
1286        break;
1287    case AMDVI_IOAPIC_INT_TYPE_INIT:
1288        pass = dte[3] & AMDVI_DEV_INT_PASS_MASK;
1289        trace_amdvi_ir_delivery_mode("init");
1290        break;
1291    case AMDVI_IOAPIC_INT_TYPE_EINT:
1292        pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK;
1293        trace_amdvi_ir_delivery_mode("eint");
1294        break;
1295    default:
1296        trace_amdvi_ir_delivery_mode("unsupported delivery_mode");
1297        ret = -AMDVI_IR_ERR;
1298        break;
1299    }
1300
1301    if (ret < 0) {
1302        goto remap_fail;
1303    }
1304
1305    /*
1306     * The MSI address register bit[2] is used to get the destination
1307     * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts
1308     * only.
1309     */
1310    dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
1311    if (dest_mode) {
1312        trace_amdvi_ir_err("invalid dest_mode");
1313        ret = -AMDVI_IR_ERR;
1314        goto remap_fail;
1315    }
1316
1317    if (pass) {
1318        memcpy(translated, origin, sizeof(*origin));
1319    } else {
1320        trace_amdvi_ir_err("passthrough is not enabled");
1321        ret = -AMDVI_IR_ERR;
1322        goto remap_fail;
1323    }
1324
1325out:
1326    trace_amdvi_ir_remap_msi(origin->address, origin->data,
1327                             translated->address, translated->data);
1328    return 0;
1329
1330remap_fail:
1331    return ret;
1332}
1333
1334static int amdvi_int_remap(X86IOMMUState *iommu,
1335                           MSIMessage *origin,
1336                           MSIMessage *translated,
1337                           uint16_t sid)
1338{
1339    return amdvi_int_remap_msi(AMD_IOMMU_DEVICE(iommu), origin,
1340                               translated, sid);
1341}
1342
1343static MemTxResult amdvi_mem_ir_write(void *opaque, hwaddr addr,
1344                                      uint64_t value, unsigned size,
1345                                      MemTxAttrs attrs)
1346{
1347    int ret;
1348    MSIMessage from = { 0, 0 }, to = { 0, 0 };
1349    uint16_t sid = AMDVI_IOAPIC_SB_DEVID;
1350
1351    from.address = (uint64_t) addr + AMDVI_INT_ADDR_FIRST;
1352    from.data = (uint32_t) value;
1353
1354    trace_amdvi_mem_ir_write_req(addr, value, size);
1355
1356    if (!attrs.unspecified) {
1357        /* We have explicit Source ID */
1358        sid = attrs.requester_id;
1359    }
1360
1361    ret = amdvi_int_remap_msi(opaque, &from, &to, sid);
1362    if (ret < 0) {
1363        /* TODO: log the event using IOMMU log event interface */
1364        error_report_once("failed to remap interrupt from devid 0x%x", sid);
1365        return MEMTX_ERROR;
1366    }
1367
1368    apic_get_class()->send_msi(&to);
1369
1370    trace_amdvi_mem_ir_write(to.address, to.data);
1371    return MEMTX_OK;
1372}
1373
1374static MemTxResult amdvi_mem_ir_read(void *opaque, hwaddr addr,
1375                                     uint64_t *data, unsigned size,
1376                                     MemTxAttrs attrs)
1377{
1378    return MEMTX_OK;
1379}
1380
1381static const MemoryRegionOps amdvi_ir_ops = {
1382    .read_with_attrs = amdvi_mem_ir_read,
1383    .write_with_attrs = amdvi_mem_ir_write,
1384    .endianness = DEVICE_LITTLE_ENDIAN,
1385    .impl = {
1386        .min_access_size = 4,
1387        .max_access_size = 4,
1388    },
1389    .valid = {
1390        .min_access_size = 4,
1391        .max_access_size = 4,
1392    }
1393};
1394
1395static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
1396{
1397    char name[128];
1398    AMDVIState *s = opaque;
1399    AMDVIAddressSpace **iommu_as, *amdvi_dev_as;
1400    int bus_num = pci_bus_num(bus);
1401
1402    iommu_as = s->address_spaces[bus_num];
1403
1404    /* allocate memory during the first run */
1405    if (!iommu_as) {
1406        iommu_as = g_malloc0(sizeof(AMDVIAddressSpace *) * PCI_DEVFN_MAX);
1407        s->address_spaces[bus_num] = iommu_as;
1408    }
1409
1410    /* set up AMD-Vi region */
1411    if (!iommu_as[devfn]) {
1412        snprintf(name, sizeof(name), "amd_iommu_devfn_%d", devfn);
1413
1414        iommu_as[devfn] = g_malloc0(sizeof(AMDVIAddressSpace));
1415        iommu_as[devfn]->bus_num = (uint8_t)bus_num;
1416        iommu_as[devfn]->devfn = (uint8_t)devfn;
1417        iommu_as[devfn]->iommu_state = s;
1418
1419        amdvi_dev_as = iommu_as[devfn];
1420
1421        /*
1422         * Memory region relationships looks like (Address range shows
1423         * only lower 32 bits to make it short in length...):
1424         *
1425         * |-----------------+-------------------+----------|
1426         * | Name            | Address range     | Priority |
1427         * |-----------------+-------------------+----------+
1428         * | amdvi_root      | 00000000-ffffffff |        0 |
1429         * |  amdvi_iommu    | 00000000-ffffffff |        1 |
1430         * |  amdvi_iommu_ir | fee00000-feefffff |       64 |
1431         * |-----------------+-------------------+----------|
1432         */
1433        memory_region_init_iommu(&amdvi_dev_as->iommu,
1434                                 sizeof(amdvi_dev_as->iommu),
1435                                 TYPE_AMD_IOMMU_MEMORY_REGION,
1436                                 OBJECT(s),
1437                                 "amd_iommu", UINT64_MAX);
1438        memory_region_init(&amdvi_dev_as->root, OBJECT(s),
1439                           "amdvi_root", UINT64_MAX);
1440        address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name);
1441        memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s),
1442                              &amdvi_ir_ops, s, "amd_iommu_ir",
1443                              AMDVI_INT_ADDR_SIZE);
1444        memory_region_add_subregion_overlap(&amdvi_dev_as->root,
1445                                            AMDVI_INT_ADDR_FIRST,
1446                                            &amdvi_dev_as->iommu_ir,
1447                                            64);
1448        memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
1449                                            MEMORY_REGION(&amdvi_dev_as->iommu),
1450                                            1);
1451    }
1452    return &iommu_as[devfn]->as;
1453}
1454
1455static const MemoryRegionOps mmio_mem_ops = {
1456    .read = amdvi_mmio_read,
1457    .write = amdvi_mmio_write,
1458    .endianness = DEVICE_LITTLE_ENDIAN,
1459    .impl = {
1460        .min_access_size = 1,
1461        .max_access_size = 8,
1462        .unaligned = false,
1463    },
1464    .valid = {
1465        .min_access_size = 1,
1466        .max_access_size = 8,
1467    }
1468};
1469
1470static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
1471                                           IOMMUNotifierFlag old,
1472                                           IOMMUNotifierFlag new,
1473                                           Error **errp)
1474{
1475    AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
1476
1477    if (new & IOMMU_NOTIFIER_MAP) {
1478        error_setg(errp,
1479                   "device %02x.%02x.%x requires iommu notifier which is not "
1480                   "currently supported", as->bus_num, PCI_SLOT(as->devfn),
1481                   PCI_FUNC(as->devfn));
1482        return -EINVAL;
1483    }
1484    return 0;
1485}
1486
1487static void amdvi_init(AMDVIState *s)
1488{
1489    amdvi_iotlb_reset(s);
1490
1491    s->devtab_len = 0;
1492    s->cmdbuf_len = 0;
1493    s->cmdbuf_head = 0;
1494    s->cmdbuf_tail = 0;
1495    s->evtlog_head = 0;
1496    s->evtlog_tail = 0;
1497    s->excl_enabled = false;
1498    s->excl_allow = false;
1499    s->mmio_enabled = false;
1500    s->enabled = false;
1501    s->ats_enabled = false;
1502    s->cmdbuf_enabled = false;
1503
1504    /* reset MMIO */
1505    memset(s->mmior, 0, AMDVI_MMIO_SIZE);
1506    amdvi_set_quad(s, AMDVI_MMIO_EXT_FEATURES, AMDVI_EXT_FEATURES,
1507            0xffffffffffffffef, 0);
1508    amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98, 0x67);
1509
1510    /* reset device ident */
1511    pci_config_set_vendor_id(s->pci.dev.config, PCI_VENDOR_ID_AMD);
1512    pci_config_set_prog_interface(s->pci.dev.config, 00);
1513    pci_config_set_device_id(s->pci.dev.config, s->devid);
1514    pci_config_set_class(s->pci.dev.config, 0x0806);
1515
1516    /* reset AMDVI specific capabilities, all r/o */
1517    pci_set_long(s->pci.dev.config + s->capab_offset, AMDVI_CAPAB_FEATURES);
1518    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_LOW,
1519                 s->mmio.addr & ~(0xffff0000));
1520    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH,
1521                (s->mmio.addr & ~(0xffff)) >> 16);
1522    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_RANGE,
1523                 0xff000000);
1524    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, 0);
1525    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC,
1526            AMDVI_MAX_PH_ADDR | AMDVI_MAX_GVA_ADDR | AMDVI_MAX_VA_ADDR);
1527}
1528
1529static void amdvi_sysbus_reset(DeviceState *dev)
1530{
1531    AMDVIState *s = AMD_IOMMU_DEVICE(dev);
1532
1533    msi_reset(&s->pci.dev);
1534    amdvi_init(s);
1535}
1536
1537static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
1538{
1539    int ret = 0;
1540    AMDVIState *s = AMD_IOMMU_DEVICE(dev);
1541    MachineState *ms = MACHINE(qdev_get_machine());
1542    PCMachineState *pcms = PC_MACHINE(ms);
1543    X86MachineState *x86ms = X86_MACHINE(ms);
1544    PCIBus *bus = pcms->bus;
1545
1546    s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
1547                                     amdvi_uint64_equal, g_free, g_free);
1548
1549    /* This device should take care of IOMMU PCI properties */
1550    if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) {
1551        return;
1552    }
1553    ret = pci_add_capability(&s->pci.dev, AMDVI_CAPAB_ID_SEC, 0,
1554                                         AMDVI_CAPAB_SIZE, errp);
1555    if (ret < 0) {
1556        return;
1557    }
1558    s->capab_offset = ret;
1559
1560    ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_MSI, 0,
1561                             AMDVI_CAPAB_REG_SIZE, errp);
1562    if (ret < 0) {
1563        return;
1564    }
1565    ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_HT, 0,
1566                             AMDVI_CAPAB_REG_SIZE, errp);
1567    if (ret < 0) {
1568        return;
1569    }
1570
1571    /* Pseudo address space under root PCI bus. */
1572    x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
1573
1574    /* set up MMIO */
1575    memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio",
1576                          AMDVI_MMIO_SIZE);
1577
1578    sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio);
1579    sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
1580    pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
1581    s->devid = object_property_get_int(OBJECT(&s->pci), "addr", &error_abort);
1582    msi_init(&s->pci.dev, 0, 1, true, false, errp);
1583    amdvi_init(s);
1584}
1585
1586static const VMStateDescription vmstate_amdvi_sysbus = {
1587    .name = "amd-iommu",
1588    .unmigratable = 1
1589};
1590
1591static void amdvi_sysbus_instance_init(Object *klass)
1592{
1593    AMDVIState *s = AMD_IOMMU_DEVICE(klass);
1594
1595    object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI);
1596}
1597
1598static void amdvi_sysbus_class_init(ObjectClass *klass, void *data)
1599{
1600    DeviceClass *dc = DEVICE_CLASS(klass);
1601    X86IOMMUClass *dc_class = X86_IOMMU_DEVICE_CLASS(klass);
1602
1603    dc->reset = amdvi_sysbus_reset;
1604    dc->vmsd = &vmstate_amdvi_sysbus;
1605    dc->hotpluggable = false;
1606    dc_class->realize = amdvi_sysbus_realize;
1607    dc_class->int_remap = amdvi_int_remap;
1608    /* Supported by the pc-q35-* machine types */
1609    dc->user_creatable = true;
1610    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1611    dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device";
1612}
1613
1614static const TypeInfo amdvi_sysbus = {
1615    .name = TYPE_AMD_IOMMU_DEVICE,
1616    .parent = TYPE_X86_IOMMU_DEVICE,
1617    .instance_size = sizeof(AMDVIState),
1618    .instance_init = amdvi_sysbus_instance_init,
1619    .class_init = amdvi_sysbus_class_init
1620};
1621
1622static void amdvi_pci_class_init(ObjectClass *klass, void *data)
1623{
1624    DeviceClass *dc = DEVICE_CLASS(klass);
1625
1626    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1627    dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device";
1628}
1629
1630static const TypeInfo amdvi_pci = {
1631    .name = TYPE_AMD_IOMMU_PCI,
1632    .parent = TYPE_PCI_DEVICE,
1633    .instance_size = sizeof(AMDVIPCIState),
1634    .class_init = amdvi_pci_class_init,
1635    .interfaces = (InterfaceInfo[]) {
1636        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
1637        { },
1638    },
1639};
1640
1641static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, void *data)
1642{
1643    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
1644
1645    imrc->translate = amdvi_translate;
1646    imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed;
1647}
1648
1649static const TypeInfo amdvi_iommu_memory_region_info = {
1650    .parent = TYPE_IOMMU_MEMORY_REGION,
1651    .name = TYPE_AMD_IOMMU_MEMORY_REGION,
1652    .class_init = amdvi_iommu_memory_region_class_init,
1653};
1654
1655static void amdvi_register_types(void)
1656{
1657    type_register_static(&amdvi_pci);
1658    type_register_static(&amdvi_sysbus);
1659    type_register_static(&amdvi_iommu_memory_region_info);
1660}
1661
1662type_init(amdvi_register_types);
1663