qemu/exec.c
<<
>>
Prefs
   1/*
   2 *  Virtual page mapping
   3 *
   4 *  Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19#include "qemu/osdep.h"
  20#include "qapi/error.h"
  21#ifndef _WIN32
  22#endif
  23
  24#include "qemu/cutils.h"
  25#include "cpu.h"
  26#include "exec/exec-all.h"
  27#include "exec/target_page.h"
  28#include "tcg.h"
  29#include "hw/qdev-core.h"
  30#include "hw/qdev-properties.h"
  31#if !defined(CONFIG_USER_ONLY)
  32#include "hw/boards.h"
  33#include "hw/xen/xen.h"
  34#endif
  35#include "sysemu/kvm.h"
  36#include "sysemu/sysemu.h"
  37#include "qemu/timer.h"
  38#include "qemu/config-file.h"
  39#include "qemu/error-report.h"
  40#if defined(CONFIG_USER_ONLY)
  41#include "qemu.h"
  42#else /* !CONFIG_USER_ONLY */
  43#include "hw/hw.h"
  44#include "exec/memory.h"
  45#include "exec/ioport.h"
  46#include "sysemu/dma.h"
  47#include "sysemu/numa.h"
  48#include "sysemu/hw_accel.h"
  49#include "exec/address-spaces.h"
  50#include "sysemu/xen-mapcache.h"
  51#include "trace-root.h"
  52
  53#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
  54#include <fcntl.h>
  55#include <linux/falloc.h>
  56#endif
  57
  58#endif
  59#include "qemu/rcu_queue.h"
  60#include "qemu/main-loop.h"
  61#include "translate-all.h"
  62#include "sysemu/replay.h"
  63
  64#include "exec/memory-internal.h"
  65#include "exec/ram_addr.h"
  66#include "exec/log.h"
  67
  68#include "migration/vmstate.h"
  69
  70#include "qemu/range.h"
  71#ifndef _WIN32
  72#include "qemu/mmap-alloc.h"
  73#endif
  74
  75#include "monitor/monitor.h"
  76
  77//#define DEBUG_SUBPAGE
  78
  79#if !defined(CONFIG_USER_ONLY)
  80/* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  81 * are protected by the ramlist lock.
  82 */
  83RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  84
  85static MemoryRegion *system_memory;
  86static MemoryRegion *system_io;
  87
  88AddressSpace address_space_io;
  89AddressSpace address_space_memory;
  90
  91MemoryRegion io_mem_rom, io_mem_notdirty;
  92static MemoryRegion io_mem_unassigned;
  93
  94/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
  95#define RAM_PREALLOC   (1 << 0)
  96
  97/* RAM is mmap-ed with MAP_SHARED */
  98#define RAM_SHARED     (1 << 1)
  99
 100/* Only a portion of RAM (used_length) is actually used, and migrated.
 101 * This used_length size can change across reboots.
 102 */
 103#define RAM_RESIZEABLE (1 << 2)
 104
 105/* RAM is backed by an mmapped file.
 106 */
 107#define RAM_FILE (1 << 3)
 108#endif
 109
 110#ifdef TARGET_PAGE_BITS_VARY
 111int target_page_bits;
 112bool target_page_bits_decided;
 113#endif
 114
 115struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
 116/* current CPU in the current thread. It is only valid inside
 117   cpu_exec() */
 118__thread CPUState *current_cpu;
 119/* 0 = Do not count executed instructions.
 120   1 = Precise instruction counting.
 121   2 = Adaptive rate instruction counting.  */
 122int use_icount;
 123
 124uintptr_t qemu_host_page_size;
 125intptr_t qemu_host_page_mask;
 126
 127bool set_preferred_target_page_bits(int bits)
 128{
 129    /* The target page size is the lowest common denominator for all
 130     * the CPUs in the system, so we can only make it smaller, never
 131     * larger. And we can't make it smaller once we've committed to
 132     * a particular size.
 133     */
 134#ifdef TARGET_PAGE_BITS_VARY
 135    assert(bits >= TARGET_PAGE_BITS_MIN);
 136    if (target_page_bits == 0 || target_page_bits > bits) {
 137        if (target_page_bits_decided) {
 138            return false;
 139        }
 140        target_page_bits = bits;
 141    }
 142#endif
 143    return true;
 144}
 145
 146#if !defined(CONFIG_USER_ONLY)
 147
 148static void finalize_target_page_bits(void)
 149{
 150#ifdef TARGET_PAGE_BITS_VARY
 151    if (target_page_bits == 0) {
 152        target_page_bits = TARGET_PAGE_BITS_MIN;
 153    }
 154    target_page_bits_decided = true;
 155#endif
 156}
 157
 158typedef struct PhysPageEntry PhysPageEntry;
 159
 160struct PhysPageEntry {
 161    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 162    uint32_t skip : 6;
 163     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 164    uint32_t ptr : 26;
 165};
 166
 167#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 168
 169/* Size of the L2 (and L3, etc) page tables.  */
 170#define ADDR_SPACE_BITS 64
 171
 172#define P_L2_BITS 9
 173#define P_L2_SIZE (1 << P_L2_BITS)
 174
 175#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 176
 177typedef PhysPageEntry Node[P_L2_SIZE];
 178
 179typedef struct PhysPageMap {
 180    struct rcu_head rcu;
 181
 182    unsigned sections_nb;
 183    unsigned sections_nb_alloc;
 184    unsigned nodes_nb;
 185    unsigned nodes_nb_alloc;
 186    Node *nodes;
 187    MemoryRegionSection *sections;
 188} PhysPageMap;
 189
 190struct AddressSpaceDispatch {
 191    MemoryRegionSection *mru_section;
 192    /* This is a multi-level map on the physical address space.
 193     * The bottom level has pointers to MemoryRegionSections.
 194     */
 195    PhysPageEntry phys_map;
 196    PhysPageMap map;
 197};
 198
 199#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 200typedef struct subpage_t {
 201    MemoryRegion iomem;
 202    FlatView *fv;
 203    hwaddr base;
 204    uint16_t sub_section[];
 205} subpage_t;
 206
 207#define PHYS_SECTION_UNASSIGNED 0
 208#define PHYS_SECTION_NOTDIRTY 1
 209#define PHYS_SECTION_ROM 2
 210#define PHYS_SECTION_WATCH 3
 211
 212static void io_mem_init(void);
 213static void memory_map_init(void);
 214static void tcg_commit(MemoryListener *listener);
 215
 216static MemoryRegion io_mem_watch;
 217
 218/**
 219 * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 220 * @cpu: the CPU whose AddressSpace this is
 221 * @as: the AddressSpace itself
 222 * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 223 * @tcg_as_listener: listener for tracking changes to the AddressSpace
 224 */
 225struct CPUAddressSpace {
 226    CPUState *cpu;
 227    AddressSpace *as;
 228    struct AddressSpaceDispatch *memory_dispatch;
 229    MemoryListener tcg_as_listener;
 230};
 231
 232struct DirtyBitmapSnapshot {
 233    ram_addr_t start;
 234    ram_addr_t end;
 235    unsigned long dirty[];
 236};
 237
 238#endif
 239
 240#if !defined(CONFIG_USER_ONLY)
 241
 242static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 243{
 244    static unsigned alloc_hint = 16;
 245    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 246        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
 247        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 248        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 249        alloc_hint = map->nodes_nb_alloc;
 250    }
 251}
 252
 253static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
 254{
 255    unsigned i;
 256    uint32_t ret;
 257    PhysPageEntry e;
 258    PhysPageEntry *p;
 259
 260    ret = map->nodes_nb++;
 261    p = map->nodes[ret];
 262    assert(ret != PHYS_MAP_NODE_NIL);
 263    assert(ret != map->nodes_nb_alloc);
 264
 265    e.skip = leaf ? 0 : 1;
 266    e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
 267    for (i = 0; i < P_L2_SIZE; ++i) {
 268        memcpy(&p[i], &e, sizeof(e));
 269    }
 270    return ret;
 271}
 272
 273static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 274                                hwaddr *index, hwaddr *nb, uint16_t leaf,
 275                                int level)
 276{
 277    PhysPageEntry *p;
 278    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 279
 280    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 281        lp->ptr = phys_map_node_alloc(map, level == 0);
 282    }
 283    p = map->nodes[lp->ptr];
 284    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 285
 286    while (*nb && lp < &p[P_L2_SIZE]) {
 287        if ((*index & (step - 1)) == 0 && *nb >= step) {
 288            lp->skip = 0;
 289            lp->ptr = leaf;
 290            *index += step;
 291            *nb -= step;
 292        } else {
 293            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 294        }
 295        ++lp;
 296    }
 297}
 298
 299static void phys_page_set(AddressSpaceDispatch *d,
 300                          hwaddr index, hwaddr nb,
 301                          uint16_t leaf)
 302{
 303    /* Wildly overreserve - it doesn't matter much. */
 304    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 305
 306    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 307}
 308
 309/* Compact a non leaf page entry. Simply detect that the entry has a single child,
 310 * and update our entry so we can skip it and go directly to the destination.
 311 */
 312static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
 313{
 314    unsigned valid_ptr = P_L2_SIZE;
 315    int valid = 0;
 316    PhysPageEntry *p;
 317    int i;
 318
 319    if (lp->ptr == PHYS_MAP_NODE_NIL) {
 320        return;
 321    }
 322
 323    p = nodes[lp->ptr];
 324    for (i = 0; i < P_L2_SIZE; i++) {
 325        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 326            continue;
 327        }
 328
 329        valid_ptr = i;
 330        valid++;
 331        if (p[i].skip) {
 332            phys_page_compact(&p[i], nodes);
 333        }
 334    }
 335
 336    /* We can only compress if there's only one child. */
 337    if (valid != 1) {
 338        return;
 339    }
 340
 341    assert(valid_ptr < P_L2_SIZE);
 342
 343    /* Don't compress if it won't fit in the # of bits we have. */
 344    if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 345        return;
 346    }
 347
 348    lp->ptr = p[valid_ptr].ptr;
 349    if (!p[valid_ptr].skip) {
 350        /* If our only child is a leaf, make this a leaf. */
 351        /* By design, we should have made this node a leaf to begin with so we
 352         * should never reach here.
 353         * But since it's so simple to handle this, let's do it just in case we
 354         * change this rule.
 355         */
 356        lp->skip = 0;
 357    } else {
 358        lp->skip += p[valid_ptr].skip;
 359    }
 360}
 361
 362void address_space_dispatch_compact(AddressSpaceDispatch *d)
 363{
 364    if (d->phys_map.skip) {
 365        phys_page_compact(&d->phys_map, d->map.nodes);
 366    }
 367}
 368
 369static inline bool section_covers_addr(const MemoryRegionSection *section,
 370                                       hwaddr addr)
 371{
 372    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
 373     * the section must cover the entire address space.
 374     */
 375    return int128_gethi(section->size) ||
 376           range_covers_byte(section->offset_within_address_space,
 377                             int128_getlo(section->size), addr);
 378}
 379
 380static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
 381{
 382    PhysPageEntry lp = d->phys_map, *p;
 383    Node *nodes = d->map.nodes;
 384    MemoryRegionSection *sections = d->map.sections;
 385    hwaddr index = addr >> TARGET_PAGE_BITS;
 386    int i;
 387
 388    for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 389        if (lp.ptr == PHYS_MAP_NODE_NIL) {
 390            return &sections[PHYS_SECTION_UNASSIGNED];
 391        }
 392        p = nodes[lp.ptr];
 393        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 394    }
 395
 396    if (section_covers_addr(&sections[lp.ptr], addr)) {
 397        return &sections[lp.ptr];
 398    } else {
 399        return &sections[PHYS_SECTION_UNASSIGNED];
 400    }
 401}
 402
 403bool memory_region_is_unassigned(MemoryRegion *mr)
 404{
 405    return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
 406        && mr != &io_mem_watch;
 407}
 408
 409/* Called from RCU critical section */
 410static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 411                                                        hwaddr addr,
 412                                                        bool resolve_subpage)
 413{
 414    MemoryRegionSection *section = atomic_read(&d->mru_section);
 415    subpage_t *subpage;
 416
 417    if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
 418        !section_covers_addr(section, addr)) {
 419        section = phys_page_find(d, addr);
 420        atomic_set(&d->mru_section, section);
 421    }
 422    if (resolve_subpage && section->mr->subpage) {
 423        subpage = container_of(section->mr, subpage_t, iomem);
 424        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 425    }
 426    return section;
 427}
 428
 429/* Called from RCU critical section */
 430static MemoryRegionSection *
 431address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 432                                 hwaddr *plen, bool resolve_subpage)
 433{
 434    MemoryRegionSection *section;
 435    MemoryRegion *mr;
 436    Int128 diff;
 437
 438    section = address_space_lookup_region(d, addr, resolve_subpage);
 439    /* Compute offset within MemoryRegionSection */
 440    addr -= section->offset_within_address_space;
 441
 442    /* Compute offset within MemoryRegion */
 443    *xlat = addr + section->offset_within_region;
 444
 445    mr = section->mr;
 446
 447    /* MMIO registers can be expected to perform full-width accesses based only
 448     * on their address, without considering adjacent registers that could
 449     * decode to completely different MemoryRegions.  When such registers
 450     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
 451     * regions overlap wildly.  For this reason we cannot clamp the accesses
 452     * here.
 453     *
 454     * If the length is small (as is the case for address_space_ldl/stl),
 455     * everything works fine.  If the incoming length is large, however,
 456     * the caller really has to do the clamping through memory_access_size.
 457     */
 458    if (memory_region_is_ram(mr)) {
 459        diff = int128_sub(section->size, int128_make64(addr));
 460        *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 461    }
 462    return section;
 463}
 464
 465/**
 466 * flatview_do_translate - translate an address in FlatView
 467 *
 468 * @fv: the flat view that we want to translate on
 469 * @addr: the address to be translated in above address space
 470 * @xlat: the translated address offset within memory region. It
 471 *        cannot be @NULL.
 472 * @plen_out: valid read/write length of the translated address. It
 473 *            can be @NULL when we don't care about it.
 474 * @page_mask_out: page mask for the translated address. This
 475 *            should only be meaningful for IOMMU translated
 476 *            addresses, since there may be huge pages that this bit
 477 *            would tell. It can be @NULL if we don't care about it.
 478 * @is_write: whether the translation operation is for write
 479 * @is_mmio: whether this can be MMIO, set true if it can
 480 *
 481 * This function is called from RCU critical section
 482 */
 483static MemoryRegionSection flatview_do_translate(FlatView *fv,
 484                                                 hwaddr addr,
 485                                                 hwaddr *xlat,
 486                                                 hwaddr *plen_out,
 487                                                 hwaddr *page_mask_out,
 488                                                 bool is_write,
 489                                                 bool is_mmio,
 490                                                 AddressSpace **target_as,
 491                                                 MemTxAttrs *attr)
 492{
 493    IOMMUTLBEntry iotlb;
 494    MemoryRegionSection *section;
 495    IOMMUMemoryRegion *iommu_mr;
 496    IOMMUMemoryRegionClass *imrc;
 497    hwaddr page_mask = (hwaddr)(-1);
 498    hwaddr plen = (hwaddr)(-1);
 499
 500    if (plen_out) {
 501        plen = *plen_out;
 502    }
 503
 504    for (;;) {
 505        section = address_space_translate_internal(
 506                flatview_to_dispatch(fv), addr, &addr,
 507                &plen, is_mmio);
 508
 509        iommu_mr = memory_region_get_iommu(section->mr);
 510        if (!iommu_mr) {
 511            break;
 512        }
 513        imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 514
 515        if (imrc->translate_attr) {
 516            iotlb = imrc->translate_attr(iommu_mr, addr, is_write, attr);
 517        } else {
 518            iotlb = imrc->translate(iommu_mr, addr, is_write ?
 519                                    IOMMU_WO : IOMMU_RO);
 520        }
 521        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 522                | (addr & iotlb.addr_mask));
 523        page_mask &= iotlb.addr_mask;
 524        plen = MIN(plen, (addr | iotlb.addr_mask) - addr + 1);
 525        if (!(iotlb.perm & (1 << is_write))) {
 526            goto translate_fail;
 527        }
 528
 529        fv = address_space_to_flatview(iotlb.target_as);
 530        *target_as = iotlb.target_as;
 531    }
 532
 533    *xlat = addr;
 534
 535    if (page_mask == (hwaddr)(-1)) {
 536        /* Not behind an IOMMU, use default page size. */
 537        page_mask = ~TARGET_PAGE_MASK;
 538    }
 539
 540    if (page_mask_out) {
 541        *page_mask_out = page_mask;
 542    }
 543
 544    if (plen_out) {
 545        *plen_out = plen;
 546    }
 547
 548    return *section;
 549
 550translate_fail:
 551    return (MemoryRegionSection) { .mr = &io_mem_unassigned };
 552}
 553
 554/* Called from RCU critical section */
 555IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
 556                                            bool is_write)
 557{
 558    MemoryRegionSection section;
 559    hwaddr xlat, page_mask;
 560
 561    /*
 562     * This can never be MMIO, and we don't really care about plen,
 563     * but page mask.
 564     */
 565    section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
 566                                    NULL, &page_mask, is_write, false, &as,
 567                                    &MEMTXATTRS_UNSPECIFIED);
 568
 569    /* Illegal translation */
 570    if (section.mr == &io_mem_unassigned) {
 571        goto iotlb_fail;
 572    }
 573
 574    /* Convert memory region offset into address space offset */
 575    xlat += section.offset_within_address_space -
 576        section.offset_within_region;
 577
 578    return (IOMMUTLBEntry) {
 579        .target_as = as,
 580        .iova = addr & ~page_mask,
 581        .translated_addr = xlat & ~page_mask,
 582        .addr_mask = page_mask,
 583        /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
 584        .perm = IOMMU_RW,
 585    };
 586
 587iotlb_fail:
 588    return (IOMMUTLBEntry) {0};
 589}
 590
 591/* Called from RCU critical section */
 592MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
 593                                 hwaddr *plen, bool is_write, MemTxAttrs *attr)
 594{
 595    MemoryRegion *mr;
 596    MemoryRegionSection section;
 597    AddressSpace *as = NULL;
 598
 599    /* This can be MMIO, so setup MMIO bit. */
 600    section = flatview_do_translate(fv, addr, xlat, plen, NULL,
 601                                    is_write, true, &as, attr);
 602    mr = section.mr;
 603
 604    if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 605        hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 606        *plen = MIN(page, *plen);
 607    }
 608
 609    return mr;
 610}
 611
 612/* Called from RCU critical section */
 613MemoryRegionSection *
 614address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
 615                                  hwaddr *xlat, hwaddr *plen, int *prot,
 616                                  MemTxAttrs *attr)
 617{
 618    MemoryRegionSection *section;
 619    IOMMUMemoryRegion *iommu_mr;
 620    IOMMUMemoryRegionClass *imrc;
 621    AddressSpace *as = cpu->cpu_ases[asidx].as;
 622
 623    IOMMUTLBEntry iotlb;
 624    struct {
 625        MemoryRegionSection *section;
 626        hwaddr addr;
 627        hwaddr len;
 628    } root =  { .section = NULL, .addr = addr};
 629    AddressSpace *orig_as = as;
 630    hwaddr len = *plen;
 631
 632    assert(prot);
 633
 634    for (;;) {
 635        /* Xilinx: We want to ensure the AddressSpaceDispatch is updated on
 636         * each loop, to keep the XMPU happy.
 637         */
 638        FlatView *fv = address_space_to_flatview(as);
 639        AddressSpaceDispatch *d = flatview_to_dispatch(fv);
 640        section = address_space_translate_internal(d, addr, &addr, plen, false);
 641
 642        iommu_mr = memory_region_get_iommu(section->mr);
 643        if (!iommu_mr) {
 644            break;
 645        }
 646        imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 647
 648        /* FIXME: these are not necessarily accesses, so is_write doesn't make
 649           sense!  */
 650        if (imrc->translate_attr) {
 651            iotlb = imrc->translate_attr(iommu_mr, addr, false, attr);
 652        } else {
 653            iotlb = imrc->translate(iommu_mr, addr, false);
 654        }
 655        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 656                | (addr & iotlb.addr_mask));
 657        len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
 658        as = iotlb.target_as;
 659
 660        if (!root.section && orig_as != as) {
 661            root.section = section;
 662            root.len = *plen;
 663        }
 664    }
 665
 666    *plen = len;
 667    *xlat = addr;
 668
 669    /* If the IOMMU translated addr into IO in a different AS, refer to
 670     * the IOMMU itself and do a slow translated access at access time.
 671     * TODO: If the iotlb could record dst AS, this wouldn't be needed.
 672     */
 673    if (!memory_region_is_ram(section->mr) && as != orig_as) {
 674        *plen = root.len;
 675        *xlat = root.addr;
 676        section = root.section;
 677    }
 678    // qemu_log("as=%s mr=%p addr=%lx len=%lx\n", as->name, section->mr, *xlat, *plen);
 679
 680    return section;
 681}
 682#endif
 683
 684#if !defined(CONFIG_USER_ONLY)
 685
 686static int cpu_common_post_load(void *opaque, int version_id)
 687{
 688    CPUState *cpu = opaque;
 689
 690    /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 691       version_id is increased. */
 692    cpu->interrupt_request &= ~0x01;
 693    tlb_flush(cpu);
 694
 695    return 0;
 696}
 697
 698static int cpu_common_pre_load(void *opaque)
 699{
 700    CPUState *cpu = opaque;
 701
 702    cpu->exception_index = -1;
 703
 704    return 0;
 705}
 706
 707static bool cpu_common_exception_index_needed(void *opaque)
 708{
 709    CPUState *cpu = opaque;
 710
 711    return tcg_enabled() && cpu->exception_index != -1;
 712}
 713
 714static const VMStateDescription vmstate_cpu_common_exception_index = {
 715    .name = "cpu_common/exception_index",
 716    .version_id = 1,
 717    .minimum_version_id = 1,
 718    .needed = cpu_common_exception_index_needed,
 719    .fields = (VMStateField[]) {
 720        VMSTATE_INT32(exception_index, CPUState),
 721        VMSTATE_END_OF_LIST()
 722    }
 723};
 724
 725static bool cpu_common_crash_occurred_needed(void *opaque)
 726{
 727    CPUState *cpu = opaque;
 728
 729    return cpu->crash_occurred;
 730}
 731
 732static const VMStateDescription vmstate_cpu_common_crash_occurred = {
 733    .name = "cpu_common/crash_occurred",
 734    .version_id = 1,
 735    .minimum_version_id = 1,
 736    .needed = cpu_common_crash_occurred_needed,
 737    .fields = (VMStateField[]) {
 738        VMSTATE_BOOL(crash_occurred, CPUState),
 739        VMSTATE_END_OF_LIST()
 740    }
 741};
 742
 743const VMStateDescription vmstate_cpu_common = {
 744    .name = "cpu_common",
 745    .version_id = 1,
 746    .minimum_version_id = 1,
 747    .pre_load = cpu_common_pre_load,
 748    .post_load = cpu_common_post_load,
 749    .fields = (VMStateField[]) {
 750        VMSTATE_UINT32(halted, CPUState),
 751        VMSTATE_UINT32(interrupt_request, CPUState),
 752        VMSTATE_END_OF_LIST()
 753    },
 754    .subsections = (const VMStateDescription*[]) {
 755        &vmstate_cpu_common_exception_index,
 756        &vmstate_cpu_common_crash_occurred,
 757        NULL
 758    }
 759};
 760
 761#endif
 762
 763CPUState *qemu_get_cpu(int index)
 764{
 765    CPUState *cpu;
 766
 767    CPU_FOREACH(cpu) {
 768        if (cpu->cpu_index == index) {
 769            return cpu;
 770        }
 771    }
 772
 773    return NULL;
 774}
 775
 776#if !defined(CONFIG_USER_ONLY)
 777void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
 778{
 779    CPUAddressSpace *newas;
 780
 781    /* Target code should have set num_ases before calling us */
 782    assert(asidx < cpu->num_ases);
 783
 784    if (asidx == 0) {
 785        /* address space 0 gets the convenience alias */
 786        cpu->as = as;
 787    }
 788
 789    /* KVM cannot currently support multiple address spaces. */
 790    assert(asidx == 0 || !kvm_enabled());
 791
 792    if (!cpu->cpu_ases) {
 793        cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
 794    }
 795
 796    newas = &cpu->cpu_ases[asidx];
 797    newas->cpu = cpu;
 798    newas->as = as;
 799    if (tcg_enabled()) {
 800        newas->tcg_as_listener.commit = tcg_commit;
 801        memory_listener_register(&newas->tcg_as_listener, as);
 802    }
 803}
 804
 805AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 806{
 807    /* Return the AddressSpace corresponding to the specified index */
 808    return cpu->cpu_ases[asidx].as;
 809}
 810#endif
 811
 812void cpu_exec_unrealizefn(CPUState *cpu)
 813{
 814    CPUClass *cc = CPU_GET_CLASS(cpu);
 815
 816    cpu_list_remove(cpu);
 817
 818    if (cc->vmsd != NULL) {
 819        vmstate_unregister(NULL, cc->vmsd, cpu);
 820    }
 821    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 822        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
 823    }
 824}
 825
 826Property cpu_common_props[] = {
 827#ifndef CONFIG_USER_ONLY
 828    /* Create a memory property for softmmu CPU object,
 829     * so users can wire up its memory. (This can't go in qom/cpu.c
 830     * because that file is compiled only once for both user-mode
 831     * and system builds.) The default if no link is set up is to use
 832     * the system address space.
 833     */
 834    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
 835                     MemoryRegion *),
 836#endif
 837    DEFINE_PROP_STRING("gdb-id", CPUState, gdb_id),
 838    DEFINE_PROP_END_OF_LIST(),
 839};
 840
 841void cpu_exec_initfn(CPUState *cpu)
 842{
 843    cpu->as = NULL;
 844    cpu->num_ases = 0;
 845
 846#ifndef CONFIG_USER_ONLY
 847    cpu->thread_id = qemu_get_thread_id();
 848    cpu->memory = system_memory;
 849    object_ref(OBJECT(cpu->memory));
 850#endif
 851}
 852
 853void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 854{
 855    CPUClass *cc = CPU_GET_CLASS(cpu);
 856    static bool tcg_target_initialized;
 857
 858    cpu_list_add(cpu);
 859
 860    if (tcg_enabled() && !tcg_target_initialized) {
 861        tcg_target_initialized = true;
 862        cc->tcg_initialize();
 863    }
 864
 865#ifndef CONFIG_USER_ONLY
 866    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 867        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 868    }
 869    if (cc->vmsd != NULL) {
 870        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 871    }
 872#endif
 873}
 874
 875#if defined(CONFIG_USER_ONLY)
 876static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 877{
 878    mmap_lock();
 879    tb_lock();
 880    tb_invalidate_phys_page_range(pc, pc + 1, 0);
 881    tb_unlock();
 882    mmap_unlock();
 883}
 884#else
 885static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 886{
 887    MemTxAttrs attrs;
 888    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
 889    int asidx = cpu_asidx_from_attrs(cpu, attrs);
 890    if (phys != -1) {
 891        /* Locks grabbed by tb_invalidate_phys_addr */
 892        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
 893                                phys | (pc & ~TARGET_PAGE_MASK));
 894    }
 895}
 896#endif
 897
 898#if defined(CONFIG_USER_ONLY)
 899void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 900
 901{
 902}
 903
 904int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 905                          int flags)
 906{
 907    return -ENOSYS;
 908}
 909
 910void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 911{
 912}
 913
 914int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 915                          int flags, CPUWatchpoint **watchpoint)
 916{
 917    return -ENOSYS;
 918}
 919#else
 920/* Add a watchpoint.  */
 921int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 922                          int flags, CPUWatchpoint **watchpoint)
 923{
 924    CPUWatchpoint *wp;
 925
 926    /* forbid ranges which are empty or run off the end of the address space */
 927    if (len == 0 || (addr + len - 1) < addr) {
 928        error_report("tried to set invalid watchpoint at %"
 929                     VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
 930        return -EINVAL;
 931    }
 932    wp = g_malloc(sizeof(*wp));
 933
 934    wp->vaddr = addr;
 935    wp->len = len;
 936    wp->flags = flags;
 937
 938    /* keep all GDB-injected watchpoints in front */
 939    if (flags & BP_GDB) {
 940        QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
 941    } else {
 942        QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
 943    }
 944
 945    tlb_flush_page(cpu, addr);
 946
 947    if (watchpoint)
 948        *watchpoint = wp;
 949    return 0;
 950}
 951
 952/* Remove a specific watchpoint.  */
 953int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 954                          int flags)
 955{
 956    CPUWatchpoint *wp;
 957
 958    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 959        if (addr == wp->vaddr && len == wp->len
 960                && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
 961            cpu_watchpoint_remove_by_ref(cpu, wp);
 962            return 0;
 963        }
 964    }
 965    return -ENOENT;
 966}
 967
 968/* Remove a specific watchpoint by reference.  */
 969void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 970{
 971    QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 972
 973    tlb_flush_page(cpu, watchpoint->vaddr);
 974
 975    g_free(watchpoint);
 976}
 977
 978/* Remove all matching watchpoints.  */
 979void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 980{
 981    CPUWatchpoint *wp, *next;
 982
 983    QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
 984        if (wp->flags & mask) {
 985            cpu_watchpoint_remove_by_ref(cpu, wp);
 986        }
 987    }
 988}
 989
 990/* Return true if this watchpoint address matches the specified
 991 * access (ie the address range covered by the watchpoint overlaps
 992 * partially or completely with the address range covered by the
 993 * access).
 994 */
 995static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
 996                                                  vaddr addr,
 997                                                  vaddr len)
 998{
 999    /* We know the lengths are non-zero, but a little caution is
1000     * required to avoid errors in the case where the range ends
1001     * exactly at the top of the address space and so addr + len
1002     * wraps round to zero.
1003     */
1004    vaddr wpend = wp->vaddr + wp->len - 1;
1005    vaddr addrend = addr + len - 1;
1006
1007    return !(addr > wpend || wp->vaddr > addrend);
1008}
1009
1010#endif
1011
1012/* Add a breakpoint.  */
1013int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
1014                          CPUBreakpoint **breakpoint)
1015{
1016    CPUBreakpoint *bp;
1017
1018    bp = g_malloc(sizeof(*bp));
1019
1020    bp->pc = pc;
1021    bp->flags = flags;
1022
1023    /* keep all GDB-injected breakpoints in front */
1024    if (flags & BP_GDB) {
1025        QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
1026    } else {
1027        QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
1028    }
1029
1030    breakpoint_invalidate(cpu, pc);
1031
1032    if (breakpoint) {
1033        *breakpoint = bp;
1034    }
1035    return 0;
1036}
1037
1038/* Remove a specific breakpoint.  */
1039int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
1040{
1041    CPUBreakpoint *bp;
1042
1043    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1044        if (bp->pc == pc && bp->flags == flags) {
1045            cpu_breakpoint_remove_by_ref(cpu, bp);
1046            return 0;
1047        }
1048    }
1049    return -ENOENT;
1050}
1051
1052/* Remove a specific breakpoint by reference.  */
1053void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
1054{
1055    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
1056
1057    breakpoint_invalidate(cpu, breakpoint->pc);
1058
1059    g_free(breakpoint);
1060}
1061
1062/* Remove all matching breakpoints. */
1063void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
1064{
1065    CPUBreakpoint *bp, *next;
1066
1067    QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
1068        if (bp->flags & mask) {
1069            cpu_breakpoint_remove_by_ref(cpu, bp);
1070        }
1071    }
1072}
1073
1074/* enable or disable single step mode. EXCP_DEBUG is returned by the
1075   CPU loop after each instruction */
1076void cpu_single_step(CPUState *cpu, int enabled)
1077{
1078    if (cpu->singlestep_enabled != enabled) {
1079        cpu->singlestep_enabled = enabled;
1080        if (kvm_enabled()) {
1081            kvm_update_guest_debug(cpu, 0);
1082        } else {
1083            /* must flush all the translated code to avoid inconsistencies */
1084            /* XXX: only flush what is necessary */
1085            tb_flush(cpu);
1086        }
1087    }
1088}
1089
1090void cpu_abort(CPUState *cpu, const char *fmt, ...)
1091{
1092    va_list ap;
1093    va_list ap2;
1094
1095    va_start(ap, fmt);
1096    va_copy(ap2, ap);
1097    fprintf(stderr, "qemu: fatal: ");
1098    vfprintf(stderr, fmt, ap);
1099    fprintf(stderr, "\n");
1100    cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1101    if (qemu_log_separate()) {
1102        qemu_log_lock();
1103        qemu_log("qemu: fatal: ");
1104        qemu_log_vprintf(fmt, ap2);
1105        qemu_log("\n");
1106        log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1107        qemu_log_flush();
1108        qemu_log_unlock();
1109        qemu_log_close();
1110    }
1111    va_end(ap2);
1112    va_end(ap);
1113    replay_finish();
1114#if defined(CONFIG_USER_ONLY)
1115    {
1116        struct sigaction act;
1117        sigfillset(&act.sa_mask);
1118        act.sa_handler = SIG_DFL;
1119        sigaction(SIGABRT, &act, NULL);
1120    }
1121#endif
1122    abort();
1123}
1124
1125#if !defined(CONFIG_USER_ONLY)
1126/* Called from RCU critical section */
1127static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
1128{
1129    RAMBlock *block;
1130
1131    block = atomic_rcu_read(&ram_list.mru_block);
1132    if (block && addr - block->offset < block->max_length) {
1133        return block;
1134    }
1135    RAMBLOCK_FOREACH(block) {
1136        if (addr - block->offset < block->max_length) {
1137            goto found;
1138        }
1139    }
1140
1141    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1142    abort();
1143
1144found:
1145    /* It is safe to write mru_block outside the iothread lock.  This
1146     * is what happens:
1147     *
1148     *     mru_block = xxx
1149     *     rcu_read_unlock()
1150     *                                        xxx removed from list
1151     *                  rcu_read_lock()
1152     *                  read mru_block
1153     *                                        mru_block = NULL;
1154     *                                        call_rcu(reclaim_ramblock, xxx);
1155     *                  rcu_read_unlock()
1156     *
1157     * atomic_rcu_set is not needed here.  The block was already published
1158     * when it was placed into the list.  Here we're just making an extra
1159     * copy of the pointer.
1160     */
1161    ram_list.mru_block = block;
1162    return block;
1163}
1164
1165static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
1166{
1167    CPUState *cpu;
1168    ram_addr_t start1;
1169    RAMBlock *block;
1170    ram_addr_t end;
1171
1172    end = TARGET_PAGE_ALIGN(start + length);
1173    start &= TARGET_PAGE_MASK;
1174
1175    rcu_read_lock();
1176    block = qemu_get_ram_block(start);
1177    assert(block == qemu_get_ram_block(end - 1));
1178    start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1179    CPU_FOREACH(cpu) {
1180        tlb_reset_dirty(cpu, start1, length);
1181    }
1182    rcu_read_unlock();
1183}
1184
1185/* Note: start and end must be within the same ram block.  */
1186bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
1187                                              ram_addr_t length,
1188                                              unsigned client)
1189{
1190    DirtyMemoryBlocks *blocks;
1191    unsigned long end, page;
1192    bool dirty = false;
1193
1194    if (length == 0) {
1195        return false;
1196    }
1197
1198    end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1199    page = start >> TARGET_PAGE_BITS;
1200
1201    rcu_read_lock();
1202
1203    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1204
1205    while (page < end) {
1206        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1207        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1208        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1209
1210        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1211                                              offset, num);
1212        page += num;
1213    }
1214
1215    rcu_read_unlock();
1216
1217    if (dirty && tcg_enabled()) {
1218        tlb_reset_dirty_range_all(start, length);
1219    }
1220
1221    return dirty;
1222}
1223
1224DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
1225     (ram_addr_t start, ram_addr_t length, unsigned client)
1226{
1227    DirtyMemoryBlocks *blocks;
1228    unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
1229    ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
1230    ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
1231    DirtyBitmapSnapshot *snap;
1232    unsigned long page, end, dest;
1233
1234    snap = g_malloc0(sizeof(*snap) +
1235                     ((last - first) >> (TARGET_PAGE_BITS + 3)));
1236    snap->start = first;
1237    snap->end   = last;
1238
1239    page = first >> TARGET_PAGE_BITS;
1240    end  = last  >> TARGET_PAGE_BITS;
1241    dest = 0;
1242
1243    rcu_read_lock();
1244
1245    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1246
1247    while (page < end) {
1248        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1249        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1250        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1251
1252        assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
1253        assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
1254        offset >>= BITS_PER_LEVEL;
1255
1256        bitmap_copy_and_clear_atomic(snap->dirty + dest,
1257                                     blocks->blocks[idx] + offset,
1258                                     num);
1259        page += num;
1260        dest += num >> BITS_PER_LEVEL;
1261    }
1262
1263    rcu_read_unlock();
1264
1265    if (tcg_enabled()) {
1266        tlb_reset_dirty_range_all(start, length);
1267    }
1268
1269    return snap;
1270}
1271
1272bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
1273                                            ram_addr_t start,
1274                                            ram_addr_t length)
1275{
1276    unsigned long page, end;
1277
1278    assert(start >= snap->start);
1279    assert(start + length <= snap->end);
1280
1281    end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
1282    page = (start - snap->start) >> TARGET_PAGE_BITS;
1283
1284    while (page < end) {
1285        if (test_bit(page, snap->dirty)) {
1286            return true;
1287        }
1288        page++;
1289    }
1290    return false;
1291}
1292
1293/* Called from RCU critical section */
1294hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1295                                       MemoryRegionSection *section,
1296                                       target_ulong vaddr,
1297                                       hwaddr paddr, hwaddr xlat,
1298                                       int prot,
1299                                       target_ulong *address)
1300{
1301    hwaddr iotlb;
1302    CPUWatchpoint *wp;
1303
1304    if (memory_region_is_ram(section->mr)) {
1305        /* Normal RAM.  */
1306        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
1307        if (!section->readonly) {
1308            iotlb |= PHYS_SECTION_NOTDIRTY;
1309        } else {
1310            iotlb |= PHYS_SECTION_ROM;
1311        }
1312    } else {
1313        AddressSpaceDispatch *d;
1314
1315        d = flatview_to_dispatch(section->fv);
1316        iotlb = section - d->map.sections;
1317        iotlb += xlat;
1318    }
1319
1320    /* Make accesses to pages with watchpoints go via the
1321       watchpoint trap routines.  */
1322    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1323        if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
1324            /* Avoid trapping reads of pages with a write breakpoint. */
1325            if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
1326                iotlb = PHYS_SECTION_WATCH + paddr;
1327                *address |= TLB_MMIO;
1328                break;
1329            }
1330        }
1331    }
1332
1333    return iotlb;
1334}
1335#endif /* defined(CONFIG_USER_ONLY) */
1336
1337#if !defined(CONFIG_USER_ONLY)
1338
1339static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
1340                             uint16_t section);
1341static subpage_t *subpage_init(FlatView *fv, hwaddr base);
1342
1343static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
1344                               qemu_anon_ram_alloc;
1345
1346/*
1347 * Set a custom physical guest memory alloator.
1348 * Accelerators with unusual needs may need this.  Hopefully, we can
1349 * get rid of it eventually.
1350 */
1351void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
1352{
1353    phys_mem_alloc = alloc;
1354}
1355
1356static uint16_t phys_section_add(PhysPageMap *map,
1357                                 MemoryRegionSection *section)
1358{
1359    /* The physical section number is ORed with a page-aligned
1360     * pointer to produce the iotlb entries.  Thus it should
1361     * never overflow into the page-aligned value.
1362     */
1363    assert(map->sections_nb < TARGET_PAGE_SIZE);
1364
1365    if (map->sections_nb == map->sections_nb_alloc) {
1366        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1367        map->sections = g_renew(MemoryRegionSection, map->sections,
1368                                map->sections_nb_alloc);
1369    }
1370    map->sections[map->sections_nb] = *section;
1371    memory_region_ref(section->mr);
1372    return map->sections_nb++;
1373}
1374
1375static void phys_section_destroy(MemoryRegion *mr)
1376{
1377    bool have_sub_page = mr->subpage;
1378
1379    memory_region_unref(mr);
1380
1381    if (have_sub_page) {
1382        subpage_t *subpage = container_of(mr, subpage_t, iomem);
1383        object_unref(OBJECT(&subpage->iomem));
1384        g_free(subpage);
1385    }
1386}
1387
1388static void phys_sections_free(PhysPageMap *map)
1389{
1390    while (map->sections_nb > 0) {
1391        MemoryRegionSection *section = &map->sections[--map->sections_nb];
1392        phys_section_destroy(section->mr);
1393    }
1394    g_free(map->sections);
1395    g_free(map->nodes);
1396}
1397
1398static void register_subpage(FlatView *fv, MemoryRegionSection *section)
1399{
1400    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1401    subpage_t *subpage;
1402    hwaddr base = section->offset_within_address_space
1403        & TARGET_PAGE_MASK;
1404    MemoryRegionSection *existing = phys_page_find(d, base);
1405    MemoryRegionSection subsection = {
1406        .offset_within_address_space = base,
1407        .size = int128_make64(TARGET_PAGE_SIZE),
1408    };
1409    hwaddr start, end;
1410
1411    assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1412
1413    if (!(existing->mr->subpage)) {
1414        subpage = subpage_init(fv, base);
1415        subsection.fv = fv;
1416        subsection.mr = &subpage->iomem;
1417        phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1418                      phys_section_add(&d->map, &subsection));
1419    } else {
1420        subpage = container_of(existing->mr, subpage_t, iomem);
1421    }
1422    start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1423    end = start + int128_get64(section->size) - 1;
1424    subpage_register(subpage, start, end,
1425                     phys_section_add(&d->map, section));
1426}
1427
1428
1429static void register_multipage(FlatView *fv,
1430                               MemoryRegionSection *section)
1431{
1432    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1433    hwaddr start_addr = section->offset_within_address_space;
1434    uint16_t section_index = phys_section_add(&d->map, section);
1435    uint64_t num_pages = int128_get64(int128_rshift(section->size,
1436                                                    TARGET_PAGE_BITS));
1437
1438    assert(num_pages);
1439    phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1440}
1441
1442void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
1443{
1444    MemoryRegionSection now = *section, remain = *section;
1445    Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1446
1447    if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
1448        uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
1449                       - now.offset_within_address_space;
1450
1451        now.size = int128_min(int128_make64(left), now.size);
1452        register_subpage(fv, &now);
1453    } else {
1454        now.size = int128_zero();
1455    }
1456    while (int128_ne(remain.size, now.size)) {
1457        remain.size = int128_sub(remain.size, now.size);
1458        remain.offset_within_address_space += int128_get64(now.size);
1459        remain.offset_within_region += int128_get64(now.size);
1460        now = remain;
1461        if (int128_lt(remain.size, page_size)) {
1462            register_subpage(fv, &now);
1463        } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1464            now.size = page_size;
1465            register_subpage(fv, &now);
1466        } else {
1467            now.size = int128_and(now.size, int128_neg(page_size));
1468            register_multipage(fv, &now);
1469        }
1470    }
1471}
1472
1473void qemu_flush_coalesced_mmio_buffer(void)
1474{
1475    if (kvm_enabled())
1476        kvm_flush_coalesced_mmio_buffer();
1477}
1478
1479void qemu_mutex_lock_ramlist(void)
1480{
1481    qemu_mutex_lock(&ram_list.mutex);
1482}
1483
1484void qemu_mutex_unlock_ramlist(void)
1485{
1486    qemu_mutex_unlock(&ram_list.mutex);
1487}
1488
1489void ram_block_dump(Monitor *mon)
1490{
1491    RAMBlock *block;
1492    char *psize;
1493
1494    rcu_read_lock();
1495    monitor_printf(mon, "%24s %8s  %18s %18s %18s\n",
1496                   "Block Name", "PSize", "Offset", "Used", "Total");
1497    RAMBLOCK_FOREACH(block) {
1498        psize = size_to_str(block->page_size);
1499        monitor_printf(mon, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
1500                       " 0x%016" PRIx64 "\n", block->idstr, psize,
1501                       (uint64_t)block->offset,
1502                       (uint64_t)block->used_length,
1503                       (uint64_t)block->max_length);
1504        g_free(psize);
1505    }
1506    rcu_read_unlock();
1507}
1508
1509#ifdef __linux__
1510/*
1511 * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
1512 * may or may not name the same files / on the same filesystem now as
1513 * when we actually open and map them.  Iterate over the file
1514 * descriptors instead, and use qemu_fd_getpagesize().
1515 */
1516static int find_max_supported_pagesize(Object *obj, void *opaque)
1517{
1518    char *mem_path;
1519    long *hpsize_min = opaque;
1520
1521    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1522        mem_path = object_property_get_str(obj, "mem-path", NULL);
1523        if (mem_path) {
1524            long hpsize = qemu_mempath_getpagesize(mem_path);
1525            if (hpsize < *hpsize_min) {
1526                *hpsize_min = hpsize;
1527            }
1528        } else {
1529            *hpsize_min = getpagesize();
1530        }
1531    }
1532
1533    return 0;
1534}
1535
1536long qemu_getrampagesize(void)
1537{
1538    long hpsize = LONG_MAX;
1539    long mainrampagesize;
1540    Object *memdev_root;
1541
1542    if (mem_path) {
1543        mainrampagesize = qemu_mempath_getpagesize(mem_path);
1544    } else {
1545        mainrampagesize = getpagesize();
1546    }
1547
1548    /* it's possible we have memory-backend objects with
1549     * hugepage-backed RAM. these may get mapped into system
1550     * address space via -numa parameters or memory hotplug
1551     * hooks. we want to take these into account, but we
1552     * also want to make sure these supported hugepage
1553     * sizes are applicable across the entire range of memory
1554     * we may boot from, so we take the min across all
1555     * backends, and assume normal pages in cases where a
1556     * backend isn't backed by hugepages.
1557     */
1558    memdev_root = object_resolve_path("/objects", NULL);
1559    if (memdev_root) {
1560        object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
1561    }
1562    if (hpsize == LONG_MAX) {
1563        /* No additional memory regions found ==> Report main RAM page size */
1564        return mainrampagesize;
1565    }
1566
1567    /* If NUMA is disabled or the NUMA nodes are not backed with a
1568     * memory-backend, then there is at least one node using "normal" RAM,
1569     * so if its page size is smaller we have got to report that size instead.
1570     */
1571    if (hpsize > mainrampagesize &&
1572        (nb_numa_nodes == 0 || numa_info[0].node_memdev == NULL)) {
1573        static bool warned;
1574        if (!warned) {
1575            error_report("Huge page support disabled (n/a for main memory).");
1576            warned = true;
1577        }
1578        return mainrampagesize;
1579    }
1580
1581    return hpsize;
1582}
1583#else
1584long qemu_getrampagesize(void)
1585{
1586    return getpagesize();
1587}
1588#endif
1589
1590static int64_t get_file_size(int fd)
1591{
1592    int64_t size = lseek(fd, 0, SEEK_END);
1593    if (size < 0) {
1594        return -errno;
1595    }
1596    return size;
1597}
1598
1599static int file_ram_open(const char *path,
1600                         const char *region_name,
1601                         bool *created,
1602                         Error **errp)
1603{
1604    char *filename;
1605    char *sanitized_name;
1606    char *c;
1607    int fd = -1;
1608
1609    *created = false;
1610    for (;;) {
1611        fd = open(path, O_RDWR);
1612        if (fd >= 0) {
1613            /* @path names an existing file, use it */
1614            break;
1615        }
1616        if (errno == ENOENT) {
1617            /* @path names a file that doesn't exist, create it */
1618            fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1619            if (fd >= 0) {
1620                *created = true;
1621                break;
1622            }
1623        } else if (errno == EISDIR) {
1624            /* @path names a directory, create a file there */
1625            /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1626            sanitized_name = g_strdup(region_name);
1627            for (c = sanitized_name; *c != '\0'; c++) {
1628                if (*c == '/') {
1629                    *c = '_';
1630                }
1631            }
1632
1633            filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1634                                       sanitized_name);
1635            g_free(sanitized_name);
1636
1637#ifdef _WIN32
1638            fd = _open(_mktemp(filename), _O_CREAT | _O_RDWR);
1639#else
1640            fd = mkstemp(filename);
1641#endif
1642            if (fd >= 0) {
1643                unlink(filename);
1644                g_free(filename);
1645                break;
1646            }
1647            g_free(filename);
1648        }
1649        if (errno != EEXIST && errno != EINTR) {
1650            error_setg_errno(errp, errno,
1651                             "can't open backing store %s for guest RAM",
1652                             path);
1653            return -1;
1654        }
1655        /*
1656         * Try again on EINTR and EEXIST.  The latter happens when
1657         * something else creates the file between our two open().
1658         */
1659    }
1660
1661    return fd;
1662}
1663
1664static void *file_ram_alloc(RAMBlock *block,
1665                            ram_addr_t memory,
1666                            int fd,
1667                            bool truncate,
1668                            Error **errp)
1669{
1670    void *area;
1671
1672#ifdef _WIN32
1673    SYSTEM_INFO SysInfo;
1674    GetSystemInfo(&SysInfo);
1675    block->page_size = SysInfo.dwPageSize;
1676#else
1677    block->page_size = qemu_fd_getpagesize(fd);
1678#endif
1679    block->mr->align = block->page_size;
1680
1681#if defined(__s390x__)
1682    if (kvm_enabled()) {
1683        block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1684    }
1685#endif
1686
1687    if (memory < block->page_size) {
1688        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1689                   "or larger than page size 0x%zx",
1690                   memory, block->page_size);
1691        return NULL;
1692    }
1693
1694    memory = ROUND_UP(memory, block->page_size);
1695
1696    /*
1697     * ftruncate is not supported by hugetlbfs in older
1698     * hosts, so don't bother bailing out on errors.
1699     * If anything goes wrong with it under other filesystems,
1700     * mmap will fail.
1701     *
1702     * Do not truncate the non-empty backend file to avoid corrupting
1703     * the existing data in the file. Disabling shrinking is not
1704     * enough. For example, the current vNVDIMM implementation stores
1705     * the guest NVDIMM labels at the end of the backend file. If the
1706     * backend file is later extended, QEMU will not be able to find
1707     * those labels. Therefore, extending the non-empty backend file
1708     * is disabled as well.
1709     */
1710    if (truncate && ftruncate(fd, memory)) {
1711        perror("ftruncate");
1712    }
1713
1714#ifdef _WIN32
1715    HANDLE fd_temp = (HANDLE)_get_osfhandle(fd);
1716    HANDLE hMapFile = CreateFileMapping(fd_temp, NULL, PAGE_READWRITE,
1717                                        0, memory, NULL);
1718    area = MapViewOfFile(hMapFile, FILE_MAP_ALL_ACCESS, 0, 0, 0);
1719    if (area == NULL) {
1720#else
1721    area = qemu_ram_mmap(fd, memory, block->mr->align,
1722                         block->flags & RAM_SHARED);
1723    if (area == MAP_FAILED) {
1724#endif
1725        error_setg_errno(errp, errno,
1726                         "unable to map backing store for guest RAM");
1727        return NULL;
1728    }
1729
1730    if (mem_prealloc) {
1731        os_mem_prealloc(fd, area, memory, smp_cpus, errp);
1732        if (errp && *errp) {
1733#ifndef _WIN32
1734            qemu_ram_munmap(area, memory);
1735#endif
1736            return NULL;
1737        }
1738    }
1739
1740    block->fd = fd;
1741    return area;
1742}
1743
1744/* Called with the ramlist lock held.  */
1745static ram_addr_t find_ram_offset(ram_addr_t size)
1746{
1747    RAMBlock *block, *next_block;
1748    ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1749
1750    assert(size != 0); /* it would hand out same offset multiple times */
1751
1752    if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1753        return 0;
1754    }
1755
1756    RAMBLOCK_FOREACH(block) {
1757        ram_addr_t end, next = RAM_ADDR_MAX;
1758
1759        end = block->offset + block->max_length;
1760
1761        RAMBLOCK_FOREACH(next_block) {
1762            if (next_block->offset >= end) {
1763                next = MIN(next, next_block->offset);
1764            }
1765        }
1766        if (next - end >= size && next - end < mingap) {
1767            offset = end;
1768            mingap = next - end;
1769        }
1770    }
1771
1772    if (offset == RAM_ADDR_MAX) {
1773        fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1774                (uint64_t)size);
1775        abort();
1776    }
1777
1778    return offset;
1779}
1780
1781unsigned long last_ram_page(void)
1782{
1783    RAMBlock *block;
1784    ram_addr_t last = 0;
1785
1786    rcu_read_lock();
1787    RAMBLOCK_FOREACH(block) {
1788        last = MAX(last, block->offset + block->max_length);
1789    }
1790    rcu_read_unlock();
1791    return last >> TARGET_PAGE_BITS;
1792}
1793
1794static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1795{
1796    int ret;
1797
1798    /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1799    if (!machine_dump_guest_core(current_machine)) {
1800        ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1801        if (ret) {
1802            perror("qemu_madvise");
1803            fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1804                            "but dump_guest_core=off specified\n");
1805        }
1806    }
1807}
1808
1809const char *qemu_ram_get_idstr(RAMBlock *rb)
1810{
1811    return rb->idstr;
1812}
1813
1814bool qemu_ram_is_shared(RAMBlock *rb)
1815{
1816    return rb->flags & RAM_SHARED;
1817}
1818
1819/* Called with iothread lock held.  */
1820void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
1821{
1822    RAMBlock *block;
1823
1824    assert(new_block);
1825    assert(!new_block->idstr[0]);
1826
1827    if (dev) {
1828        char *id = qdev_get_dev_path(dev);
1829        if (id) {
1830            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1831            g_free(id);
1832        }
1833    }
1834    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1835
1836    rcu_read_lock();
1837    RAMBLOCK_FOREACH(block) {
1838        if (block != new_block &&
1839            !strcmp(block->idstr, new_block->idstr)) {
1840            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1841                    new_block->idstr);
1842            abort();
1843        }
1844    }
1845    rcu_read_unlock();
1846}
1847
1848/* Called with iothread lock held.  */
1849void qemu_ram_unset_idstr(RAMBlock *block)
1850{
1851    /* FIXME: arch_init.c assumes that this is not called throughout
1852     * migration.  Ignore the problem since hot-unplug during migration
1853     * does not work anyway.
1854     */
1855    if (block) {
1856        memset(block->idstr, 0, sizeof(block->idstr));
1857    }
1858}
1859
1860size_t qemu_ram_pagesize(RAMBlock *rb)
1861{
1862    return rb->page_size;
1863}
1864
1865/* Returns the largest size of page in use */
1866size_t qemu_ram_pagesize_largest(void)
1867{
1868    RAMBlock *block;
1869    size_t largest = 0;
1870
1871    RAMBLOCK_FOREACH(block) {
1872        largest = MAX(largest, qemu_ram_pagesize(block));
1873    }
1874
1875    return largest;
1876}
1877
1878static int memory_try_enable_merging(void *addr, size_t len)
1879{
1880    if (!machine_mem_merge(current_machine)) {
1881        /* disabled by the user */
1882        return 0;
1883    }
1884
1885    return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1886}
1887
1888/* Only legal before guest might have detected the memory size: e.g. on
1889 * incoming migration, or right after reset.
1890 *
1891 * As memory core doesn't know how is memory accessed, it is up to
1892 * resize callback to update device state and/or add assertions to detect
1893 * misuse, if necessary.
1894 */
1895int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
1896{
1897    assert(block);
1898
1899    newsize = HOST_PAGE_ALIGN(newsize);
1900
1901    if (block->used_length == newsize) {
1902        return 0;
1903    }
1904
1905    if (!(block->flags & RAM_RESIZEABLE)) {
1906        error_setg_errno(errp, EINVAL,
1907                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
1908                         " in != 0x" RAM_ADDR_FMT, block->idstr,
1909                         newsize, block->used_length);
1910        return -EINVAL;
1911    }
1912
1913    if (block->max_length < newsize) {
1914        error_setg_errno(errp, EINVAL,
1915                         "Length too large: %s: 0x" RAM_ADDR_FMT
1916                         " > 0x" RAM_ADDR_FMT, block->idstr,
1917                         newsize, block->max_length);
1918        return -EINVAL;
1919    }
1920
1921    cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1922    block->used_length = newsize;
1923    cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
1924                                        DIRTY_CLIENTS_ALL);
1925    memory_region_set_size(block->mr, newsize);
1926    if (block->resized) {
1927        block->resized(block->idstr, newsize, block->host);
1928    }
1929    return 0;
1930}
1931
1932/* Called with ram_list.mutex held */
1933static void dirty_memory_extend(ram_addr_t old_ram_size,
1934                                ram_addr_t new_ram_size)
1935{
1936    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
1937                                             DIRTY_MEMORY_BLOCK_SIZE);
1938    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
1939                                             DIRTY_MEMORY_BLOCK_SIZE);
1940    int i;
1941
1942    /* Only need to extend if block count increased */
1943    if (new_num_blocks <= old_num_blocks) {
1944        return;
1945    }
1946
1947    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1948        DirtyMemoryBlocks *old_blocks;
1949        DirtyMemoryBlocks *new_blocks;
1950        int j;
1951
1952        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
1953        new_blocks = g_malloc(sizeof(*new_blocks) +
1954                              sizeof(new_blocks->blocks[0]) * new_num_blocks);
1955
1956        if (old_num_blocks) {
1957            memcpy(new_blocks->blocks, old_blocks->blocks,
1958                   old_num_blocks * sizeof(old_blocks->blocks[0]));
1959        }
1960
1961        for (j = old_num_blocks; j < new_num_blocks; j++) {
1962            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
1963        }
1964
1965        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
1966
1967        if (old_blocks) {
1968            g_free_rcu(old_blocks, rcu);
1969        }
1970    }
1971}
1972
1973static void ram_block_add(RAMBlock *new_block, Error **errp)
1974{
1975    RAMBlock *block;
1976    RAMBlock *last_block = NULL;
1977    ram_addr_t old_ram_size, new_ram_size;
1978    Error *err = NULL;
1979
1980    old_ram_size = last_ram_page();
1981
1982    qemu_mutex_lock_ramlist();
1983    new_block->offset = find_ram_offset(new_block->max_length);
1984
1985    if (!new_block->host) {
1986        if (xen_enabled()) {
1987            xen_ram_alloc(new_block->offset, new_block->max_length,
1988                          new_block->mr, &err);
1989            if (err) {
1990                error_propagate(errp, err);
1991                qemu_mutex_unlock_ramlist();
1992                return;
1993            }
1994        } else {
1995            new_block->host = phys_mem_alloc(new_block->max_length,
1996                                             &new_block->mr->align);
1997            if (!new_block->host) {
1998                error_setg_errno(errp, errno,
1999                                 "cannot set up guest memory '%s'",
2000                                 memory_region_name(new_block->mr));
2001                qemu_mutex_unlock_ramlist();
2002                return;
2003            }
2004            memory_try_enable_merging(new_block->host, new_block->max_length);
2005        }
2006    }
2007
2008    new_ram_size = MAX(old_ram_size,
2009              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
2010    if (new_ram_size > old_ram_size) {
2011        dirty_memory_extend(old_ram_size, new_ram_size);
2012    }
2013    /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
2014     * QLIST (which has an RCU-friendly variant) does not have insertion at
2015     * tail, so save the last element in last_block.
2016     */
2017    RAMBLOCK_FOREACH(block) {
2018        last_block = block;
2019        if (block->max_length < new_block->max_length) {
2020            break;
2021        }
2022    }
2023    if (block) {
2024        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
2025    } else if (last_block) {
2026        QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
2027    } else { /* list is empty */
2028        QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
2029    }
2030    ram_list.mru_block = NULL;
2031
2032    /* Write list before version */
2033    smp_wmb();
2034    ram_list.version++;
2035    qemu_mutex_unlock_ramlist();
2036
2037    cpu_physical_memory_set_dirty_range(new_block->offset,
2038                                        new_block->used_length,
2039                                        DIRTY_CLIENTS_ALL);
2040
2041    if (new_block->host) {
2042        qemu_ram_setup_dump(new_block->host, new_block->max_length);
2043        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
2044        /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
2045        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
2046        ram_block_notify_add(new_block->host, new_block->max_length);
2047    }
2048}
2049
2050RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
2051                                 bool share, int fd,
2052                                 Error **errp)
2053{
2054    RAMBlock *new_block;
2055    Error *local_err = NULL;
2056    int64_t file_size;
2057
2058    if (xen_enabled()) {
2059        error_setg(errp, "-mem-path not supported with Xen");
2060        return NULL;
2061    }
2062
2063    if (kvm_enabled() && !kvm_has_sync_mmu()) {
2064        error_setg(errp,
2065                   "host lacks kvm mmu notifiers, -mem-path unsupported");
2066        return NULL;
2067    }
2068
2069    if (phys_mem_alloc != qemu_anon_ram_alloc) {
2070        /*
2071         * file_ram_alloc() needs to allocate just like
2072         * phys_mem_alloc, but we haven't bothered to provide
2073         * a hook there.
2074         */
2075        error_setg(errp,
2076                   "-mem-path not supported with this accelerator");
2077        return NULL;
2078    }
2079
2080    size = HOST_PAGE_ALIGN(size);
2081    file_size = get_file_size(fd);
2082    if (file_size > 0 && file_size < size) {
2083        error_setg(errp, "backing store %s size 0x%" PRIx64
2084                   " does not match 'size' option 0x" RAM_ADDR_FMT,
2085                   mem_path, file_size, size);
2086        return NULL;
2087    }
2088
2089    new_block = g_malloc0(sizeof(*new_block));
2090    new_block->mr = mr;
2091    new_block->used_length = size;
2092    new_block->max_length = size;
2093    new_block->flags = share ? RAM_SHARED : 0;
2094    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
2095    if (!new_block->host) {
2096        g_free(new_block);
2097        return NULL;
2098    }
2099
2100    ram_block_add(new_block, &local_err);
2101    if (local_err) {
2102        g_free(new_block);
2103        error_propagate(errp, local_err);
2104        return NULL;
2105    }
2106    return new_block;
2107
2108}
2109
2110
2111RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
2112                                   bool share, const char *mem_path,
2113                                   Error **errp)
2114{
2115    int fd;
2116    bool created;
2117    RAMBlock *block;
2118
2119    fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
2120    if (fd < 0) {
2121        return NULL;
2122    }
2123
2124    block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
2125    if (!block) {
2126        if (created) {
2127            unlink(mem_path);
2128        }
2129        close(fd);
2130        return NULL;
2131    }
2132
2133    return block;
2134}
2135
2136static
2137RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
2138                                  void (*resized)(const char*,
2139                                                  uint64_t length,
2140                                                  void *host),
2141                                  void *host, bool resizeable,
2142                                  MemoryRegion *mr, Error **errp)
2143{
2144    RAMBlock *new_block;
2145    Error *local_err = NULL;
2146
2147    size = HOST_PAGE_ALIGN(size);
2148    max_size = HOST_PAGE_ALIGN(max_size);
2149    new_block = g_malloc0(sizeof(*new_block));
2150    new_block->mr = mr;
2151    new_block->resized = resized;
2152    new_block->used_length = size;
2153    new_block->max_length = max_size;
2154    assert(max_size >= size);
2155    new_block->fd = -1;
2156    new_block->page_size = getpagesize();
2157    new_block->host = host;
2158    if (host) {
2159        new_block->flags |= RAM_PREALLOC;
2160    }
2161    if (resizeable) {
2162        new_block->flags |= RAM_RESIZEABLE;
2163    }
2164    ram_block_add(new_block, &local_err);
2165    if (local_err) {
2166        g_free(new_block);
2167        error_propagate(errp, local_err);
2168        return NULL;
2169    }
2170    return new_block;
2171}
2172
2173RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2174                                   MemoryRegion *mr, Error **errp)
2175{
2176    return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
2177}
2178
2179RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
2180{
2181    return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
2182}
2183
2184RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2185                                     void (*resized)(const char*,
2186                                                     uint64_t length,
2187                                                     void *host),
2188                                     MemoryRegion *mr, Error **errp)
2189{
2190    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
2191}
2192
2193static void reclaim_ramblock(RAMBlock *block)
2194{
2195    if (block->flags & RAM_PREALLOC) {
2196        ;
2197    } else if (xen_enabled()) {
2198        xen_invalidate_map_cache_entry(block->host);
2199#ifndef _WIN32
2200    } else if (block->fd >= 0) {
2201        qemu_ram_munmap(block->host, block->max_length);
2202        close(block->fd);
2203#endif
2204    } else {
2205        qemu_anon_ram_free(block->host, block->max_length);
2206    }
2207    g_free(block);
2208}
2209
2210void qemu_ram_free(RAMBlock *block)
2211{
2212    if (!block) {
2213        return;
2214    }
2215
2216    if (block->host) {
2217        ram_block_notify_remove(block->host, block->max_length);
2218    }
2219
2220    qemu_mutex_lock_ramlist();
2221    QLIST_REMOVE_RCU(block, next);
2222    ram_list.mru_block = NULL;
2223    /* Write list before version */
2224    smp_wmb();
2225    ram_list.version++;
2226    call_rcu(block, reclaim_ramblock, rcu);
2227    qemu_mutex_unlock_ramlist();
2228}
2229
2230#ifndef _WIN32
2231void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
2232{
2233    RAMBlock *block;
2234    ram_addr_t offset;
2235    int flags;
2236    void *area, *vaddr;
2237
2238    RAMBLOCK_FOREACH(block) {
2239        offset = addr - block->offset;
2240        if (offset < block->max_length) {
2241            vaddr = ramblock_ptr(block, offset);
2242            if (block->flags & RAM_PREALLOC) {
2243                ;
2244            } else if (xen_enabled()) {
2245                abort();
2246            } else {
2247                flags = MAP_FIXED;
2248                if (block->fd >= 0) {
2249                    flags |= (block->flags & RAM_SHARED ?
2250                              MAP_SHARED : MAP_PRIVATE);
2251                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2252                                flags, block->fd, offset);
2253                } else {
2254                    /*
2255                     * Remap needs to match alloc.  Accelerators that
2256                     * set phys_mem_alloc never remap.  If they did,
2257                     * we'd need a remap hook here.
2258                     */
2259                    assert(phys_mem_alloc == qemu_anon_ram_alloc);
2260
2261                    flags |= MAP_PRIVATE | MAP_ANONYMOUS;
2262                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2263                                flags, -1, 0);
2264                }
2265                if (area != vaddr) {
2266                    fprintf(stderr, "Could not remap addr: "
2267                            RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
2268                            length, addr);
2269                    exit(1);
2270                }
2271                memory_try_enable_merging(vaddr, length);
2272                qemu_ram_setup_dump(vaddr, length);
2273            }
2274        }
2275    }
2276}
2277#endif /* !_WIN32 */
2278
2279/* Return a host pointer to ram allocated with qemu_ram_alloc.
2280 * This should not be used for general purpose DMA.  Use address_space_map
2281 * or address_space_rw instead. For local memory (e.g. video ram) that the
2282 * device owns, use memory_region_get_ram_ptr.
2283 *
2284 * Called within RCU critical section.
2285 */
2286void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2287{
2288    RAMBlock *block = ram_block;
2289
2290    if (block == NULL) {
2291        block = qemu_get_ram_block(addr);
2292        addr -= block->offset;
2293    }
2294
2295    if (xen_enabled() && block->host == NULL) {
2296        /* We need to check if the requested address is in the RAM
2297         * because we don't want to map the entire memory in QEMU.
2298         * In that case just map until the end of the page.
2299         */
2300        if (block->offset == 0) {
2301            return xen_map_cache(addr, 0, 0, false);
2302        }
2303
2304        block->host = xen_map_cache(block->offset, block->max_length, 1, false);
2305    }
2306    return ramblock_ptr(block, addr);
2307}
2308
2309/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
2310 * but takes a size argument.
2311 *
2312 * Called within RCU critical section.
2313 */
2314static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
2315                                 hwaddr *size, bool lock)
2316{
2317    RAMBlock *block = ram_block;
2318    if (*size == 0) {
2319        return NULL;
2320    }
2321
2322    if (block == NULL) {
2323        block = qemu_get_ram_block(addr);
2324        addr -= block->offset;
2325    }
2326    *size = MIN(*size, block->max_length - addr);
2327
2328    if (xen_enabled() && block->host == NULL) {
2329        /* We need to check if the requested address is in the RAM
2330         * because we don't want to map the entire memory in QEMU.
2331         * In that case just map the requested area.
2332         */
2333        if (block->offset == 0) {
2334            return xen_map_cache(addr, *size, lock, lock);
2335        }
2336
2337        block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
2338    }
2339
2340    return ramblock_ptr(block, addr);
2341}
2342
2343/*
2344 * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
2345 * in that RAMBlock.
2346 *
2347 * ptr: Host pointer to look up
2348 * round_offset: If true round the result offset down to a page boundary
2349 * *ram_addr: set to result ram_addr
2350 * *offset: set to result offset within the RAMBlock
2351 *
2352 * Returns: RAMBlock (or NULL if not found)
2353 *
2354 * By the time this function returns, the returned pointer is not protected
2355 * by RCU anymore.  If the caller is not within an RCU critical section and
2356 * does not hold the iothread lock, it must have other means of protecting the
2357 * pointer, such as a reference to the region that includes the incoming
2358 * ram_addr_t.
2359 */
2360RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
2361                                   ram_addr_t *offset)
2362{
2363    RAMBlock *block;
2364    uint8_t *host = ptr;
2365
2366    if (xen_enabled()) {
2367        ram_addr_t ram_addr;
2368        rcu_read_lock();
2369        ram_addr = xen_ram_addr_from_mapcache(ptr);
2370        block = qemu_get_ram_block(ram_addr);
2371        if (block) {
2372            *offset = ram_addr - block->offset;
2373        }
2374        rcu_read_unlock();
2375        return block;
2376    }
2377
2378    rcu_read_lock();
2379    block = atomic_rcu_read(&ram_list.mru_block);
2380    if (block && block->host && host - block->host < block->max_length) {
2381        goto found;
2382    }
2383
2384    RAMBLOCK_FOREACH(block) {
2385        /* This case append when the block is not mapped. */
2386        if (block->host == NULL) {
2387            continue;
2388        }
2389        if (host - block->host < block->max_length) {
2390            goto found;
2391        }
2392    }
2393
2394    rcu_read_unlock();
2395    return NULL;
2396
2397found:
2398    *offset = (host - block->host);
2399    if (round_offset) {
2400        *offset &= TARGET_PAGE_MASK;
2401    }
2402    rcu_read_unlock();
2403    return block;
2404}
2405
2406/*
2407 * Finds the named RAMBlock
2408 *
2409 * name: The name of RAMBlock to find
2410 *
2411 * Returns: RAMBlock (or NULL if not found)
2412 */
2413RAMBlock *qemu_ram_block_by_name(const char *name)
2414{
2415    RAMBlock *block;
2416
2417    RAMBLOCK_FOREACH(block) {
2418        if (!strcmp(name, block->idstr)) {
2419            return block;
2420        }
2421    }
2422
2423    return NULL;
2424}
2425
2426/* Some of the softmmu routines need to translate from a host pointer
2427   (typically a TLB entry) back to a ram offset.  */
2428ram_addr_t qemu_ram_addr_from_host(void *ptr)
2429{
2430    RAMBlock *block;
2431    ram_addr_t offset;
2432
2433    block = qemu_ram_block_from_host(ptr, false, &offset);
2434    if (!block) {
2435        return RAM_ADDR_INVALID;
2436    }
2437
2438    return block->offset + offset;
2439}
2440
2441/* Called within RCU critical section. */
2442void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
2443                          CPUState *cpu,
2444                          vaddr mem_vaddr,
2445                          ram_addr_t ram_addr,
2446                          unsigned size)
2447{
2448    ndi->cpu = cpu;
2449    ndi->ram_addr = ram_addr;
2450    ndi->mem_vaddr = mem_vaddr;
2451    ndi->size = size;
2452    ndi->locked = false;
2453
2454    assert(tcg_enabled());
2455    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
2456        ndi->locked = true;
2457        tb_lock();
2458        tb_invalidate_phys_page_fast(ram_addr, size);
2459    }
2460}
2461
2462/* Called within RCU critical section. */
2463void memory_notdirty_write_complete(NotDirtyInfo *ndi)
2464{
2465    if (ndi->locked) {
2466        tb_unlock();
2467    }
2468
2469    /* Set both VGA and migration bits for simplicity and to remove
2470     * the notdirty callback faster.
2471     */
2472    cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
2473                                        DIRTY_CLIENTS_NOCODE);
2474    /* we remove the notdirty callback only if the code has been
2475       flushed */
2476    if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
2477        tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
2478    }
2479}
2480
2481/* Called within RCU critical section.  */
2482static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
2483                               uint64_t val, unsigned size)
2484{
2485    NotDirtyInfo ndi;
2486
2487    memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
2488                         ram_addr, size);
2489
2490    switch (size) {
2491    case 1:
2492        stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2493        break;
2494    case 2:
2495        stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2496        break;
2497    case 4:
2498        stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2499        break;
2500    case 8:
2501        stq_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2502        break;
2503    default:
2504        abort();
2505    }
2506    memory_notdirty_write_complete(&ndi);
2507}
2508
2509static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
2510                                 unsigned size, bool is_write)
2511{
2512    return is_write;
2513}
2514
2515static const MemoryRegionOps notdirty_mem_ops = {
2516    .write = notdirty_mem_write,
2517    .valid.accepts = notdirty_mem_accepts,
2518    .endianness = DEVICE_NATIVE_ENDIAN,
2519    .valid = {
2520        .min_access_size = 1,
2521        .max_access_size = 8,
2522        .unaligned = false,
2523    },
2524    .impl = {
2525        .min_access_size = 1,
2526        .max_access_size = 8,
2527        .unaligned = false,
2528    },
2529};
2530
2531/* Generate a debug exception if a watchpoint has been hit.  */
2532static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
2533{
2534    CPUState *cpu = current_cpu;
2535    CPUClass *cc = CPU_GET_CLASS(cpu);
2536    target_ulong vaddr;
2537    CPUWatchpoint *wp;
2538
2539    assert(tcg_enabled());
2540    if (cpu->watchpoint_hit) {
2541        /* We re-entered the check after replacing the TB. Now raise
2542         * the debug interrupt so that is will trigger after the
2543         * current instruction. */
2544        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2545        return;
2546    }
2547    vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
2548    vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
2549    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2550        if (cpu_watchpoint_address_matches(wp, vaddr, len)
2551            && (wp->flags & flags)) {
2552            if (flags == BP_MEM_READ) {
2553                wp->flags |= BP_WATCHPOINT_HIT_READ;
2554            } else {
2555                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2556            }
2557            wp->hitaddr = vaddr;
2558            wp->hitattrs = attrs;
2559            if (!cpu->watchpoint_hit) {
2560                if (wp->flags & BP_CPU &&
2561                    !cc->debug_check_watchpoint(cpu, wp)) {
2562                    wp->flags &= ~BP_WATCHPOINT_HIT;
2563                    continue;
2564                }
2565                cpu->watchpoint_hit = wp;
2566
2567                /* Both tb_lock and iothread_mutex will be reset when
2568                 * cpu_loop_exit or cpu_loop_exit_noexc longjmp
2569                 * back into the cpu_exec main loop.
2570                 */
2571                tb_lock();
2572                tb_check_watchpoint(cpu);
2573                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2574                    cpu->exception_index = EXCP_DEBUG;
2575                    cpu_loop_exit(cpu);
2576                } else {
2577                    /* Force execution of one insn next time.  */
2578                    cpu->cflags_next_tb = 1 | curr_cflags();
2579                    cpu_loop_exit_noexc(cpu);
2580                }
2581            }
2582        } else {
2583            wp->flags &= ~BP_WATCHPOINT_HIT;
2584        }
2585    }
2586}
2587
2588/* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
2589   so these check for a hit then pass through to the normal out-of-line
2590   phys routines.  */
2591static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
2592                                  unsigned size, MemTxAttrs attrs)
2593{
2594    MemTxResult res;
2595    uint64_t data;
2596    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2597    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2598
2599    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
2600    switch (size) {
2601    case 1:
2602        data = address_space_ldub(as, addr, attrs, &res);
2603        break;
2604    case 2:
2605        data = address_space_lduw(as, addr, attrs, &res);
2606        break;
2607    case 4:
2608        data = address_space_ldl(as, addr, attrs, &res);
2609        break;
2610    case 8:
2611        data = address_space_ldq(as, addr, attrs, &res);
2612        break;
2613    default: abort();
2614    }
2615    *pdata = data;
2616    return res;
2617}
2618
2619static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
2620                                   uint64_t val, unsigned size,
2621                                   MemTxAttrs attrs)
2622{
2623    MemTxResult res;
2624    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2625    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2626
2627    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
2628    switch (size) {
2629    case 1:
2630        address_space_stb(as, addr, val, attrs, &res);
2631        break;
2632    case 2:
2633        address_space_stw(as, addr, val, attrs, &res);
2634        break;
2635    case 4:
2636        address_space_stl(as, addr, val, attrs, &res);
2637        break;
2638    case 8:
2639        address_space_stq(as, addr, val, attrs, &res);
2640        break;
2641    default: abort();
2642    }
2643    return res;
2644}
2645
2646static const MemoryRegionOps watch_mem_ops = {
2647    .read_with_attrs = watch_mem_read,
2648    .write_with_attrs = watch_mem_write,
2649    .endianness = DEVICE_NATIVE_ENDIAN,
2650    .valid = {
2651        .min_access_size = 1,
2652        .max_access_size = 8,
2653        .unaligned = false,
2654    },
2655    .impl = {
2656        .min_access_size = 1,
2657        .max_access_size = 8,
2658        .unaligned = false,
2659    },
2660};
2661
2662static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
2663                                  const uint8_t *buf, int len);
2664static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
2665                                  bool is_write);
2666
2667static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2668                                unsigned len, MemTxAttrs attrs)
2669{
2670    subpage_t *subpage = opaque;
2671    uint8_t buf[8];
2672    MemTxResult res;
2673
2674#if defined(DEBUG_SUBPAGE)
2675    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2676           subpage, len, addr);
2677#endif
2678    res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
2679    if (res) {
2680        return res;
2681    }
2682    switch (len) {
2683    case 1:
2684        *data = ldub_p(buf);
2685        return MEMTX_OK;
2686    case 2:
2687        *data = lduw_p(buf);
2688        return MEMTX_OK;
2689    case 4:
2690        *data = ldl_p(buf);
2691        return MEMTX_OK;
2692    case 8:
2693        *data = ldq_p(buf);
2694        return MEMTX_OK;
2695    default:
2696        abort();
2697    }
2698}
2699
2700static MemTxResult subpage_write(void *opaque, hwaddr addr,
2701                                 uint64_t value, unsigned len, MemTxAttrs attrs)
2702{
2703    subpage_t *subpage = opaque;
2704    uint8_t buf[8];
2705
2706#if defined(DEBUG_SUBPAGE)
2707    printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2708           " value %"PRIx64"\n",
2709           __func__, subpage, len, addr, value);
2710#endif
2711    switch (len) {
2712    case 1:
2713        stb_p(buf, value);
2714        break;
2715    case 2:
2716        stw_p(buf, value);
2717        break;
2718    case 4:
2719        stl_p(buf, value);
2720        break;
2721    case 8:
2722        stq_p(buf, value);
2723        break;
2724    default:
2725        abort();
2726    }
2727    return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
2728}
2729
2730static bool subpage_accepts(void *opaque, hwaddr addr,
2731                            unsigned len, bool is_write)
2732{
2733    subpage_t *subpage = opaque;
2734#if defined(DEBUG_SUBPAGE)
2735    printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2736           __func__, subpage, is_write ? 'w' : 'r', len, addr);
2737#endif
2738
2739    return flatview_access_valid(subpage->fv, addr + subpage->base,
2740                                 len, is_write);
2741}
2742
2743static const MemoryRegionOps subpage_ops = {
2744    .read_with_attrs = subpage_read,
2745    .write_with_attrs = subpage_write,
2746    .impl.min_access_size = 1,
2747    .impl.max_access_size = 8,
2748    .valid.min_access_size = 1,
2749    .valid.max_access_size = 8,
2750    .valid.accepts = subpage_accepts,
2751    .endianness = DEVICE_NATIVE_ENDIAN,
2752};
2753
2754static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2755                             uint16_t section)
2756{
2757    int idx, eidx;
2758
2759    if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2760        return -1;
2761    idx = SUBPAGE_IDX(start);
2762    eidx = SUBPAGE_IDX(end);
2763#if defined(DEBUG_SUBPAGE)
2764    printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2765           __func__, mmio, start, end, idx, eidx, section);
2766#endif
2767    for (; idx <= eidx; idx++) {
2768        mmio->sub_section[idx] = section;
2769    }
2770
2771    return 0;
2772}
2773
2774static subpage_t *subpage_init(FlatView *fv, hwaddr base)
2775{
2776    subpage_t *mmio;
2777
2778    mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2779    mmio->fv = fv;
2780    mmio->base = base;
2781    memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2782                          NULL, TARGET_PAGE_SIZE);
2783    mmio->iomem.subpage = true;
2784#if defined(DEBUG_SUBPAGE)
2785    printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2786           mmio, base, TARGET_PAGE_SIZE);
2787#endif
2788    subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2789
2790    return mmio;
2791}
2792
2793static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
2794{
2795    assert(fv);
2796    MemoryRegionSection section = {
2797        .fv = fv,
2798        .mr = mr,
2799        .offset_within_address_space = 0,
2800        .offset_within_region = 0,
2801        .size = int128_2_64(),
2802    };
2803
2804    return phys_section_add(map, &section);
2805}
2806
2807MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
2808{
2809    int asidx = cpu_asidx_from_attrs(cpu, attrs);
2810    CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2811    AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2812    MemoryRegionSection *sections = d->map.sections;
2813
2814    return sections[index & ~TARGET_PAGE_MASK].mr;
2815}
2816
2817static void io_mem_init(void)
2818{
2819    memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2820    memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2821                          NULL, UINT64_MAX);
2822
2823    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
2824     * which can be called without the iothread mutex.
2825     */
2826    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2827                          NULL, UINT64_MAX);
2828    memory_region_clear_global_locking(&io_mem_notdirty);
2829
2830    memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2831                          NULL, UINT64_MAX);
2832}
2833
2834AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
2835{
2836    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2837    uint16_t n;
2838
2839    n = dummy_section(&d->map, fv, &io_mem_unassigned);
2840    assert(n == PHYS_SECTION_UNASSIGNED);
2841    n = dummy_section(&d->map, fv, &io_mem_notdirty);
2842    assert(n == PHYS_SECTION_NOTDIRTY);
2843    n = dummy_section(&d->map, fv, &io_mem_rom);
2844    assert(n == PHYS_SECTION_ROM);
2845    n = dummy_section(&d->map, fv, &io_mem_watch);
2846    assert(n == PHYS_SECTION_WATCH);
2847
2848    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2849
2850    return d;
2851}
2852
2853void address_space_dispatch_free(AddressSpaceDispatch *d)
2854{
2855    phys_sections_free(&d->map);
2856    g_free(d);
2857}
2858
2859static void tcg_commit(MemoryListener *listener)
2860{
2861    CPUAddressSpace *cpuas;
2862    AddressSpaceDispatch *d;
2863
2864    /* since each CPU stores ram addresses in its TLB cache, we must
2865       reset the modified entries */
2866    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2867    cpu_reloading_memory_map();
2868    /* The CPU and TLB are protected by the iothread lock.
2869     * We reload the dispatch pointer now because cpu_reloading_memory_map()
2870     * may have split the RCU critical section.
2871     */
2872    d = address_space_to_dispatch(cpuas->as);
2873    atomic_rcu_set(&cpuas->memory_dispatch, d);
2874    tlb_flush(cpuas->cpu);
2875}
2876
2877static void memory_map_init(void)
2878{
2879    system_memory = g_malloc(sizeof(*system_memory));
2880
2881    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2882    address_space_init(&address_space_memory, system_memory, "memory");
2883
2884    system_io = g_malloc(sizeof(*system_io));
2885    memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2886                          65536);
2887    address_space_init(&address_space_io, system_io, "I/O");
2888}
2889
2890MemoryRegion *get_system_memory(void)
2891{
2892    return system_memory;
2893}
2894
2895MemoryRegion *get_system_io(void)
2896{
2897    return system_io;
2898}
2899
2900#endif /* !defined(CONFIG_USER_ONLY) */
2901
2902/* physical memory access (slow version, mainly for debug) */
2903#if defined(CONFIG_USER_ONLY)
2904int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2905                        uint8_t *buf, int len, int is_write)
2906{
2907    int l, flags;
2908    target_ulong page;
2909    void * p;
2910
2911    while (len > 0) {
2912        page = addr & TARGET_PAGE_MASK;
2913        l = (page + TARGET_PAGE_SIZE) - addr;
2914        if (l > len)
2915            l = len;
2916        flags = page_get_flags(page);
2917        if (!(flags & PAGE_VALID))
2918            return -1;
2919        if (is_write) {
2920            if (!(flags & PAGE_WRITE))
2921                return -1;
2922            /* XXX: this code should not depend on lock_user */
2923            if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
2924                return -1;
2925            memcpy(p, buf, l);
2926            unlock_user(p, addr, l);
2927        } else {
2928            if (!(flags & PAGE_READ))
2929                return -1;
2930            /* XXX: this code should not depend on lock_user */
2931            if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
2932                return -1;
2933            memcpy(buf, p, l);
2934            unlock_user(p, addr, 0);
2935        }
2936        len -= l;
2937        buf += l;
2938        addr += l;
2939    }
2940    return 0;
2941}
2942
2943void cpu_set_mr(Object *obj, Visitor *v, void *opaque,
2944                const char *name, Error **errp)
2945{
2946}
2947
2948#else
2949
2950void cpu_set_mr(Object *obj, Visitor *v, void *opaque,
2951                const char *name, Error **errp)
2952{
2953    CPUState *cpu = CPU(obj);
2954    Error *local_err = NULL;
2955    char *path = NULL;
2956
2957    visit_type_str(v, name, &path, &local_err);
2958
2959    if (!local_err && strcmp(path, "") != 0) {
2960        cpu->memory = MEMORY_REGION(object_resolve_link(obj, name, path,
2961                                &local_err));
2962    }
2963
2964    if (local_err) {
2965        error_propagate(errp, local_err);
2966        return;
2967    }
2968
2969    object_ref(OBJECT(cpu->memory));
2970    cpu->as = g_malloc0(sizeof(AddressSpace));
2971    address_space_init(cpu->as, cpu->memory, NULL);
2972}
2973
2974static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
2975                                     hwaddr length)
2976{
2977    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
2978    addr += memory_region_get_ram_addr(mr);
2979
2980    /* No early return if dirty_log_mask is or becomes 0, because
2981     * cpu_physical_memory_set_dirty_range will still call
2982     * xen_modified_memory.
2983     */
2984    if (dirty_log_mask) {
2985        dirty_log_mask =
2986            cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
2987    }
2988    if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
2989        assert(tcg_enabled());
2990        tb_lock();
2991        tb_invalidate_phys_range(addr, addr + length);
2992        tb_unlock();
2993        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
2994    }
2995    cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
2996}
2997
2998static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2999{
3000    unsigned access_size_max = mr->ops->valid.max_access_size;
3001
3002    /* Regions are assumed to support 1-4 byte accesses unless
3003       otherwise specified.  */
3004    if (access_size_max == 0) {
3005        access_size_max = 4;
3006    }
3007
3008    /* Bound the maximum access by the alignment of the address.  */
3009    if (!mr->ops->impl.unaligned) {
3010        unsigned align_size_max = addr & -addr;
3011        if (align_size_max != 0 && align_size_max < access_size_max) {
3012            access_size_max = align_size_max;
3013        }
3014    }
3015
3016    /* Don't attempt accesses larger than the maximum.  */
3017    if (l > access_size_max) {
3018        l = access_size_max;
3019    }
3020    l = pow2floor(l);
3021
3022    return l;
3023}
3024
3025static bool prepare_mmio_access(MemoryRegion *mr)
3026{
3027    bool unlocked = !qemu_mutex_iothread_locked();
3028    bool release_lock = false;
3029
3030    if (unlocked && mr->global_locking) {
3031        qemu_mutex_lock_iothread();
3032        unlocked = false;
3033        release_lock = true;
3034    }
3035    if (mr->flush_coalesced_mmio) {
3036        if (unlocked) {
3037            qemu_mutex_lock_iothread();
3038        }
3039        qemu_flush_coalesced_mmio_buffer();
3040        if (unlocked) {
3041            qemu_mutex_unlock_iothread();
3042        }
3043    }
3044
3045    return release_lock;
3046}
3047
3048/* Called within RCU critical section.  */
3049static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
3050                                           MemTxAttrs attrs,
3051                                           const uint8_t *buf,
3052                                           int len, hwaddr addr1,
3053                                           hwaddr l, MemoryRegion *mr)
3054{
3055    uint8_t *ptr;
3056    uint64_t val;
3057    MemTxResult result = MEMTX_OK;
3058    bool release_lock = false;
3059
3060    for (;;) {
3061        if (!memory_access_is_direct(mr, true)) {
3062            release_lock |= prepare_mmio_access(mr);
3063            l = memory_access_size(mr, l, addr1);
3064            /* XXX: could force current_cpu to NULL to avoid
3065               potential bugs */
3066            switch (l) {
3067            case 8:
3068                /* 64 bit write access */
3069                val = ldq_p(buf);
3070                result |= memory_region_dispatch_write(mr, addr1, val, 8,
3071                                                       attrs);
3072                break;
3073            case 4:
3074                /* 32 bit write access */
3075                val = (uint32_t)ldl_p(buf);
3076                result |= memory_region_dispatch_write(mr, addr1, val, 4,
3077                                                       attrs);
3078                break;
3079            case 2:
3080                /* 16 bit write access */
3081                val = lduw_p(buf);
3082                result |= memory_region_dispatch_write(mr, addr1, val, 2,
3083                                                       attrs);
3084                break;
3085            case 1:
3086                /* 8 bit write access */
3087                val = ldub_p(buf);
3088                result |= memory_region_dispatch_write(mr, addr1, val, 1,
3089                                                       attrs);
3090                break;
3091            default:
3092                if (mr->ops->access) {
3093                    MemoryTransaction tr = {
3094                        .data.p8 = (uint8_t *) buf,
3095                        .rw = true,
3096                        .addr = addr1,
3097                        .size = l,
3098                        .attr = attrs,
3099                        .opaque = mr->opaque,
3100                    };
3101                    mr->ops->access(&tr);
3102                } else {
3103                    abort();
3104                }
3105            }
3106        } else {
3107            /* RAM case */
3108            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3109            memcpy(ptr, buf, l);
3110            invalidate_and_set_dirty(mr, addr1, l);
3111        }
3112
3113        if (release_lock) {
3114            qemu_mutex_unlock_iothread();
3115            release_lock = false;
3116        }
3117
3118        len -= l;
3119        buf += l;
3120        addr += l;
3121
3122        if (!len) {
3123            break;
3124        }
3125
3126        l = len;
3127        mr = flatview_translate(fv, addr, &addr1, &l, true, &attrs);
3128    }
3129
3130    return result;
3131}
3132
3133static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
3134                                  const uint8_t *buf, int len)
3135{
3136    hwaddr l;
3137    hwaddr addr1;
3138    MemoryRegion *mr;
3139    MemTxResult result = MEMTX_OK;
3140
3141    if (len > 0) {
3142        rcu_read_lock();
3143        l = len;
3144        mr = flatview_translate(fv, addr, &addr1, &l, true, &attrs);
3145        result = flatview_write_continue(fv, addr, attrs, buf, len,
3146                                         addr1, l, mr);
3147        rcu_read_unlock();
3148    }
3149
3150    return result;
3151}
3152
3153MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
3154                                              MemTxAttrs attrs,
3155                                              const uint8_t *buf, int len)
3156{
3157    return flatview_write(address_space_to_flatview(as), addr, attrs, buf, len);
3158}
3159
3160/* Called within RCU critical section.  */
3161MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
3162                                   MemTxAttrs attrs, uint8_t *buf,
3163                                   int len, hwaddr addr1, hwaddr l,
3164                                   MemoryRegion *mr)
3165{
3166    uint8_t *ptr;
3167    uint64_t val;
3168    MemTxResult result = MEMTX_OK;
3169    bool release_lock = false;
3170
3171    for (;;) {
3172        if (!memory_access_is_direct(mr, false)) {
3173            /* I/O case */
3174            release_lock |= prepare_mmio_access(mr);
3175            l = memory_access_size(mr, l, addr1);
3176            switch (l) {
3177            case 8:
3178                /* 64 bit read access */
3179                result |= memory_region_dispatch_read(mr, addr1, &val, 8,
3180                                                      attrs);
3181                stq_p(buf, val);
3182                break;
3183            case 4:
3184                /* 32 bit read access */
3185                result |= memory_region_dispatch_read(mr, addr1, &val, 4,
3186                                                      attrs);
3187                stl_p(buf, val);
3188                break;
3189            case 2:
3190                /* 16 bit read access */
3191                result |= memory_region_dispatch_read(mr, addr1, &val, 2,
3192                                                      attrs);
3193                stw_p(buf, val);
3194                break;
3195            case 1:
3196                /* 8 bit read access */
3197                result |= memory_region_dispatch_read(mr, addr1, &val, 1,
3198                                                      attrs);
3199                stb_p(buf, val);
3200                break;
3201            default:
3202                if (mr->ops->access) {
3203                    MemoryTransaction tr = {
3204                        .data.p8 = buf,
3205                        .rw = false,
3206                        .addr = addr1,
3207                        .size = l,
3208                        .attr = attrs,
3209                        .opaque = mr->opaque,
3210                    };
3211                    mr->ops->access(&tr);
3212                } else {
3213                    abort();
3214                }
3215            }
3216        } else {
3217            /* RAM case */
3218            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3219            memcpy(buf, ptr, l);
3220        }
3221
3222        if (release_lock) {
3223            qemu_mutex_unlock_iothread();
3224            release_lock = false;
3225        }
3226
3227        len -= l;
3228        buf += l;
3229        addr += l;
3230
3231        if (!len) {
3232            break;
3233        }
3234
3235        l = len;
3236        mr = flatview_translate(fv, addr, &addr1, &l, false, &attrs);
3237    }
3238
3239    return result;
3240}
3241
3242MemTxResult flatview_read_full(FlatView *fv, hwaddr addr,
3243                               MemTxAttrs attrs, uint8_t *buf, int len)
3244{
3245    hwaddr l;
3246    hwaddr addr1;
3247    MemoryRegion *mr;
3248    MemTxResult result = MEMTX_OK;
3249
3250    if (len > 0) {
3251        rcu_read_lock();
3252        l = len;
3253        mr = flatview_translate(fv, addr, &addr1, &l, false, &attrs);
3254        result = flatview_read_continue(fv, addr, attrs, buf, len,
3255                                        addr1, l, mr);
3256        rcu_read_unlock();
3257    }
3258
3259    return result;
3260}
3261
3262static MemTxResult flatview_rw(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
3263                               uint8_t *buf, int len, bool is_write)
3264{
3265    if (is_write) {
3266        return flatview_write(fv, addr, attrs, (uint8_t *)buf, len);
3267    } else {
3268        return flatview_read(fv, addr, attrs, (uint8_t *)buf, len);
3269    }
3270}
3271
3272MemTxResult address_space_rw(AddressSpace *as, hwaddr addr,
3273                             MemTxAttrs attrs, uint8_t *buf,
3274                             int len, bool is_write)
3275{
3276    return flatview_rw(address_space_to_flatview(as),
3277                       addr, attrs, buf, len, is_write);
3278}
3279
3280void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
3281                            int len, int is_write)
3282{
3283    address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
3284                     buf, len, is_write);
3285}
3286
3287enum write_rom_type {
3288    WRITE_DATA,
3289    FLUSH_CACHE,
3290};
3291
3292static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
3293    hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
3294{
3295    hwaddr l;
3296    uint8_t *ptr;
3297    hwaddr addr1;
3298    MemoryRegion *mr;
3299
3300    rcu_read_lock();
3301    while (len > 0) {
3302        l = len;
3303        mr = address_space_translate(as, addr, &addr1, &l, true);
3304
3305        if (!(memory_region_is_ram(mr) ||
3306              memory_region_is_romd(mr))) {
3307            if (type == WRITE_DATA) {
3308                address_space_rw(as, addr, MEMTXATTRS_UNSPECIFIED,
3309                                 (uint8_t *) buf, len, true);
3310            } else {
3311                l = memory_access_size(mr, l, addr1);
3312            }
3313        } else {
3314            /* ROM/RAM case */
3315            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3316            switch (type) {
3317            case WRITE_DATA:
3318                memcpy(ptr, buf, l);
3319                invalidate_and_set_dirty(mr, addr1, l);
3320                break;
3321            case FLUSH_CACHE:
3322                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
3323                break;
3324            }
3325        }
3326        len -= l;
3327        buf += l;
3328        addr += l;
3329    }
3330    rcu_read_unlock();
3331}
3332
3333/* used for ROM loading : can write in RAM and ROM */
3334void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
3335                                   const uint8_t *buf, int len)
3336{
3337    cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
3338}
3339
3340void cpu_flush_icache_range(hwaddr start, int len)
3341{
3342    /*
3343     * This function should do the same thing as an icache flush that was
3344     * triggered from within the guest. For TCG we are always cache coherent,
3345     * so there is no need to flush anything. For KVM / Xen we need to flush
3346     * the host's instruction cache at least.
3347     */
3348    if (tcg_enabled()) {
3349        return;
3350    }
3351
3352    cpu_physical_memory_write_rom_internal(&address_space_memory,
3353                                           start, NULL, len, FLUSH_CACHE);
3354}
3355
3356typedef struct {
3357    MemoryRegion *mr;
3358    void *buffer;
3359    hwaddr addr;
3360    hwaddr len;
3361    bool in_use;
3362} BounceBuffer;
3363
3364static BounceBuffer bounce;
3365
3366typedef struct MapClient {
3367    QEMUBH *bh;
3368    QLIST_ENTRY(MapClient) link;
3369} MapClient;
3370
3371QemuMutex map_client_list_lock;
3372static QLIST_HEAD(map_client_list, MapClient) map_client_list
3373    = QLIST_HEAD_INITIALIZER(map_client_list);
3374
3375static void cpu_unregister_map_client_do(MapClient *client)
3376{
3377    QLIST_REMOVE(client, link);
3378    g_free(client);
3379}
3380
3381static void cpu_notify_map_clients_locked(void)
3382{
3383    MapClient *client;
3384
3385    while (!QLIST_EMPTY(&map_client_list)) {
3386        client = QLIST_FIRST(&map_client_list);
3387        qemu_bh_schedule(client->bh);
3388        cpu_unregister_map_client_do(client);
3389    }
3390}
3391
3392void cpu_register_map_client(QEMUBH *bh)
3393{
3394    MapClient *client = g_malloc(sizeof(*client));
3395
3396    qemu_mutex_lock(&map_client_list_lock);
3397    client->bh = bh;
3398    QLIST_INSERT_HEAD(&map_client_list, client, link);
3399    if (!atomic_read(&bounce.in_use)) {
3400        cpu_notify_map_clients_locked();
3401    }
3402    qemu_mutex_unlock(&map_client_list_lock);
3403}
3404
3405void cpu_exec_init_all(void)
3406{
3407    qemu_mutex_init(&ram_list.mutex);
3408    /* The data structures we set up here depend on knowing the page size,
3409     * so no more changes can be made after this point.
3410     * In an ideal world, nothing we did before we had finished the
3411     * machine setup would care about the target page size, and we could
3412     * do this much later, rather than requiring board models to state
3413     * up front what their requirements are.
3414     */
3415    finalize_target_page_bits();
3416    io_mem_init();
3417    memory_map_init();
3418    qemu_mutex_init(&map_client_list_lock);
3419}
3420
3421void cpu_unregister_map_client(QEMUBH *bh)
3422{
3423    MapClient *client;
3424
3425    qemu_mutex_lock(&map_client_list_lock);
3426    QLIST_FOREACH(client, &map_client_list, link) {
3427        if (client->bh == bh) {
3428            cpu_unregister_map_client_do(client);
3429            break;
3430        }
3431    }
3432    qemu_mutex_unlock(&map_client_list_lock);
3433}
3434
3435static void cpu_notify_map_clients(void)
3436{
3437    qemu_mutex_lock(&map_client_list_lock);
3438    cpu_notify_map_clients_locked();
3439    qemu_mutex_unlock(&map_client_list_lock);
3440}
3441
3442static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
3443                                  bool is_write)
3444{
3445    MemoryRegion *mr;
3446    hwaddr l, xlat;
3447
3448    rcu_read_lock();
3449    while (len > 0) {
3450        l = len;
3451        mr = flatview_translate(fv, addr, &xlat, &l, is_write,
3452                                &MEMTXATTRS_UNSPECIFIED);
3453        if (!memory_access_is_direct(mr, is_write)) {
3454            l = memory_access_size(mr, l, addr);
3455            if (!memory_region_access_valid(mr, xlat, l, is_write)) {
3456                rcu_read_unlock();
3457                return false;
3458            }
3459        }
3460
3461        len -= l;
3462        addr += l;
3463    }
3464    rcu_read_unlock();
3465    return true;
3466}
3467
3468bool address_space_access_valid(AddressSpace *as, hwaddr addr,
3469                                int len, bool is_write)
3470{
3471    return flatview_access_valid(address_space_to_flatview(as),
3472                                 addr, len, is_write);
3473}
3474
3475static hwaddr
3476flatview_extend_translation(FlatView *fv, hwaddr addr,
3477                                 hwaddr target_len,
3478                                 MemoryRegion *mr, hwaddr base, hwaddr len,
3479                                 bool is_write)
3480{
3481    hwaddr done = 0;
3482    hwaddr xlat;
3483    MemoryRegion *this_mr;
3484
3485    for (;;) {
3486        target_len -= len;
3487        addr += len;
3488        done += len;
3489        if (target_len == 0) {
3490            return done;
3491        }
3492
3493        len = target_len;
3494        this_mr = flatview_translate(fv, addr, &xlat,
3495                                                   &len, is_write,
3496                                                   &MEMTXATTRS_UNSPECIFIED);
3497        if (this_mr != mr || xlat != base + done) {
3498            return done;
3499        }
3500    }
3501}
3502
3503/* Map a physical memory region into a host virtual address.
3504 * May map a subset of the requested range, given by and returned in *plen.
3505 * May return NULL if resources needed to perform the mapping are exhausted.
3506 * Use only for reads OR writes - not for read-modify-write operations.
3507 * Use cpu_register_map_client() to know when retrying the map operation is
3508 * likely to succeed.
3509 */
3510void *address_space_map(AddressSpace *as,
3511                        hwaddr addr,
3512                        hwaddr *plen,
3513                        bool is_write)
3514{
3515    hwaddr len = *plen;
3516    hwaddr l, xlat;
3517    MemoryRegion *mr;
3518    void *ptr;
3519    FlatView *fv = address_space_to_flatview(as);
3520
3521    if (len == 0) {
3522        return NULL;
3523    }
3524
3525    l = len;
3526    rcu_read_lock();
3527    mr = flatview_translate(fv, addr, &xlat, &l, is_write,
3528                            &MEMTXATTRS_UNSPECIFIED);
3529
3530    if (!memory_access_is_direct(mr, is_write)) {
3531        if (atomic_xchg(&bounce.in_use, true)) {
3532            rcu_read_unlock();
3533            return NULL;
3534        }
3535        /* Avoid unbounded allocations */
3536        l = MIN(l, TARGET_PAGE_SIZE);
3537        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3538        bounce.addr = addr;
3539        bounce.len = l;
3540
3541        memory_region_ref(mr);
3542        bounce.mr = mr;
3543        if (!is_write) {
3544            flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
3545                               bounce.buffer, l);
3546        }
3547
3548        rcu_read_unlock();
3549        *plen = l;
3550        return bounce.buffer;
3551    }
3552
3553
3554    memory_region_ref(mr);
3555    *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
3556                                             l, is_write);
3557    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
3558    rcu_read_unlock();
3559
3560    return ptr;
3561}
3562
3563/* Unmaps a memory region previously mapped by address_space_map().
3564 * Will also mark the memory as dirty if is_write == 1.  access_len gives
3565 * the amount of memory that was actually read or written by the caller.
3566 */
3567void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3568                         int is_write, hwaddr access_len)
3569{
3570    if (buffer != bounce.buffer) {
3571        MemoryRegion *mr;
3572        ram_addr_t addr1;
3573
3574        mr = memory_region_from_host(buffer, &addr1);
3575        assert(mr != NULL);
3576        if (is_write) {
3577            invalidate_and_set_dirty(mr, addr1, access_len);
3578        }
3579        if (xen_enabled()) {
3580            xen_invalidate_map_cache_entry(buffer);
3581        }
3582        memory_region_unref(mr);
3583        return;
3584    }
3585    if (is_write) {
3586        address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3587                            bounce.buffer, access_len);
3588    }
3589    qemu_vfree(bounce.buffer);
3590    bounce.buffer = NULL;
3591    memory_region_unref(bounce.mr);
3592    atomic_mb_set(&bounce.in_use, false);
3593    cpu_notify_map_clients();
3594}
3595
3596void *cpu_physical_memory_map(hwaddr addr,
3597                              hwaddr *plen,
3598                              int is_write)
3599{
3600    return address_space_map(&address_space_memory, addr, plen, is_write);
3601}
3602
3603void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3604                               int is_write, hwaddr access_len)
3605{
3606    return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3607}
3608
3609#define ARG1_DECL                AddressSpace *as
3610#define ARG1                     as
3611#define SUFFIX
3612#define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
3613#define IS_DIRECT(mr, is_write)  memory_access_is_direct(mr, is_write)
3614#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
3615#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
3616#define RCU_READ_LOCK(...)       rcu_read_lock()
3617#define RCU_READ_UNLOCK(...)     rcu_read_unlock()
3618#include "memory_ldst.inc.c"
3619
3620int64_t address_space_cache_init(MemoryRegionCache *cache,
3621                                 AddressSpace *as,
3622                                 hwaddr addr,
3623                                 hwaddr len,
3624                                 bool is_write)
3625{
3626    cache->len = len;
3627    cache->as = as;
3628    cache->xlat = addr;
3629    return len;
3630}
3631
3632void address_space_cache_invalidate(MemoryRegionCache *cache,
3633                                    hwaddr addr,
3634                                    hwaddr access_len)
3635{
3636}
3637
3638void address_space_cache_destroy(MemoryRegionCache *cache)
3639{
3640    cache->as = NULL;
3641}
3642
3643#define ARG1_DECL                MemoryRegionCache *cache
3644#define ARG1                     cache
3645#define SUFFIX                   _cached
3646#define TRANSLATE(addr, ...)     \
3647    address_space_translate(cache->as, cache->xlat + (addr), __VA_ARGS__)
3648#define IS_DIRECT(mr, is_write)  true
3649#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
3650#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
3651#define RCU_READ_LOCK()          rcu_read_lock()
3652#define RCU_READ_UNLOCK()        rcu_read_unlock()
3653#include "memory_ldst.inc.c"
3654
3655/* virtual memory access for debug (includes writing to ROM) */
3656int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3657                        uint8_t *buf, int len, int is_write)
3658{
3659    int l;
3660    hwaddr phys_addr;
3661    target_ulong page;
3662
3663    cpu_synchronize_state(cpu);
3664    while (len > 0) {
3665        int asidx;
3666        MemTxAttrs attrs;
3667
3668        page = addr & TARGET_PAGE_MASK;
3669        phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3670        asidx = cpu_asidx_from_attrs(cpu, attrs);
3671        /* if no physical page mapped, return an error */
3672        if (phys_addr == -1)
3673            return -1;
3674        l = (page + TARGET_PAGE_SIZE) - addr;
3675        if (l > len)
3676            l = len;
3677        phys_addr += (addr & ~TARGET_PAGE_MASK);
3678        if (is_write) {
3679            cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
3680                                          phys_addr, buf, l);
3681        } else {
3682            address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3683                             MEMTXATTRS_UNSPECIFIED,
3684                             buf, l, 0);
3685        }
3686        len -= l;
3687        buf += l;
3688        addr += l;
3689    }
3690    return 0;
3691}
3692
3693/*
3694 * Allows code that needs to deal with migration bitmaps etc to still be built
3695 * target independent.
3696 */
3697size_t qemu_target_page_size(void)
3698{
3699    return TARGET_PAGE_SIZE;
3700}
3701
3702int qemu_target_page_bits(void)
3703{
3704    return TARGET_PAGE_BITS;
3705}
3706
3707int qemu_target_page_bits_min(void)
3708{
3709    return TARGET_PAGE_BITS_MIN;
3710}
3711#endif
3712
3713/*
3714 * A helper function for the _utterly broken_ virtio device model to find out if
3715 * it's running on a big endian machine. Don't do this at home kids!
3716 */
3717bool target_words_bigendian(void);
3718bool target_words_bigendian(void)
3719{
3720#if defined(TARGET_WORDS_BIGENDIAN)
3721    return true;
3722#else
3723    return false;
3724#endif
3725}
3726
3727#ifndef CONFIG_USER_ONLY
3728bool cpu_physical_memory_is_io(hwaddr phys_addr)
3729{
3730    MemoryRegion*mr;
3731    hwaddr l = 1;
3732    bool res;
3733
3734    rcu_read_lock();
3735    mr = address_space_translate(&address_space_memory,
3736                                 phys_addr, &phys_addr, &l, false);
3737
3738    res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3739    rcu_read_unlock();
3740    return res;
3741}
3742
3743int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3744{
3745    RAMBlock *block;
3746    int ret = 0;
3747
3748    rcu_read_lock();
3749    RAMBLOCK_FOREACH(block) {
3750        ret = func(block->idstr, block->host, block->offset,
3751                   block->used_length, opaque);
3752        if (ret) {
3753            break;
3754        }
3755    }
3756    rcu_read_unlock();
3757    return ret;
3758}
3759
3760/*
3761 * Unmap pages of memory from start to start+length such that
3762 * they a) read as 0, b) Trigger whatever fault mechanism
3763 * the OS provides for postcopy.
3764 * The pages must be unmapped by the end of the function.
3765 * Returns: 0 on success, none-0 on failure
3766 *
3767 */
3768int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
3769{
3770    int ret = -1;
3771
3772    uint8_t *host_startaddr = rb->host + start;
3773
3774    if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
3775        error_report("ram_block_discard_range: Unaligned start address: %p",
3776                     host_startaddr);
3777        goto err;
3778    }
3779
3780    if ((start + length) <= rb->used_length) {
3781        uint8_t *host_endaddr = host_startaddr + length;
3782        if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
3783            error_report("ram_block_discard_range: Unaligned end address: %p",
3784                         host_endaddr);
3785            goto err;
3786        }
3787
3788        errno = ENOTSUP; /* If we are missing MADVISE etc */
3789
3790        if (rb->page_size == qemu_host_page_size) {
3791#if defined(CONFIG_MADVISE)
3792            /* Note: We need the madvise MADV_DONTNEED behaviour of definitely
3793             * freeing the page.
3794             */
3795            ret = madvise(host_startaddr, length, MADV_DONTNEED);
3796#endif
3797        } else {
3798            /* Huge page case  - unfortunately it can't do DONTNEED, but
3799             * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
3800             * huge page file.
3801             */
3802#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
3803            ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
3804                            start, length);
3805#endif
3806        }
3807        if (ret) {
3808            ret = -errno;
3809            error_report("ram_block_discard_range: Failed to discard range "
3810                         "%s:%" PRIx64 " +%zx (%d)",
3811                         rb->idstr, start, length, ret);
3812        }
3813    } else {
3814        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
3815                     "/%zx/" RAM_ADDR_FMT")",
3816                     rb->idstr, start, length, rb->used_length);
3817    }
3818
3819err:
3820    return ret;
3821}
3822
3823#endif
3824
3825void cpu_halt_update(CPUState *cpu)
3826{
3827    bool val;
3828    bool need_lock = !qemu_mutex_iothread_locked();
3829
3830    val = cpu->reset_pin || cpu->halt_pin || cpu->arch_halt_pin;
3831
3832    if (need_lock) {
3833        qemu_mutex_lock_iothread();
3834    }
3835
3836    if (val) {
3837        cpu_interrupt(cpu, CPU_INTERRUPT_HALT);
3838    } else {
3839        cpu_reset_interrupt(cpu, CPU_INTERRUPT_HALT);
3840        cpu_interrupt(cpu, CPU_INTERRUPT_EXITTB);
3841    }
3842
3843    cpu->exception_index = -1;
3844
3845    if (need_lock) {
3846        qemu_mutex_unlock_iothread();
3847    }
3848}
3849
3850void cpu_reset_gpio(void *opaque, int irq, int level)
3851{
3852    CPUState *cpu = CPU(opaque);
3853    int old_reset_pin = cpu->reset_pin;
3854
3855    if (level == cpu->reset_pin) {
3856        return;
3857    }
3858
3859    /* On hardware when the reset pin is asserted the CPU resets and stays
3860     * in reset until the pin is lowered. As we don't have a reset state, we
3861     * do it a little differently. If the reset_pin is being set high then
3862     * cpu_halt_update() will halt the CPU, but it isn't reset. Once the pin
3863     * is lowered we reset the CPU and then let it run, as long as no halt pin
3864     * is set. This avoids us having to double reset, which can cause issues
3865     * with MTTCG.
3866     */
3867    cpu->reset_pin = level;
3868    if (old_reset_pin && !cpu->reset_pin) {
3869        cpu_reset(cpu);
3870    }
3871
3872    cpu_halt_update(cpu);
3873}
3874
3875void cpu_halt_gpio(void *opaque, int irq, int level)
3876{
3877    CPUState *cpu = CPU(opaque);
3878
3879    cpu->halt_pin = level;
3880    cpu_halt_update(cpu);
3881}
3882
3883void page_size_init(void)
3884{
3885    /* NOTE: we can always suppose that qemu_host_page_size >=
3886       TARGET_PAGE_SIZE */
3887    if (qemu_host_page_size == 0) {
3888        qemu_host_page_size = qemu_real_host_page_size;
3889    }
3890    if (qemu_host_page_size < TARGET_PAGE_SIZE) {
3891        qemu_host_page_size = TARGET_PAGE_SIZE;
3892    }
3893    qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
3894}
3895
3896#if !defined(CONFIG_USER_ONLY)
3897
3898static void mtree_print_phys_entries(fprintf_function mon, void *f,
3899                                     int start, int end, int skip, int ptr)
3900{
3901    if (start == end - 1) {
3902        mon(f, "\t%3d      ", start);
3903    } else {
3904        mon(f, "\t%3d..%-3d ", start, end - 1);
3905    }
3906    mon(f, " skip=%d ", skip);
3907    if (ptr == PHYS_MAP_NODE_NIL) {
3908        mon(f, " ptr=NIL");
3909    } else if (!skip) {
3910        mon(f, " ptr=#%d", ptr);
3911    } else {
3912        mon(f, " ptr=[%d]", ptr);
3913    }
3914    mon(f, "\n");
3915}
3916
3917#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
3918                           int128_sub((size), int128_one())) : 0)
3919
3920void mtree_print_dispatch(fprintf_function mon, void *f,
3921                          AddressSpaceDispatch *d, MemoryRegion *root)
3922{
3923    int i;
3924
3925    mon(f, "  Dispatch\n");
3926    mon(f, "    Physical sections\n");
3927
3928    for (i = 0; i < d->map.sections_nb; ++i) {
3929        MemoryRegionSection *s = d->map.sections + i;
3930        const char *names[] = { " [unassigned]", " [not dirty]",
3931                                " [ROM]", " [watch]" };
3932
3933        mon(f, "      #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx " %s%s%s%s%s",
3934            i,
3935            s->offset_within_address_space,
3936            s->offset_within_address_space + MR_SIZE(s->mr->size),
3937            s->mr->name ? s->mr->name : "(noname)",
3938            i < ARRAY_SIZE(names) ? names[i] : "",
3939            s->mr == root ? " [ROOT]" : "",
3940            s == d->mru_section ? " [MRU]" : "",
3941            s->mr->is_iommu ? " [iommu]" : "");
3942
3943        if (s->mr->alias) {
3944            mon(f, " alias=%s", s->mr->alias->name ?
3945                    s->mr->alias->name : "noname");
3946        }
3947        mon(f, "\n");
3948    }
3949
3950    mon(f, "    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
3951               P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
3952    for (i = 0; i < d->map.nodes_nb; ++i) {
3953        int j, jprev;
3954        PhysPageEntry prev;
3955        Node *n = d->map.nodes + i;
3956
3957        mon(f, "      [%d]\n", i);
3958
3959        for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
3960            PhysPageEntry *pe = *n + j;
3961
3962            if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
3963                continue;
3964            }
3965
3966            mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
3967
3968            jprev = j;
3969            prev = *pe;
3970        }
3971
3972        if (jprev != ARRAY_SIZE(*n)) {
3973            mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
3974        }
3975    }
3976}
3977
3978#endif
3979