qemu/exec.c
<<
>>
Prefs
   1/*
   2 *  Virtual page mapping
   3 *
   4 *  Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19#include "qemu/osdep.h"
  20#include "qapi/error.h"
  21
  22#include "qemu/cutils.h"
  23#include "cpu.h"
  24#include "exec/exec-all.h"
  25#include "exec/target_page.h"
  26#include "tcg.h"
  27#include "hw/qdev-core.h"
  28#include "hw/qdev-properties.h"
  29#if !defined(CONFIG_USER_ONLY)
  30#include "hw/boards.h"
  31#include "hw/xen/xen.h"
  32#endif
  33#include "sysemu/kvm.h"
  34#include "sysemu/sysemu.h"
  35#include "qemu/timer.h"
  36#include "qemu/config-file.h"
  37#include "qemu/error-report.h"
  38#if defined(CONFIG_USER_ONLY)
  39#include "qemu.h"
  40#else /* !CONFIG_USER_ONLY */
  41#include "hw/hw.h"
  42#include "exec/memory.h"
  43#include "exec/ioport.h"
  44#include "sysemu/dma.h"
  45#include "sysemu/numa.h"
  46#include "sysemu/hw_accel.h"
  47#include "exec/address-spaces.h"
  48#include "sysemu/xen-mapcache.h"
  49#include "trace-root.h"
  50
  51#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
  52#include <linux/falloc.h>
  53#endif
  54
  55#endif
  56#include "qemu/rcu_queue.h"
  57#include "qemu/main-loop.h"
  58#include "translate-all.h"
  59#include "sysemu/replay.h"
  60
  61#include "exec/memory-internal.h"
  62#include "exec/ram_addr.h"
  63#include "exec/log.h"
  64
  65#include "migration/vmstate.h"
  66
  67#include "qemu/range.h"
  68#ifndef _WIN32
  69#include "qemu/mmap-alloc.h"
  70#endif
  71
  72#include "monitor/monitor.h"
  73
  74//#define DEBUG_SUBPAGE
  75
  76#if !defined(CONFIG_USER_ONLY)
  77/* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  78 * are protected by the ramlist lock.
  79 */
  80RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  81
  82static MemoryRegion *system_memory;
  83static MemoryRegion *system_io;
  84
  85AddressSpace address_space_io;
  86AddressSpace address_space_memory;
  87
  88MemoryRegion io_mem_rom, io_mem_notdirty;
  89static MemoryRegion io_mem_unassigned;
  90#endif
  91
  92#ifdef TARGET_PAGE_BITS_VARY
  93int target_page_bits;
  94bool target_page_bits_decided;
  95#endif
  96
  97CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
  98
  99/* current CPU in the current thread. It is only valid inside
 100   cpu_exec() */
 101__thread CPUState *current_cpu;
 102/* 0 = Do not count executed instructions.
 103   1 = Precise instruction counting.
 104   2 = Adaptive rate instruction counting.  */
 105int use_icount;
 106
 107uintptr_t qemu_host_page_size;
 108intptr_t qemu_host_page_mask;
 109
 110bool set_preferred_target_page_bits(int bits)
 111{
 112    /* The target page size is the lowest common denominator for all
 113     * the CPUs in the system, so we can only make it smaller, never
 114     * larger. And we can't make it smaller once we've committed to
 115     * a particular size.
 116     */
 117#ifdef TARGET_PAGE_BITS_VARY
 118    assert(bits >= TARGET_PAGE_BITS_MIN);
 119    if (target_page_bits == 0 || target_page_bits > bits) {
 120        if (target_page_bits_decided) {
 121            return false;
 122        }
 123        target_page_bits = bits;
 124    }
 125#endif
 126    return true;
 127}
 128
 129#if !defined(CONFIG_USER_ONLY)
 130
 131static void finalize_target_page_bits(void)
 132{
 133#ifdef TARGET_PAGE_BITS_VARY
 134    if (target_page_bits == 0) {
 135        target_page_bits = TARGET_PAGE_BITS_MIN;
 136    }
 137    target_page_bits_decided = true;
 138#endif
 139}
 140
 141typedef struct PhysPageEntry PhysPageEntry;
 142
 143struct PhysPageEntry {
 144    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 145    uint32_t skip : 6;
 146     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 147    uint32_t ptr : 26;
 148};
 149
 150#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 151
 152/* Size of the L2 (and L3, etc) page tables.  */
 153#define ADDR_SPACE_BITS 64
 154
 155#define P_L2_BITS 9
 156#define P_L2_SIZE (1 << P_L2_BITS)
 157
 158#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 159
 160typedef PhysPageEntry Node[P_L2_SIZE];
 161
 162typedef struct PhysPageMap {
 163    struct rcu_head rcu;
 164
 165    unsigned sections_nb;
 166    unsigned sections_nb_alloc;
 167    unsigned nodes_nb;
 168    unsigned nodes_nb_alloc;
 169    Node *nodes;
 170    MemoryRegionSection *sections;
 171} PhysPageMap;
 172
 173struct AddressSpaceDispatch {
 174    MemoryRegionSection *mru_section;
 175    /* This is a multi-level map on the physical address space.
 176     * The bottom level has pointers to MemoryRegionSections.
 177     */
 178    PhysPageEntry phys_map;
 179    PhysPageMap map;
 180};
 181
 182#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 183typedef struct subpage_t {
 184    MemoryRegion iomem;
 185    FlatView *fv;
 186    hwaddr base;
 187    uint16_t sub_section[];
 188} subpage_t;
 189
 190#define PHYS_SECTION_UNASSIGNED 0
 191#define PHYS_SECTION_NOTDIRTY 1
 192#define PHYS_SECTION_ROM 2
 193#define PHYS_SECTION_WATCH 3
 194
 195static void io_mem_init(void);
 196static void memory_map_init(void);
 197static void tcg_commit(MemoryListener *listener);
 198
 199static MemoryRegion io_mem_watch;
 200
 201/**
 202 * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 203 * @cpu: the CPU whose AddressSpace this is
 204 * @as: the AddressSpace itself
 205 * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 206 * @tcg_as_listener: listener for tracking changes to the AddressSpace
 207 */
 208struct CPUAddressSpace {
 209    CPUState *cpu;
 210    AddressSpace *as;
 211    struct AddressSpaceDispatch *memory_dispatch;
 212    MemoryListener tcg_as_listener;
 213};
 214
 215struct DirtyBitmapSnapshot {
 216    ram_addr_t start;
 217    ram_addr_t end;
 218    unsigned long dirty[];
 219};
 220
 221#endif
 222
 223#if !defined(CONFIG_USER_ONLY)
 224
 225static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 226{
 227    static unsigned alloc_hint = 16;
 228    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 229        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
 230        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 231        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 232        alloc_hint = map->nodes_nb_alloc;
 233    }
 234}
 235
 236static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
 237{
 238    unsigned i;
 239    uint32_t ret;
 240    PhysPageEntry e;
 241    PhysPageEntry *p;
 242
 243    ret = map->nodes_nb++;
 244    p = map->nodes[ret];
 245    assert(ret != PHYS_MAP_NODE_NIL);
 246    assert(ret != map->nodes_nb_alloc);
 247
 248    e.skip = leaf ? 0 : 1;
 249    e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
 250    for (i = 0; i < P_L2_SIZE; ++i) {
 251        memcpy(&p[i], &e, sizeof(e));
 252    }
 253    return ret;
 254}
 255
 256static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 257                                hwaddr *index, hwaddr *nb, uint16_t leaf,
 258                                int level)
 259{
 260    PhysPageEntry *p;
 261    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 262
 263    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 264        lp->ptr = phys_map_node_alloc(map, level == 0);
 265    }
 266    p = map->nodes[lp->ptr];
 267    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 268
 269    while (*nb && lp < &p[P_L2_SIZE]) {
 270        if ((*index & (step - 1)) == 0 && *nb >= step) {
 271            lp->skip = 0;
 272            lp->ptr = leaf;
 273            *index += step;
 274            *nb -= step;
 275        } else {
 276            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 277        }
 278        ++lp;
 279    }
 280}
 281
 282static void phys_page_set(AddressSpaceDispatch *d,
 283                          hwaddr index, hwaddr nb,
 284                          uint16_t leaf)
 285{
 286    /* Wildly overreserve - it doesn't matter much. */
 287    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 288
 289    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 290}
 291
 292/* Compact a non leaf page entry. Simply detect that the entry has a single child,
 293 * and update our entry so we can skip it and go directly to the destination.
 294 */
 295static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
 296{
 297    unsigned valid_ptr = P_L2_SIZE;
 298    int valid = 0;
 299    PhysPageEntry *p;
 300    int i;
 301
 302    if (lp->ptr == PHYS_MAP_NODE_NIL) {
 303        return;
 304    }
 305
 306    p = nodes[lp->ptr];
 307    for (i = 0; i < P_L2_SIZE; i++) {
 308        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 309            continue;
 310        }
 311
 312        valid_ptr = i;
 313        valid++;
 314        if (p[i].skip) {
 315            phys_page_compact(&p[i], nodes);
 316        }
 317    }
 318
 319    /* We can only compress if there's only one child. */
 320    if (valid != 1) {
 321        return;
 322    }
 323
 324    assert(valid_ptr < P_L2_SIZE);
 325
 326    /* Don't compress if it won't fit in the # of bits we have. */
 327    if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 328        return;
 329    }
 330
 331    lp->ptr = p[valid_ptr].ptr;
 332    if (!p[valid_ptr].skip) {
 333        /* If our only child is a leaf, make this a leaf. */
 334        /* By design, we should have made this node a leaf to begin with so we
 335         * should never reach here.
 336         * But since it's so simple to handle this, let's do it just in case we
 337         * change this rule.
 338         */
 339        lp->skip = 0;
 340    } else {
 341        lp->skip += p[valid_ptr].skip;
 342    }
 343}
 344
 345void address_space_dispatch_compact(AddressSpaceDispatch *d)
 346{
 347    if (d->phys_map.skip) {
 348        phys_page_compact(&d->phys_map, d->map.nodes);
 349    }
 350}
 351
 352static inline bool section_covers_addr(const MemoryRegionSection *section,
 353                                       hwaddr addr)
 354{
 355    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
 356     * the section must cover the entire address space.
 357     */
 358    return int128_gethi(section->size) ||
 359           range_covers_byte(section->offset_within_address_space,
 360                             int128_getlo(section->size), addr);
 361}
 362
 363static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
 364{
 365    PhysPageEntry lp = d->phys_map, *p;
 366    Node *nodes = d->map.nodes;
 367    MemoryRegionSection *sections = d->map.sections;
 368    hwaddr index = addr >> TARGET_PAGE_BITS;
 369    int i;
 370
 371    for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 372        if (lp.ptr == PHYS_MAP_NODE_NIL) {
 373            return &sections[PHYS_SECTION_UNASSIGNED];
 374        }
 375        p = nodes[lp.ptr];
 376        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 377    }
 378
 379    if (section_covers_addr(&sections[lp.ptr], addr)) {
 380        return &sections[lp.ptr];
 381    } else {
 382        return &sections[PHYS_SECTION_UNASSIGNED];
 383    }
 384}
 385
 386/* Called from RCU critical section */
 387static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 388                                                        hwaddr addr,
 389                                                        bool resolve_subpage)
 390{
 391    MemoryRegionSection *section = atomic_read(&d->mru_section);
 392    subpage_t *subpage;
 393
 394    if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
 395        !section_covers_addr(section, addr)) {
 396        section = phys_page_find(d, addr);
 397        atomic_set(&d->mru_section, section);
 398    }
 399    if (resolve_subpage && section->mr->subpage) {
 400        subpage = container_of(section->mr, subpage_t, iomem);
 401        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 402    }
 403    return section;
 404}
 405
 406/* Called from RCU critical section */
 407static MemoryRegionSection *
 408address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 409                                 hwaddr *plen, bool resolve_subpage)
 410{
 411    MemoryRegionSection *section;
 412    MemoryRegion *mr;
 413    Int128 diff;
 414
 415    section = address_space_lookup_region(d, addr, resolve_subpage);
 416    /* Compute offset within MemoryRegionSection */
 417    addr -= section->offset_within_address_space;
 418
 419    /* Compute offset within MemoryRegion */
 420    *xlat = addr + section->offset_within_region;
 421
 422    mr = section->mr;
 423
 424    /* MMIO registers can be expected to perform full-width accesses based only
 425     * on their address, without considering adjacent registers that could
 426     * decode to completely different MemoryRegions.  When such registers
 427     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
 428     * regions overlap wildly.  For this reason we cannot clamp the accesses
 429     * here.
 430     *
 431     * If the length is small (as is the case for address_space_ldl/stl),
 432     * everything works fine.  If the incoming length is large, however,
 433     * the caller really has to do the clamping through memory_access_size.
 434     */
 435    if (memory_region_is_ram(mr)) {
 436        diff = int128_sub(section->size, int128_make64(addr));
 437        *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 438    }
 439    return section;
 440}
 441
 442/**
 443 * address_space_translate_iommu - translate an address through an IOMMU
 444 * memory region and then through the target address space.
 445 *
 446 * @iommu_mr: the IOMMU memory region that we start the translation from
 447 * @addr: the address to be translated through the MMU
 448 * @xlat: the translated address offset within the destination memory region.
 449 *        It cannot be %NULL.
 450 * @plen_out: valid read/write length of the translated address. It
 451 *            cannot be %NULL.
 452 * @page_mask_out: page mask for the translated address. This
 453 *            should only be meaningful for IOMMU translated
 454 *            addresses, since there may be huge pages that this bit
 455 *            would tell. It can be %NULL if we don't care about it.
 456 * @is_write: whether the translation operation is for write
 457 * @is_mmio: whether this can be MMIO, set true if it can
 458 * @target_as: the address space targeted by the IOMMU
 459 * @attrs: transaction attributes
 460 *
 461 * This function is called from RCU critical section.  It is the common
 462 * part of flatview_do_translate and address_space_translate_cached.
 463 */
 464static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
 465                                                         hwaddr *xlat,
 466                                                         hwaddr *plen_out,
 467                                                         hwaddr *page_mask_out,
 468                                                         bool is_write,
 469                                                         bool is_mmio,
 470                                                         AddressSpace **target_as,
 471                                                         MemTxAttrs attrs)
 472{
 473    MemoryRegionSection *section;
 474    hwaddr page_mask = (hwaddr)-1;
 475
 476    do {
 477        hwaddr addr = *xlat;
 478        IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 479        int iommu_idx = 0;
 480        IOMMUTLBEntry iotlb;
 481
 482        if (imrc->attrs_to_index) {
 483            iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
 484        }
 485
 486        iotlb = imrc->translate(iommu_mr, addr, is_write ?
 487                                IOMMU_WO : IOMMU_RO, iommu_idx);
 488
 489        if (!(iotlb.perm & (1 << is_write))) {
 490            goto unassigned;
 491        }
 492
 493        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 494                | (addr & iotlb.addr_mask));
 495        page_mask &= iotlb.addr_mask;
 496        *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
 497        *target_as = iotlb.target_as;
 498
 499        section = address_space_translate_internal(
 500                address_space_to_dispatch(iotlb.target_as), addr, xlat,
 501                plen_out, is_mmio);
 502
 503        iommu_mr = memory_region_get_iommu(section->mr);
 504    } while (unlikely(iommu_mr));
 505
 506    if (page_mask_out) {
 507        *page_mask_out = page_mask;
 508    }
 509    return *section;
 510
 511unassigned:
 512    return (MemoryRegionSection) { .mr = &io_mem_unassigned };
 513}
 514
 515/**
 516 * flatview_do_translate - translate an address in FlatView
 517 *
 518 * @fv: the flat view that we want to translate on
 519 * @addr: the address to be translated in above address space
 520 * @xlat: the translated address offset within memory region. It
 521 *        cannot be @NULL.
 522 * @plen_out: valid read/write length of the translated address. It
 523 *            can be @NULL when we don't care about it.
 524 * @page_mask_out: page mask for the translated address. This
 525 *            should only be meaningful for IOMMU translated
 526 *            addresses, since there may be huge pages that this bit
 527 *            would tell. It can be @NULL if we don't care about it.
 528 * @is_write: whether the translation operation is for write
 529 * @is_mmio: whether this can be MMIO, set true if it can
 530 * @target_as: the address space targeted by the IOMMU
 531 * @attrs: memory transaction attributes
 532 *
 533 * This function is called from RCU critical section
 534 */
 535static MemoryRegionSection flatview_do_translate(FlatView *fv,
 536                                                 hwaddr addr,
 537                                                 hwaddr *xlat,
 538                                                 hwaddr *plen_out,
 539                                                 hwaddr *page_mask_out,
 540                                                 bool is_write,
 541                                                 bool is_mmio,
 542                                                 AddressSpace **target_as,
 543                                                 MemTxAttrs attrs)
 544{
 545    MemoryRegionSection *section;
 546    IOMMUMemoryRegion *iommu_mr;
 547    hwaddr plen = (hwaddr)(-1);
 548
 549    if (!plen_out) {
 550        plen_out = &plen;
 551    }
 552
 553    section = address_space_translate_internal(
 554            flatview_to_dispatch(fv), addr, xlat,
 555            plen_out, is_mmio);
 556
 557    iommu_mr = memory_region_get_iommu(section->mr);
 558    if (unlikely(iommu_mr)) {
 559        return address_space_translate_iommu(iommu_mr, xlat,
 560                                             plen_out, page_mask_out,
 561                                             is_write, is_mmio,
 562                                             target_as, attrs);
 563    }
 564    if (page_mask_out) {
 565        /* Not behind an IOMMU, use default page size. */
 566        *page_mask_out = ~TARGET_PAGE_MASK;
 567    }
 568
 569    return *section;
 570}
 571
 572/* Called from RCU critical section */
 573IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
 574                                            bool is_write, MemTxAttrs attrs)
 575{
 576    MemoryRegionSection section;
 577    hwaddr xlat, page_mask;
 578
 579    /*
 580     * This can never be MMIO, and we don't really care about plen,
 581     * but page mask.
 582     */
 583    section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
 584                                    NULL, &page_mask, is_write, false, &as,
 585                                    attrs);
 586
 587    /* Illegal translation */
 588    if (section.mr == &io_mem_unassigned) {
 589        goto iotlb_fail;
 590    }
 591
 592    /* Convert memory region offset into address space offset */
 593    xlat += section.offset_within_address_space -
 594        section.offset_within_region;
 595
 596    return (IOMMUTLBEntry) {
 597        .target_as = as,
 598        .iova = addr & ~page_mask,
 599        .translated_addr = xlat & ~page_mask,
 600        .addr_mask = page_mask,
 601        /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
 602        .perm = IOMMU_RW,
 603    };
 604
 605iotlb_fail:
 606    return (IOMMUTLBEntry) {0};
 607}
 608
 609/* Called from RCU critical section */
 610MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
 611                                 hwaddr *plen, bool is_write,
 612                                 MemTxAttrs attrs)
 613{
 614    MemoryRegion *mr;
 615    MemoryRegionSection section;
 616    AddressSpace *as = NULL;
 617
 618    /* This can be MMIO, so setup MMIO bit. */
 619    section = flatview_do_translate(fv, addr, xlat, plen, NULL,
 620                                    is_write, true, &as, attrs);
 621    mr = section.mr;
 622
 623    if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 624        hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 625        *plen = MIN(page, *plen);
 626    }
 627
 628    return mr;
 629}
 630
 631typedef struct TCGIOMMUNotifier {
 632    IOMMUNotifier n;
 633    MemoryRegion *mr;
 634    CPUState *cpu;
 635    int iommu_idx;
 636    bool active;
 637} TCGIOMMUNotifier;
 638
 639static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 640{
 641    TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
 642
 643    if (!notifier->active) {
 644        return;
 645    }
 646    tlb_flush(notifier->cpu);
 647    notifier->active = false;
 648    /* We leave the notifier struct on the list to avoid reallocating it later.
 649     * Generally the number of IOMMUs a CPU deals with will be small.
 650     * In any case we can't unregister the iommu notifier from a notify
 651     * callback.
 652     */
 653}
 654
 655static void tcg_register_iommu_notifier(CPUState *cpu,
 656                                        IOMMUMemoryRegion *iommu_mr,
 657                                        int iommu_idx)
 658{
 659    /* Make sure this CPU has an IOMMU notifier registered for this
 660     * IOMMU/IOMMU index combination, so that we can flush its TLB
 661     * when the IOMMU tells us the mappings we've cached have changed.
 662     */
 663    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
 664    TCGIOMMUNotifier *notifier;
 665    int i;
 666
 667    for (i = 0; i < cpu->iommu_notifiers->len; i++) {
 668        notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
 669        if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
 670            break;
 671        }
 672    }
 673    if (i == cpu->iommu_notifiers->len) {
 674        /* Not found, add a new entry at the end of the array */
 675        cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
 676        notifier = g_new0(TCGIOMMUNotifier, 1);
 677        g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
 678
 679        notifier->mr = mr;
 680        notifier->iommu_idx = iommu_idx;
 681        notifier->cpu = cpu;
 682        /* Rather than trying to register interest in the specific part
 683         * of the iommu's address space that we've accessed and then
 684         * expand it later as subsequent accesses touch more of it, we
 685         * just register interest in the whole thing, on the assumption
 686         * that iommu reconfiguration will be rare.
 687         */
 688        iommu_notifier_init(&notifier->n,
 689                            tcg_iommu_unmap_notify,
 690                            IOMMU_NOTIFIER_UNMAP,
 691                            0,
 692                            HWADDR_MAX,
 693                            iommu_idx);
 694        memory_region_register_iommu_notifier(notifier->mr, &notifier->n);
 695    }
 696
 697    if (!notifier->active) {
 698        notifier->active = true;
 699    }
 700}
 701
 702static void tcg_iommu_free_notifier_list(CPUState *cpu)
 703{
 704    /* Destroy the CPU's notifier list */
 705    int i;
 706    TCGIOMMUNotifier *notifier;
 707
 708    for (i = 0; i < cpu->iommu_notifiers->len; i++) {
 709        notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
 710        memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
 711        g_free(notifier);
 712    }
 713    g_array_free(cpu->iommu_notifiers, true);
 714}
 715
 716/* Called from RCU critical section */
 717MemoryRegionSection *
 718address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
 719                                  hwaddr *xlat, hwaddr *plen,
 720                                  MemTxAttrs attrs, int *prot)
 721{
 722    MemoryRegionSection *section;
 723    IOMMUMemoryRegion *iommu_mr;
 724    IOMMUMemoryRegionClass *imrc;
 725    IOMMUTLBEntry iotlb;
 726    int iommu_idx;
 727    AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
 728
 729    for (;;) {
 730        section = address_space_translate_internal(d, addr, &addr, plen, false);
 731
 732        iommu_mr = memory_region_get_iommu(section->mr);
 733        if (!iommu_mr) {
 734            break;
 735        }
 736
 737        imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 738
 739        iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
 740        tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
 741        /* We need all the permissions, so pass IOMMU_NONE so the IOMMU
 742         * doesn't short-cut its translation table walk.
 743         */
 744        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
 745        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 746                | (addr & iotlb.addr_mask));
 747        /* Update the caller's prot bits to remove permissions the IOMMU
 748         * is giving us a failure response for. If we get down to no
 749         * permissions left at all we can give up now.
 750         */
 751        if (!(iotlb.perm & IOMMU_RO)) {
 752            *prot &= ~(PAGE_READ | PAGE_EXEC);
 753        }
 754        if (!(iotlb.perm & IOMMU_WO)) {
 755            *prot &= ~PAGE_WRITE;
 756        }
 757
 758        if (!*prot) {
 759            goto translate_fail;
 760        }
 761
 762        d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
 763    }
 764
 765    assert(!memory_region_is_iommu(section->mr));
 766    *xlat = addr;
 767    return section;
 768
 769translate_fail:
 770    return &d->map.sections[PHYS_SECTION_UNASSIGNED];
 771}
 772#endif
 773
 774#if !defined(CONFIG_USER_ONLY)
 775
 776static int cpu_common_post_load(void *opaque, int version_id)
 777{
 778    CPUState *cpu = opaque;
 779
 780    /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 781       version_id is increased. */
 782    cpu->interrupt_request &= ~0x01;
 783    tlb_flush(cpu);
 784
 785    /* loadvm has just updated the content of RAM, bypassing the
 786     * usual mechanisms that ensure we flush TBs for writes to
 787     * memory we've translated code from. So we must flush all TBs,
 788     * which will now be stale.
 789     */
 790    tb_flush(cpu);
 791
 792    return 0;
 793}
 794
 795static int cpu_common_pre_load(void *opaque)
 796{
 797    CPUState *cpu = opaque;
 798
 799    cpu->exception_index = -1;
 800
 801    return 0;
 802}
 803
 804static bool cpu_common_exception_index_needed(void *opaque)
 805{
 806    CPUState *cpu = opaque;
 807
 808    return tcg_enabled() && cpu->exception_index != -1;
 809}
 810
 811static const VMStateDescription vmstate_cpu_common_exception_index = {
 812    .name = "cpu_common/exception_index",
 813    .version_id = 1,
 814    .minimum_version_id = 1,
 815    .needed = cpu_common_exception_index_needed,
 816    .fields = (VMStateField[]) {
 817        VMSTATE_INT32(exception_index, CPUState),
 818        VMSTATE_END_OF_LIST()
 819    }
 820};
 821
 822static bool cpu_common_crash_occurred_needed(void *opaque)
 823{
 824    CPUState *cpu = opaque;
 825
 826    return cpu->crash_occurred;
 827}
 828
 829static const VMStateDescription vmstate_cpu_common_crash_occurred = {
 830    .name = "cpu_common/crash_occurred",
 831    .version_id = 1,
 832    .minimum_version_id = 1,
 833    .needed = cpu_common_crash_occurred_needed,
 834    .fields = (VMStateField[]) {
 835        VMSTATE_BOOL(crash_occurred, CPUState),
 836        VMSTATE_END_OF_LIST()
 837    }
 838};
 839
 840const VMStateDescription vmstate_cpu_common = {
 841    .name = "cpu_common",
 842    .version_id = 1,
 843    .minimum_version_id = 1,
 844    .pre_load = cpu_common_pre_load,
 845    .post_load = cpu_common_post_load,
 846    .fields = (VMStateField[]) {
 847        VMSTATE_UINT32(halted, CPUState),
 848        VMSTATE_UINT32(interrupt_request, CPUState),
 849        VMSTATE_END_OF_LIST()
 850    },
 851    .subsections = (const VMStateDescription*[]) {
 852        &vmstate_cpu_common_exception_index,
 853        &vmstate_cpu_common_crash_occurred,
 854        NULL
 855    }
 856};
 857
 858#endif
 859
 860CPUState *qemu_get_cpu(int index)
 861{
 862    CPUState *cpu;
 863
 864    CPU_FOREACH(cpu) {
 865        if (cpu->cpu_index == index) {
 866            return cpu;
 867        }
 868    }
 869
 870    return NULL;
 871}
 872
 873#if !defined(CONFIG_USER_ONLY)
 874void cpu_address_space_init(CPUState *cpu, int asidx,
 875                            const char *prefix, MemoryRegion *mr)
 876{
 877    CPUAddressSpace *newas;
 878    AddressSpace *as = g_new0(AddressSpace, 1);
 879    char *as_name;
 880
 881    assert(mr);
 882    as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
 883    address_space_init(as, mr, as_name);
 884    g_free(as_name);
 885
 886    /* Target code should have set num_ases before calling us */
 887    assert(asidx < cpu->num_ases);
 888
 889    if (asidx == 0) {
 890        /* address space 0 gets the convenience alias */
 891        cpu->as = as;
 892    }
 893
 894    /* KVM cannot currently support multiple address spaces. */
 895    assert(asidx == 0 || !kvm_enabled());
 896
 897    if (!cpu->cpu_ases) {
 898        cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
 899    }
 900
 901    newas = &cpu->cpu_ases[asidx];
 902    newas->cpu = cpu;
 903    newas->as = as;
 904    if (tcg_enabled()) {
 905        newas->tcg_as_listener.commit = tcg_commit;
 906        memory_listener_register(&newas->tcg_as_listener, as);
 907    }
 908}
 909
 910AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 911{
 912    /* Return the AddressSpace corresponding to the specified index */
 913    return cpu->cpu_ases[asidx].as;
 914}
 915#endif
 916
 917void cpu_exec_unrealizefn(CPUState *cpu)
 918{
 919    CPUClass *cc = CPU_GET_CLASS(cpu);
 920
 921    cpu_list_remove(cpu);
 922
 923    if (cc->vmsd != NULL) {
 924        vmstate_unregister(NULL, cc->vmsd, cpu);
 925    }
 926    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 927        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
 928    }
 929#ifndef CONFIG_USER_ONLY
 930    tcg_iommu_free_notifier_list(cpu);
 931#endif
 932}
 933
 934Property cpu_common_props[] = {
 935#ifndef CONFIG_USER_ONLY
 936    /* Create a memory property for softmmu CPU object,
 937     * so users can wire up its memory. (This can't go in qom/cpu.c
 938     * because that file is compiled only once for both user-mode
 939     * and system builds.) The default if no link is set up is to use
 940     * the system address space.
 941     */
 942    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
 943                     MemoryRegion *),
 944#endif
 945    DEFINE_PROP_END_OF_LIST(),
 946};
 947
 948void cpu_exec_initfn(CPUState *cpu)
 949{
 950    cpu->as = NULL;
 951    cpu->num_ases = 0;
 952
 953#ifndef CONFIG_USER_ONLY
 954    cpu->thread_id = qemu_get_thread_id();
 955    cpu->memory = system_memory;
 956    object_ref(OBJECT(cpu->memory));
 957#endif
 958}
 959
 960void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 961{
 962    CPUClass *cc = CPU_GET_CLASS(cpu);
 963    static bool tcg_target_initialized;
 964
 965    cpu_list_add(cpu);
 966
 967    if (tcg_enabled() && !tcg_target_initialized) {
 968        tcg_target_initialized = true;
 969        cc->tcg_initialize();
 970    }
 971    tlb_init(cpu);
 972
 973#ifndef CONFIG_USER_ONLY
 974    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 975        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 976    }
 977    if (cc->vmsd != NULL) {
 978        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 979    }
 980
 981    cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
 982#endif
 983}
 984
 985const char *parse_cpu_model(const char *cpu_model)
 986{
 987    ObjectClass *oc;
 988    CPUClass *cc;
 989    gchar **model_pieces;
 990    const char *cpu_type;
 991
 992    model_pieces = g_strsplit(cpu_model, ",", 2);
 993
 994    oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
 995    if (oc == NULL) {
 996        error_report("unable to find CPU model '%s'", model_pieces[0]);
 997        g_strfreev(model_pieces);
 998        exit(EXIT_FAILURE);
 999    }
1000
1001    cpu_type = object_class_get_name(oc);
1002    cc = CPU_CLASS(oc);
1003    cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
1004    g_strfreev(model_pieces);
1005    return cpu_type;
1006}
1007
1008#if defined(CONFIG_USER_ONLY)
1009void tb_invalidate_phys_addr(target_ulong addr)
1010{
1011    mmap_lock();
1012    tb_invalidate_phys_page_range(addr, addr + 1, 0);
1013    mmap_unlock();
1014}
1015
1016static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
1017{
1018    tb_invalidate_phys_addr(pc);
1019}
1020#else
1021void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
1022{
1023    ram_addr_t ram_addr;
1024    MemoryRegion *mr;
1025    hwaddr l = 1;
1026
1027    if (!tcg_enabled()) {
1028        return;
1029    }
1030
1031    rcu_read_lock();
1032    mr = address_space_translate(as, addr, &addr, &l, false, attrs);
1033    if (!(memory_region_is_ram(mr)
1034          || memory_region_is_romd(mr))) {
1035        rcu_read_unlock();
1036        return;
1037    }
1038    ram_addr = memory_region_get_ram_addr(mr) + addr;
1039    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0);
1040    rcu_read_unlock();
1041}
1042
1043static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
1044{
1045    MemTxAttrs attrs;
1046    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
1047    int asidx = cpu_asidx_from_attrs(cpu, attrs);
1048    if (phys != -1) {
1049        /* Locks grabbed by tb_invalidate_phys_addr */
1050        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
1051                                phys | (pc & ~TARGET_PAGE_MASK), attrs);
1052    }
1053}
1054#endif
1055
1056#if defined(CONFIG_USER_ONLY)
1057void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
1058
1059{
1060}
1061
1062int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
1063                          int flags)
1064{
1065    return -ENOSYS;
1066}
1067
1068void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
1069{
1070}
1071
1072int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
1073                          int flags, CPUWatchpoint **watchpoint)
1074{
1075    return -ENOSYS;
1076}
1077#else
1078/* Add a watchpoint.  */
1079int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
1080                          int flags, CPUWatchpoint **watchpoint)
1081{
1082    CPUWatchpoint *wp;
1083
1084    /* forbid ranges which are empty or run off the end of the address space */
1085    if (len == 0 || (addr + len - 1) < addr) {
1086        error_report("tried to set invalid watchpoint at %"
1087                     VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
1088        return -EINVAL;
1089    }
1090    wp = g_malloc(sizeof(*wp));
1091
1092    wp->vaddr = addr;
1093    wp->len = len;
1094    wp->flags = flags;
1095
1096    /* keep all GDB-injected watchpoints in front */
1097    if (flags & BP_GDB) {
1098        QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
1099    } else {
1100        QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
1101    }
1102
1103    tlb_flush_page(cpu, addr);
1104
1105    if (watchpoint)
1106        *watchpoint = wp;
1107    return 0;
1108}
1109
1110/* Remove a specific watchpoint.  */
1111int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
1112                          int flags)
1113{
1114    CPUWatchpoint *wp;
1115
1116    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1117        if (addr == wp->vaddr && len == wp->len
1118                && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
1119            cpu_watchpoint_remove_by_ref(cpu, wp);
1120            return 0;
1121        }
1122    }
1123    return -ENOENT;
1124}
1125
1126/* Remove a specific watchpoint by reference.  */
1127void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
1128{
1129    QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
1130
1131    tlb_flush_page(cpu, watchpoint->vaddr);
1132
1133    g_free(watchpoint);
1134}
1135
1136/* Remove all matching watchpoints.  */
1137void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
1138{
1139    CPUWatchpoint *wp, *next;
1140
1141    QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
1142        if (wp->flags & mask) {
1143            cpu_watchpoint_remove_by_ref(cpu, wp);
1144        }
1145    }
1146}
1147
1148/* Return true if this watchpoint address matches the specified
1149 * access (ie the address range covered by the watchpoint overlaps
1150 * partially or completely with the address range covered by the
1151 * access).
1152 */
1153static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
1154                                                  vaddr addr,
1155                                                  vaddr len)
1156{
1157    /* We know the lengths are non-zero, but a little caution is
1158     * required to avoid errors in the case where the range ends
1159     * exactly at the top of the address space and so addr + len
1160     * wraps round to zero.
1161     */
1162    vaddr wpend = wp->vaddr + wp->len - 1;
1163    vaddr addrend = addr + len - 1;
1164
1165    return !(addr > wpend || wp->vaddr > addrend);
1166}
1167
1168#endif
1169
1170/* Add a breakpoint.  */
1171int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
1172                          CPUBreakpoint **breakpoint)
1173{
1174    CPUBreakpoint *bp;
1175
1176    bp = g_malloc(sizeof(*bp));
1177
1178    bp->pc = pc;
1179    bp->flags = flags;
1180
1181    /* keep all GDB-injected breakpoints in front */
1182    if (flags & BP_GDB) {
1183        QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
1184    } else {
1185        QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
1186    }
1187
1188    breakpoint_invalidate(cpu, pc);
1189
1190    if (breakpoint) {
1191        *breakpoint = bp;
1192    }
1193    return 0;
1194}
1195
1196/* Remove a specific breakpoint.  */
1197int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
1198{
1199    CPUBreakpoint *bp;
1200
1201    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1202        if (bp->pc == pc && bp->flags == flags) {
1203            cpu_breakpoint_remove_by_ref(cpu, bp);
1204            return 0;
1205        }
1206    }
1207    return -ENOENT;
1208}
1209
1210/* Remove a specific breakpoint by reference.  */
1211void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
1212{
1213    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
1214
1215    breakpoint_invalidate(cpu, breakpoint->pc);
1216
1217    g_free(breakpoint);
1218}
1219
1220/* Remove all matching breakpoints. */
1221void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
1222{
1223    CPUBreakpoint *bp, *next;
1224
1225    QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
1226        if (bp->flags & mask) {
1227            cpu_breakpoint_remove_by_ref(cpu, bp);
1228        }
1229    }
1230}
1231
1232/* enable or disable single step mode. EXCP_DEBUG is returned by the
1233   CPU loop after each instruction */
1234void cpu_single_step(CPUState *cpu, int enabled)
1235{
1236    if (cpu->singlestep_enabled != enabled) {
1237        cpu->singlestep_enabled = enabled;
1238        if (kvm_enabled()) {
1239            kvm_update_guest_debug(cpu, 0);
1240        } else {
1241            /* must flush all the translated code to avoid inconsistencies */
1242            /* XXX: only flush what is necessary */
1243            tb_flush(cpu);
1244        }
1245    }
1246}
1247
1248void cpu_abort(CPUState *cpu, const char *fmt, ...)
1249{
1250    va_list ap;
1251    va_list ap2;
1252
1253    va_start(ap, fmt);
1254    va_copy(ap2, ap);
1255    fprintf(stderr, "qemu: fatal: ");
1256    vfprintf(stderr, fmt, ap);
1257    fprintf(stderr, "\n");
1258    cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1259    if (qemu_log_separate()) {
1260        qemu_log_lock();
1261        qemu_log("qemu: fatal: ");
1262        qemu_log_vprintf(fmt, ap2);
1263        qemu_log("\n");
1264        log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1265        qemu_log_flush();
1266        qemu_log_unlock();
1267        qemu_log_close();
1268    }
1269    va_end(ap2);
1270    va_end(ap);
1271    replay_finish();
1272#if defined(CONFIG_USER_ONLY)
1273    {
1274        struct sigaction act;
1275        sigfillset(&act.sa_mask);
1276        act.sa_handler = SIG_DFL;
1277        act.sa_flags = 0;
1278        sigaction(SIGABRT, &act, NULL);
1279    }
1280#endif
1281    abort();
1282}
1283
1284#if !defined(CONFIG_USER_ONLY)
1285/* Called from RCU critical section */
1286static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
1287{
1288    RAMBlock *block;
1289
1290    block = atomic_rcu_read(&ram_list.mru_block);
1291    if (block && addr - block->offset < block->max_length) {
1292        return block;
1293    }
1294    RAMBLOCK_FOREACH(block) {
1295        if (addr - block->offset < block->max_length) {
1296            goto found;
1297        }
1298    }
1299
1300    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1301    abort();
1302
1303found:
1304    /* It is safe to write mru_block outside the iothread lock.  This
1305     * is what happens:
1306     *
1307     *     mru_block = xxx
1308     *     rcu_read_unlock()
1309     *                                        xxx removed from list
1310     *                  rcu_read_lock()
1311     *                  read mru_block
1312     *                                        mru_block = NULL;
1313     *                                        call_rcu(reclaim_ramblock, xxx);
1314     *                  rcu_read_unlock()
1315     *
1316     * atomic_rcu_set is not needed here.  The block was already published
1317     * when it was placed into the list.  Here we're just making an extra
1318     * copy of the pointer.
1319     */
1320    ram_list.mru_block = block;
1321    return block;
1322}
1323
1324static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
1325{
1326    CPUState *cpu;
1327    ram_addr_t start1;
1328    RAMBlock *block;
1329    ram_addr_t end;
1330
1331    assert(tcg_enabled());
1332    end = TARGET_PAGE_ALIGN(start + length);
1333    start &= TARGET_PAGE_MASK;
1334
1335    rcu_read_lock();
1336    block = qemu_get_ram_block(start);
1337    assert(block == qemu_get_ram_block(end - 1));
1338    start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1339    CPU_FOREACH(cpu) {
1340        tlb_reset_dirty(cpu, start1, length);
1341    }
1342    rcu_read_unlock();
1343}
1344
1345/* Note: start and end must be within the same ram block.  */
1346bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
1347                                              ram_addr_t length,
1348                                              unsigned client)
1349{
1350    DirtyMemoryBlocks *blocks;
1351    unsigned long end, page;
1352    bool dirty = false;
1353
1354    if (length == 0) {
1355        return false;
1356    }
1357
1358    end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1359    page = start >> TARGET_PAGE_BITS;
1360
1361    rcu_read_lock();
1362
1363    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1364
1365    while (page < end) {
1366        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1367        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1368        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1369
1370        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1371                                              offset, num);
1372        page += num;
1373    }
1374
1375    rcu_read_unlock();
1376
1377    if (dirty && tcg_enabled()) {
1378        tlb_reset_dirty_range_all(start, length);
1379    }
1380
1381    return dirty;
1382}
1383
1384DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
1385     (ram_addr_t start, ram_addr_t length, unsigned client)
1386{
1387    DirtyMemoryBlocks *blocks;
1388    unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
1389    ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
1390    ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
1391    DirtyBitmapSnapshot *snap;
1392    unsigned long page, end, dest;
1393
1394    snap = g_malloc0(sizeof(*snap) +
1395                     ((last - first) >> (TARGET_PAGE_BITS + 3)));
1396    snap->start = first;
1397    snap->end   = last;
1398
1399    page = first >> TARGET_PAGE_BITS;
1400    end  = last  >> TARGET_PAGE_BITS;
1401    dest = 0;
1402
1403    rcu_read_lock();
1404
1405    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1406
1407    while (page < end) {
1408        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1409        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1410        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1411
1412        assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
1413        assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
1414        offset >>= BITS_PER_LEVEL;
1415
1416        bitmap_copy_and_clear_atomic(snap->dirty + dest,
1417                                     blocks->blocks[idx] + offset,
1418                                     num);
1419        page += num;
1420        dest += num >> BITS_PER_LEVEL;
1421    }
1422
1423    rcu_read_unlock();
1424
1425    if (tcg_enabled()) {
1426        tlb_reset_dirty_range_all(start, length);
1427    }
1428
1429    return snap;
1430}
1431
1432bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
1433                                            ram_addr_t start,
1434                                            ram_addr_t length)
1435{
1436    unsigned long page, end;
1437
1438    assert(start >= snap->start);
1439    assert(start + length <= snap->end);
1440
1441    end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
1442    page = (start - snap->start) >> TARGET_PAGE_BITS;
1443
1444    while (page < end) {
1445        if (test_bit(page, snap->dirty)) {
1446            return true;
1447        }
1448        page++;
1449    }
1450    return false;
1451}
1452
1453/* Called from RCU critical section */
1454hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1455                                       MemoryRegionSection *section,
1456                                       target_ulong vaddr,
1457                                       hwaddr paddr, hwaddr xlat,
1458                                       int prot,
1459                                       target_ulong *address)
1460{
1461    hwaddr iotlb;
1462    CPUWatchpoint *wp;
1463
1464    if (memory_region_is_ram(section->mr)) {
1465        /* Normal RAM.  */
1466        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
1467        if (!section->readonly) {
1468            iotlb |= PHYS_SECTION_NOTDIRTY;
1469        } else {
1470            iotlb |= PHYS_SECTION_ROM;
1471        }
1472    } else {
1473        AddressSpaceDispatch *d;
1474
1475        d = flatview_to_dispatch(section->fv);
1476        iotlb = section - d->map.sections;
1477        iotlb += xlat;
1478    }
1479
1480    /* Make accesses to pages with watchpoints go via the
1481       watchpoint trap routines.  */
1482    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1483        if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
1484            /* Avoid trapping reads of pages with a write breakpoint. */
1485            if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
1486                iotlb = PHYS_SECTION_WATCH + paddr;
1487                *address |= TLB_MMIO;
1488                break;
1489            }
1490        }
1491    }
1492
1493    return iotlb;
1494}
1495#endif /* defined(CONFIG_USER_ONLY) */
1496
1497#if !defined(CONFIG_USER_ONLY)
1498
1499static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
1500                             uint16_t section);
1501static subpage_t *subpage_init(FlatView *fv, hwaddr base);
1502
1503static void *(*phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
1504                               qemu_anon_ram_alloc;
1505
1506/*
1507 * Set a custom physical guest memory alloator.
1508 * Accelerators with unusual needs may need this.  Hopefully, we can
1509 * get rid of it eventually.
1510 */
1511void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared))
1512{
1513    phys_mem_alloc = alloc;
1514}
1515
1516static uint16_t phys_section_add(PhysPageMap *map,
1517                                 MemoryRegionSection *section)
1518{
1519    /* The physical section number is ORed with a page-aligned
1520     * pointer to produce the iotlb entries.  Thus it should
1521     * never overflow into the page-aligned value.
1522     */
1523    assert(map->sections_nb < TARGET_PAGE_SIZE);
1524
1525    if (map->sections_nb == map->sections_nb_alloc) {
1526        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1527        map->sections = g_renew(MemoryRegionSection, map->sections,
1528                                map->sections_nb_alloc);
1529    }
1530    map->sections[map->sections_nb] = *section;
1531    memory_region_ref(section->mr);
1532    return map->sections_nb++;
1533}
1534
1535static void phys_section_destroy(MemoryRegion *mr)
1536{
1537    bool have_sub_page = mr->subpage;
1538
1539    memory_region_unref(mr);
1540
1541    if (have_sub_page) {
1542        subpage_t *subpage = container_of(mr, subpage_t, iomem);
1543        object_unref(OBJECT(&subpage->iomem));
1544        g_free(subpage);
1545    }
1546}
1547
1548static void phys_sections_free(PhysPageMap *map)
1549{
1550    while (map->sections_nb > 0) {
1551        MemoryRegionSection *section = &map->sections[--map->sections_nb];
1552        phys_section_destroy(section->mr);
1553    }
1554    g_free(map->sections);
1555    g_free(map->nodes);
1556}
1557
1558static void register_subpage(FlatView *fv, MemoryRegionSection *section)
1559{
1560    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1561    subpage_t *subpage;
1562    hwaddr base = section->offset_within_address_space
1563        & TARGET_PAGE_MASK;
1564    MemoryRegionSection *existing = phys_page_find(d, base);
1565    MemoryRegionSection subsection = {
1566        .offset_within_address_space = base,
1567        .size = int128_make64(TARGET_PAGE_SIZE),
1568    };
1569    hwaddr start, end;
1570
1571    assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1572
1573    if (!(existing->mr->subpage)) {
1574        subpage = subpage_init(fv, base);
1575        subsection.fv = fv;
1576        subsection.mr = &subpage->iomem;
1577        phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1578                      phys_section_add(&d->map, &subsection));
1579    } else {
1580        subpage = container_of(existing->mr, subpage_t, iomem);
1581    }
1582    start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1583    end = start + int128_get64(section->size) - 1;
1584    subpage_register(subpage, start, end,
1585                     phys_section_add(&d->map, section));
1586}
1587
1588
1589static void register_multipage(FlatView *fv,
1590                               MemoryRegionSection *section)
1591{
1592    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1593    hwaddr start_addr = section->offset_within_address_space;
1594    uint16_t section_index = phys_section_add(&d->map, section);
1595    uint64_t num_pages = int128_get64(int128_rshift(section->size,
1596                                                    TARGET_PAGE_BITS));
1597
1598    assert(num_pages);
1599    phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1600}
1601
1602/*
1603 * The range in *section* may look like this:
1604 *
1605 *      |s|PPPPPPP|s|
1606 *
1607 * where s stands for subpage and P for page.
1608 */
1609void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
1610{
1611    MemoryRegionSection remain = *section;
1612    Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1613
1614    /* register first subpage */
1615    if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1616        uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
1617                        - remain.offset_within_address_space;
1618
1619        MemoryRegionSection now = remain;
1620        now.size = int128_min(int128_make64(left), now.size);
1621        register_subpage(fv, &now);
1622        if (int128_eq(remain.size, now.size)) {
1623            return;
1624        }
1625        remain.size = int128_sub(remain.size, now.size);
1626        remain.offset_within_address_space += int128_get64(now.size);
1627        remain.offset_within_region += int128_get64(now.size);
1628    }
1629
1630    /* register whole pages */
1631    if (int128_ge(remain.size, page_size)) {
1632        MemoryRegionSection now = remain;
1633        now.size = int128_and(now.size, int128_neg(page_size));
1634        register_multipage(fv, &now);
1635        if (int128_eq(remain.size, now.size)) {
1636            return;
1637        }
1638        remain.size = int128_sub(remain.size, now.size);
1639        remain.offset_within_address_space += int128_get64(now.size);
1640        remain.offset_within_region += int128_get64(now.size);
1641    }
1642
1643    /* register last subpage */
1644    register_subpage(fv, &remain);
1645}
1646
1647void qemu_flush_coalesced_mmio_buffer(void)
1648{
1649    if (kvm_enabled())
1650        kvm_flush_coalesced_mmio_buffer();
1651}
1652
1653void qemu_mutex_lock_ramlist(void)
1654{
1655    qemu_mutex_lock(&ram_list.mutex);
1656}
1657
1658void qemu_mutex_unlock_ramlist(void)
1659{
1660    qemu_mutex_unlock(&ram_list.mutex);
1661}
1662
1663void ram_block_dump(Monitor *mon)
1664{
1665    RAMBlock *block;
1666    char *psize;
1667
1668    rcu_read_lock();
1669    monitor_printf(mon, "%24s %8s  %18s %18s %18s\n",
1670                   "Block Name", "PSize", "Offset", "Used", "Total");
1671    RAMBLOCK_FOREACH(block) {
1672        psize = size_to_str(block->page_size);
1673        monitor_printf(mon, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
1674                       " 0x%016" PRIx64 "\n", block->idstr, psize,
1675                       (uint64_t)block->offset,
1676                       (uint64_t)block->used_length,
1677                       (uint64_t)block->max_length);
1678        g_free(psize);
1679    }
1680    rcu_read_unlock();
1681}
1682
1683#ifdef __linux__
1684/*
1685 * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
1686 * may or may not name the same files / on the same filesystem now as
1687 * when we actually open and map them.  Iterate over the file
1688 * descriptors instead, and use qemu_fd_getpagesize().
1689 */
1690static int find_max_supported_pagesize(Object *obj, void *opaque)
1691{
1692    long *hpsize_min = opaque;
1693
1694    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1695        HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1696        long hpsize = host_memory_backend_pagesize(backend);
1697
1698        if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
1699            *hpsize_min = hpsize;
1700        }
1701    }
1702
1703    return 0;
1704}
1705
1706long qemu_getrampagesize(void)
1707{
1708    long hpsize = LONG_MAX;
1709    long mainrampagesize;
1710    Object *memdev_root;
1711
1712    mainrampagesize = qemu_mempath_getpagesize(mem_path);
1713
1714    /* it's possible we have memory-backend objects with
1715     * hugepage-backed RAM. these may get mapped into system
1716     * address space via -numa parameters or memory hotplug
1717     * hooks. we want to take these into account, but we
1718     * also want to make sure these supported hugepage
1719     * sizes are applicable across the entire range of memory
1720     * we may boot from, so we take the min across all
1721     * backends, and assume normal pages in cases where a
1722     * backend isn't backed by hugepages.
1723     */
1724    memdev_root = object_resolve_path("/objects", NULL);
1725    if (memdev_root) {
1726        object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
1727    }
1728    if (hpsize == LONG_MAX) {
1729        /* No additional memory regions found ==> Report main RAM page size */
1730        return mainrampagesize;
1731    }
1732
1733    /* If NUMA is disabled or the NUMA nodes are not backed with a
1734     * memory-backend, then there is at least one node using "normal" RAM,
1735     * so if its page size is smaller we have got to report that size instead.
1736     */
1737    if (hpsize > mainrampagesize &&
1738        (nb_numa_nodes == 0 || numa_info[0].node_memdev == NULL)) {
1739        static bool warned;
1740        if (!warned) {
1741            error_report("Huge page support disabled (n/a for main memory).");
1742            warned = true;
1743        }
1744        return mainrampagesize;
1745    }
1746
1747    return hpsize;
1748}
1749#else
1750long qemu_getrampagesize(void)
1751{
1752    return getpagesize();
1753}
1754#endif
1755
1756#ifdef CONFIG_POSIX
1757static int64_t get_file_size(int fd)
1758{
1759    int64_t size = lseek(fd, 0, SEEK_END);
1760    if (size < 0) {
1761        return -errno;
1762    }
1763    return size;
1764}
1765
1766static int file_ram_open(const char *path,
1767                         const char *region_name,
1768                         bool *created,
1769                         Error **errp)
1770{
1771    char *filename;
1772    char *sanitized_name;
1773    char *c;
1774    int fd = -1;
1775
1776    *created = false;
1777    for (;;) {
1778        fd = open(path, O_RDWR);
1779        if (fd >= 0) {
1780            /* @path names an existing file, use it */
1781            break;
1782        }
1783        if (errno == ENOENT) {
1784            /* @path names a file that doesn't exist, create it */
1785            fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1786            if (fd >= 0) {
1787                *created = true;
1788                break;
1789            }
1790        } else if (errno == EISDIR) {
1791            /* @path names a directory, create a file there */
1792            /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1793            sanitized_name = g_strdup(region_name);
1794            for (c = sanitized_name; *c != '\0'; c++) {
1795                if (*c == '/') {
1796                    *c = '_';
1797                }
1798            }
1799
1800            filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1801                                       sanitized_name);
1802            g_free(sanitized_name);
1803
1804            fd = mkstemp(filename);
1805            if (fd >= 0) {
1806                unlink(filename);
1807                g_free(filename);
1808                break;
1809            }
1810            g_free(filename);
1811        }
1812        if (errno != EEXIST && errno != EINTR) {
1813            error_setg_errno(errp, errno,
1814                             "can't open backing store %s for guest RAM",
1815                             path);
1816            return -1;
1817        }
1818        /*
1819         * Try again on EINTR and EEXIST.  The latter happens when
1820         * something else creates the file between our two open().
1821         */
1822    }
1823
1824    return fd;
1825}
1826
1827static void *file_ram_alloc(RAMBlock *block,
1828                            ram_addr_t memory,
1829                            int fd,
1830                            bool truncate,
1831                            Error **errp)
1832{
1833    void *area;
1834
1835    block->page_size = qemu_fd_getpagesize(fd);
1836    if (block->mr->align % block->page_size) {
1837        error_setg(errp, "alignment 0x%" PRIx64
1838                   " must be multiples of page size 0x%zx",
1839                   block->mr->align, block->page_size);
1840        return NULL;
1841    } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
1842        error_setg(errp, "alignment 0x%" PRIx64
1843                   " must be a power of two", block->mr->align);
1844        return NULL;
1845    }
1846    block->mr->align = MAX(block->page_size, block->mr->align);
1847#if defined(__s390x__)
1848    if (kvm_enabled()) {
1849        block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1850    }
1851#endif
1852
1853    if (memory < block->page_size) {
1854        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1855                   "or larger than page size 0x%zx",
1856                   memory, block->page_size);
1857        return NULL;
1858    }
1859
1860    memory = ROUND_UP(memory, block->page_size);
1861
1862    /*
1863     * ftruncate is not supported by hugetlbfs in older
1864     * hosts, so don't bother bailing out on errors.
1865     * If anything goes wrong with it under other filesystems,
1866     * mmap will fail.
1867     *
1868     * Do not truncate the non-empty backend file to avoid corrupting
1869     * the existing data in the file. Disabling shrinking is not
1870     * enough. For example, the current vNVDIMM implementation stores
1871     * the guest NVDIMM labels at the end of the backend file. If the
1872     * backend file is later extended, QEMU will not be able to find
1873     * those labels. Therefore, extending the non-empty backend file
1874     * is disabled as well.
1875     */
1876    if (truncate && ftruncate(fd, memory)) {
1877        perror("ftruncate");
1878    }
1879
1880    area = qemu_ram_mmap(fd, memory, block->mr->align,
1881                         block->flags & RAM_SHARED);
1882    if (area == MAP_FAILED) {
1883        error_setg_errno(errp, errno,
1884                         "unable to map backing store for guest RAM");
1885        return NULL;
1886    }
1887
1888    if (mem_prealloc) {
1889        os_mem_prealloc(fd, area, memory, smp_cpus, errp);
1890        if (errp && *errp) {
1891            qemu_ram_munmap(fd, area, memory);
1892            return NULL;
1893        }
1894    }
1895
1896    block->fd = fd;
1897    return area;
1898}
1899#endif
1900
1901/* Allocate space within the ram_addr_t space that governs the
1902 * dirty bitmaps.
1903 * Called with the ramlist lock held.
1904 */
1905static ram_addr_t find_ram_offset(ram_addr_t size)
1906{
1907    RAMBlock *block, *next_block;
1908    ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1909
1910    assert(size != 0); /* it would hand out same offset multiple times */
1911
1912    if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1913        return 0;
1914    }
1915
1916    RAMBLOCK_FOREACH(block) {
1917        ram_addr_t candidate, next = RAM_ADDR_MAX;
1918
1919        /* Align blocks to start on a 'long' in the bitmap
1920         * which makes the bitmap sync'ing take the fast path.
1921         */
1922        candidate = block->offset + block->max_length;
1923        candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
1924
1925        /* Search for the closest following block
1926         * and find the gap.
1927         */
1928        RAMBLOCK_FOREACH(next_block) {
1929            if (next_block->offset >= candidate) {
1930                next = MIN(next, next_block->offset);
1931            }
1932        }
1933
1934        /* If it fits remember our place and remember the size
1935         * of gap, but keep going so that we might find a smaller
1936         * gap to fill so avoiding fragmentation.
1937         */
1938        if (next - candidate >= size && next - candidate < mingap) {
1939            offset = candidate;
1940            mingap = next - candidate;
1941        }
1942
1943        trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
1944    }
1945
1946    if (offset == RAM_ADDR_MAX) {
1947        fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1948                (uint64_t)size);
1949        abort();
1950    }
1951
1952    trace_find_ram_offset(size, offset);
1953
1954    return offset;
1955}
1956
1957static unsigned long last_ram_page(void)
1958{
1959    RAMBlock *block;
1960    ram_addr_t last = 0;
1961
1962    rcu_read_lock();
1963    RAMBLOCK_FOREACH(block) {
1964        last = MAX(last, block->offset + block->max_length);
1965    }
1966    rcu_read_unlock();
1967    return last >> TARGET_PAGE_BITS;
1968}
1969
1970static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1971{
1972    int ret;
1973
1974    /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1975    if (!machine_dump_guest_core(current_machine)) {
1976        ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1977        if (ret) {
1978            perror("qemu_madvise");
1979            fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1980                            "but dump_guest_core=off specified\n");
1981        }
1982    }
1983}
1984
1985const char *qemu_ram_get_idstr(RAMBlock *rb)
1986{
1987    return rb->idstr;
1988}
1989
1990void *qemu_ram_get_host_addr(RAMBlock *rb)
1991{
1992    return rb->host;
1993}
1994
1995ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
1996{
1997    return rb->offset;
1998}
1999
2000ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
2001{
2002    return rb->used_length;
2003}
2004
2005bool qemu_ram_is_shared(RAMBlock *rb)
2006{
2007    return rb->flags & RAM_SHARED;
2008}
2009
2010/* Note: Only set at the start of postcopy */
2011bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
2012{
2013    return rb->flags & RAM_UF_ZEROPAGE;
2014}
2015
2016void qemu_ram_set_uf_zeroable(RAMBlock *rb)
2017{
2018    rb->flags |= RAM_UF_ZEROPAGE;
2019}
2020
2021bool qemu_ram_is_migratable(RAMBlock *rb)
2022{
2023    return rb->flags & RAM_MIGRATABLE;
2024}
2025
2026void qemu_ram_set_migratable(RAMBlock *rb)
2027{
2028    rb->flags |= RAM_MIGRATABLE;
2029}
2030
2031void qemu_ram_unset_migratable(RAMBlock *rb)
2032{
2033    rb->flags &= ~RAM_MIGRATABLE;
2034}
2035
2036/* Called with iothread lock held.  */
2037void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
2038{
2039    RAMBlock *block;
2040
2041    assert(new_block);
2042    assert(!new_block->idstr[0]);
2043
2044    if (dev) {
2045        char *id = qdev_get_dev_path(dev);
2046        if (id) {
2047            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
2048            g_free(id);
2049        }
2050    }
2051    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
2052
2053    rcu_read_lock();
2054    RAMBLOCK_FOREACH(block) {
2055        if (block != new_block &&
2056            !strcmp(block->idstr, new_block->idstr)) {
2057            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
2058                    new_block->idstr);
2059            abort();
2060        }
2061    }
2062    rcu_read_unlock();
2063}
2064
2065/* Called with iothread lock held.  */
2066void qemu_ram_unset_idstr(RAMBlock *block)
2067{
2068    /* FIXME: arch_init.c assumes that this is not called throughout
2069     * migration.  Ignore the problem since hot-unplug during migration
2070     * does not work anyway.
2071     */
2072    if (block) {
2073        memset(block->idstr, 0, sizeof(block->idstr));
2074    }
2075}
2076
2077size_t qemu_ram_pagesize(RAMBlock *rb)
2078{
2079    return rb->page_size;
2080}
2081
2082/* Returns the largest size of page in use */
2083size_t qemu_ram_pagesize_largest(void)
2084{
2085    RAMBlock *block;
2086    size_t largest = 0;
2087
2088    RAMBLOCK_FOREACH(block) {
2089        largest = MAX(largest, qemu_ram_pagesize(block));
2090    }
2091
2092    return largest;
2093}
2094
2095static int memory_try_enable_merging(void *addr, size_t len)
2096{
2097    if (!machine_mem_merge(current_machine)) {
2098        /* disabled by the user */
2099        return 0;
2100    }
2101
2102    return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
2103}
2104
2105/* Only legal before guest might have detected the memory size: e.g. on
2106 * incoming migration, or right after reset.
2107 *
2108 * As memory core doesn't know how is memory accessed, it is up to
2109 * resize callback to update device state and/or add assertions to detect
2110 * misuse, if necessary.
2111 */
2112int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
2113{
2114    assert(block);
2115
2116    newsize = HOST_PAGE_ALIGN(newsize);
2117
2118    if (block->used_length == newsize) {
2119        return 0;
2120    }
2121
2122    if (!(block->flags & RAM_RESIZEABLE)) {
2123        error_setg_errno(errp, EINVAL,
2124                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
2125                         " in != 0x" RAM_ADDR_FMT, block->idstr,
2126                         newsize, block->used_length);
2127        return -EINVAL;
2128    }
2129
2130    if (block->max_length < newsize) {
2131        error_setg_errno(errp, EINVAL,
2132                         "Length too large: %s: 0x" RAM_ADDR_FMT
2133                         " > 0x" RAM_ADDR_FMT, block->idstr,
2134                         newsize, block->max_length);
2135        return -EINVAL;
2136    }
2137
2138    cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
2139    block->used_length = newsize;
2140    cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
2141                                        DIRTY_CLIENTS_ALL);
2142    memory_region_set_size(block->mr, newsize);
2143    if (block->resized) {
2144        block->resized(block->idstr, newsize, block->host);
2145    }
2146    return 0;
2147}
2148
2149/* Called with ram_list.mutex held */
2150static void dirty_memory_extend(ram_addr_t old_ram_size,
2151                                ram_addr_t new_ram_size)
2152{
2153    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
2154                                             DIRTY_MEMORY_BLOCK_SIZE);
2155    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
2156                                             DIRTY_MEMORY_BLOCK_SIZE);
2157    int i;
2158
2159    /* Only need to extend if block count increased */
2160    if (new_num_blocks <= old_num_blocks) {
2161        return;
2162    }
2163
2164    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
2165        DirtyMemoryBlocks *old_blocks;
2166        DirtyMemoryBlocks *new_blocks;
2167        int j;
2168
2169        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
2170        new_blocks = g_malloc(sizeof(*new_blocks) +
2171                              sizeof(new_blocks->blocks[0]) * new_num_blocks);
2172
2173        if (old_num_blocks) {
2174            memcpy(new_blocks->blocks, old_blocks->blocks,
2175                   old_num_blocks * sizeof(old_blocks->blocks[0]));
2176        }
2177
2178        for (j = old_num_blocks; j < new_num_blocks; j++) {
2179            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
2180        }
2181
2182        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
2183
2184        if (old_blocks) {
2185            g_free_rcu(old_blocks, rcu);
2186        }
2187    }
2188}
2189
2190static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
2191{
2192    RAMBlock *block;
2193    RAMBlock *last_block = NULL;
2194    ram_addr_t old_ram_size, new_ram_size;
2195    Error *err = NULL;
2196
2197    old_ram_size = last_ram_page();
2198
2199    qemu_mutex_lock_ramlist();
2200    new_block->offset = find_ram_offset(new_block->max_length);
2201
2202    if (!new_block->host) {
2203        if (xen_enabled()) {
2204            xen_ram_alloc(new_block->offset, new_block->max_length,
2205                          new_block->mr, &err);
2206            if (err) {
2207                error_propagate(errp, err);
2208                qemu_mutex_unlock_ramlist();
2209                return;
2210            }
2211        } else {
2212            new_block->host = phys_mem_alloc(new_block->max_length,
2213                                             &new_block->mr->align, shared);
2214            if (!new_block->host) {
2215                error_setg_errno(errp, errno,
2216                                 "cannot set up guest memory '%s'",
2217                                 memory_region_name(new_block->mr));
2218                qemu_mutex_unlock_ramlist();
2219                return;
2220            }
2221            memory_try_enable_merging(new_block->host, new_block->max_length);
2222        }
2223    }
2224
2225    new_ram_size = MAX(old_ram_size,
2226              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
2227    if (new_ram_size > old_ram_size) {
2228        dirty_memory_extend(old_ram_size, new_ram_size);
2229    }
2230    /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
2231     * QLIST (which has an RCU-friendly variant) does not have insertion at
2232     * tail, so save the last element in last_block.
2233     */
2234    RAMBLOCK_FOREACH(block) {
2235        last_block = block;
2236        if (block->max_length < new_block->max_length) {
2237            break;
2238        }
2239    }
2240    if (block) {
2241        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
2242    } else if (last_block) {
2243        QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
2244    } else { /* list is empty */
2245        QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
2246    }
2247    ram_list.mru_block = NULL;
2248
2249    /* Write list before version */
2250    smp_wmb();
2251    ram_list.version++;
2252    qemu_mutex_unlock_ramlist();
2253
2254    cpu_physical_memory_set_dirty_range(new_block->offset,
2255                                        new_block->used_length,
2256                                        DIRTY_CLIENTS_ALL);
2257
2258    if (new_block->host) {
2259        qemu_ram_setup_dump(new_block->host, new_block->max_length);
2260        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
2261        /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
2262        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
2263        ram_block_notify_add(new_block->host, new_block->max_length);
2264    }
2265}
2266
2267#ifdef CONFIG_POSIX
2268RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
2269                                 uint32_t ram_flags, int fd,
2270                                 Error **errp)
2271{
2272    RAMBlock *new_block;
2273    Error *local_err = NULL;
2274    int64_t file_size;
2275
2276    /* Just support these ram flags by now. */
2277    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
2278
2279    if (xen_enabled()) {
2280        error_setg(errp, "-mem-path not supported with Xen");
2281        return NULL;
2282    }
2283
2284    if (kvm_enabled() && !kvm_has_sync_mmu()) {
2285        error_setg(errp,
2286                   "host lacks kvm mmu notifiers, -mem-path unsupported");
2287        return NULL;
2288    }
2289
2290    if (phys_mem_alloc != qemu_anon_ram_alloc) {
2291        /*
2292         * file_ram_alloc() needs to allocate just like
2293         * phys_mem_alloc, but we haven't bothered to provide
2294         * a hook there.
2295         */
2296        error_setg(errp,
2297                   "-mem-path not supported with this accelerator");
2298        return NULL;
2299    }
2300
2301    size = HOST_PAGE_ALIGN(size);
2302    file_size = get_file_size(fd);
2303    if (file_size > 0 && file_size < size) {
2304        error_setg(errp, "backing store %s size 0x%" PRIx64
2305                   " does not match 'size' option 0x" RAM_ADDR_FMT,
2306                   mem_path, file_size, size);
2307        return NULL;
2308    }
2309
2310    new_block = g_malloc0(sizeof(*new_block));
2311    new_block->mr = mr;
2312    new_block->used_length = size;
2313    new_block->max_length = size;
2314    new_block->flags = ram_flags;
2315    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
2316    if (!new_block->host) {
2317        g_free(new_block);
2318        return NULL;
2319    }
2320
2321    ram_block_add(new_block, &local_err, ram_flags & RAM_SHARED);
2322    if (local_err) {
2323        g_free(new_block);
2324        error_propagate(errp, local_err);
2325        return NULL;
2326    }
2327    return new_block;
2328
2329}
2330
2331
2332RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
2333                                   uint32_t ram_flags, const char *mem_path,
2334                                   Error **errp)
2335{
2336    int fd;
2337    bool created;
2338    RAMBlock *block;
2339
2340    fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
2341    if (fd < 0) {
2342        return NULL;
2343    }
2344
2345    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
2346    if (!block) {
2347        if (created) {
2348            unlink(mem_path);
2349        }
2350        close(fd);
2351        return NULL;
2352    }
2353
2354    return block;
2355}
2356#endif
2357
2358static
2359RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
2360                                  void (*resized)(const char*,
2361                                                  uint64_t length,
2362                                                  void *host),
2363                                  void *host, bool resizeable, bool share,
2364                                  MemoryRegion *mr, Error **errp)
2365{
2366    RAMBlock *new_block;
2367    Error *local_err = NULL;
2368
2369    size = HOST_PAGE_ALIGN(size);
2370    max_size = HOST_PAGE_ALIGN(max_size);
2371    new_block = g_malloc0(sizeof(*new_block));
2372    new_block->mr = mr;
2373    new_block->resized = resized;
2374    new_block->used_length = size;
2375    new_block->max_length = max_size;
2376    assert(max_size >= size);
2377    new_block->fd = -1;
2378    new_block->page_size = getpagesize();
2379    new_block->host = host;
2380    if (host) {
2381        new_block->flags |= RAM_PREALLOC;
2382    }
2383    if (resizeable) {
2384        new_block->flags |= RAM_RESIZEABLE;
2385    }
2386    ram_block_add(new_block, &local_err, share);
2387    if (local_err) {
2388        g_free(new_block);
2389        error_propagate(errp, local_err);
2390        return NULL;
2391    }
2392    return new_block;
2393}
2394
2395RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2396                                   MemoryRegion *mr, Error **errp)
2397{
2398    return qemu_ram_alloc_internal(size, size, NULL, host, false,
2399                                   false, mr, errp);
2400}
2401
2402RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
2403                         MemoryRegion *mr, Error **errp)
2404{
2405    return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
2406                                   share, mr, errp);
2407}
2408
2409RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2410                                     void (*resized)(const char*,
2411                                                     uint64_t length,
2412                                                     void *host),
2413                                     MemoryRegion *mr, Error **errp)
2414{
2415    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
2416                                   false, mr, errp);
2417}
2418
2419static void reclaim_ramblock(RAMBlock *block)
2420{
2421    if (block->flags & RAM_PREALLOC) {
2422        ;
2423    } else if (xen_enabled()) {
2424        xen_invalidate_map_cache_entry(block->host);
2425#ifndef _WIN32
2426    } else if (block->fd >= 0) {
2427        qemu_ram_munmap(block->fd, block->host, block->max_length);
2428        close(block->fd);
2429#endif
2430    } else {
2431        qemu_anon_ram_free(block->host, block->max_length);
2432    }
2433    g_free(block);
2434}
2435
2436void qemu_ram_free(RAMBlock *block)
2437{
2438    if (!block) {
2439        return;
2440    }
2441
2442    if (block->host) {
2443        ram_block_notify_remove(block->host, block->max_length);
2444    }
2445
2446    qemu_mutex_lock_ramlist();
2447    QLIST_REMOVE_RCU(block, next);
2448    ram_list.mru_block = NULL;
2449    /* Write list before version */
2450    smp_wmb();
2451    ram_list.version++;
2452    call_rcu(block, reclaim_ramblock, rcu);
2453    qemu_mutex_unlock_ramlist();
2454}
2455
2456#ifndef _WIN32
2457void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
2458{
2459    RAMBlock *block;
2460    ram_addr_t offset;
2461    int flags;
2462    void *area, *vaddr;
2463
2464    RAMBLOCK_FOREACH(block) {
2465        offset = addr - block->offset;
2466        if (offset < block->max_length) {
2467            vaddr = ramblock_ptr(block, offset);
2468            if (block->flags & RAM_PREALLOC) {
2469                ;
2470            } else if (xen_enabled()) {
2471                abort();
2472            } else {
2473                flags = MAP_FIXED;
2474                if (block->fd >= 0) {
2475                    flags |= (block->flags & RAM_SHARED ?
2476                              MAP_SHARED : MAP_PRIVATE);
2477                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2478                                flags, block->fd, offset);
2479                } else {
2480                    /*
2481                     * Remap needs to match alloc.  Accelerators that
2482                     * set phys_mem_alloc never remap.  If they did,
2483                     * we'd need a remap hook here.
2484                     */
2485                    assert(phys_mem_alloc == qemu_anon_ram_alloc);
2486
2487                    flags |= MAP_PRIVATE | MAP_ANONYMOUS;
2488                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2489                                flags, -1, 0);
2490                }
2491                if (area != vaddr) {
2492                    error_report("Could not remap addr: "
2493                                 RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
2494                                 length, addr);
2495                    exit(1);
2496                }
2497                memory_try_enable_merging(vaddr, length);
2498                qemu_ram_setup_dump(vaddr, length);
2499            }
2500        }
2501    }
2502}
2503#endif /* !_WIN32 */
2504
2505/* Return a host pointer to ram allocated with qemu_ram_alloc.
2506 * This should not be used for general purpose DMA.  Use address_space_map
2507 * or address_space_rw instead. For local memory (e.g. video ram) that the
2508 * device owns, use memory_region_get_ram_ptr.
2509 *
2510 * Called within RCU critical section.
2511 */
2512void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2513{
2514    RAMBlock *block = ram_block;
2515
2516    if (block == NULL) {
2517        block = qemu_get_ram_block(addr);
2518        addr -= block->offset;
2519    }
2520
2521    if (xen_enabled() && block->host == NULL) {
2522        /* We need to check if the requested address is in the RAM
2523         * because we don't want to map the entire memory in QEMU.
2524         * In that case just map until the end of the page.
2525         */
2526        if (block->offset == 0) {
2527            return xen_map_cache(addr, 0, 0, false);
2528        }
2529
2530        block->host = xen_map_cache(block->offset, block->max_length, 1, false);
2531    }
2532    return ramblock_ptr(block, addr);
2533}
2534
2535/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
2536 * but takes a size argument.
2537 *
2538 * Called within RCU critical section.
2539 */
2540static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
2541                                 hwaddr *size, bool lock)
2542{
2543    RAMBlock *block = ram_block;
2544    if (*size == 0) {
2545        return NULL;
2546    }
2547
2548    if (block == NULL) {
2549        block = qemu_get_ram_block(addr);
2550        addr -= block->offset;
2551    }
2552    *size = MIN(*size, block->max_length - addr);
2553
2554    if (xen_enabled() && block->host == NULL) {
2555        /* We need to check if the requested address is in the RAM
2556         * because we don't want to map the entire memory in QEMU.
2557         * In that case just map the requested area.
2558         */
2559        if (block->offset == 0) {
2560            return xen_map_cache(addr, *size, lock, lock);
2561        }
2562
2563        block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
2564    }
2565
2566    return ramblock_ptr(block, addr);
2567}
2568
2569/* Return the offset of a hostpointer within a ramblock */
2570ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
2571{
2572    ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
2573    assert((uintptr_t)host >= (uintptr_t)rb->host);
2574    assert(res < rb->max_length);
2575
2576    return res;
2577}
2578
2579/*
2580 * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
2581 * in that RAMBlock.
2582 *
2583 * ptr: Host pointer to look up
2584 * round_offset: If true round the result offset down to a page boundary
2585 * *ram_addr: set to result ram_addr
2586 * *offset: set to result offset within the RAMBlock
2587 *
2588 * Returns: RAMBlock (or NULL if not found)
2589 *
2590 * By the time this function returns, the returned pointer is not protected
2591 * by RCU anymore.  If the caller is not within an RCU critical section and
2592 * does not hold the iothread lock, it must have other means of protecting the
2593 * pointer, such as a reference to the region that includes the incoming
2594 * ram_addr_t.
2595 */
2596RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
2597                                   ram_addr_t *offset)
2598{
2599    RAMBlock *block;
2600    uint8_t *host = ptr;
2601
2602    if (xen_enabled()) {
2603        ram_addr_t ram_addr;
2604        rcu_read_lock();
2605        ram_addr = xen_ram_addr_from_mapcache(ptr);
2606        block = qemu_get_ram_block(ram_addr);
2607        if (block) {
2608            *offset = ram_addr - block->offset;
2609        }
2610        rcu_read_unlock();
2611        return block;
2612    }
2613
2614    rcu_read_lock();
2615    block = atomic_rcu_read(&ram_list.mru_block);
2616    if (block && block->host && host - block->host < block->max_length) {
2617        goto found;
2618    }
2619
2620    RAMBLOCK_FOREACH(block) {
2621        /* This case append when the block is not mapped. */
2622        if (block->host == NULL) {
2623            continue;
2624        }
2625        if (host - block->host < block->max_length) {
2626            goto found;
2627        }
2628    }
2629
2630    rcu_read_unlock();
2631    return NULL;
2632
2633found:
2634    *offset = (host - block->host);
2635    if (round_offset) {
2636        *offset &= TARGET_PAGE_MASK;
2637    }
2638    rcu_read_unlock();
2639    return block;
2640}
2641
2642/*
2643 * Finds the named RAMBlock
2644 *
2645 * name: The name of RAMBlock to find
2646 *
2647 * Returns: RAMBlock (or NULL if not found)
2648 */
2649RAMBlock *qemu_ram_block_by_name(const char *name)
2650{
2651    RAMBlock *block;
2652
2653    RAMBLOCK_FOREACH(block) {
2654        if (!strcmp(name, block->idstr)) {
2655            return block;
2656        }
2657    }
2658
2659    return NULL;
2660}
2661
2662/* Some of the softmmu routines need to translate from a host pointer
2663   (typically a TLB entry) back to a ram offset.  */
2664ram_addr_t qemu_ram_addr_from_host(void *ptr)
2665{
2666    RAMBlock *block;
2667    ram_addr_t offset;
2668
2669    block = qemu_ram_block_from_host(ptr, false, &offset);
2670    if (!block) {
2671        return RAM_ADDR_INVALID;
2672    }
2673
2674    return block->offset + offset;
2675}
2676
2677/* Called within RCU critical section. */
2678void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
2679                          CPUState *cpu,
2680                          vaddr mem_vaddr,
2681                          ram_addr_t ram_addr,
2682                          unsigned size)
2683{
2684    ndi->cpu = cpu;
2685    ndi->ram_addr = ram_addr;
2686    ndi->mem_vaddr = mem_vaddr;
2687    ndi->size = size;
2688    ndi->pages = NULL;
2689
2690    assert(tcg_enabled());
2691    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
2692        ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
2693        tb_invalidate_phys_page_fast(ndi->pages, ram_addr, size);
2694    }
2695}
2696
2697/* Called within RCU critical section. */
2698void memory_notdirty_write_complete(NotDirtyInfo *ndi)
2699{
2700    if (ndi->pages) {
2701        assert(tcg_enabled());
2702        page_collection_unlock(ndi->pages);
2703        ndi->pages = NULL;
2704    }
2705
2706    /* Set both VGA and migration bits for simplicity and to remove
2707     * the notdirty callback faster.
2708     */
2709    cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
2710                                        DIRTY_CLIENTS_NOCODE);
2711    /* we remove the notdirty callback only if the code has been
2712       flushed */
2713    if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
2714        tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
2715    }
2716}
2717
2718/* Called within RCU critical section.  */
2719static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
2720                               uint64_t val, unsigned size)
2721{
2722    NotDirtyInfo ndi;
2723
2724    memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
2725                         ram_addr, size);
2726
2727    stn_p(qemu_map_ram_ptr(NULL, ram_addr), size, val);
2728    memory_notdirty_write_complete(&ndi);
2729}
2730
2731static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
2732                                 unsigned size, bool is_write,
2733                                 MemTxAttrs attrs)
2734{
2735    return is_write;
2736}
2737
2738static const MemoryRegionOps notdirty_mem_ops = {
2739    .write = notdirty_mem_write,
2740    .valid.accepts = notdirty_mem_accepts,
2741    .endianness = DEVICE_NATIVE_ENDIAN,
2742    .valid = {
2743        .min_access_size = 1,
2744        .max_access_size = 8,
2745        .unaligned = false,
2746    },
2747    .impl = {
2748        .min_access_size = 1,
2749        .max_access_size = 8,
2750        .unaligned = false,
2751    },
2752};
2753
2754/* Generate a debug exception if a watchpoint has been hit.  */
2755static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
2756{
2757    CPUState *cpu = current_cpu;
2758    CPUClass *cc = CPU_GET_CLASS(cpu);
2759    target_ulong vaddr;
2760    CPUWatchpoint *wp;
2761
2762    assert(tcg_enabled());
2763    if (cpu->watchpoint_hit) {
2764        /* We re-entered the check after replacing the TB. Now raise
2765         * the debug interrupt so that is will trigger after the
2766         * current instruction. */
2767        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2768        return;
2769    }
2770    vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
2771    vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
2772    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2773        if (cpu_watchpoint_address_matches(wp, vaddr, len)
2774            && (wp->flags & flags)) {
2775            if (flags == BP_MEM_READ) {
2776                wp->flags |= BP_WATCHPOINT_HIT_READ;
2777            } else {
2778                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2779            }
2780            wp->hitaddr = vaddr;
2781            wp->hitattrs = attrs;
2782            if (!cpu->watchpoint_hit) {
2783                if (wp->flags & BP_CPU &&
2784                    !cc->debug_check_watchpoint(cpu, wp)) {
2785                    wp->flags &= ~BP_WATCHPOINT_HIT;
2786                    continue;
2787                }
2788                cpu->watchpoint_hit = wp;
2789
2790                mmap_lock();
2791                tb_check_watchpoint(cpu);
2792                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2793                    cpu->exception_index = EXCP_DEBUG;
2794                    mmap_unlock();
2795                    cpu_loop_exit(cpu);
2796                } else {
2797                    /* Force execution of one insn next time.  */
2798                    cpu->cflags_next_tb = 1 | curr_cflags();
2799                    mmap_unlock();
2800                    cpu_loop_exit_noexc(cpu);
2801                }
2802            }
2803        } else {
2804            wp->flags &= ~BP_WATCHPOINT_HIT;
2805        }
2806    }
2807}
2808
2809/* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
2810   so these check for a hit then pass through to the normal out-of-line
2811   phys routines.  */
2812static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
2813                                  unsigned size, MemTxAttrs attrs)
2814{
2815    MemTxResult res;
2816    uint64_t data;
2817    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2818    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2819
2820    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
2821    switch (size) {
2822    case 1:
2823        data = address_space_ldub(as, addr, attrs, &res);
2824        break;
2825    case 2:
2826        data = address_space_lduw(as, addr, attrs, &res);
2827        break;
2828    case 4:
2829        data = address_space_ldl(as, addr, attrs, &res);
2830        break;
2831    case 8:
2832        data = address_space_ldq(as, addr, attrs, &res);
2833        break;
2834    default: abort();
2835    }
2836    *pdata = data;
2837    return res;
2838}
2839
2840static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
2841                                   uint64_t val, unsigned size,
2842                                   MemTxAttrs attrs)
2843{
2844    MemTxResult res;
2845    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2846    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2847
2848    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
2849    switch (size) {
2850    case 1:
2851        address_space_stb(as, addr, val, attrs, &res);
2852        break;
2853    case 2:
2854        address_space_stw(as, addr, val, attrs, &res);
2855        break;
2856    case 4:
2857        address_space_stl(as, addr, val, attrs, &res);
2858        break;
2859    case 8:
2860        address_space_stq(as, addr, val, attrs, &res);
2861        break;
2862    default: abort();
2863    }
2864    return res;
2865}
2866
2867static const MemoryRegionOps watch_mem_ops = {
2868    .read_with_attrs = watch_mem_read,
2869    .write_with_attrs = watch_mem_write,
2870    .endianness = DEVICE_NATIVE_ENDIAN,
2871    .valid = {
2872        .min_access_size = 1,
2873        .max_access_size = 8,
2874        .unaligned = false,
2875    },
2876    .impl = {
2877        .min_access_size = 1,
2878        .max_access_size = 8,
2879        .unaligned = false,
2880    },
2881};
2882
2883static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
2884                                 MemTxAttrs attrs, uint8_t *buf, hwaddr len);
2885static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
2886                                  const uint8_t *buf, hwaddr len);
2887static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
2888                                  bool is_write, MemTxAttrs attrs);
2889
2890static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2891                                unsigned len, MemTxAttrs attrs)
2892{
2893    subpage_t *subpage = opaque;
2894    uint8_t buf[8];
2895    MemTxResult res;
2896
2897#if defined(DEBUG_SUBPAGE)
2898    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2899           subpage, len, addr);
2900#endif
2901    res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
2902    if (res) {
2903        return res;
2904    }
2905    *data = ldn_p(buf, len);
2906    return MEMTX_OK;
2907}
2908
2909static MemTxResult subpage_write(void *opaque, hwaddr addr,
2910                                 uint64_t value, unsigned len, MemTxAttrs attrs)
2911{
2912    subpage_t *subpage = opaque;
2913    uint8_t buf[8];
2914
2915#if defined(DEBUG_SUBPAGE)
2916    printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2917           " value %"PRIx64"\n",
2918           __func__, subpage, len, addr, value);
2919#endif
2920    stn_p(buf, len, value);
2921    return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
2922}
2923
2924static bool subpage_accepts(void *opaque, hwaddr addr,
2925                            unsigned len, bool is_write,
2926                            MemTxAttrs attrs)
2927{
2928    subpage_t *subpage = opaque;
2929#if defined(DEBUG_SUBPAGE)
2930    printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2931           __func__, subpage, is_write ? 'w' : 'r', len, addr);
2932#endif
2933
2934    return flatview_access_valid(subpage->fv, addr + subpage->base,
2935                                 len, is_write, attrs);
2936}
2937
2938static const MemoryRegionOps subpage_ops = {
2939    .read_with_attrs = subpage_read,
2940    .write_with_attrs = subpage_write,
2941    .impl.min_access_size = 1,
2942    .impl.max_access_size = 8,
2943    .valid.min_access_size = 1,
2944    .valid.max_access_size = 8,
2945    .valid.accepts = subpage_accepts,
2946    .endianness = DEVICE_NATIVE_ENDIAN,
2947};
2948
2949static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2950                             uint16_t section)
2951{
2952    int idx, eidx;
2953
2954    if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2955        return -1;
2956    idx = SUBPAGE_IDX(start);
2957    eidx = SUBPAGE_IDX(end);
2958#if defined(DEBUG_SUBPAGE)
2959    printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2960           __func__, mmio, start, end, idx, eidx, section);
2961#endif
2962    for (; idx <= eidx; idx++) {
2963        mmio->sub_section[idx] = section;
2964    }
2965
2966    return 0;
2967}
2968
2969static subpage_t *subpage_init(FlatView *fv, hwaddr base)
2970{
2971    subpage_t *mmio;
2972
2973    mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2974    mmio->fv = fv;
2975    mmio->base = base;
2976    memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2977                          NULL, TARGET_PAGE_SIZE);
2978    mmio->iomem.subpage = true;
2979#if defined(DEBUG_SUBPAGE)
2980    printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2981           mmio, base, TARGET_PAGE_SIZE);
2982#endif
2983    subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2984
2985    return mmio;
2986}
2987
2988static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
2989{
2990    assert(fv);
2991    MemoryRegionSection section = {
2992        .fv = fv,
2993        .mr = mr,
2994        .offset_within_address_space = 0,
2995        .offset_within_region = 0,
2996        .size = int128_2_64(),
2997    };
2998
2999    return phys_section_add(map, &section);
3000}
3001
3002static void readonly_mem_write(void *opaque, hwaddr addr,
3003                               uint64_t val, unsigned size)
3004{
3005    /* Ignore any write to ROM. */
3006}
3007
3008static bool readonly_mem_accepts(void *opaque, hwaddr addr,
3009                                 unsigned size, bool is_write,
3010                                 MemTxAttrs attrs)
3011{
3012    return is_write;
3013}
3014
3015/* This will only be used for writes, because reads are special cased
3016 * to directly access the underlying host ram.
3017 */
3018static const MemoryRegionOps readonly_mem_ops = {
3019    .write = readonly_mem_write,
3020    .valid.accepts = readonly_mem_accepts,
3021    .endianness = DEVICE_NATIVE_ENDIAN,
3022    .valid = {
3023        .min_access_size = 1,
3024        .max_access_size = 8,
3025        .unaligned = false,
3026    },
3027    .impl = {
3028        .min_access_size = 1,
3029        .max_access_size = 8,
3030        .unaligned = false,
3031    },
3032};
3033
3034MemoryRegionSection *iotlb_to_section(CPUState *cpu,
3035                                      hwaddr index, MemTxAttrs attrs)
3036{
3037    int asidx = cpu_asidx_from_attrs(cpu, attrs);
3038    CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
3039    AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
3040    MemoryRegionSection *sections = d->map.sections;
3041
3042    return &sections[index & ~TARGET_PAGE_MASK];
3043}
3044
3045static void io_mem_init(void)
3046{
3047    memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
3048                          NULL, NULL, UINT64_MAX);
3049    memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
3050                          NULL, UINT64_MAX);
3051
3052    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
3053     * which can be called without the iothread mutex.
3054     */
3055    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
3056                          NULL, UINT64_MAX);
3057    memory_region_clear_global_locking(&io_mem_notdirty);
3058
3059    memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
3060                          NULL, UINT64_MAX);
3061}
3062
3063AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
3064{
3065    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
3066    uint16_t n;
3067
3068    n = dummy_section(&d->map, fv, &io_mem_unassigned);
3069    assert(n == PHYS_SECTION_UNASSIGNED);
3070    n = dummy_section(&d->map, fv, &io_mem_notdirty);
3071    assert(n == PHYS_SECTION_NOTDIRTY);
3072    n = dummy_section(&d->map, fv, &io_mem_rom);
3073    assert(n == PHYS_SECTION_ROM);
3074    n = dummy_section(&d->map, fv, &io_mem_watch);
3075    assert(n == PHYS_SECTION_WATCH);
3076
3077    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
3078
3079    return d;
3080}
3081
3082void address_space_dispatch_free(AddressSpaceDispatch *d)
3083{
3084    phys_sections_free(&d->map);
3085    g_free(d);
3086}
3087
3088static void tcg_commit(MemoryListener *listener)
3089{
3090    CPUAddressSpace *cpuas;
3091    AddressSpaceDispatch *d;
3092
3093    assert(tcg_enabled());
3094    /* since each CPU stores ram addresses in its TLB cache, we must
3095       reset the modified entries */
3096    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
3097    cpu_reloading_memory_map();
3098    /* The CPU and TLB are protected by the iothread lock.
3099     * We reload the dispatch pointer now because cpu_reloading_memory_map()
3100     * may have split the RCU critical section.
3101     */
3102    d = address_space_to_dispatch(cpuas->as);
3103    atomic_rcu_set(&cpuas->memory_dispatch, d);
3104    tlb_flush(cpuas->cpu);
3105}
3106
3107static void memory_map_init(void)
3108{
3109    system_memory = g_malloc(sizeof(*system_memory));
3110
3111    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
3112    address_space_init(&address_space_memory, system_memory, "memory");
3113
3114    system_io = g_malloc(sizeof(*system_io));
3115    memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
3116                          65536);
3117    address_space_init(&address_space_io, system_io, "I/O");
3118}
3119
3120MemoryRegion *get_system_memory(void)
3121{
3122    return system_memory;
3123}
3124
3125MemoryRegion *get_system_io(void)
3126{
3127    return system_io;
3128}
3129
3130#endif /* !defined(CONFIG_USER_ONLY) */
3131
3132/* physical memory access (slow version, mainly for debug) */
3133#if defined(CONFIG_USER_ONLY)
3134int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3135                        uint8_t *buf, target_ulong len, int is_write)
3136{
3137    int flags;
3138    target_ulong l, page;
3139    void * p;
3140
3141    while (len > 0) {
3142        page = addr & TARGET_PAGE_MASK;
3143        l = (page + TARGET_PAGE_SIZE) - addr;
3144        if (l > len)
3145            l = len;
3146        flags = page_get_flags(page);
3147        if (!(flags & PAGE_VALID))
3148            return -1;
3149        if (is_write) {
3150            if (!(flags & PAGE_WRITE))
3151                return -1;
3152            /* XXX: this code should not depend on lock_user */
3153            if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
3154                return -1;
3155            memcpy(p, buf, l);
3156            unlock_user(p, addr, l);
3157        } else {
3158            if (!(flags & PAGE_READ))
3159                return -1;
3160            /* XXX: this code should not depend on lock_user */
3161            if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
3162                return -1;
3163            memcpy(buf, p, l);
3164            unlock_user(p, addr, 0);
3165        }
3166        len -= l;
3167        buf += l;
3168        addr += l;
3169    }
3170    return 0;
3171}
3172
3173#else
3174
3175static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
3176                                     hwaddr length)
3177{
3178    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
3179    addr += memory_region_get_ram_addr(mr);
3180
3181    /* No early return if dirty_log_mask is or becomes 0, because
3182     * cpu_physical_memory_set_dirty_range will still call
3183     * xen_modified_memory.
3184     */
3185    if (dirty_log_mask) {
3186        dirty_log_mask =
3187            cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
3188    }
3189    if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
3190        assert(tcg_enabled());
3191        tb_invalidate_phys_range(addr, addr + length);
3192        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
3193    }
3194    cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
3195}
3196
3197void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
3198{
3199    /*
3200     * In principle this function would work on other memory region types too,
3201     * but the ROM device use case is the only one where this operation is
3202     * necessary.  Other memory regions should use the
3203     * address_space_read/write() APIs.
3204     */
3205    assert(memory_region_is_romd(mr));
3206
3207    invalidate_and_set_dirty(mr, addr, size);
3208}
3209
3210static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
3211{
3212    unsigned access_size_max = mr->ops->valid.max_access_size;
3213
3214    /* Regions are assumed to support 1-4 byte accesses unless
3215       otherwise specified.  */
3216    if (access_size_max == 0) {
3217        access_size_max = 4;
3218    }
3219
3220    /* Bound the maximum access by the alignment of the address.  */
3221    if (!mr->ops->impl.unaligned) {
3222        unsigned align_size_max = addr & -addr;
3223        if (align_size_max != 0 && align_size_max < access_size_max) {
3224            access_size_max = align_size_max;
3225        }
3226    }
3227
3228    /* Don't attempt accesses larger than the maximum.  */
3229    if (l > access_size_max) {
3230        l = access_size_max;
3231    }
3232    l = pow2floor(l);
3233
3234    return l;
3235}
3236
3237static bool prepare_mmio_access(MemoryRegion *mr)
3238{
3239    bool unlocked = !qemu_mutex_iothread_locked();
3240    bool release_lock = false;
3241
3242    if (unlocked && mr->global_locking) {
3243        qemu_mutex_lock_iothread();
3244        unlocked = false;
3245        release_lock = true;
3246    }
3247    if (mr->flush_coalesced_mmio) {
3248        if (unlocked) {
3249            qemu_mutex_lock_iothread();
3250        }
3251        qemu_flush_coalesced_mmio_buffer();
3252        if (unlocked) {
3253            qemu_mutex_unlock_iothread();
3254        }
3255    }
3256
3257    return release_lock;
3258}
3259
3260/* Called within RCU critical section.  */
3261static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
3262                                           MemTxAttrs attrs,
3263                                           const uint8_t *buf,
3264                                           hwaddr len, hwaddr addr1,
3265                                           hwaddr l, MemoryRegion *mr)
3266{
3267    uint8_t *ptr;
3268    uint64_t val;
3269    MemTxResult result = MEMTX_OK;
3270    bool release_lock = false;
3271
3272    for (;;) {
3273        if (!memory_access_is_direct(mr, true)) {
3274            release_lock |= prepare_mmio_access(mr);
3275            l = memory_access_size(mr, l, addr1);
3276            /* XXX: could force current_cpu to NULL to avoid
3277               potential bugs */
3278            val = ldn_p(buf, l);
3279            result |= memory_region_dispatch_write(mr, addr1, val, l, attrs);
3280        } else {
3281            /* RAM case */
3282            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3283            memcpy(ptr, buf, l);
3284            invalidate_and_set_dirty(mr, addr1, l);
3285        }
3286
3287        if (release_lock) {
3288            qemu_mutex_unlock_iothread();
3289            release_lock = false;
3290        }
3291
3292        len -= l;
3293        buf += l;
3294        addr += l;
3295
3296        if (!len) {
3297            break;
3298        }
3299
3300        l = len;
3301        mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3302    }
3303
3304    return result;
3305}
3306
3307/* Called from RCU critical section.  */
3308static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
3309                                  const uint8_t *buf, hwaddr len)
3310{
3311    hwaddr l;
3312    hwaddr addr1;
3313    MemoryRegion *mr;
3314    MemTxResult result = MEMTX_OK;
3315
3316    l = len;
3317    mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3318    result = flatview_write_continue(fv, addr, attrs, buf, len,
3319                                     addr1, l, mr);
3320
3321    return result;
3322}
3323
3324/* Called within RCU critical section.  */
3325MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
3326                                   MemTxAttrs attrs, uint8_t *buf,
3327                                   hwaddr len, hwaddr addr1, hwaddr l,
3328                                   MemoryRegion *mr)
3329{
3330    uint8_t *ptr;
3331    uint64_t val;
3332    MemTxResult result = MEMTX_OK;
3333    bool release_lock = false;
3334
3335    for (;;) {
3336        if (!memory_access_is_direct(mr, false)) {
3337            /* I/O case */
3338            release_lock |= prepare_mmio_access(mr);
3339            l = memory_access_size(mr, l, addr1);
3340            result |= memory_region_dispatch_read(mr, addr1, &val, l, attrs);
3341            stn_p(buf, l, val);
3342        } else {
3343            /* RAM case */
3344            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3345            memcpy(buf, ptr, l);
3346        }
3347
3348        if (release_lock) {
3349            qemu_mutex_unlock_iothread();
3350            release_lock = false;
3351        }
3352
3353        len -= l;
3354        buf += l;
3355        addr += l;
3356
3357        if (!len) {
3358            break;
3359        }
3360
3361        l = len;
3362        mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3363    }
3364
3365    return result;
3366}
3367
3368/* Called from RCU critical section.  */
3369static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
3370                                 MemTxAttrs attrs, uint8_t *buf, hwaddr len)
3371{
3372    hwaddr l;
3373    hwaddr addr1;
3374    MemoryRegion *mr;
3375
3376    l = len;
3377    mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3378    return flatview_read_continue(fv, addr, attrs, buf, len,
3379                                  addr1, l, mr);
3380}
3381
3382MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
3383                                    MemTxAttrs attrs, uint8_t *buf, hwaddr len)
3384{
3385    MemTxResult result = MEMTX_OK;
3386    FlatView *fv;
3387
3388    if (len > 0) {
3389        rcu_read_lock();
3390        fv = address_space_to_flatview(as);
3391        result = flatview_read(fv, addr, attrs, buf, len);
3392        rcu_read_unlock();
3393    }
3394
3395    return result;
3396}
3397
3398MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
3399                                MemTxAttrs attrs,
3400                                const uint8_t *buf, hwaddr len)
3401{
3402    MemTxResult result = MEMTX_OK;
3403    FlatView *fv;
3404
3405    if (len > 0) {
3406        rcu_read_lock();
3407        fv = address_space_to_flatview(as);
3408        result = flatview_write(fv, addr, attrs, buf, len);
3409        rcu_read_unlock();
3410    }
3411
3412    return result;
3413}
3414
3415MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
3416                             uint8_t *buf, hwaddr len, bool is_write)
3417{
3418    if (is_write) {
3419        return address_space_write(as, addr, attrs, buf, len);
3420    } else {
3421        return address_space_read_full(as, addr, attrs, buf, len);
3422    }
3423}
3424
3425void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
3426                            hwaddr len, int is_write)
3427{
3428    address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
3429                     buf, len, is_write);
3430}
3431
3432enum write_rom_type {
3433    WRITE_DATA,
3434    FLUSH_CACHE,
3435};
3436
3437static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
3438                                                           hwaddr addr,
3439                                                           MemTxAttrs attrs,
3440                                                           const uint8_t *buf,
3441                                                           hwaddr len,
3442                                                           enum write_rom_type type)
3443{
3444    hwaddr l;
3445    uint8_t *ptr;
3446    hwaddr addr1;
3447    MemoryRegion *mr;
3448
3449    rcu_read_lock();
3450    while (len > 0) {
3451        l = len;
3452        mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
3453
3454        if (!(memory_region_is_ram(mr) ||
3455              memory_region_is_romd(mr))) {
3456            l = memory_access_size(mr, l, addr1);
3457        } else {
3458            /* ROM/RAM case */
3459            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3460            switch (type) {
3461            case WRITE_DATA:
3462                memcpy(ptr, buf, l);
3463                invalidate_and_set_dirty(mr, addr1, l);
3464                break;
3465            case FLUSH_CACHE:
3466                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
3467                break;
3468            }
3469        }
3470        len -= l;
3471        buf += l;
3472        addr += l;
3473    }
3474    rcu_read_unlock();
3475    return MEMTX_OK;
3476}
3477
3478/* used for ROM loading : can write in RAM and ROM */
3479MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
3480                                    MemTxAttrs attrs,
3481                                    const uint8_t *buf, hwaddr len)
3482{
3483    return address_space_write_rom_internal(as, addr, attrs,
3484                                            buf, len, WRITE_DATA);
3485}
3486
3487void cpu_flush_icache_range(hwaddr start, hwaddr len)
3488{
3489    /*
3490     * This function should do the same thing as an icache flush that was
3491     * triggered from within the guest. For TCG we are always cache coherent,
3492     * so there is no need to flush anything. For KVM / Xen we need to flush
3493     * the host's instruction cache at least.
3494     */
3495    if (tcg_enabled()) {
3496        return;
3497    }
3498
3499    address_space_write_rom_internal(&address_space_memory,
3500                                     start, MEMTXATTRS_UNSPECIFIED,
3501                                     NULL, len, FLUSH_CACHE);
3502}
3503
3504typedef struct {
3505    MemoryRegion *mr;
3506    void *buffer;
3507    hwaddr addr;
3508    hwaddr len;
3509    bool in_use;
3510} BounceBuffer;
3511
3512static BounceBuffer bounce;
3513
3514typedef struct MapClient {
3515    QEMUBH *bh;
3516    QLIST_ENTRY(MapClient) link;
3517} MapClient;
3518
3519QemuMutex map_client_list_lock;
3520static QLIST_HEAD(, MapClient) map_client_list
3521    = QLIST_HEAD_INITIALIZER(map_client_list);
3522
3523static void cpu_unregister_map_client_do(MapClient *client)
3524{
3525    QLIST_REMOVE(client, link);
3526    g_free(client);
3527}
3528
3529static void cpu_notify_map_clients_locked(void)
3530{
3531    MapClient *client;
3532
3533    while (!QLIST_EMPTY(&map_client_list)) {
3534        client = QLIST_FIRST(&map_client_list);
3535        qemu_bh_schedule(client->bh);
3536        cpu_unregister_map_client_do(client);
3537    }
3538}
3539
3540void cpu_register_map_client(QEMUBH *bh)
3541{
3542    MapClient *client = g_malloc(sizeof(*client));
3543
3544    qemu_mutex_lock(&map_client_list_lock);
3545    client->bh = bh;
3546    QLIST_INSERT_HEAD(&map_client_list, client, link);
3547    if (!atomic_read(&bounce.in_use)) {
3548        cpu_notify_map_clients_locked();
3549    }
3550    qemu_mutex_unlock(&map_client_list_lock);
3551}
3552
3553void cpu_exec_init_all(void)
3554{
3555    qemu_mutex_init(&ram_list.mutex);
3556    /* The data structures we set up here depend on knowing the page size,
3557     * so no more changes can be made after this point.
3558     * In an ideal world, nothing we did before we had finished the
3559     * machine setup would care about the target page size, and we could
3560     * do this much later, rather than requiring board models to state
3561     * up front what their requirements are.
3562     */
3563    finalize_target_page_bits();
3564    io_mem_init();
3565    memory_map_init();
3566    qemu_mutex_init(&map_client_list_lock);
3567}
3568
3569void cpu_unregister_map_client(QEMUBH *bh)
3570{
3571    MapClient *client;
3572
3573    qemu_mutex_lock(&map_client_list_lock);
3574    QLIST_FOREACH(client, &map_client_list, link) {
3575        if (client->bh == bh) {
3576            cpu_unregister_map_client_do(client);
3577            break;
3578        }
3579    }
3580    qemu_mutex_unlock(&map_client_list_lock);
3581}
3582
3583static void cpu_notify_map_clients(void)
3584{
3585    qemu_mutex_lock(&map_client_list_lock);
3586    cpu_notify_map_clients_locked();
3587    qemu_mutex_unlock(&map_client_list_lock);
3588}
3589
3590static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
3591                                  bool is_write, MemTxAttrs attrs)
3592{
3593    MemoryRegion *mr;
3594    hwaddr l, xlat;
3595
3596    while (len > 0) {
3597        l = len;
3598        mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3599        if (!memory_access_is_direct(mr, is_write)) {
3600            l = memory_access_size(mr, l, addr);
3601            if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
3602                return false;
3603            }
3604        }
3605
3606        len -= l;
3607        addr += l;
3608    }
3609    return true;
3610}
3611
3612bool address_space_access_valid(AddressSpace *as, hwaddr addr,
3613                                hwaddr len, bool is_write,
3614                                MemTxAttrs attrs)
3615{
3616    FlatView *fv;
3617    bool result;
3618
3619    rcu_read_lock();
3620    fv = address_space_to_flatview(as);
3621    result = flatview_access_valid(fv, addr, len, is_write, attrs);
3622    rcu_read_unlock();
3623    return result;
3624}
3625
3626static hwaddr
3627flatview_extend_translation(FlatView *fv, hwaddr addr,
3628                            hwaddr target_len,
3629                            MemoryRegion *mr, hwaddr base, hwaddr len,
3630                            bool is_write, MemTxAttrs attrs)
3631{
3632    hwaddr done = 0;
3633    hwaddr xlat;
3634    MemoryRegion *this_mr;
3635
3636    for (;;) {
3637        target_len -= len;
3638        addr += len;
3639        done += len;
3640        if (target_len == 0) {
3641            return done;
3642        }
3643
3644        len = target_len;
3645        this_mr = flatview_translate(fv, addr, &xlat,
3646                                     &len, is_write, attrs);
3647        if (this_mr != mr || xlat != base + done) {
3648            return done;
3649        }
3650    }
3651}
3652
3653/* Map a physical memory region into a host virtual address.
3654 * May map a subset of the requested range, given by and returned in *plen.
3655 * May return NULL if resources needed to perform the mapping are exhausted.
3656 * Use only for reads OR writes - not for read-modify-write operations.
3657 * Use cpu_register_map_client() to know when retrying the map operation is
3658 * likely to succeed.
3659 */
3660void *address_space_map(AddressSpace *as,
3661                        hwaddr addr,
3662                        hwaddr *plen,
3663                        bool is_write,
3664                        MemTxAttrs attrs)
3665{
3666    hwaddr len = *plen;
3667    hwaddr l, xlat;
3668    MemoryRegion *mr;
3669    void *ptr;
3670    FlatView *fv;
3671
3672    if (len == 0) {
3673        return NULL;
3674    }
3675
3676    l = len;
3677    rcu_read_lock();
3678    fv = address_space_to_flatview(as);
3679    mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3680
3681    if (!memory_access_is_direct(mr, is_write)) {
3682        if (atomic_xchg(&bounce.in_use, true)) {
3683            rcu_read_unlock();
3684            return NULL;
3685        }
3686        /* Avoid unbounded allocations */
3687        l = MIN(l, TARGET_PAGE_SIZE);
3688        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3689        bounce.addr = addr;
3690        bounce.len = l;
3691
3692        memory_region_ref(mr);
3693        bounce.mr = mr;
3694        if (!is_write) {
3695            flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
3696                               bounce.buffer, l);
3697        }
3698
3699        rcu_read_unlock();
3700        *plen = l;
3701        return bounce.buffer;
3702    }
3703
3704
3705    memory_region_ref(mr);
3706    *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
3707                                        l, is_write, attrs);
3708    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
3709    rcu_read_unlock();
3710
3711    return ptr;
3712}
3713
3714/* Unmaps a memory region previously mapped by address_space_map().
3715 * Will also mark the memory as dirty if is_write == 1.  access_len gives
3716 * the amount of memory that was actually read or written by the caller.
3717 */
3718void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3719                         int is_write, hwaddr access_len)
3720{
3721    if (buffer != bounce.buffer) {
3722        MemoryRegion *mr;
3723        ram_addr_t addr1;
3724
3725        mr = memory_region_from_host(buffer, &addr1);
3726        assert(mr != NULL);
3727        if (is_write) {
3728            invalidate_and_set_dirty(mr, addr1, access_len);
3729        }
3730        if (xen_enabled()) {
3731            xen_invalidate_map_cache_entry(buffer);
3732        }
3733        memory_region_unref(mr);
3734        return;
3735    }
3736    if (is_write) {
3737        address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3738                            bounce.buffer, access_len);
3739    }
3740    qemu_vfree(bounce.buffer);
3741    bounce.buffer = NULL;
3742    memory_region_unref(bounce.mr);
3743    atomic_mb_set(&bounce.in_use, false);
3744    cpu_notify_map_clients();
3745}
3746
3747void *cpu_physical_memory_map(hwaddr addr,
3748                              hwaddr *plen,
3749                              int is_write)
3750{
3751    return address_space_map(&address_space_memory, addr, plen, is_write,
3752                             MEMTXATTRS_UNSPECIFIED);
3753}
3754
3755void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3756                               int is_write, hwaddr access_len)
3757{
3758    return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3759}
3760
3761#define ARG1_DECL                AddressSpace *as
3762#define ARG1                     as
3763#define SUFFIX
3764#define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
3765#define RCU_READ_LOCK(...)       rcu_read_lock()
3766#define RCU_READ_UNLOCK(...)     rcu_read_unlock()
3767#include "memory_ldst.inc.c"
3768
3769int64_t address_space_cache_init(MemoryRegionCache *cache,
3770                                 AddressSpace *as,
3771                                 hwaddr addr,
3772                                 hwaddr len,
3773                                 bool is_write)
3774{
3775    AddressSpaceDispatch *d;
3776    hwaddr l;
3777    MemoryRegion *mr;
3778
3779    assert(len > 0);
3780
3781    l = len;
3782    cache->fv = address_space_get_flatview(as);
3783    d = flatview_to_dispatch(cache->fv);
3784    cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
3785
3786    mr = cache->mrs.mr;
3787    memory_region_ref(mr);
3788    if (memory_access_is_direct(mr, is_write)) {
3789        /* We don't care about the memory attributes here as we're only
3790         * doing this if we found actual RAM, which behaves the same
3791         * regardless of attributes; so UNSPECIFIED is fine.
3792         */
3793        l = flatview_extend_translation(cache->fv, addr, len, mr,
3794                                        cache->xlat, l, is_write,
3795                                        MEMTXATTRS_UNSPECIFIED);
3796        cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
3797    } else {
3798        cache->ptr = NULL;
3799    }
3800
3801    cache->len = l;
3802    cache->is_write = is_write;
3803    return l;
3804}
3805
3806void address_space_cache_invalidate(MemoryRegionCache *cache,
3807                                    hwaddr addr,
3808                                    hwaddr access_len)
3809{
3810    assert(cache->is_write);
3811    if (likely(cache->ptr)) {
3812        invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
3813    }
3814}
3815
3816void address_space_cache_destroy(MemoryRegionCache *cache)
3817{
3818    if (!cache->mrs.mr) {
3819        return;
3820    }
3821
3822    if (xen_enabled()) {
3823        xen_invalidate_map_cache_entry(cache->ptr);
3824    }
3825    memory_region_unref(cache->mrs.mr);
3826    flatview_unref(cache->fv);
3827    cache->mrs.mr = NULL;
3828    cache->fv = NULL;
3829}
3830
3831/* Called from RCU critical section.  This function has the same
3832 * semantics as address_space_translate, but it only works on a
3833 * predefined range of a MemoryRegion that was mapped with
3834 * address_space_cache_init.
3835 */
3836static inline MemoryRegion *address_space_translate_cached(
3837    MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
3838    hwaddr *plen, bool is_write, MemTxAttrs attrs)
3839{
3840    MemoryRegionSection section;
3841    MemoryRegion *mr;
3842    IOMMUMemoryRegion *iommu_mr;
3843    AddressSpace *target_as;
3844
3845    assert(!cache->ptr);
3846    *xlat = addr + cache->xlat;
3847
3848    mr = cache->mrs.mr;
3849    iommu_mr = memory_region_get_iommu(mr);
3850    if (!iommu_mr) {
3851        /* MMIO region.  */
3852        return mr;
3853    }
3854
3855    section = address_space_translate_iommu(iommu_mr, xlat, plen,
3856                                            NULL, is_write, true,
3857                                            &target_as, attrs);
3858    return section.mr;
3859}
3860
3861/* Called from RCU critical section. address_space_read_cached uses this
3862 * out of line function when the target is an MMIO or IOMMU region.
3863 */
3864void
3865address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3866                                   void *buf, hwaddr len)
3867{
3868    hwaddr addr1, l;
3869    MemoryRegion *mr;
3870
3871    l = len;
3872    mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
3873                                        MEMTXATTRS_UNSPECIFIED);
3874    flatview_read_continue(cache->fv,
3875                           addr, MEMTXATTRS_UNSPECIFIED, buf, len,
3876                           addr1, l, mr);
3877}
3878
3879/* Called from RCU critical section. address_space_write_cached uses this
3880 * out of line function when the target is an MMIO or IOMMU region.
3881 */
3882void
3883address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3884                                    const void *buf, hwaddr len)
3885{
3886    hwaddr addr1, l;
3887    MemoryRegion *mr;
3888
3889    l = len;
3890    mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
3891                                        MEMTXATTRS_UNSPECIFIED);
3892    flatview_write_continue(cache->fv,
3893                            addr, MEMTXATTRS_UNSPECIFIED, buf, len,
3894                            addr1, l, mr);
3895}
3896
3897#define ARG1_DECL                MemoryRegionCache *cache
3898#define ARG1                     cache
3899#define SUFFIX                   _cached_slow
3900#define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
3901#define RCU_READ_LOCK()          ((void)0)
3902#define RCU_READ_UNLOCK()        ((void)0)
3903#include "memory_ldst.inc.c"
3904
3905/* virtual memory access for debug (includes writing to ROM) */
3906int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3907                        uint8_t *buf, target_ulong len, int is_write)
3908{
3909    hwaddr phys_addr;
3910    target_ulong l, page;
3911
3912    cpu_synchronize_state(cpu);
3913    while (len > 0) {
3914        int asidx;
3915        MemTxAttrs attrs;
3916
3917        page = addr & TARGET_PAGE_MASK;
3918        phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3919        asidx = cpu_asidx_from_attrs(cpu, attrs);
3920        /* if no physical page mapped, return an error */
3921        if (phys_addr == -1)
3922            return -1;
3923        l = (page + TARGET_PAGE_SIZE) - addr;
3924        if (l > len)
3925            l = len;
3926        phys_addr += (addr & ~TARGET_PAGE_MASK);
3927        if (is_write) {
3928            address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr,
3929                                    attrs, buf, l);
3930        } else {
3931            address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3932                             attrs, buf, l, 0);
3933        }
3934        len -= l;
3935        buf += l;
3936        addr += l;
3937    }
3938    return 0;
3939}
3940
3941/*
3942 * Allows code that needs to deal with migration bitmaps etc to still be built
3943 * target independent.
3944 */
3945size_t qemu_target_page_size(void)
3946{
3947    return TARGET_PAGE_SIZE;
3948}
3949
3950int qemu_target_page_bits(void)
3951{
3952    return TARGET_PAGE_BITS;
3953}
3954
3955int qemu_target_page_bits_min(void)
3956{
3957    return TARGET_PAGE_BITS_MIN;
3958}
3959#endif
3960
3961bool target_words_bigendian(void)
3962{
3963#if defined(TARGET_WORDS_BIGENDIAN)
3964    return true;
3965#else
3966    return false;
3967#endif
3968}
3969
3970#ifndef CONFIG_USER_ONLY
3971bool cpu_physical_memory_is_io(hwaddr phys_addr)
3972{
3973    MemoryRegion*mr;
3974    hwaddr l = 1;
3975    bool res;
3976
3977    rcu_read_lock();
3978    mr = address_space_translate(&address_space_memory,
3979                                 phys_addr, &phys_addr, &l, false,
3980                                 MEMTXATTRS_UNSPECIFIED);
3981
3982    res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3983    rcu_read_unlock();
3984    return res;
3985}
3986
3987int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3988{
3989    RAMBlock *block;
3990    int ret = 0;
3991
3992    rcu_read_lock();
3993    RAMBLOCK_FOREACH(block) {
3994        ret = func(block, opaque);
3995        if (ret) {
3996            break;
3997        }
3998    }
3999    rcu_read_unlock();
4000    return ret;
4001}
4002
4003/*
4004 * Unmap pages of memory from start to start+length such that
4005 * they a) read as 0, b) Trigger whatever fault mechanism
4006 * the OS provides for postcopy.
4007 * The pages must be unmapped by the end of the function.
4008 * Returns: 0 on success, none-0 on failure
4009 *
4010 */
4011int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
4012{
4013    int ret = -1;
4014
4015    uint8_t *host_startaddr = rb->host + start;
4016
4017    if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
4018        error_report("ram_block_discard_range: Unaligned start address: %p",
4019                     host_startaddr);
4020        goto err;
4021    }
4022
4023    if ((start + length) <= rb->used_length) {
4024        bool need_madvise, need_fallocate;
4025        uint8_t *host_endaddr = host_startaddr + length;
4026        if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
4027            error_report("ram_block_discard_range: Unaligned end address: %p",
4028                         host_endaddr);
4029            goto err;
4030        }
4031
4032        errno = ENOTSUP; /* If we are missing MADVISE etc */
4033
4034        /* The logic here is messy;
4035         *    madvise DONTNEED fails for hugepages
4036         *    fallocate works on hugepages and shmem
4037         */
4038        need_madvise = (rb->page_size == qemu_host_page_size);
4039        need_fallocate = rb->fd != -1;
4040        if (need_fallocate) {
4041            /* For a file, this causes the area of the file to be zero'd
4042             * if read, and for hugetlbfs also causes it to be unmapped
4043             * so a userfault will trigger.
4044             */
4045#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
4046            ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
4047                            start, length);
4048            if (ret) {
4049                ret = -errno;
4050                error_report("ram_block_discard_range: Failed to fallocate "
4051                             "%s:%" PRIx64 " +%zx (%d)",
4052                             rb->idstr, start, length, ret);
4053                goto err;
4054            }
4055#else
4056            ret = -ENOSYS;
4057            error_report("ram_block_discard_range: fallocate not available/file"
4058                         "%s:%" PRIx64 " +%zx (%d)",
4059                         rb->idstr, start, length, ret);
4060            goto err;
4061#endif
4062        }
4063        if (need_madvise) {
4064            /* For normal RAM this causes it to be unmapped,
4065             * for shared memory it causes the local mapping to disappear
4066             * and to fall back on the file contents (which we just
4067             * fallocate'd away).
4068             */
4069#if defined(CONFIG_MADVISE)
4070            ret =  madvise(host_startaddr, length, MADV_DONTNEED);
4071            if (ret) {
4072                ret = -errno;
4073                error_report("ram_block_discard_range: Failed to discard range "
4074                             "%s:%" PRIx64 " +%zx (%d)",
4075                             rb->idstr, start, length, ret);
4076                goto err;
4077            }
4078#else
4079            ret = -ENOSYS;
4080            error_report("ram_block_discard_range: MADVISE not available"
4081                         "%s:%" PRIx64 " +%zx (%d)",
4082                         rb->idstr, start, length, ret);
4083            goto err;
4084#endif
4085        }
4086        trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
4087                                      need_madvise, need_fallocate, ret);
4088    } else {
4089        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
4090                     "/%zx/" RAM_ADDR_FMT")",
4091                     rb->idstr, start, length, rb->used_length);
4092    }
4093
4094err:
4095    return ret;
4096}
4097
4098bool ramblock_is_pmem(RAMBlock *rb)
4099{
4100    return rb->flags & RAM_PMEM;
4101}
4102
4103#endif
4104
4105void page_size_init(void)
4106{
4107    /* NOTE: we can always suppose that qemu_host_page_size >=
4108       TARGET_PAGE_SIZE */
4109    if (qemu_host_page_size == 0) {
4110        qemu_host_page_size = qemu_real_host_page_size;
4111    }
4112    if (qemu_host_page_size < TARGET_PAGE_SIZE) {
4113        qemu_host_page_size = TARGET_PAGE_SIZE;
4114    }
4115    qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
4116}
4117
4118#if !defined(CONFIG_USER_ONLY)
4119
4120static void mtree_print_phys_entries(fprintf_function mon, void *f,
4121                                     int start, int end, int skip, int ptr)
4122{
4123    if (start == end - 1) {
4124        mon(f, "\t%3d      ", start);
4125    } else {
4126        mon(f, "\t%3d..%-3d ", start, end - 1);
4127    }
4128    mon(f, " skip=%d ", skip);
4129    if (ptr == PHYS_MAP_NODE_NIL) {
4130        mon(f, " ptr=NIL");
4131    } else if (!skip) {
4132        mon(f, " ptr=#%d", ptr);
4133    } else {
4134        mon(f, " ptr=[%d]", ptr);
4135    }
4136    mon(f, "\n");
4137}
4138
4139#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
4140                           int128_sub((size), int128_one())) : 0)
4141
4142void mtree_print_dispatch(fprintf_function mon, void *f,
4143                          AddressSpaceDispatch *d, MemoryRegion *root)
4144{
4145    int i;
4146
4147    mon(f, "  Dispatch\n");
4148    mon(f, "    Physical sections\n");
4149
4150    for (i = 0; i < d->map.sections_nb; ++i) {
4151        MemoryRegionSection *s = d->map.sections + i;
4152        const char *names[] = { " [unassigned]", " [not dirty]",
4153                                " [ROM]", " [watch]" };
4154
4155        mon(f, "      #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx " %s%s%s%s%s",
4156            i,
4157            s->offset_within_address_space,
4158            s->offset_within_address_space + MR_SIZE(s->mr->size),
4159            s->mr->name ? s->mr->name : "(noname)",
4160            i < ARRAY_SIZE(names) ? names[i] : "",
4161            s->mr == root ? " [ROOT]" : "",
4162            s == d->mru_section ? " [MRU]" : "",
4163            s->mr->is_iommu ? " [iommu]" : "");
4164
4165        if (s->mr->alias) {
4166            mon(f, " alias=%s", s->mr->alias->name ?
4167                    s->mr->alias->name : "noname");
4168        }
4169        mon(f, "\n");
4170    }
4171
4172    mon(f, "    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
4173               P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
4174    for (i = 0; i < d->map.nodes_nb; ++i) {
4175        int j, jprev;
4176        PhysPageEntry prev;
4177        Node *n = d->map.nodes + i;
4178
4179        mon(f, "      [%d]\n", i);
4180
4181        for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
4182            PhysPageEntry *pe = *n + j;
4183
4184            if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
4185                continue;
4186            }
4187
4188            mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
4189
4190            jprev = j;
4191            prev = *pe;
4192        }
4193
4194        if (jprev != ARRAY_SIZE(*n)) {
4195            mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
4196        }
4197    }
4198}
4199
4200#endif
4201