qemu/block.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator block driver
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 * Copyright (c) 2020 Virtuozzo International GmbH.
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a copy
   8 * of this software and associated documentation files (the "Software"), to deal
   9 * in the Software without restriction, including without limitation the rights
  10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 * copies of the Software, and to permit persons to whom the Software is
  12 * furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23 * THE SOFTWARE.
  24 */
  25
  26#include "qemu/osdep.h"
  27#include "block/trace.h"
  28#include "block/block_int.h"
  29#include "block/blockjob.h"
  30#include "block/dirty-bitmap.h"
  31#include "block/fuse.h"
  32#include "block/nbd.h"
  33#include "block/qdict.h"
  34#include "qemu/error-report.h"
  35#include "block/module_block.h"
  36#include "qemu/main-loop.h"
  37#include "qemu/module.h"
  38#include "qapi/error.h"
  39#include "qapi/qmp/qdict.h"
  40#include "qapi/qmp/qjson.h"
  41#include "qapi/qmp/qnull.h"
  42#include "qapi/qmp/qstring.h"
  43#include "qapi/qobject-output-visitor.h"
  44#include "qapi/qapi-visit-block-core.h"
  45#include "sysemu/block-backend.h"
  46#include "qemu/notify.h"
  47#include "qemu/option.h"
  48#include "qemu/coroutine.h"
  49#include "block/qapi.h"
  50#include "qemu/timer.h"
  51#include "qemu/cutils.h"
  52#include "qemu/id.h"
  53#include "qemu/range.h"
  54#include "qemu/rcu.h"
  55#include "block/coroutines.h"
  56
  57#ifdef CONFIG_BSD
  58#include <sys/ioctl.h>
  59#include <sys/queue.h>
  60#if defined(HAVE_SYS_DISK_H)
  61#include <sys/disk.h>
  62#endif
  63#endif
  64
  65#ifdef _WIN32
  66#include <windows.h>
  67#endif
  68
  69#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  70
  71/* Protected by BQL */
  72static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
  73    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
  74
  75/* Protected by BQL */
  76static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
  77    QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
  78
  79/* Protected by BQL */
  80static QLIST_HEAD(, BlockDriver) bdrv_drivers =
  81    QLIST_HEAD_INITIALIZER(bdrv_drivers);
  82
  83static BlockDriverState *bdrv_open_inherit(const char *filename,
  84                                           const char *reference,
  85                                           QDict *options, int flags,
  86                                           BlockDriverState *parent,
  87                                           const BdrvChildClass *child_class,
  88                                           BdrvChildRole child_role,
  89                                           Error **errp);
  90
  91static bool bdrv_recurse_has_child(BlockDriverState *bs,
  92                                   BlockDriverState *child);
  93
  94static void bdrv_replace_child_noperm(BdrvChild *child,
  95                                      BlockDriverState *new_bs);
  96static void bdrv_remove_child(BdrvChild *child, Transaction *tran);
  97
  98static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
  99                               BlockReopenQueue *queue,
 100                               Transaction *change_child_tran, Error **errp);
 101static void bdrv_reopen_commit(BDRVReopenState *reopen_state);
 102static void bdrv_reopen_abort(BDRVReopenState *reopen_state);
 103
 104static bool bdrv_backing_overridden(BlockDriverState *bs);
 105
 106static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
 107                                    GHashTable *visited, Transaction *tran,
 108                                    Error **errp);
 109
 110/* If non-zero, use only whitelisted block drivers */
 111static int use_bdrv_whitelist;
 112
 113#ifdef _WIN32
 114static int is_windows_drive_prefix(const char *filename)
 115{
 116    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
 117             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
 118            filename[1] == ':');
 119}
 120
 121int is_windows_drive(const char *filename)
 122{
 123    if (is_windows_drive_prefix(filename) &&
 124        filename[2] == '\0')
 125        return 1;
 126    if (strstart(filename, "\\\\.\\", NULL) ||
 127        strstart(filename, "//./", NULL))
 128        return 1;
 129    return 0;
 130}
 131#endif
 132
 133size_t bdrv_opt_mem_align(BlockDriverState *bs)
 134{
 135    if (!bs || !bs->drv) {
 136        /* page size or 4k (hdd sector size) should be on the safe side */
 137        return MAX(4096, qemu_real_host_page_size());
 138    }
 139    IO_CODE();
 140
 141    return bs->bl.opt_mem_alignment;
 142}
 143
 144size_t bdrv_min_mem_align(BlockDriverState *bs)
 145{
 146    if (!bs || !bs->drv) {
 147        /* page size or 4k (hdd sector size) should be on the safe side */
 148        return MAX(4096, qemu_real_host_page_size());
 149    }
 150    IO_CODE();
 151
 152    return bs->bl.min_mem_alignment;
 153}
 154
 155/* check if the path starts with "<protocol>:" */
 156int path_has_protocol(const char *path)
 157{
 158    const char *p;
 159
 160#ifdef _WIN32
 161    if (is_windows_drive(path) ||
 162        is_windows_drive_prefix(path)) {
 163        return 0;
 164    }
 165    p = path + strcspn(path, ":/\\");
 166#else
 167    p = path + strcspn(path, ":/");
 168#endif
 169
 170    return *p == ':';
 171}
 172
 173int path_is_absolute(const char *path)
 174{
 175#ifdef _WIN32
 176    /* specific case for names like: "\\.\d:" */
 177    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
 178        return 1;
 179    }
 180    return (*path == '/' || *path == '\\');
 181#else
 182    return (*path == '/');
 183#endif
 184}
 185
 186/* if filename is absolute, just return its duplicate. Otherwise, build a
 187   path to it by considering it is relative to base_path. URL are
 188   supported. */
 189char *path_combine(const char *base_path, const char *filename)
 190{
 191    const char *protocol_stripped = NULL;
 192    const char *p, *p1;
 193    char *result;
 194    int len;
 195
 196    if (path_is_absolute(filename)) {
 197        return g_strdup(filename);
 198    }
 199
 200    if (path_has_protocol(base_path)) {
 201        protocol_stripped = strchr(base_path, ':');
 202        if (protocol_stripped) {
 203            protocol_stripped++;
 204        }
 205    }
 206    p = protocol_stripped ?: base_path;
 207
 208    p1 = strrchr(base_path, '/');
 209#ifdef _WIN32
 210    {
 211        const char *p2;
 212        p2 = strrchr(base_path, '\\');
 213        if (!p1 || p2 > p1) {
 214            p1 = p2;
 215        }
 216    }
 217#endif
 218    if (p1) {
 219        p1++;
 220    } else {
 221        p1 = base_path;
 222    }
 223    if (p1 > p) {
 224        p = p1;
 225    }
 226    len = p - base_path;
 227
 228    result = g_malloc(len + strlen(filename) + 1);
 229    memcpy(result, base_path, len);
 230    strcpy(result + len, filename);
 231
 232    return result;
 233}
 234
 235/*
 236 * Helper function for bdrv_parse_filename() implementations to remove optional
 237 * protocol prefixes (especially "file:") from a filename and for putting the
 238 * stripped filename into the options QDict if there is such a prefix.
 239 */
 240void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
 241                                      QDict *options)
 242{
 243    if (strstart(filename, prefix, &filename)) {
 244        /* Stripping the explicit protocol prefix may result in a protocol
 245         * prefix being (wrongly) detected (if the filename contains a colon) */
 246        if (path_has_protocol(filename)) {
 247            GString *fat_filename;
 248
 249            /* This means there is some colon before the first slash; therefore,
 250             * this cannot be an absolute path */
 251            assert(!path_is_absolute(filename));
 252
 253            /* And we can thus fix the protocol detection issue by prefixing it
 254             * by "./" */
 255            fat_filename = g_string_new("./");
 256            g_string_append(fat_filename, filename);
 257
 258            assert(!path_has_protocol(fat_filename->str));
 259
 260            qdict_put(options, "filename",
 261                      qstring_from_gstring(fat_filename));
 262        } else {
 263            /* If no protocol prefix was detected, we can use the shortened
 264             * filename as-is */
 265            qdict_put_str(options, "filename", filename);
 266        }
 267    }
 268}
 269
 270
 271/* Returns whether the image file is opened as read-only. Note that this can
 272 * return false and writing to the image file is still not possible because the
 273 * image is inactivated. */
 274bool bdrv_is_read_only(BlockDriverState *bs)
 275{
 276    IO_CODE();
 277    return !(bs->open_flags & BDRV_O_RDWR);
 278}
 279
 280static int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
 281                                  bool ignore_allow_rdw, Error **errp)
 282{
 283    IO_CODE();
 284
 285    /* Do not set read_only if copy_on_read is enabled */
 286    if (bs->copy_on_read && read_only) {
 287        error_setg(errp, "Can't set node '%s' to r/o with copy-on-read enabled",
 288                   bdrv_get_device_or_node_name(bs));
 289        return -EINVAL;
 290    }
 291
 292    /* Do not clear read_only if it is prohibited */
 293    if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR) &&
 294        !ignore_allow_rdw)
 295    {
 296        error_setg(errp, "Node '%s' is read only",
 297                   bdrv_get_device_or_node_name(bs));
 298        return -EPERM;
 299    }
 300
 301    return 0;
 302}
 303
 304/*
 305 * Called by a driver that can only provide a read-only image.
 306 *
 307 * Returns 0 if the node is already read-only or it could switch the node to
 308 * read-only because BDRV_O_AUTO_RDONLY is set.
 309 *
 310 * Returns -EACCES if the node is read-write and BDRV_O_AUTO_RDONLY is not set
 311 * or bdrv_can_set_read_only() forbids making the node read-only. If @errmsg
 312 * is not NULL, it is used as the error message for the Error object.
 313 */
 314int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
 315                              Error **errp)
 316{
 317    int ret = 0;
 318    IO_CODE();
 319
 320    if (!(bs->open_flags & BDRV_O_RDWR)) {
 321        return 0;
 322    }
 323    if (!(bs->open_flags & BDRV_O_AUTO_RDONLY)) {
 324        goto fail;
 325    }
 326
 327    ret = bdrv_can_set_read_only(bs, true, false, NULL);
 328    if (ret < 0) {
 329        goto fail;
 330    }
 331
 332    bs->open_flags &= ~BDRV_O_RDWR;
 333
 334    return 0;
 335
 336fail:
 337    error_setg(errp, "%s", errmsg ?: "Image is read-only");
 338    return -EACCES;
 339}
 340
 341/*
 342 * If @backing is empty, this function returns NULL without setting
 343 * @errp.  In all other cases, NULL will only be returned with @errp
 344 * set.
 345 *
 346 * Therefore, a return value of NULL without @errp set means that
 347 * there is no backing file; if @errp is set, there is one but its
 348 * absolute filename cannot be generated.
 349 */
 350char *bdrv_get_full_backing_filename_from_filename(const char *backed,
 351                                                   const char *backing,
 352                                                   Error **errp)
 353{
 354    if (backing[0] == '\0') {
 355        return NULL;
 356    } else if (path_has_protocol(backing) || path_is_absolute(backing)) {
 357        return g_strdup(backing);
 358    } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
 359        error_setg(errp, "Cannot use relative backing file names for '%s'",
 360                   backed);
 361        return NULL;
 362    } else {
 363        return path_combine(backed, backing);
 364    }
 365}
 366
 367/*
 368 * If @filename is empty or NULL, this function returns NULL without
 369 * setting @errp.  In all other cases, NULL will only be returned with
 370 * @errp set.
 371 */
 372static char *bdrv_make_absolute_filename(BlockDriverState *relative_to,
 373                                         const char *filename, Error **errp)
 374{
 375    char *dir, *full_name;
 376
 377    if (!filename || filename[0] == '\0') {
 378        return NULL;
 379    } else if (path_has_protocol(filename) || path_is_absolute(filename)) {
 380        return g_strdup(filename);
 381    }
 382
 383    dir = bdrv_dirname(relative_to, errp);
 384    if (!dir) {
 385        return NULL;
 386    }
 387
 388    full_name = g_strconcat(dir, filename, NULL);
 389    g_free(dir);
 390    return full_name;
 391}
 392
 393char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp)
 394{
 395    GLOBAL_STATE_CODE();
 396    return bdrv_make_absolute_filename(bs, bs->backing_file, errp);
 397}
 398
 399void bdrv_register(BlockDriver *bdrv)
 400{
 401    assert(bdrv->format_name);
 402    GLOBAL_STATE_CODE();
 403    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 404}
 405
 406BlockDriverState *bdrv_new(void)
 407{
 408    BlockDriverState *bs;
 409    int i;
 410
 411    GLOBAL_STATE_CODE();
 412
 413    bs = g_new0(BlockDriverState, 1);
 414    QLIST_INIT(&bs->dirty_bitmaps);
 415    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
 416        QLIST_INIT(&bs->op_blockers[i]);
 417    }
 418    qemu_co_mutex_init(&bs->reqs_lock);
 419    qemu_mutex_init(&bs->dirty_bitmap_mutex);
 420    bs->refcnt = 1;
 421    bs->aio_context = qemu_get_aio_context();
 422
 423    qemu_co_queue_init(&bs->flush_queue);
 424
 425    qemu_co_mutex_init(&bs->bsc_modify_lock);
 426    bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1);
 427
 428    for (i = 0; i < bdrv_drain_all_count; i++) {
 429        bdrv_drained_begin(bs);
 430    }
 431
 432    QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
 433
 434    return bs;
 435}
 436
 437static BlockDriver *bdrv_do_find_format(const char *format_name)
 438{
 439    BlockDriver *drv1;
 440    GLOBAL_STATE_CODE();
 441
 442    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 443        if (!strcmp(drv1->format_name, format_name)) {
 444            return drv1;
 445        }
 446    }
 447
 448    return NULL;
 449}
 450
 451BlockDriver *bdrv_find_format(const char *format_name)
 452{
 453    BlockDriver *drv1;
 454    int i;
 455
 456    GLOBAL_STATE_CODE();
 457
 458    drv1 = bdrv_do_find_format(format_name);
 459    if (drv1) {
 460        return drv1;
 461    }
 462
 463    /* The driver isn't registered, maybe we need to load a module */
 464    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
 465        if (!strcmp(block_driver_modules[i].format_name, format_name)) {
 466            Error *local_err = NULL;
 467            int rv = block_module_load(block_driver_modules[i].library_name,
 468                                       &local_err);
 469            if (rv > 0) {
 470                return bdrv_do_find_format(format_name);
 471            } else if (rv < 0) {
 472                error_report_err(local_err);
 473            }
 474            break;
 475        }
 476    }
 477    return NULL;
 478}
 479
 480static int bdrv_format_is_whitelisted(const char *format_name, bool read_only)
 481{
 482    static const char *whitelist_rw[] = {
 483        CONFIG_BDRV_RW_WHITELIST
 484        NULL
 485    };
 486    static const char *whitelist_ro[] = {
 487        CONFIG_BDRV_RO_WHITELIST
 488        NULL
 489    };
 490    const char **p;
 491
 492    if (!whitelist_rw[0] && !whitelist_ro[0]) {
 493        return 1;               /* no whitelist, anything goes */
 494    }
 495
 496    for (p = whitelist_rw; *p; p++) {
 497        if (!strcmp(format_name, *p)) {
 498            return 1;
 499        }
 500    }
 501    if (read_only) {
 502        for (p = whitelist_ro; *p; p++) {
 503            if (!strcmp(format_name, *p)) {
 504                return 1;
 505            }
 506        }
 507    }
 508    return 0;
 509}
 510
 511int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
 512{
 513    GLOBAL_STATE_CODE();
 514    return bdrv_format_is_whitelisted(drv->format_name, read_only);
 515}
 516
 517bool bdrv_uses_whitelist(void)
 518{
 519    return use_bdrv_whitelist;
 520}
 521
 522typedef struct CreateCo {
 523    BlockDriver *drv;
 524    char *filename;
 525    QemuOpts *opts;
 526    int ret;
 527    Error *err;
 528} CreateCo;
 529
 530int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename,
 531                                QemuOpts *opts, Error **errp)
 532{
 533    int ret;
 534    GLOBAL_STATE_CODE();
 535    ERRP_GUARD();
 536
 537    if (!drv->bdrv_co_create_opts) {
 538        error_setg(errp, "Driver '%s' does not support image creation",
 539                   drv->format_name);
 540        return -ENOTSUP;
 541    }
 542
 543    ret = drv->bdrv_co_create_opts(drv, filename, opts, errp);
 544    if (ret < 0 && !*errp) {
 545        error_setg_errno(errp, -ret, "Could not create image");
 546    }
 547
 548    return ret;
 549}
 550
 551/**
 552 * Helper function for bdrv_create_file_fallback(): Resize @blk to at
 553 * least the given @minimum_size.
 554 *
 555 * On success, return @blk's actual length.
 556 * Otherwise, return -errno.
 557 */
 558static int64_t coroutine_fn GRAPH_UNLOCKED
 559create_file_fallback_truncate(BlockBackend *blk, int64_t minimum_size,
 560                              Error **errp)
 561{
 562    Error *local_err = NULL;
 563    int64_t size;
 564    int ret;
 565
 566    GLOBAL_STATE_CODE();
 567
 568    ret = blk_co_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
 569                          &local_err);
 570    if (ret < 0 && ret != -ENOTSUP) {
 571        error_propagate(errp, local_err);
 572        return ret;
 573    }
 574
 575    size = blk_co_getlength(blk);
 576    if (size < 0) {
 577        error_free(local_err);
 578        error_setg_errno(errp, -size,
 579                         "Failed to inquire the new image file's length");
 580        return size;
 581    }
 582
 583    if (size < minimum_size) {
 584        /* Need to grow the image, but we failed to do that */
 585        error_propagate(errp, local_err);
 586        return -ENOTSUP;
 587    }
 588
 589    error_free(local_err);
 590    local_err = NULL;
 591
 592    return size;
 593}
 594
 595/**
 596 * Helper function for bdrv_create_file_fallback(): Zero the first
 597 * sector to remove any potentially pre-existing image header.
 598 */
 599static int coroutine_fn
 600create_file_fallback_zero_first_sector(BlockBackend *blk,
 601                                       int64_t current_size,
 602                                       Error **errp)
 603{
 604    int64_t bytes_to_clear;
 605    int ret;
 606
 607    GLOBAL_STATE_CODE();
 608
 609    bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE);
 610    if (bytes_to_clear) {
 611        ret = blk_co_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP);
 612        if (ret < 0) {
 613            error_setg_errno(errp, -ret,
 614                             "Failed to clear the new image's first sector");
 615            return ret;
 616        }
 617    }
 618
 619    return 0;
 620}
 621
 622/**
 623 * Simple implementation of bdrv_co_create_opts for protocol drivers
 624 * which only support creation via opening a file
 625 * (usually existing raw storage device)
 626 */
 627int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
 628                                            const char *filename,
 629                                            QemuOpts *opts,
 630                                            Error **errp)
 631{
 632    BlockBackend *blk;
 633    QDict *options;
 634    int64_t size = 0;
 635    char *buf = NULL;
 636    PreallocMode prealloc;
 637    Error *local_err = NULL;
 638    int ret;
 639
 640    GLOBAL_STATE_CODE();
 641
 642    size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
 643    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
 644    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
 645                               PREALLOC_MODE_OFF, &local_err);
 646    g_free(buf);
 647    if (local_err) {
 648        error_propagate(errp, local_err);
 649        return -EINVAL;
 650    }
 651
 652    if (prealloc != PREALLOC_MODE_OFF) {
 653        error_setg(errp, "Unsupported preallocation mode '%s'",
 654                   PreallocMode_str(prealloc));
 655        return -ENOTSUP;
 656    }
 657
 658    options = qdict_new();
 659    qdict_put_str(options, "driver", drv->format_name);
 660
 661    blk = blk_co_new_open(filename, NULL, options,
 662                          BDRV_O_RDWR | BDRV_O_RESIZE, errp);
 663    if (!blk) {
 664        error_prepend(errp, "Protocol driver '%s' does not support image "
 665                      "creation, and opening the image failed: ",
 666                      drv->format_name);
 667        return -EINVAL;
 668    }
 669
 670    size = create_file_fallback_truncate(blk, size, errp);
 671    if (size < 0) {
 672        ret = size;
 673        goto out;
 674    }
 675
 676    ret = create_file_fallback_zero_first_sector(blk, size, errp);
 677    if (ret < 0) {
 678        goto out;
 679    }
 680
 681    ret = 0;
 682out:
 683    blk_co_unref(blk);
 684    return ret;
 685}
 686
 687int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,
 688                                     Error **errp)
 689{
 690    QemuOpts *protocol_opts;
 691    BlockDriver *drv;
 692    QDict *qdict;
 693    int ret;
 694
 695    GLOBAL_STATE_CODE();
 696
 697    drv = bdrv_find_protocol(filename, true, errp);
 698    if (drv == NULL) {
 699        return -ENOENT;
 700    }
 701
 702    if (!drv->create_opts) {
 703        error_setg(errp, "Driver '%s' does not support image creation",
 704                   drv->format_name);
 705        return -ENOTSUP;
 706    }
 707
 708    /*
 709     * 'opts' contains a QemuOptsList with a combination of format and protocol
 710     * default values.
 711     *
 712     * The format properly removes its options, but the default values remain
 713     * in 'opts->list'.  So if the protocol has options with the same name
 714     * (e.g. rbd has 'cluster_size' as qcow2), it will see the default values
 715     * of the format, since for overlapping options, the format wins.
 716     *
 717     * To avoid this issue, lets convert QemuOpts to QDict, in this way we take
 718     * only the set options, and then convert it back to QemuOpts, using the
 719     * create_opts of the protocol. So the new QemuOpts, will contain only the
 720     * protocol defaults.
 721     */
 722    qdict = qemu_opts_to_qdict(opts, NULL);
 723    protocol_opts = qemu_opts_from_qdict(drv->create_opts, qdict, errp);
 724    if (protocol_opts == NULL) {
 725        ret = -EINVAL;
 726        goto out;
 727    }
 728
 729    ret = bdrv_co_create(drv, filename, protocol_opts, errp);
 730out:
 731    qemu_opts_del(protocol_opts);
 732    qobject_unref(qdict);
 733    return ret;
 734}
 735
 736int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp)
 737{
 738    Error *local_err = NULL;
 739    int ret;
 740
 741    IO_CODE();
 742    assert(bs != NULL);
 743    assert_bdrv_graph_readable();
 744
 745    if (!bs->drv) {
 746        error_setg(errp, "Block node '%s' is not opened", bs->filename);
 747        return -ENOMEDIUM;
 748    }
 749
 750    if (!bs->drv->bdrv_co_delete_file) {
 751        error_setg(errp, "Driver '%s' does not support image deletion",
 752                   bs->drv->format_name);
 753        return -ENOTSUP;
 754    }
 755
 756    ret = bs->drv->bdrv_co_delete_file(bs, &local_err);
 757    if (ret < 0) {
 758        error_propagate(errp, local_err);
 759    }
 760
 761    return ret;
 762}
 763
 764void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs)
 765{
 766    Error *local_err = NULL;
 767    int ret;
 768    IO_CODE();
 769
 770    if (!bs) {
 771        return;
 772    }
 773
 774    ret = bdrv_co_delete_file(bs, &local_err);
 775    /*
 776     * ENOTSUP will happen if the block driver doesn't support
 777     * the 'bdrv_co_delete_file' interface. This is a predictable
 778     * scenario and shouldn't be reported back to the user.
 779     */
 780    if (ret == -ENOTSUP) {
 781        error_free(local_err);
 782    } else if (ret < 0) {
 783        error_report_err(local_err);
 784    }
 785}
 786
 787/**
 788 * Try to get @bs's logical and physical block size.
 789 * On success, store them in @bsz struct and return 0.
 790 * On failure return -errno.
 791 * @bs must not be empty.
 792 */
 793int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 794{
 795    BlockDriver *drv = bs->drv;
 796    BlockDriverState *filtered = bdrv_filter_bs(bs);
 797    GLOBAL_STATE_CODE();
 798
 799    if (drv && drv->bdrv_probe_blocksizes) {
 800        return drv->bdrv_probe_blocksizes(bs, bsz);
 801    } else if (filtered) {
 802        return bdrv_probe_blocksizes(filtered, bsz);
 803    }
 804
 805    return -ENOTSUP;
 806}
 807
 808/**
 809 * Try to get @bs's geometry (cyls, heads, sectors).
 810 * On success, store them in @geo struct and return 0.
 811 * On failure return -errno.
 812 * @bs must not be empty.
 813 */
 814int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 815{
 816    BlockDriver *drv = bs->drv;
 817    BlockDriverState *filtered = bdrv_filter_bs(bs);
 818    GLOBAL_STATE_CODE();
 819
 820    if (drv && drv->bdrv_probe_geometry) {
 821        return drv->bdrv_probe_geometry(bs, geo);
 822    } else if (filtered) {
 823        return bdrv_probe_geometry(filtered, geo);
 824    }
 825
 826    return -ENOTSUP;
 827}
 828
 829/*
 830 * Create a uniquely-named empty temporary file.
 831 * Return the actual file name used upon success, otherwise NULL.
 832 * This string should be freed with g_free() when not needed any longer.
 833 *
 834 * Note: creating a temporary file for the caller to (re)open is
 835 * inherently racy. Use g_file_open_tmp() instead whenever practical.
 836 */
 837char *create_tmp_file(Error **errp)
 838{
 839    int fd;
 840    const char *tmpdir;
 841    g_autofree char *filename = NULL;
 842
 843    tmpdir = g_get_tmp_dir();
 844#ifndef _WIN32
 845    /*
 846     * See commit 69bef79 ("block: use /var/tmp instead of /tmp for -snapshot")
 847     *
 848     * This function is used to create temporary disk images (like -snapshot),
 849     * so the files can become very large. /tmp is often a tmpfs where as
 850     * /var/tmp is usually on a disk, so more appropriate for disk images.
 851     */
 852    if (!g_strcmp0(tmpdir, "/tmp")) {
 853        tmpdir = "/var/tmp";
 854    }
 855#endif
 856
 857    filename = g_strdup_printf("%s/vl.XXXXXX", tmpdir);
 858    fd = g_mkstemp(filename);
 859    if (fd < 0) {
 860        error_setg_errno(errp, errno, "Could not open temporary file '%s'",
 861                         filename);
 862        return NULL;
 863    }
 864    close(fd);
 865
 866    return g_steal_pointer(&filename);
 867}
 868
 869/*
 870 * Detect host devices. By convention, /dev/cdrom[N] is always
 871 * recognized as a host CDROM.
 872 */
 873static BlockDriver *find_hdev_driver(const char *filename)
 874{
 875    int score_max = 0, score;
 876    BlockDriver *drv = NULL, *d;
 877    GLOBAL_STATE_CODE();
 878
 879    QLIST_FOREACH(d, &bdrv_drivers, list) {
 880        if (d->bdrv_probe_device) {
 881            score = d->bdrv_probe_device(filename);
 882            if (score > score_max) {
 883                score_max = score;
 884                drv = d;
 885            }
 886        }
 887    }
 888
 889    return drv;
 890}
 891
 892static BlockDriver *bdrv_do_find_protocol(const char *protocol)
 893{
 894    BlockDriver *drv1;
 895    GLOBAL_STATE_CODE();
 896
 897    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 898        if (drv1->protocol_name && !strcmp(drv1->protocol_name, protocol)) {
 899            return drv1;
 900        }
 901    }
 902
 903    return NULL;
 904}
 905
 906BlockDriver *bdrv_find_protocol(const char *filename,
 907                                bool allow_protocol_prefix,
 908                                Error **errp)
 909{
 910    BlockDriver *drv1;
 911    char protocol[128];
 912    int len;
 913    const char *p;
 914    int i;
 915
 916    GLOBAL_STATE_CODE();
 917    /* TODO Drivers without bdrv_file_open must be specified explicitly */
 918
 919    /*
 920     * XXX(hch): we really should not let host device detection
 921     * override an explicit protocol specification, but moving this
 922     * later breaks access to device names with colons in them.
 923     * Thanks to the brain-dead persistent naming schemes on udev-
 924     * based Linux systems those actually are quite common.
 925     */
 926    drv1 = find_hdev_driver(filename);
 927    if (drv1) {
 928        return drv1;
 929    }
 930
 931    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
 932        return &bdrv_file;
 933    }
 934
 935    p = strchr(filename, ':');
 936    assert(p != NULL);
 937    len = p - filename;
 938    if (len > sizeof(protocol) - 1)
 939        len = sizeof(protocol) - 1;
 940    memcpy(protocol, filename, len);
 941    protocol[len] = '\0';
 942
 943    drv1 = bdrv_do_find_protocol(protocol);
 944    if (drv1) {
 945        return drv1;
 946    }
 947
 948    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
 949        if (block_driver_modules[i].protocol_name &&
 950            !strcmp(block_driver_modules[i].protocol_name, protocol)) {
 951            int rv = block_module_load(block_driver_modules[i].library_name, errp);
 952            if (rv > 0) {
 953                drv1 = bdrv_do_find_protocol(protocol);
 954            } else if (rv < 0) {
 955                return NULL;
 956            }
 957            break;
 958        }
 959    }
 960
 961    if (!drv1) {
 962        error_setg(errp, "Unknown protocol '%s'", protocol);
 963    }
 964    return drv1;
 965}
 966
 967/*
 968 * Guess image format by probing its contents.
 969 * This is not a good idea when your image is raw (CVE-2008-2004), but
 970 * we do it anyway for backward compatibility.
 971 *
 972 * @buf         contains the image's first @buf_size bytes.
 973 * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
 974 *              but can be smaller if the image file is smaller)
 975 * @filename    is its filename.
 976 *
 977 * For all block drivers, call the bdrv_probe() method to get its
 978 * probing score.
 979 * Return the first block driver with the highest probing score.
 980 */
 981BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
 982                            const char *filename)
 983{
 984    int score_max = 0, score;
 985    BlockDriver *drv = NULL, *d;
 986    IO_CODE();
 987
 988    QLIST_FOREACH(d, &bdrv_drivers, list) {
 989        if (d->bdrv_probe) {
 990            score = d->bdrv_probe(buf, buf_size, filename);
 991            if (score > score_max) {
 992                score_max = score;
 993                drv = d;
 994            }
 995        }
 996    }
 997
 998    return drv;
 999}
1000
1001static int find_image_format(BlockBackend *file, const char *filename,
1002                             BlockDriver **pdrv, Error **errp)
1003{
1004    BlockDriver *drv;
1005    uint8_t buf[BLOCK_PROBE_BUF_SIZE];
1006    int ret = 0;
1007
1008    GLOBAL_STATE_CODE();
1009
1010    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
1011    if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) {
1012        *pdrv = &bdrv_raw;
1013        return ret;
1014    }
1015
1016    ret = blk_pread(file, 0, sizeof(buf), buf, 0);
1017    if (ret < 0) {
1018        error_setg_errno(errp, -ret, "Could not read image for determining its "
1019                         "format");
1020        *pdrv = NULL;
1021        return ret;
1022    }
1023
1024    drv = bdrv_probe_all(buf, sizeof(buf), filename);
1025    if (!drv) {
1026        error_setg(errp, "Could not determine image format: No compatible "
1027                   "driver found");
1028        *pdrv = NULL;
1029        return -ENOENT;
1030    }
1031
1032    *pdrv = drv;
1033    return 0;
1034}
1035
1036/**
1037 * Set the current 'total_sectors' value
1038 * Return 0 on success, -errno on error.
1039 */
1040int coroutine_fn bdrv_co_refresh_total_sectors(BlockDriverState *bs,
1041                                               int64_t hint)
1042{
1043    BlockDriver *drv = bs->drv;
1044    IO_CODE();
1045    assert_bdrv_graph_readable();
1046
1047    if (!drv) {
1048        return -ENOMEDIUM;
1049    }
1050
1051    /* Do not attempt drv->bdrv_co_getlength() on scsi-generic devices */
1052    if (bdrv_is_sg(bs))
1053        return 0;
1054
1055    /* query actual device if possible, otherwise just trust the hint */
1056    if (drv->bdrv_co_getlength) {
1057        int64_t length = drv->bdrv_co_getlength(bs);
1058        if (length < 0) {
1059            return length;
1060        }
1061        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
1062    }
1063
1064    bs->total_sectors = hint;
1065
1066    if (bs->total_sectors * BDRV_SECTOR_SIZE > BDRV_MAX_LENGTH) {
1067        return -EFBIG;
1068    }
1069
1070    return 0;
1071}
1072
1073/**
1074 * Combines a QDict of new block driver @options with any missing options taken
1075 * from @old_options, so that leaving out an option defaults to its old value.
1076 */
1077static void bdrv_join_options(BlockDriverState *bs, QDict *options,
1078                              QDict *old_options)
1079{
1080    GLOBAL_STATE_CODE();
1081    if (bs->drv && bs->drv->bdrv_join_options) {
1082        bs->drv->bdrv_join_options(options, old_options);
1083    } else {
1084        qdict_join(options, old_options, false);
1085    }
1086}
1087
1088static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts,
1089                                                            int open_flags,
1090                                                            Error **errp)
1091{
1092    Error *local_err = NULL;
1093    char *value = qemu_opt_get_del(opts, "detect-zeroes");
1094    BlockdevDetectZeroesOptions detect_zeroes =
1095        qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup, value,
1096                        BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, &local_err);
1097    GLOBAL_STATE_CODE();
1098    g_free(value);
1099    if (local_err) {
1100        error_propagate(errp, local_err);
1101        return detect_zeroes;
1102    }
1103
1104    if (detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
1105        !(open_flags & BDRV_O_UNMAP))
1106    {
1107        error_setg(errp, "setting detect-zeroes to unmap is not allowed "
1108                   "without setting discard operation to unmap");
1109    }
1110
1111    return detect_zeroes;
1112}
1113
1114/**
1115 * Set open flags for aio engine
1116 *
1117 * Return 0 on success, -1 if the engine specified is invalid
1118 */
1119int bdrv_parse_aio(const char *mode, int *flags)
1120{
1121    if (!strcmp(mode, "threads")) {
1122        /* do nothing, default */
1123    } else if (!strcmp(mode, "native")) {
1124        *flags |= BDRV_O_NATIVE_AIO;
1125#ifdef CONFIG_LINUX_IO_URING
1126    } else if (!strcmp(mode, "io_uring")) {
1127        *flags |= BDRV_O_IO_URING;
1128#endif
1129    } else {
1130        return -1;
1131    }
1132
1133    return 0;
1134}
1135
1136/**
1137 * Set open flags for a given discard mode
1138 *
1139 * Return 0 on success, -1 if the discard mode was invalid.
1140 */
1141int bdrv_parse_discard_flags(const char *mode, int *flags)
1142{
1143    *flags &= ~BDRV_O_UNMAP;
1144
1145    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
1146        /* do nothing */
1147    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
1148        *flags |= BDRV_O_UNMAP;
1149    } else {
1150        return -1;
1151    }
1152
1153    return 0;
1154}
1155
1156/**
1157 * Set open flags for a given cache mode
1158 *
1159 * Return 0 on success, -1 if the cache mode was invalid.
1160 */
1161int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
1162{
1163    *flags &= ~BDRV_O_CACHE_MASK;
1164
1165    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
1166        *writethrough = false;
1167        *flags |= BDRV_O_NOCACHE;
1168    } else if (!strcmp(mode, "directsync")) {
1169        *writethrough = true;
1170        *flags |= BDRV_O_NOCACHE;
1171    } else if (!strcmp(mode, "writeback")) {
1172        *writethrough = false;
1173    } else if (!strcmp(mode, "unsafe")) {
1174        *writethrough = false;
1175        *flags |= BDRV_O_NO_FLUSH;
1176    } else if (!strcmp(mode, "writethrough")) {
1177        *writethrough = true;
1178    } else {
1179        return -1;
1180    }
1181
1182    return 0;
1183}
1184
1185static char *bdrv_child_get_parent_desc(BdrvChild *c)
1186{
1187    BlockDriverState *parent = c->opaque;
1188    return g_strdup_printf("node '%s'", bdrv_get_node_name(parent));
1189}
1190
1191static void bdrv_child_cb_drained_begin(BdrvChild *child)
1192{
1193    BlockDriverState *bs = child->opaque;
1194    bdrv_do_drained_begin_quiesce(bs, NULL);
1195}
1196
1197static bool bdrv_child_cb_drained_poll(BdrvChild *child)
1198{
1199    BlockDriverState *bs = child->opaque;
1200    return bdrv_drain_poll(bs, NULL, false);
1201}
1202
1203static void bdrv_child_cb_drained_end(BdrvChild *child)
1204{
1205    BlockDriverState *bs = child->opaque;
1206    bdrv_drained_end(bs);
1207}
1208
1209static int bdrv_child_cb_inactivate(BdrvChild *child)
1210{
1211    BlockDriverState *bs = child->opaque;
1212    GLOBAL_STATE_CODE();
1213    assert(bs->open_flags & BDRV_O_INACTIVE);
1214    return 0;
1215}
1216
1217static bool bdrv_child_cb_change_aio_ctx(BdrvChild *child, AioContext *ctx,
1218                                         GHashTable *visited, Transaction *tran,
1219                                         Error **errp)
1220{
1221    BlockDriverState *bs = child->opaque;
1222    return bdrv_change_aio_context(bs, ctx, visited, tran, errp);
1223}
1224
1225/*
1226 * Returns the options and flags that a temporary snapshot should get, based on
1227 * the originally requested flags (the originally requested image will have
1228 * flags like a backing file)
1229 */
1230static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
1231                                       int parent_flags, QDict *parent_options)
1232{
1233    GLOBAL_STATE_CODE();
1234    *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
1235
1236    /* For temporary files, unconditional cache=unsafe is fine */
1237    qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
1238    qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
1239
1240    /* Copy the read-only and discard options from the parent */
1241    qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1242    qdict_copy_default(child_options, parent_options, BDRV_OPT_DISCARD);
1243
1244    /* aio=native doesn't work for cache.direct=off, so disable it for the
1245     * temporary snapshot */
1246    *child_flags &= ~BDRV_O_NATIVE_AIO;
1247}
1248
1249static void bdrv_backing_attach(BdrvChild *c)
1250{
1251    BlockDriverState *parent = c->opaque;
1252    BlockDriverState *backing_hd = c->bs;
1253
1254    GLOBAL_STATE_CODE();
1255    assert(!parent->backing_blocker);
1256    error_setg(&parent->backing_blocker,
1257               "node is used as backing hd of '%s'",
1258               bdrv_get_device_or_node_name(parent));
1259
1260    bdrv_refresh_filename(backing_hd);
1261
1262    parent->open_flags &= ~BDRV_O_NO_BACKING;
1263
1264    bdrv_op_block_all(backing_hd, parent->backing_blocker);
1265    /* Otherwise we won't be able to commit or stream */
1266    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1267                    parent->backing_blocker);
1268    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
1269                    parent->backing_blocker);
1270    /*
1271     * We do backup in 3 ways:
1272     * 1. drive backup
1273     *    The target bs is new opened, and the source is top BDS
1274     * 2. blockdev backup
1275     *    Both the source and the target are top BDSes.
1276     * 3. internal backup(used for block replication)
1277     *    Both the source and the target are backing file
1278     *
1279     * In case 1 and 2, neither the source nor the target is the backing file.
1280     * In case 3, we will block the top BDS, so there is only one block job
1281     * for the top BDS and its backing chain.
1282     */
1283    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
1284                    parent->backing_blocker);
1285    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
1286                    parent->backing_blocker);
1287}
1288
1289static void bdrv_backing_detach(BdrvChild *c)
1290{
1291    BlockDriverState *parent = c->opaque;
1292
1293    GLOBAL_STATE_CODE();
1294    assert(parent->backing_blocker);
1295    bdrv_op_unblock_all(c->bs, parent->backing_blocker);
1296    error_free(parent->backing_blocker);
1297    parent->backing_blocker = NULL;
1298}
1299
1300static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
1301                                        const char *filename, Error **errp)
1302{
1303    BlockDriverState *parent = c->opaque;
1304    bool read_only = bdrv_is_read_only(parent);
1305    int ret;
1306    GLOBAL_STATE_CODE();
1307
1308    if (read_only) {
1309        ret = bdrv_reopen_set_read_only(parent, false, errp);
1310        if (ret < 0) {
1311            return ret;
1312        }
1313    }
1314
1315    ret = bdrv_change_backing_file(parent, filename,
1316                                   base->drv ? base->drv->format_name : "",
1317                                   false);
1318    if (ret < 0) {
1319        error_setg_errno(errp, -ret, "Could not update backing file link");
1320    }
1321
1322    if (read_only) {
1323        bdrv_reopen_set_read_only(parent, true, NULL);
1324    }
1325
1326    return ret;
1327}
1328
1329/*
1330 * Returns the options and flags that a generic child of a BDS should
1331 * get, based on the given options and flags for the parent BDS.
1332 */
1333static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format,
1334                                   int *child_flags, QDict *child_options,
1335                                   int parent_flags, QDict *parent_options)
1336{
1337    int flags = parent_flags;
1338    GLOBAL_STATE_CODE();
1339
1340    /*
1341     * First, decide whether to set, clear, or leave BDRV_O_PROTOCOL.
1342     * Generally, the question to answer is: Should this child be
1343     * format-probed by default?
1344     */
1345
1346    /*
1347     * Pure and non-filtered data children of non-format nodes should
1348     * be probed by default (even when the node itself has BDRV_O_PROTOCOL
1349     * set).  This only affects a very limited set of drivers (namely
1350     * quorum and blkverify when this comment was written).
1351     * Force-clear BDRV_O_PROTOCOL then.
1352     */
1353    if (!parent_is_format &&
1354        (role & BDRV_CHILD_DATA) &&
1355        !(role & (BDRV_CHILD_METADATA | BDRV_CHILD_FILTERED)))
1356    {
1357        flags &= ~BDRV_O_PROTOCOL;
1358    }
1359
1360    /*
1361     * All children of format nodes (except for COW children) and all
1362     * metadata children in general should never be format-probed.
1363     * Force-set BDRV_O_PROTOCOL then.
1364     */
1365    if ((parent_is_format && !(role & BDRV_CHILD_COW)) ||
1366        (role & BDRV_CHILD_METADATA))
1367    {
1368        flags |= BDRV_O_PROTOCOL;
1369    }
1370
1371    /*
1372     * If the cache mode isn't explicitly set, inherit direct and no-flush from
1373     * the parent.
1374     */
1375    qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_DIRECT);
1376    qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_NO_FLUSH);
1377    qdict_copy_default(child_options, parent_options, BDRV_OPT_FORCE_SHARE);
1378
1379    if (role & BDRV_CHILD_COW) {
1380        /* backing files are opened read-only by default */
1381        qdict_set_default_str(child_options, BDRV_OPT_READ_ONLY, "on");
1382        qdict_set_default_str(child_options, BDRV_OPT_AUTO_READ_ONLY, "off");
1383    } else {
1384        /* Inherit the read-only option from the parent if it's not set */
1385        qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1386        qdict_copy_default(child_options, parent_options,
1387                           BDRV_OPT_AUTO_READ_ONLY);
1388    }
1389
1390    /*
1391     * bdrv_co_pdiscard() respects unmap policy for the parent, so we
1392     * can default to enable it on lower layers regardless of the
1393     * parent option.
1394     */
1395    qdict_set_default_str(child_options, BDRV_OPT_DISCARD, "unmap");
1396
1397    /* Clear flags that only apply to the top layer */
1398    flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
1399
1400    if (role & BDRV_CHILD_METADATA) {
1401        flags &= ~BDRV_O_NO_IO;
1402    }
1403    if (role & BDRV_CHILD_COW) {
1404        flags &= ~BDRV_O_TEMPORARY;
1405    }
1406
1407    *child_flags = flags;
1408}
1409
1410static void GRAPH_WRLOCK bdrv_child_cb_attach(BdrvChild *child)
1411{
1412    BlockDriverState *bs = child->opaque;
1413
1414    assert_bdrv_graph_writable();
1415    QLIST_INSERT_HEAD(&bs->children, child, next);
1416    if (bs->drv->is_filter || (child->role & BDRV_CHILD_FILTERED)) {
1417        /*
1418         * Here we handle filters and block/raw-format.c when it behave like
1419         * filter. They generally have a single PRIMARY child, which is also the
1420         * FILTERED child, and that they may have multiple more children, which
1421         * are neither PRIMARY nor FILTERED. And never we have a COW child here.
1422         * So bs->file will be the PRIMARY child, unless the PRIMARY child goes
1423         * into bs->backing on exceptional cases; and bs->backing will be
1424         * nothing else.
1425         */
1426        assert(!(child->role & BDRV_CHILD_COW));
1427        if (child->role & BDRV_CHILD_PRIMARY) {
1428            assert(child->role & BDRV_CHILD_FILTERED);
1429            assert(!bs->backing);
1430            assert(!bs->file);
1431
1432            if (bs->drv->filtered_child_is_backing) {
1433                bs->backing = child;
1434            } else {
1435                bs->file = child;
1436            }
1437        } else {
1438            assert(!(child->role & BDRV_CHILD_FILTERED));
1439        }
1440    } else if (child->role & BDRV_CHILD_COW) {
1441        assert(bs->drv->supports_backing);
1442        assert(!(child->role & BDRV_CHILD_PRIMARY));
1443        assert(!bs->backing);
1444        bs->backing = child;
1445        bdrv_backing_attach(child);
1446    } else if (child->role & BDRV_CHILD_PRIMARY) {
1447        assert(!bs->file);
1448        bs->file = child;
1449    }
1450}
1451
1452static void GRAPH_WRLOCK bdrv_child_cb_detach(BdrvChild *child)
1453{
1454    BlockDriverState *bs = child->opaque;
1455
1456    if (child->role & BDRV_CHILD_COW) {
1457        bdrv_backing_detach(child);
1458    }
1459
1460    assert_bdrv_graph_writable();
1461    QLIST_REMOVE(child, next);
1462    if (child == bs->backing) {
1463        assert(child != bs->file);
1464        bs->backing = NULL;
1465    } else if (child == bs->file) {
1466        bs->file = NULL;
1467    }
1468}
1469
1470static int bdrv_child_cb_update_filename(BdrvChild *c, BlockDriverState *base,
1471                                         const char *filename, Error **errp)
1472{
1473    if (c->role & BDRV_CHILD_COW) {
1474        return bdrv_backing_update_filename(c, base, filename, errp);
1475    }
1476    return 0;
1477}
1478
1479AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c)
1480{
1481    BlockDriverState *bs = c->opaque;
1482    IO_CODE();
1483
1484    return bdrv_get_aio_context(bs);
1485}
1486
1487const BdrvChildClass child_of_bds = {
1488    .parent_is_bds   = true,
1489    .get_parent_desc = bdrv_child_get_parent_desc,
1490    .inherit_options = bdrv_inherited_options,
1491    .drained_begin   = bdrv_child_cb_drained_begin,
1492    .drained_poll    = bdrv_child_cb_drained_poll,
1493    .drained_end     = bdrv_child_cb_drained_end,
1494    .attach          = bdrv_child_cb_attach,
1495    .detach          = bdrv_child_cb_detach,
1496    .inactivate      = bdrv_child_cb_inactivate,
1497    .change_aio_ctx  = bdrv_child_cb_change_aio_ctx,
1498    .update_filename = bdrv_child_cb_update_filename,
1499    .get_parent_aio_context = child_of_bds_get_parent_aio_context,
1500};
1501
1502AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
1503{
1504    IO_CODE();
1505    return c->klass->get_parent_aio_context(c);
1506}
1507
1508static int bdrv_open_flags(BlockDriverState *bs, int flags)
1509{
1510    int open_flags = flags;
1511    GLOBAL_STATE_CODE();
1512
1513    /*
1514     * Clear flags that are internal to the block layer before opening the
1515     * image.
1516     */
1517    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
1518
1519    return open_flags;
1520}
1521
1522static void update_flags_from_options(int *flags, QemuOpts *opts)
1523{
1524    GLOBAL_STATE_CODE();
1525
1526    *flags &= ~(BDRV_O_CACHE_MASK | BDRV_O_RDWR | BDRV_O_AUTO_RDONLY);
1527
1528    if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_NO_FLUSH, false)) {
1529        *flags |= BDRV_O_NO_FLUSH;
1530    }
1531
1532    if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_DIRECT, false)) {
1533        *flags |= BDRV_O_NOCACHE;
1534    }
1535
1536    if (!qemu_opt_get_bool_del(opts, BDRV_OPT_READ_ONLY, false)) {
1537        *flags |= BDRV_O_RDWR;
1538    }
1539
1540    if (qemu_opt_get_bool_del(opts, BDRV_OPT_AUTO_READ_ONLY, false)) {
1541        *flags |= BDRV_O_AUTO_RDONLY;
1542    }
1543}
1544
1545static void update_options_from_flags(QDict *options, int flags)
1546{
1547    GLOBAL_STATE_CODE();
1548    if (!qdict_haskey(options, BDRV_OPT_CACHE_DIRECT)) {
1549        qdict_put_bool(options, BDRV_OPT_CACHE_DIRECT, flags & BDRV_O_NOCACHE);
1550    }
1551    if (!qdict_haskey(options, BDRV_OPT_CACHE_NO_FLUSH)) {
1552        qdict_put_bool(options, BDRV_OPT_CACHE_NO_FLUSH,
1553                       flags & BDRV_O_NO_FLUSH);
1554    }
1555    if (!qdict_haskey(options, BDRV_OPT_READ_ONLY)) {
1556        qdict_put_bool(options, BDRV_OPT_READ_ONLY, !(flags & BDRV_O_RDWR));
1557    }
1558    if (!qdict_haskey(options, BDRV_OPT_AUTO_READ_ONLY)) {
1559        qdict_put_bool(options, BDRV_OPT_AUTO_READ_ONLY,
1560                       flags & BDRV_O_AUTO_RDONLY);
1561    }
1562}
1563
1564static void bdrv_assign_node_name(BlockDriverState *bs,
1565                                  const char *node_name,
1566                                  Error **errp)
1567{
1568    char *gen_node_name = NULL;
1569    GLOBAL_STATE_CODE();
1570
1571    if (!node_name) {
1572        node_name = gen_node_name = id_generate(ID_BLOCK);
1573    } else if (!id_wellformed(node_name)) {
1574        /*
1575         * Check for empty string or invalid characters, but not if it is
1576         * generated (generated names use characters not available to the user)
1577         */
1578        error_setg(errp, "Invalid node-name: '%s'", node_name);
1579        return;
1580    }
1581
1582    /* takes care of avoiding namespaces collisions */
1583    if (blk_by_name(node_name)) {
1584        error_setg(errp, "node-name=%s is conflicting with a device id",
1585                   node_name);
1586        goto out;
1587    }
1588
1589    /* takes care of avoiding duplicates node names */
1590    if (bdrv_find_node(node_name)) {
1591        error_setg(errp, "Duplicate nodes with node-name='%s'", node_name);
1592        goto out;
1593    }
1594
1595    /* Make sure that the node name isn't truncated */
1596    if (strlen(node_name) >= sizeof(bs->node_name)) {
1597        error_setg(errp, "Node name too long");
1598        goto out;
1599    }
1600
1601    /* copy node name into the bs and insert it into the graph list */
1602    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
1603    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
1604out:
1605    g_free(gen_node_name);
1606}
1607
1608/*
1609 * The caller must always hold @bs AioContext lock, because this function calls
1610 * bdrv_refresh_total_sectors() which polls when called from non-coroutine
1611 * context.
1612 */
1613static int no_coroutine_fn GRAPH_UNLOCKED
1614bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
1615                 QDict *options, int open_flags, Error **errp)
1616{
1617    AioContext *ctx;
1618    Error *local_err = NULL;
1619    int i, ret;
1620    GLOBAL_STATE_CODE();
1621
1622    bdrv_assign_node_name(bs, node_name, &local_err);
1623    if (local_err) {
1624        error_propagate(errp, local_err);
1625        return -EINVAL;
1626    }
1627
1628    bs->drv = drv;
1629    bs->opaque = g_malloc0(drv->instance_size);
1630
1631    if (drv->bdrv_file_open) {
1632        assert(!drv->bdrv_needs_filename || bs->filename[0]);
1633        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1634    } else if (drv->bdrv_open) {
1635        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1636    } else {
1637        ret = 0;
1638    }
1639
1640    if (ret < 0) {
1641        if (local_err) {
1642            error_propagate(errp, local_err);
1643        } else if (bs->filename[0]) {
1644            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1645        } else {
1646            error_setg_errno(errp, -ret, "Could not open image");
1647        }
1648        goto open_failed;
1649    }
1650
1651    assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK));
1652    assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK));
1653
1654    /*
1655     * Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves
1656     * drivers that pass read/write requests through to a child the trouble of
1657     * declaring support explicitly.
1658     *
1659     * Drivers must not propagate this flag accidentally when they initiate I/O
1660     * to a bounce buffer. That case should be rare though.
1661     */
1662    bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
1663    bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;
1664
1665    /* Get the context after .bdrv_open, it can change the context */
1666    ctx = bdrv_get_aio_context(bs);
1667    aio_context_acquire(ctx);
1668
1669    ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
1670    if (ret < 0) {
1671        error_setg_errno(errp, -ret, "Could not refresh total sector count");
1672        aio_context_release(ctx);
1673        return ret;
1674    }
1675
1676    bdrv_graph_rdlock_main_loop();
1677    bdrv_refresh_limits(bs, NULL, &local_err);
1678    bdrv_graph_rdunlock_main_loop();
1679    aio_context_release(ctx);
1680
1681    if (local_err) {
1682        error_propagate(errp, local_err);
1683        return -EINVAL;
1684    }
1685
1686    assert(bdrv_opt_mem_align(bs) != 0);
1687    assert(bdrv_min_mem_align(bs) != 0);
1688    assert(is_power_of_2(bs->bl.request_alignment));
1689
1690    for (i = 0; i < bs->quiesce_counter; i++) {
1691        if (drv->bdrv_drain_begin) {
1692            drv->bdrv_drain_begin(bs);
1693        }
1694    }
1695
1696    return 0;
1697open_failed:
1698    bs->drv = NULL;
1699    if (bs->file != NULL) {
1700        bdrv_unref_child(bs, bs->file);
1701        assert(!bs->file);
1702    }
1703    g_free(bs->opaque);
1704    bs->opaque = NULL;
1705    return ret;
1706}
1707
1708/*
1709 * Create and open a block node.
1710 *
1711 * @options is a QDict of options to pass to the block drivers, or NULL for an
1712 * empty set of options. The reference to the QDict belongs to the block layer
1713 * after the call (even on failure), so if the caller intends to reuse the
1714 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
1715 */
1716BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv,
1717                                            const char *node_name,
1718                                            QDict *options, int flags,
1719                                            Error **errp)
1720{
1721    BlockDriverState *bs;
1722    int ret;
1723
1724    GLOBAL_STATE_CODE();
1725
1726    bs = bdrv_new();
1727    bs->open_flags = flags;
1728    bs->options = options ?: qdict_new();
1729    bs->explicit_options = qdict_clone_shallow(bs->options);
1730    bs->opaque = NULL;
1731
1732    update_options_from_flags(bs->options, flags);
1733
1734    ret = bdrv_open_driver(bs, drv, node_name, bs->options, flags, errp);
1735    if (ret < 0) {
1736        qobject_unref(bs->explicit_options);
1737        bs->explicit_options = NULL;
1738        qobject_unref(bs->options);
1739        bs->options = NULL;
1740        bdrv_unref(bs);
1741        return NULL;
1742    }
1743
1744    return bs;
1745}
1746
1747/* Create and open a block node. */
1748BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
1749                                       int flags, Error **errp)
1750{
1751    GLOBAL_STATE_CODE();
1752    return bdrv_new_open_driver_opts(drv, node_name, NULL, flags, errp);
1753}
1754
1755QemuOptsList bdrv_runtime_opts = {
1756    .name = "bdrv_common",
1757    .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
1758    .desc = {
1759        {
1760            .name = "node-name",
1761            .type = QEMU_OPT_STRING,
1762            .help = "Node name of the block device node",
1763        },
1764        {
1765            .name = "driver",
1766            .type = QEMU_OPT_STRING,
1767            .help = "Block driver to use for the node",
1768        },
1769        {
1770            .name = BDRV_OPT_CACHE_DIRECT,
1771            .type = QEMU_OPT_BOOL,
1772            .help = "Bypass software writeback cache on the host",
1773        },
1774        {
1775            .name = BDRV_OPT_CACHE_NO_FLUSH,
1776            .type = QEMU_OPT_BOOL,
1777            .help = "Ignore flush requests",
1778        },
1779        {
1780            .name = BDRV_OPT_READ_ONLY,
1781            .type = QEMU_OPT_BOOL,
1782            .help = "Node is opened in read-only mode",
1783        },
1784        {
1785            .name = BDRV_OPT_AUTO_READ_ONLY,
1786            .type = QEMU_OPT_BOOL,
1787            .help = "Node can become read-only if opening read-write fails",
1788        },
1789        {
1790            .name = "detect-zeroes",
1791            .type = QEMU_OPT_STRING,
1792            .help = "try to optimize zero writes (off, on, unmap)",
1793        },
1794        {
1795            .name = BDRV_OPT_DISCARD,
1796            .type = QEMU_OPT_STRING,
1797            .help = "discard operation (ignore/off, unmap/on)",
1798        },
1799        {
1800            .name = BDRV_OPT_FORCE_SHARE,
1801            .type = QEMU_OPT_BOOL,
1802            .help = "always accept other writers (default: off)",
1803        },
1804        { /* end of list */ }
1805    },
1806};
1807
1808QemuOptsList bdrv_create_opts_simple = {
1809    .name = "simple-create-opts",
1810    .head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head),
1811    .desc = {
1812        {
1813            .name = BLOCK_OPT_SIZE,
1814            .type = QEMU_OPT_SIZE,
1815            .help = "Virtual disk size"
1816        },
1817        {
1818            .name = BLOCK_OPT_PREALLOC,
1819            .type = QEMU_OPT_STRING,
1820            .help = "Preallocation mode (allowed values: off)"
1821        },
1822        { /* end of list */ }
1823    }
1824};
1825
1826/*
1827 * Common part for opening disk images and files
1828 *
1829 * Removes all processed options from *options.
1830 */
1831static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
1832                            QDict *options, Error **errp)
1833{
1834    int ret, open_flags;
1835    const char *filename;
1836    const char *driver_name = NULL;
1837    const char *node_name = NULL;
1838    const char *discard;
1839    QemuOpts *opts;
1840    BlockDriver *drv;
1841    Error *local_err = NULL;
1842    bool ro;
1843
1844    assert(bs->file == NULL);
1845    assert(options != NULL && bs->options != options);
1846    GLOBAL_STATE_CODE();
1847
1848    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
1849    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
1850        ret = -EINVAL;
1851        goto fail_opts;
1852    }
1853
1854    update_flags_from_options(&bs->open_flags, opts);
1855
1856    driver_name = qemu_opt_get(opts, "driver");
1857    drv = bdrv_find_format(driver_name);
1858    assert(drv != NULL);
1859
1860    bs->force_share = qemu_opt_get_bool(opts, BDRV_OPT_FORCE_SHARE, false);
1861
1862    if (bs->force_share && (bs->open_flags & BDRV_O_RDWR)) {
1863        error_setg(errp,
1864                   BDRV_OPT_FORCE_SHARE
1865                   "=on can only be used with read-only images");
1866        ret = -EINVAL;
1867        goto fail_opts;
1868    }
1869
1870    if (file != NULL) {
1871        bdrv_refresh_filename(blk_bs(file));
1872        filename = blk_bs(file)->filename;
1873    } else {
1874        /*
1875         * Caution: while qdict_get_try_str() is fine, getting
1876         * non-string types would require more care.  When @options
1877         * come from -blockdev or blockdev_add, its members are typed
1878         * according to the QAPI schema, but when they come from
1879         * -drive, they're all QString.
1880         */
1881        filename = qdict_get_try_str(options, "filename");
1882    }
1883
1884    if (drv->bdrv_needs_filename && (!filename || !filename[0])) {
1885        error_setg(errp, "The '%s' block driver requires a file name",
1886                   drv->format_name);
1887        ret = -EINVAL;
1888        goto fail_opts;
1889    }
1890
1891    trace_bdrv_open_common(bs, filename ?: "", bs->open_flags,
1892                           drv->format_name);
1893
1894    ro = bdrv_is_read_only(bs);
1895
1896    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, ro)) {
1897        if (!ro && bdrv_is_whitelisted(drv, true)) {
1898            ret = bdrv_apply_auto_read_only(bs, NULL, NULL);
1899        } else {
1900            ret = -ENOTSUP;
1901        }
1902        if (ret < 0) {
1903            error_setg(errp,
1904                       !ro && bdrv_is_whitelisted(drv, true)
1905                       ? "Driver '%s' can only be used for read-only devices"
1906                       : "Driver '%s' is not whitelisted",
1907                       drv->format_name);
1908            goto fail_opts;
1909        }
1910    }
1911
1912    /* bdrv_new() and bdrv_close() make it so */
1913    assert(qatomic_read(&bs->copy_on_read) == 0);
1914
1915    if (bs->open_flags & BDRV_O_COPY_ON_READ) {
1916        if (!ro) {
1917            bdrv_enable_copy_on_read(bs);
1918        } else {
1919            error_setg(errp, "Can't use copy-on-read on read-only device");
1920            ret = -EINVAL;
1921            goto fail_opts;
1922        }
1923    }
1924
1925    discard = qemu_opt_get(opts, BDRV_OPT_DISCARD);
1926    if (discard != NULL) {
1927        if (bdrv_parse_discard_flags(discard, &bs->open_flags) != 0) {
1928            error_setg(errp, "Invalid discard option");
1929            ret = -EINVAL;
1930            goto fail_opts;
1931        }
1932    }
1933
1934    bs->detect_zeroes =
1935        bdrv_parse_detect_zeroes(opts, bs->open_flags, &local_err);
1936    if (local_err) {
1937        error_propagate(errp, local_err);
1938        ret = -EINVAL;
1939        goto fail_opts;
1940    }
1941
1942    if (filename != NULL) {
1943        pstrcpy(bs->filename, sizeof(bs->filename), filename);
1944    } else {
1945        bs->filename[0] = '\0';
1946    }
1947    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1948
1949    /* Open the image, either directly or using a protocol */
1950    open_flags = bdrv_open_flags(bs, bs->open_flags);
1951    node_name = qemu_opt_get(opts, "node-name");
1952
1953    assert(!drv->bdrv_file_open || file == NULL);
1954    ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
1955    if (ret < 0) {
1956        goto fail_opts;
1957    }
1958
1959    qemu_opts_del(opts);
1960    return 0;
1961
1962fail_opts:
1963    qemu_opts_del(opts);
1964    return ret;
1965}
1966
1967static QDict *parse_json_filename(const char *filename, Error **errp)
1968{
1969    QObject *options_obj;
1970    QDict *options;
1971    int ret;
1972    GLOBAL_STATE_CODE();
1973
1974    ret = strstart(filename, "json:", &filename);
1975    assert(ret);
1976
1977    options_obj = qobject_from_json(filename, errp);
1978    if (!options_obj) {
1979        error_prepend(errp, "Could not parse the JSON options: ");
1980        return NULL;
1981    }
1982
1983    options = qobject_to(QDict, options_obj);
1984    if (!options) {
1985        qobject_unref(options_obj);
1986        error_setg(errp, "Invalid JSON object given");
1987        return NULL;
1988    }
1989
1990    qdict_flatten(options);
1991
1992    return options;
1993}
1994
1995static void parse_json_protocol(QDict *options, const char **pfilename,
1996                                Error **errp)
1997{
1998    QDict *json_options;
1999    Error *local_err = NULL;
2000    GLOBAL_STATE_CODE();
2001
2002    /* Parse json: pseudo-protocol */
2003    if (!*pfilename || !g_str_has_prefix(*pfilename, "json:")) {
2004        return;
2005    }
2006
2007    json_options = parse_json_filename(*pfilename, &local_err);
2008    if (local_err) {
2009        error_propagate(errp, local_err);
2010        return;
2011    }
2012
2013    /* Options given in the filename have lower priority than options
2014     * specified directly */
2015    qdict_join(options, json_options, false);
2016    qobject_unref(json_options);
2017    *pfilename = NULL;
2018}
2019
2020/*
2021 * Fills in default options for opening images and converts the legacy
2022 * filename/flags pair to option QDict entries.
2023 * The BDRV_O_PROTOCOL flag in *flags will be set or cleared accordingly if a
2024 * block driver has been specified explicitly.
2025 */
2026static int bdrv_fill_options(QDict **options, const char *filename,
2027                             int *flags, Error **errp)
2028{
2029    const char *drvname;
2030    bool protocol = *flags & BDRV_O_PROTOCOL;
2031    bool parse_filename = false;
2032    BlockDriver *drv = NULL;
2033    Error *local_err = NULL;
2034
2035    GLOBAL_STATE_CODE();
2036
2037    /*
2038     * Caution: while qdict_get_try_str() is fine, getting non-string
2039     * types would require more care.  When @options come from
2040     * -blockdev or blockdev_add, its members are typed according to
2041     * the QAPI schema, but when they come from -drive, they're all
2042     * QString.
2043     */
2044    drvname = qdict_get_try_str(*options, "driver");
2045    if (drvname) {
2046        drv = bdrv_find_format(drvname);
2047        if (!drv) {
2048            error_setg(errp, "Unknown driver '%s'", drvname);
2049            return -ENOENT;
2050        }
2051        /* If the user has explicitly specified the driver, this choice should
2052         * override the BDRV_O_PROTOCOL flag */
2053        protocol = drv->bdrv_file_open;
2054    }
2055
2056    if (protocol) {
2057        *flags |= BDRV_O_PROTOCOL;
2058    } else {
2059        *flags &= ~BDRV_O_PROTOCOL;
2060    }
2061
2062    /* Translate cache options from flags into options */
2063    update_options_from_flags(*options, *flags);
2064
2065    /* Fetch the file name from the options QDict if necessary */
2066    if (protocol && filename) {
2067        if (!qdict_haskey(*options, "filename")) {
2068            qdict_put_str(*options, "filename", filename);
2069            parse_filename = true;
2070        } else {
2071            error_setg(errp, "Can't specify 'file' and 'filename' options at "
2072                             "the same time");
2073            return -EINVAL;
2074        }
2075    }
2076
2077    /* Find the right block driver */
2078    /* See cautionary note on accessing @options above */
2079    filename = qdict_get_try_str(*options, "filename");
2080
2081    if (!drvname && protocol) {
2082        if (filename) {
2083            drv = bdrv_find_protocol(filename, parse_filename, errp);
2084            if (!drv) {
2085                return -EINVAL;
2086            }
2087
2088            drvname = drv->format_name;
2089            qdict_put_str(*options, "driver", drvname);
2090        } else {
2091            error_setg(errp, "Must specify either driver or file");
2092            return -EINVAL;
2093        }
2094    }
2095
2096    assert(drv || !protocol);
2097
2098    /* Driver-specific filename parsing */
2099    if (drv && drv->bdrv_parse_filename && parse_filename) {
2100        drv->bdrv_parse_filename(filename, *options, &local_err);
2101        if (local_err) {
2102            error_propagate(errp, local_err);
2103            return -EINVAL;
2104        }
2105
2106        if (!drv->bdrv_needs_filename) {
2107            qdict_del(*options, "filename");
2108        }
2109    }
2110
2111    return 0;
2112}
2113
2114typedef struct BlockReopenQueueEntry {
2115     bool prepared;
2116     bool perms_checked;
2117     BDRVReopenState state;
2118     QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
2119} BlockReopenQueueEntry;
2120
2121/*
2122 * Return the flags that @bs will have after the reopens in @q have
2123 * successfully completed. If @q is NULL (or @bs is not contained in @q),
2124 * return the current flags.
2125 */
2126static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs)
2127{
2128    BlockReopenQueueEntry *entry;
2129
2130    if (q != NULL) {
2131        QTAILQ_FOREACH(entry, q, entry) {
2132            if (entry->state.bs == bs) {
2133                return entry->state.flags;
2134            }
2135        }
2136    }
2137
2138    return bs->open_flags;
2139}
2140
2141/* Returns whether the image file can be written to after the reopen queue @q
2142 * has been successfully applied, or right now if @q is NULL. */
2143static bool bdrv_is_writable_after_reopen(BlockDriverState *bs,
2144                                          BlockReopenQueue *q)
2145{
2146    int flags = bdrv_reopen_get_flags(q, bs);
2147
2148    return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
2149}
2150
2151/*
2152 * Return whether the BDS can be written to.  This is not necessarily
2153 * the same as !bdrv_is_read_only(bs), as inactivated images may not
2154 * be written to but do not count as read-only images.
2155 */
2156bool bdrv_is_writable(BlockDriverState *bs)
2157{
2158    IO_CODE();
2159    return bdrv_is_writable_after_reopen(bs, NULL);
2160}
2161
2162static char *bdrv_child_user_desc(BdrvChild *c)
2163{
2164    GLOBAL_STATE_CODE();
2165    return c->klass->get_parent_desc(c);
2166}
2167
2168/*
2169 * Check that @a allows everything that @b needs. @a and @b must reference same
2170 * child node.
2171 */
2172static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
2173{
2174    const char *child_bs_name;
2175    g_autofree char *a_user = NULL;
2176    g_autofree char *b_user = NULL;
2177    g_autofree char *perms = NULL;
2178
2179    assert(a->bs);
2180    assert(a->bs == b->bs);
2181    GLOBAL_STATE_CODE();
2182
2183    if ((b->perm & a->shared_perm) == b->perm) {
2184        return true;
2185    }
2186
2187    child_bs_name = bdrv_get_node_name(b->bs);
2188    a_user = bdrv_child_user_desc(a);
2189    b_user = bdrv_child_user_desc(b);
2190    perms = bdrv_perm_names(b->perm & ~a->shared_perm);
2191
2192    error_setg(errp, "Permission conflict on node '%s': permissions '%s' are "
2193               "both required by %s (uses node '%s' as '%s' child) and "
2194               "unshared by %s (uses node '%s' as '%s' child).",
2195               child_bs_name, perms,
2196               b_user, child_bs_name, b->name,
2197               a_user, child_bs_name, a->name);
2198
2199    return false;
2200}
2201
2202static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
2203{
2204    BdrvChild *a, *b;
2205    GLOBAL_STATE_CODE();
2206
2207    /*
2208     * During the loop we'll look at each pair twice. That's correct because
2209     * bdrv_a_allow_b() is asymmetric and we should check each pair in both
2210     * directions.
2211     */
2212    QLIST_FOREACH(a, &bs->parents, next_parent) {
2213        QLIST_FOREACH(b, &bs->parents, next_parent) {
2214            if (a == b) {
2215                continue;
2216            }
2217
2218            if (!bdrv_a_allow_b(a, b, errp)) {
2219                return true;
2220            }
2221        }
2222    }
2223
2224    return false;
2225}
2226
2227static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
2228                            BdrvChild *c, BdrvChildRole role,
2229                            BlockReopenQueue *reopen_queue,
2230                            uint64_t parent_perm, uint64_t parent_shared,
2231                            uint64_t *nperm, uint64_t *nshared)
2232{
2233    assert(bs->drv && bs->drv->bdrv_child_perm);
2234    GLOBAL_STATE_CODE();
2235    bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
2236                             parent_perm, parent_shared,
2237                             nperm, nshared);
2238    /* TODO Take force_share from reopen_queue */
2239    if (child_bs && child_bs->force_share) {
2240        *nshared = BLK_PERM_ALL;
2241    }
2242}
2243
2244/*
2245 * Adds the whole subtree of @bs (including @bs itself) to the @list (except for
2246 * nodes that are already in the @list, of course) so that final list is
2247 * topologically sorted. Return the result (GSList @list object is updated, so
2248 * don't use old reference after function call).
2249 *
2250 * On function start @list must be already topologically sorted and for any node
2251 * in the @list the whole subtree of the node must be in the @list as well. The
2252 * simplest way to satisfy this criteria: use only result of
2253 * bdrv_topological_dfs() or NULL as @list parameter.
2254 */
2255static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,
2256                                    BlockDriverState *bs)
2257{
2258    BdrvChild *child;
2259    g_autoptr(GHashTable) local_found = NULL;
2260
2261    GLOBAL_STATE_CODE();
2262
2263    if (!found) {
2264        assert(!list);
2265        found = local_found = g_hash_table_new(NULL, NULL);
2266    }
2267
2268    if (g_hash_table_contains(found, bs)) {
2269        return list;
2270    }
2271    g_hash_table_add(found, bs);
2272
2273    QLIST_FOREACH(child, &bs->children, next) {
2274        list = bdrv_topological_dfs(list, found, child->bs);
2275    }
2276
2277    return g_slist_prepend(list, bs);
2278}
2279
2280typedef struct BdrvChildSetPermState {
2281    BdrvChild *child;
2282    uint64_t old_perm;
2283    uint64_t old_shared_perm;
2284} BdrvChildSetPermState;
2285
2286static void bdrv_child_set_perm_abort(void *opaque)
2287{
2288    BdrvChildSetPermState *s = opaque;
2289
2290    GLOBAL_STATE_CODE();
2291
2292    s->child->perm = s->old_perm;
2293    s->child->shared_perm = s->old_shared_perm;
2294}
2295
2296static TransactionActionDrv bdrv_child_set_pem_drv = {
2297    .abort = bdrv_child_set_perm_abort,
2298    .clean = g_free,
2299};
2300
2301static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm,
2302                                uint64_t shared, Transaction *tran)
2303{
2304    BdrvChildSetPermState *s = g_new(BdrvChildSetPermState, 1);
2305    GLOBAL_STATE_CODE();
2306
2307    *s = (BdrvChildSetPermState) {
2308        .child = c,
2309        .old_perm = c->perm,
2310        .old_shared_perm = c->shared_perm,
2311    };
2312
2313    c->perm = perm;
2314    c->shared_perm = shared;
2315
2316    tran_add(tran, &bdrv_child_set_pem_drv, s);
2317}
2318
2319static void bdrv_drv_set_perm_commit(void *opaque)
2320{
2321    BlockDriverState *bs = opaque;
2322    uint64_t cumulative_perms, cumulative_shared_perms;
2323    GLOBAL_STATE_CODE();
2324
2325    if (bs->drv->bdrv_set_perm) {
2326        bdrv_get_cumulative_perm(bs, &cumulative_perms,
2327                                 &cumulative_shared_perms);
2328        bs->drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
2329    }
2330}
2331
2332static void bdrv_drv_set_perm_abort(void *opaque)
2333{
2334    BlockDriverState *bs = opaque;
2335    GLOBAL_STATE_CODE();
2336
2337    if (bs->drv->bdrv_abort_perm_update) {
2338        bs->drv->bdrv_abort_perm_update(bs);
2339    }
2340}
2341
2342TransactionActionDrv bdrv_drv_set_perm_drv = {
2343    .abort = bdrv_drv_set_perm_abort,
2344    .commit = bdrv_drv_set_perm_commit,
2345};
2346
2347static int bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm,
2348                             uint64_t shared_perm, Transaction *tran,
2349                             Error **errp)
2350{
2351    GLOBAL_STATE_CODE();
2352    if (!bs->drv) {
2353        return 0;
2354    }
2355
2356    if (bs->drv->bdrv_check_perm) {
2357        int ret = bs->drv->bdrv_check_perm(bs, perm, shared_perm, errp);
2358        if (ret < 0) {
2359            return ret;
2360        }
2361    }
2362
2363    if (tran) {
2364        tran_add(tran, &bdrv_drv_set_perm_drv, bs);
2365    }
2366
2367    return 0;
2368}
2369
2370typedef struct BdrvReplaceChildState {
2371    BdrvChild *child;
2372    BlockDriverState *old_bs;
2373} BdrvReplaceChildState;
2374
2375static void bdrv_replace_child_commit(void *opaque)
2376{
2377    BdrvReplaceChildState *s = opaque;
2378    GLOBAL_STATE_CODE();
2379
2380    bdrv_unref(s->old_bs);
2381}
2382
2383static void bdrv_replace_child_abort(void *opaque)
2384{
2385    BdrvReplaceChildState *s = opaque;
2386    BlockDriverState *new_bs = s->child->bs;
2387
2388    GLOBAL_STATE_CODE();
2389    /* old_bs reference is transparently moved from @s to @s->child */
2390    if (!s->child->bs) {
2391        /*
2392         * The parents were undrained when removing old_bs from the child. New
2393         * requests can't have been made, though, because the child was empty.
2394         *
2395         * TODO Make bdrv_replace_child_noperm() transactionable to avoid
2396         * undraining the parent in the first place. Once this is done, having
2397         * new_bs drained when calling bdrv_replace_child_tran() is not a
2398         * requirement any more.
2399         */
2400        bdrv_parent_drained_begin_single(s->child);
2401        assert(!bdrv_parent_drained_poll_single(s->child));
2402    }
2403    assert(s->child->quiesced_parent);
2404    bdrv_replace_child_noperm(s->child, s->old_bs);
2405    bdrv_unref(new_bs);
2406}
2407
2408static TransactionActionDrv bdrv_replace_child_drv = {
2409    .commit = bdrv_replace_child_commit,
2410    .abort = bdrv_replace_child_abort,
2411    .clean = g_free,
2412};
2413
2414/*
2415 * bdrv_replace_child_tran
2416 *
2417 * Note: real unref of old_bs is done only on commit.
2418 *
2419 * Both @child->bs and @new_bs (if non-NULL) must be drained. @new_bs must be
2420 * kept drained until the transaction is completed.
2421 *
2422 * The function doesn't update permissions, caller is responsible for this.
2423 */
2424static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
2425                                    Transaction *tran)
2426{
2427    BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
2428
2429    assert(child->quiesced_parent);
2430    assert(!new_bs || new_bs->quiesce_counter);
2431
2432    *s = (BdrvReplaceChildState) {
2433        .child = child,
2434        .old_bs = child->bs,
2435    };
2436    tran_add(tran, &bdrv_replace_child_drv, s);
2437
2438    if (new_bs) {
2439        bdrv_ref(new_bs);
2440    }
2441    bdrv_replace_child_noperm(child, new_bs);
2442    /* old_bs reference is transparently moved from @child to @s */
2443}
2444
2445/*
2446 * Refresh permissions in @bs subtree. The function is intended to be called
2447 * after some graph modification that was done without permission update.
2448 */
2449static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
2450                                  Transaction *tran, Error **errp)
2451{
2452    BlockDriver *drv = bs->drv;
2453    BdrvChild *c;
2454    int ret;
2455    uint64_t cumulative_perms, cumulative_shared_perms;
2456    GLOBAL_STATE_CODE();
2457
2458    bdrv_get_cumulative_perm(bs, &cumulative_perms, &cumulative_shared_perms);
2459
2460    /* Write permissions never work with read-only images */
2461    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2462        !bdrv_is_writable_after_reopen(bs, q))
2463    {
2464        if (!bdrv_is_writable_after_reopen(bs, NULL)) {
2465            error_setg(errp, "Block node is read-only");
2466        } else {
2467            error_setg(errp, "Read-only block node '%s' cannot support "
2468                       "read-write users", bdrv_get_node_name(bs));
2469        }
2470
2471        return -EPERM;
2472    }
2473
2474    /*
2475     * Unaligned requests will automatically be aligned to bl.request_alignment
2476     * and without RESIZE we can't extend requests to write to space beyond the
2477     * end of the image, so it's required that the image size is aligned.
2478     */
2479    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2480        !(cumulative_perms & BLK_PERM_RESIZE))
2481    {
2482        if ((bs->total_sectors * BDRV_SECTOR_SIZE) % bs->bl.request_alignment) {
2483            error_setg(errp, "Cannot get 'write' permission without 'resize': "
2484                             "Image size is not a multiple of request "
2485                             "alignment");
2486            return -EPERM;
2487        }
2488    }
2489
2490    /* Check this node */
2491    if (!drv) {
2492        return 0;
2493    }
2494
2495    ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, tran,
2496                            errp);
2497    if (ret < 0) {
2498        return ret;
2499    }
2500
2501    /* Drivers that never have children can omit .bdrv_child_perm() */
2502    if (!drv->bdrv_child_perm) {
2503        assert(QLIST_EMPTY(&bs->children));
2504        return 0;
2505    }
2506
2507    /* Check all children */
2508    QLIST_FOREACH(c, &bs->children, next) {
2509        uint64_t cur_perm, cur_shared;
2510
2511        bdrv_child_perm(bs, c->bs, c, c->role, q,
2512                        cumulative_perms, cumulative_shared_perms,
2513                        &cur_perm, &cur_shared);
2514        bdrv_child_set_perm(c, cur_perm, cur_shared, tran);
2515    }
2516
2517    return 0;
2518}
2519
2520/*
2521 * @list is a product of bdrv_topological_dfs() (may be called several times) -
2522 * a topologically sorted subgraph.
2523 */
2524static int bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q,
2525                                 Transaction *tran, Error **errp)
2526{
2527    int ret;
2528    BlockDriverState *bs;
2529    GLOBAL_STATE_CODE();
2530
2531    for ( ; list; list = list->next) {
2532        bs = list->data;
2533
2534        if (bdrv_parent_perms_conflict(bs, errp)) {
2535            return -EINVAL;
2536        }
2537
2538        ret = bdrv_node_refresh_perm(bs, q, tran, errp);
2539        if (ret < 0) {
2540            return ret;
2541        }
2542    }
2543
2544    return 0;
2545}
2546
2547/*
2548 * @list is any list of nodes. List is completed by all subtrees and
2549 * topologically sorted. It's not a problem if some node occurs in the @list
2550 * several times.
2551 */
2552static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
2553                                   Transaction *tran, Error **errp)
2554{
2555    g_autoptr(GHashTable) found = g_hash_table_new(NULL, NULL);
2556    g_autoptr(GSList) refresh_list = NULL;
2557
2558    for ( ; list; list = list->next) {
2559        refresh_list = bdrv_topological_dfs(refresh_list, found, list->data);
2560    }
2561
2562    return bdrv_do_refresh_perms(refresh_list, q, tran, errp);
2563}
2564
2565void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
2566                              uint64_t *shared_perm)
2567{
2568    BdrvChild *c;
2569    uint64_t cumulative_perms = 0;
2570    uint64_t cumulative_shared_perms = BLK_PERM_ALL;
2571
2572    GLOBAL_STATE_CODE();
2573
2574    QLIST_FOREACH(c, &bs->parents, next_parent) {
2575        cumulative_perms |= c->perm;
2576        cumulative_shared_perms &= c->shared_perm;
2577    }
2578
2579    *perm = cumulative_perms;
2580    *shared_perm = cumulative_shared_perms;
2581}
2582
2583char *bdrv_perm_names(uint64_t perm)
2584{
2585    struct perm_name {
2586        uint64_t perm;
2587        const char *name;
2588    } permissions[] = {
2589        { BLK_PERM_CONSISTENT_READ, "consistent read" },
2590        { BLK_PERM_WRITE,           "write" },
2591        { BLK_PERM_WRITE_UNCHANGED, "write unchanged" },
2592        { BLK_PERM_RESIZE,          "resize" },
2593        { 0, NULL }
2594    };
2595
2596    GString *result = g_string_sized_new(30);
2597    struct perm_name *p;
2598
2599    for (p = permissions; p->name; p++) {
2600        if (perm & p->perm) {
2601            if (result->len > 0) {
2602                g_string_append(result, ", ");
2603            }
2604            g_string_append(result, p->name);
2605        }
2606    }
2607
2608    return g_string_free(result, FALSE);
2609}
2610
2611
2612/* @tran is allowed to be NULL. In this case no rollback is possible */
2613static int bdrv_refresh_perms(BlockDriverState *bs, Transaction *tran,
2614                              Error **errp)
2615{
2616    int ret;
2617    Transaction *local_tran = NULL;
2618    g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
2619    GLOBAL_STATE_CODE();
2620
2621    if (!tran) {
2622        tran = local_tran = tran_new();
2623    }
2624
2625    ret = bdrv_do_refresh_perms(list, NULL, tran, errp);
2626
2627    if (local_tran) {
2628        tran_finalize(local_tran, ret);
2629    }
2630
2631    return ret;
2632}
2633
2634int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
2635                            Error **errp)
2636{
2637    Error *local_err = NULL;
2638    Transaction *tran = tran_new();
2639    int ret;
2640
2641    GLOBAL_STATE_CODE();
2642
2643    bdrv_child_set_perm(c, perm, shared, tran);
2644
2645    ret = bdrv_refresh_perms(c->bs, tran, &local_err);
2646
2647    tran_finalize(tran, ret);
2648
2649    if (ret < 0) {
2650        if ((perm & ~c->perm) || (c->shared_perm & ~shared)) {
2651            /* tighten permissions */
2652            error_propagate(errp, local_err);
2653        } else {
2654            /*
2655             * Our caller may intend to only loosen restrictions and
2656             * does not expect this function to fail.  Errors are not
2657             * fatal in such a case, so we can just hide them from our
2658             * caller.
2659             */
2660            error_free(local_err);
2661            ret = 0;
2662        }
2663    }
2664
2665    return ret;
2666}
2667
2668int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp)
2669{
2670    uint64_t parent_perms, parent_shared;
2671    uint64_t perms, shared;
2672
2673    GLOBAL_STATE_CODE();
2674
2675    bdrv_get_cumulative_perm(bs, &parent_perms, &parent_shared);
2676    bdrv_child_perm(bs, c->bs, c, c->role, NULL,
2677                    parent_perms, parent_shared, &perms, &shared);
2678
2679    return bdrv_child_try_set_perm(c, perms, shared, errp);
2680}
2681
2682/*
2683 * Default implementation for .bdrv_child_perm() for block filters:
2684 * Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED, and RESIZE to the
2685 * filtered child.
2686 */
2687static void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
2688                                      BdrvChildRole role,
2689                                      BlockReopenQueue *reopen_queue,
2690                                      uint64_t perm, uint64_t shared,
2691                                      uint64_t *nperm, uint64_t *nshared)
2692{
2693    GLOBAL_STATE_CODE();
2694    *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
2695    *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
2696}
2697
2698static void bdrv_default_perms_for_cow(BlockDriverState *bs, BdrvChild *c,
2699                                       BdrvChildRole role,
2700                                       BlockReopenQueue *reopen_queue,
2701                                       uint64_t perm, uint64_t shared,
2702                                       uint64_t *nperm, uint64_t *nshared)
2703{
2704    assert(role & BDRV_CHILD_COW);
2705    GLOBAL_STATE_CODE();
2706
2707    /*
2708     * We want consistent read from backing files if the parent needs it.
2709     * No other operations are performed on backing files.
2710     */
2711    perm &= BLK_PERM_CONSISTENT_READ;
2712
2713    /*
2714     * If the parent can deal with changing data, we're okay with a
2715     * writable and resizable backing file.
2716     * TODO Require !(perm & BLK_PERM_CONSISTENT_READ), too?
2717     */
2718    if (shared & BLK_PERM_WRITE) {
2719        shared = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2720    } else {
2721        shared = 0;
2722    }
2723
2724    shared |= BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
2725
2726    if (bs->open_flags & BDRV_O_INACTIVE) {
2727        shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2728    }
2729
2730    *nperm = perm;
2731    *nshared = shared;
2732}
2733
2734static void bdrv_default_perms_for_storage(BlockDriverState *bs, BdrvChild *c,
2735                                           BdrvChildRole role,
2736                                           BlockReopenQueue *reopen_queue,
2737                                           uint64_t perm, uint64_t shared,
2738                                           uint64_t *nperm, uint64_t *nshared)
2739{
2740    int flags;
2741
2742    GLOBAL_STATE_CODE();
2743    assert(role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA));
2744
2745    flags = bdrv_reopen_get_flags(reopen_queue, bs);
2746
2747    /*
2748     * Apart from the modifications below, the same permissions are
2749     * forwarded and left alone as for filters
2750     */
2751    bdrv_filter_default_perms(bs, c, role, reopen_queue,
2752                              perm, shared, &perm, &shared);
2753
2754    if (role & BDRV_CHILD_METADATA) {
2755        /* Format drivers may touch metadata even if the guest doesn't write */
2756        if (bdrv_is_writable_after_reopen(bs, reopen_queue)) {
2757            perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2758        }
2759
2760        /*
2761         * bs->file always needs to be consistent because of the
2762         * metadata. We can never allow other users to resize or write
2763         * to it.
2764         */
2765        if (!(flags & BDRV_O_NO_IO)) {
2766            perm |= BLK_PERM_CONSISTENT_READ;
2767        }
2768        shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
2769    }
2770
2771    if (role & BDRV_CHILD_DATA) {
2772        /*
2773         * Technically, everything in this block is a subset of the
2774         * BDRV_CHILD_METADATA path taken above, and so this could
2775         * be an "else if" branch.  However, that is not obvious, and
2776         * this function is not performance critical, therefore we let
2777         * this be an independent "if".
2778         */
2779
2780        /*
2781         * We cannot allow other users to resize the file because the
2782         * format driver might have some assumptions about the size
2783         * (e.g. because it is stored in metadata, or because the file
2784         * is split into fixed-size data files).
2785         */
2786        shared &= ~BLK_PERM_RESIZE;
2787
2788        /*
2789         * WRITE_UNCHANGED often cannot be performed as such on the
2790         * data file.  For example, the qcow2 driver may still need to
2791         * write copied clusters on copy-on-read.
2792         */
2793        if (perm & BLK_PERM_WRITE_UNCHANGED) {
2794            perm |= BLK_PERM_WRITE;
2795        }
2796
2797        /*
2798         * If the data file is written to, the format driver may
2799         * expect to be able to resize it by writing beyond the EOF.
2800         */
2801        if (perm & BLK_PERM_WRITE) {
2802            perm |= BLK_PERM_RESIZE;
2803        }
2804    }
2805
2806    if (bs->open_flags & BDRV_O_INACTIVE) {
2807        shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2808    }
2809
2810    *nperm = perm;
2811    *nshared = shared;
2812}
2813
2814void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c,
2815                        BdrvChildRole role, BlockReopenQueue *reopen_queue,
2816                        uint64_t perm, uint64_t shared,
2817                        uint64_t *nperm, uint64_t *nshared)
2818{
2819    GLOBAL_STATE_CODE();
2820    if (role & BDRV_CHILD_FILTERED) {
2821        assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
2822                         BDRV_CHILD_COW)));
2823        bdrv_filter_default_perms(bs, c, role, reopen_queue,
2824                                  perm, shared, nperm, nshared);
2825    } else if (role & BDRV_CHILD_COW) {
2826        assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA)));
2827        bdrv_default_perms_for_cow(bs, c, role, reopen_queue,
2828                                   perm, shared, nperm, nshared);
2829    } else if (role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA)) {
2830        bdrv_default_perms_for_storage(bs, c, role, reopen_queue,
2831                                       perm, shared, nperm, nshared);
2832    } else {
2833        g_assert_not_reached();
2834    }
2835}
2836
2837uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
2838{
2839    static const uint64_t permissions[] = {
2840        [BLOCK_PERMISSION_CONSISTENT_READ]  = BLK_PERM_CONSISTENT_READ,
2841        [BLOCK_PERMISSION_WRITE]            = BLK_PERM_WRITE,
2842        [BLOCK_PERMISSION_WRITE_UNCHANGED]  = BLK_PERM_WRITE_UNCHANGED,
2843        [BLOCK_PERMISSION_RESIZE]           = BLK_PERM_RESIZE,
2844    };
2845
2846    QEMU_BUILD_BUG_ON(ARRAY_SIZE(permissions) != BLOCK_PERMISSION__MAX);
2847    QEMU_BUILD_BUG_ON(1UL << ARRAY_SIZE(permissions) != BLK_PERM_ALL + 1);
2848
2849    assert(qapi_perm < BLOCK_PERMISSION__MAX);
2850
2851    return permissions[qapi_perm];
2852}
2853
2854/*
2855 * Replaces the node that a BdrvChild points to without updating permissions.
2856 *
2857 * If @new_bs is non-NULL, the parent of @child must already be drained through
2858 * @child and the caller must hold the AioContext lock for @new_bs.
2859 */
2860static void bdrv_replace_child_noperm(BdrvChild *child,
2861                                      BlockDriverState *new_bs)
2862{
2863    BlockDriverState *old_bs = child->bs;
2864    int new_bs_quiesce_counter;
2865
2866    assert(!child->frozen);
2867
2868    /*
2869     * If we want to change the BdrvChild to point to a drained node as its new
2870     * child->bs, we need to make sure that its new parent is drained, too. In
2871     * other words, either child->quiesce_parent must already be true or we must
2872     * be able to set it and keep the parent's quiesce_counter consistent with
2873     * that, but without polling or starting new requests (this function
2874     * guarantees that it doesn't poll, and starting new requests would be
2875     * against the invariants of drain sections).
2876     *
2877     * To keep things simple, we pick the first option (child->quiesce_parent
2878     * must already be true). We also generalise the rule a bit to make it
2879     * easier to verify in callers and more likely to be covered in test cases:
2880     * The parent must be quiesced through this child even if new_bs isn't
2881     * currently drained.
2882     *
2883     * The only exception is for callers that always pass new_bs == NULL. In
2884     * this case, we obviously never need to consider the case of a drained
2885     * new_bs, so we can keep the callers simpler by allowing them not to drain
2886     * the parent.
2887     */
2888    assert(!new_bs || child->quiesced_parent);
2889    assert(old_bs != new_bs);
2890    GLOBAL_STATE_CODE();
2891
2892    if (old_bs && new_bs) {
2893        assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
2894    }
2895
2896    /* TODO Pull this up into the callers to avoid polling here */
2897    bdrv_graph_wrlock(new_bs);
2898    if (old_bs) {
2899        if (child->klass->detach) {
2900            child->klass->detach(child);
2901        }
2902        QLIST_REMOVE(child, next_parent);
2903    }
2904
2905    child->bs = new_bs;
2906
2907    if (new_bs) {
2908        QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
2909        if (child->klass->attach) {
2910            child->klass->attach(child);
2911        }
2912    }
2913    bdrv_graph_wrunlock();
2914
2915    /*
2916     * If the parent was drained through this BdrvChild previously, but new_bs
2917     * is not drained, allow requests to come in only after the new node has
2918     * been attached.
2919     */
2920    new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
2921    if (!new_bs_quiesce_counter && child->quiesced_parent) {
2922        bdrv_parent_drained_end_single(child);
2923    }
2924}
2925
2926/**
2927 * Free the given @child.
2928 *
2929 * The child must be empty (i.e. `child->bs == NULL`) and it must be
2930 * unused (i.e. not in a children list).
2931 */
2932static void bdrv_child_free(BdrvChild *child)
2933{
2934    assert(!child->bs);
2935    GLOBAL_STATE_CODE();
2936    assert(!child->next.le_prev); /* not in children list */
2937
2938    g_free(child->name);
2939    g_free(child);
2940}
2941
2942typedef struct BdrvAttachChildCommonState {
2943    BdrvChild *child;
2944    AioContext *old_parent_ctx;
2945    AioContext *old_child_ctx;
2946} BdrvAttachChildCommonState;
2947
2948static void bdrv_attach_child_common_abort(void *opaque)
2949{
2950    BdrvAttachChildCommonState *s = opaque;
2951    BlockDriverState *bs = s->child->bs;
2952
2953    GLOBAL_STATE_CODE();
2954    bdrv_replace_child_noperm(s->child, NULL);
2955
2956    if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
2957        bdrv_try_change_aio_context(bs, s->old_child_ctx, NULL, &error_abort);
2958    }
2959
2960    if (bdrv_child_get_parent_aio_context(s->child) != s->old_parent_ctx) {
2961        Transaction *tran;
2962        GHashTable *visited;
2963        bool ret;
2964
2965        tran = tran_new();
2966
2967        /* No need to visit `child`, because it has been detached already */
2968        visited = g_hash_table_new(NULL, NULL);
2969        ret = s->child->klass->change_aio_ctx(s->child, s->old_parent_ctx,
2970                                              visited, tran, &error_abort);
2971        g_hash_table_destroy(visited);
2972
2973        /* transaction is supposed to always succeed */
2974        assert(ret == true);
2975        tran_commit(tran);
2976    }
2977
2978    bdrv_unref(bs);
2979    bdrv_child_free(s->child);
2980}
2981
2982static TransactionActionDrv bdrv_attach_child_common_drv = {
2983    .abort = bdrv_attach_child_common_abort,
2984    .clean = g_free,
2985};
2986
2987/*
2988 * Common part of attaching bdrv child to bs or to blk or to job
2989 *
2990 * Function doesn't update permissions, caller is responsible for this.
2991 *
2992 * Returns new created child.
2993 *
2994 * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
2995 * @child_bs can move to a different AioContext in this function. Callers must
2996 * make sure that their AioContext locking is still correct after this.
2997 */
2998static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
2999                                           const char *child_name,
3000                                           const BdrvChildClass *child_class,
3001                                           BdrvChildRole child_role,
3002                                           uint64_t perm, uint64_t shared_perm,
3003                                           void *opaque,
3004                                           Transaction *tran, Error **errp)
3005{
3006    BdrvChild *new_child;
3007    AioContext *parent_ctx, *new_child_ctx;
3008    AioContext *child_ctx = bdrv_get_aio_context(child_bs);
3009
3010    assert(child_class->get_parent_desc);
3011    GLOBAL_STATE_CODE();
3012
3013    new_child = g_new(BdrvChild, 1);
3014    *new_child = (BdrvChild) {
3015        .bs             = NULL,
3016        .name           = g_strdup(child_name),
3017        .klass          = child_class,
3018        .role           = child_role,
3019        .perm           = perm,
3020        .shared_perm    = shared_perm,
3021        .opaque         = opaque,
3022    };
3023
3024    /*
3025     * If the AioContexts don't match, first try to move the subtree of
3026     * child_bs into the AioContext of the new parent. If this doesn't work,
3027     * try moving the parent into the AioContext of child_bs instead.
3028     */
3029    parent_ctx = bdrv_child_get_parent_aio_context(new_child);
3030    if (child_ctx != parent_ctx) {
3031        Error *local_err = NULL;
3032        int ret = bdrv_try_change_aio_context(child_bs, parent_ctx, NULL,
3033                                              &local_err);
3034
3035        if (ret < 0 && child_class->change_aio_ctx) {
3036            Transaction *tran = tran_new();
3037            GHashTable *visited = g_hash_table_new(NULL, NULL);
3038            bool ret_child;
3039
3040            g_hash_table_add(visited, new_child);
3041            ret_child = child_class->change_aio_ctx(new_child, child_ctx,
3042                                                    visited, tran, NULL);
3043            if (ret_child == true) {
3044                error_free(local_err);
3045                ret = 0;
3046            }
3047            tran_finalize(tran, ret_child == true ? 0 : -1);
3048            g_hash_table_destroy(visited);
3049        }
3050
3051        if (ret < 0) {
3052            error_propagate(errp, local_err);
3053            bdrv_child_free(new_child);
3054            return NULL;
3055        }
3056    }
3057
3058    new_child_ctx = bdrv_get_aio_context(child_bs);
3059    if (new_child_ctx != child_ctx) {
3060        aio_context_release(child_ctx);
3061        aio_context_acquire(new_child_ctx);
3062    }
3063
3064    bdrv_ref(child_bs);
3065    /*
3066     * Let every new BdrvChild start with a drained parent. Inserting the child
3067     * in the graph with bdrv_replace_child_noperm() will undrain it if
3068     * @child_bs is not drained.
3069     *
3070     * The child was only just created and is not yet visible in global state
3071     * until bdrv_replace_child_noperm() inserts it into the graph, so nobody
3072     * could have sent requests and polling is not necessary.
3073     *
3074     * Note that this means that the parent isn't fully drained yet, we only
3075     * stop new requests from coming in. This is fine, we don't care about the
3076     * old requests here, they are not for this child. If another place enters a
3077     * drain section for the same parent, but wants it to be fully quiesced, it
3078     * will not run most of the the code in .drained_begin() again (which is not
3079     * a problem, we already did this), but it will still poll until the parent
3080     * is fully quiesced, so it will not be negatively affected either.
3081     */
3082    bdrv_parent_drained_begin_single(new_child);
3083    bdrv_replace_child_noperm(new_child, child_bs);
3084
3085    BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
3086    *s = (BdrvAttachChildCommonState) {
3087        .child = new_child,
3088        .old_parent_ctx = parent_ctx,
3089        .old_child_ctx = child_ctx,
3090    };
3091    tran_add(tran, &bdrv_attach_child_common_drv, s);
3092
3093    if (new_child_ctx != child_ctx) {
3094        aio_context_release(new_child_ctx);
3095        aio_context_acquire(child_ctx);
3096    }
3097
3098    return new_child;
3099}
3100
3101/*
3102 * Function doesn't update permissions, caller is responsible for this.
3103 *
3104 * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
3105 * @child_bs can move to a different AioContext in this function. Callers must
3106 * make sure that their AioContext locking is still correct after this.
3107 */
3108static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs,
3109                                           BlockDriverState *child_bs,
3110                                           const char *child_name,
3111                                           const BdrvChildClass *child_class,
3112                                           BdrvChildRole child_role,
3113                                           Transaction *tran,
3114                                           Error **errp)
3115{
3116    uint64_t perm, shared_perm;
3117
3118    assert(parent_bs->drv);
3119    GLOBAL_STATE_CODE();
3120
3121    if (bdrv_recurse_has_child(child_bs, parent_bs)) {
3122        error_setg(errp, "Making '%s' a %s child of '%s' would create a cycle",
3123                   child_bs->node_name, child_name, parent_bs->node_name);
3124        return NULL;
3125    }
3126
3127    bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
3128    bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
3129                    perm, shared_perm, &perm, &shared_perm);
3130
3131    return bdrv_attach_child_common(child_bs, child_name, child_class,
3132                                    child_role, perm, shared_perm, parent_bs,
3133                                    tran, errp);
3134}
3135
3136/*
3137 * This function steals the reference to child_bs from the caller.
3138 * That reference is later dropped by bdrv_root_unref_child().
3139 *
3140 * On failure NULL is returned, errp is set and the reference to
3141 * child_bs is also dropped.
3142 *
3143 * The caller must hold the AioContext lock @child_bs, but not that of @ctx
3144 * (unless @child_bs is already in @ctx).
3145 */
3146BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
3147                                  const char *child_name,
3148                                  const BdrvChildClass *child_class,
3149                                  BdrvChildRole child_role,
3150                                  uint64_t perm, uint64_t shared_perm,
3151                                  void *opaque, Error **errp)
3152{
3153    int ret;
3154    BdrvChild *child;
3155    Transaction *tran = tran_new();
3156
3157    GLOBAL_STATE_CODE();
3158
3159    child = bdrv_attach_child_common(child_bs, child_name, child_class,
3160                                   child_role, perm, shared_perm, opaque,
3161                                   tran, errp);
3162    if (!child) {
3163        ret = -EINVAL;
3164        goto out;
3165    }
3166
3167    ret = bdrv_refresh_perms(child_bs, tran, errp);
3168
3169out:
3170    tran_finalize(tran, ret);
3171
3172    bdrv_unref(child_bs);
3173
3174    return ret < 0 ? NULL : child;
3175}
3176
3177/*
3178 * This function transfers the reference to child_bs from the caller
3179 * to parent_bs. That reference is later dropped by parent_bs on
3180 * bdrv_close() or if someone calls bdrv_unref_child().
3181 *
3182 * On failure NULL is returned, errp is set and the reference to
3183 * child_bs is also dropped.
3184 *
3185 * If @parent_bs and @child_bs are in different AioContexts, the caller must
3186 * hold the AioContext lock for @child_bs, but not for @parent_bs.
3187 */
3188BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
3189                             BlockDriverState *child_bs,
3190                             const char *child_name,
3191                             const BdrvChildClass *child_class,
3192                             BdrvChildRole child_role,
3193                             Error **errp)
3194{
3195    int ret;
3196    BdrvChild *child;
3197    Transaction *tran = tran_new();
3198
3199    GLOBAL_STATE_CODE();
3200
3201    child = bdrv_attach_child_noperm(parent_bs, child_bs, child_name,
3202                                     child_class, child_role, tran, errp);
3203    if (!child) {
3204        ret = -EINVAL;
3205        goto out;
3206    }
3207
3208    ret = bdrv_refresh_perms(parent_bs, tran, errp);
3209    if (ret < 0) {
3210        goto out;
3211    }
3212
3213out:
3214    tran_finalize(tran, ret);
3215
3216    bdrv_unref(child_bs);
3217
3218    return ret < 0 ? NULL : child;
3219}
3220
3221/* Callers must ensure that child->frozen is false. */
3222void bdrv_root_unref_child(BdrvChild *child)
3223{
3224    BlockDriverState *child_bs = child->bs;
3225
3226    GLOBAL_STATE_CODE();
3227    bdrv_replace_child_noperm(child, NULL);
3228    bdrv_child_free(child);
3229
3230    if (child_bs) {
3231        /*
3232         * Update permissions for old node. We're just taking a parent away, so
3233         * we're loosening restrictions. Errors of permission update are not
3234         * fatal in this case, ignore them.
3235         */
3236        bdrv_refresh_perms(child_bs, NULL, NULL);
3237
3238        /*
3239         * When the parent requiring a non-default AioContext is removed, the
3240         * node moves back to the main AioContext
3241         */
3242        bdrv_try_change_aio_context(child_bs, qemu_get_aio_context(), NULL,
3243                                    NULL);
3244    }
3245
3246    bdrv_unref(child_bs);
3247}
3248
3249typedef struct BdrvSetInheritsFrom {
3250    BlockDriverState *bs;
3251    BlockDriverState *old_inherits_from;
3252} BdrvSetInheritsFrom;
3253
3254static void bdrv_set_inherits_from_abort(void *opaque)
3255{
3256    BdrvSetInheritsFrom *s = opaque;
3257
3258    s->bs->inherits_from = s->old_inherits_from;
3259}
3260
3261static TransactionActionDrv bdrv_set_inherits_from_drv = {
3262    .abort = bdrv_set_inherits_from_abort,
3263    .clean = g_free,
3264};
3265
3266/* @tran is allowed to be NULL. In this case no rollback is possible */
3267static void bdrv_set_inherits_from(BlockDriverState *bs,
3268                                   BlockDriverState *new_inherits_from,
3269                                   Transaction *tran)
3270{
3271    if (tran) {
3272        BdrvSetInheritsFrom *s = g_new(BdrvSetInheritsFrom, 1);
3273
3274        *s = (BdrvSetInheritsFrom) {
3275            .bs = bs,
3276            .old_inherits_from = bs->inherits_from,
3277        };
3278
3279        tran_add(tran, &bdrv_set_inherits_from_drv, s);
3280    }
3281
3282    bs->inherits_from = new_inherits_from;
3283}
3284
3285/**
3286 * Clear all inherits_from pointers from children and grandchildren of
3287 * @root that point to @root, where necessary.
3288 * @tran is allowed to be NULL. In this case no rollback is possible
3289 */
3290static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
3291                                     Transaction *tran)
3292{
3293    BdrvChild *c;
3294
3295    if (child->bs->inherits_from == root) {
3296        /*
3297         * Remove inherits_from only when the last reference between root and
3298         * child->bs goes away.
3299         */
3300        QLIST_FOREACH(c, &root->children, next) {
3301            if (c != child && c->bs == child->bs) {
3302                break;
3303            }
3304        }
3305        if (c == NULL) {
3306            bdrv_set_inherits_from(child->bs, NULL, tran);
3307        }
3308    }
3309
3310    QLIST_FOREACH(c, &child->bs->children, next) {
3311        bdrv_unset_inherits_from(root, c, tran);
3312    }
3313}
3314
3315/* Callers must ensure that child->frozen is false. */
3316void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
3317{
3318    GLOBAL_STATE_CODE();
3319    if (child == NULL) {
3320        return;
3321    }
3322
3323    bdrv_unset_inherits_from(parent, child, NULL);
3324    bdrv_root_unref_child(child);
3325}
3326
3327
3328static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
3329{
3330    BdrvChild *c;
3331    GLOBAL_STATE_CODE();
3332    QLIST_FOREACH(c, &bs->parents, next_parent) {
3333        if (c->klass->change_media) {
3334            c->klass->change_media(c, load);
3335        }
3336    }
3337}
3338
3339/* Return true if you can reach parent going through child->inherits_from
3340 * recursively. If parent or child are NULL, return false */
3341static bool bdrv_inherits_from_recursive(BlockDriverState *child,
3342                                         BlockDriverState *parent)
3343{
3344    while (child && child != parent) {
3345        child = child->inherits_from;
3346    }
3347
3348    return child != NULL;
3349}
3350
3351/*
3352 * Return the BdrvChildRole for @bs's backing child.  bs->backing is
3353 * mostly used for COW backing children (role = COW), but also for
3354 * filtered children (role = FILTERED | PRIMARY).
3355 */
3356static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
3357{
3358    if (bs->drv && bs->drv->is_filter) {
3359        return BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3360    } else {
3361        return BDRV_CHILD_COW;
3362    }
3363}
3364
3365/*
3366 * Sets the bs->backing or bs->file link of a BDS. A new reference is created;
3367 * callers which don't need their own reference any more must call bdrv_unref().
3368 *
3369 * Function doesn't update permissions, caller is responsible for this.
3370 *
3371 * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
3372 * @child_bs can move to a different AioContext in this function. Callers must
3373 * make sure that their AioContext locking is still correct after this.
3374 */
3375static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
3376                                           BlockDriverState *child_bs,
3377                                           bool is_backing,
3378                                           Transaction *tran, Error **errp)
3379{
3380    bool update_inherits_from =
3381        bdrv_inherits_from_recursive(child_bs, parent_bs);
3382    BdrvChild *child = is_backing ? parent_bs->backing : parent_bs->file;
3383    BdrvChildRole role;
3384
3385    GLOBAL_STATE_CODE();
3386
3387    if (!parent_bs->drv) {
3388        /*
3389         * Node without drv is an object without a class :/. TODO: finally fix
3390         * qcow2 driver to never clear bs->drv and implement format corruption
3391         * handling in other way.
3392         */
3393        error_setg(errp, "Node corrupted");
3394        return -EINVAL;
3395    }
3396
3397    if (child && child->frozen) {
3398        error_setg(errp, "Cannot change frozen '%s' link from '%s' to '%s'",
3399                   child->name, parent_bs->node_name, child->bs->node_name);
3400        return -EPERM;
3401    }
3402
3403    if (is_backing && !parent_bs->drv->is_filter &&
3404        !parent_bs->drv->supports_backing)
3405    {
3406        error_setg(errp, "Driver '%s' of node '%s' does not support backing "
3407                   "files", parent_bs->drv->format_name, parent_bs->node_name);
3408        return -EINVAL;
3409    }
3410
3411    if (parent_bs->drv->is_filter) {
3412        role = BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3413    } else if (is_backing) {
3414        role = BDRV_CHILD_COW;
3415    } else {
3416        /*
3417         * We only can use same role as it is in existing child. We don't have
3418         * infrastructure to determine role of file child in generic way
3419         */
3420        if (!child) {
3421            error_setg(errp, "Cannot set file child to format node without "
3422                       "file child");
3423            return -EINVAL;
3424        }
3425        role = child->role;
3426    }
3427
3428    if (child) {
3429        bdrv_unset_inherits_from(parent_bs, child, tran);
3430        bdrv_remove_child(child, tran);
3431    }
3432
3433    if (!child_bs) {
3434        goto out;
3435    }
3436
3437    child = bdrv_attach_child_noperm(parent_bs, child_bs,
3438                                     is_backing ? "backing" : "file",
3439                                     &child_of_bds, role,
3440                                     tran, errp);
3441    if (!child) {
3442        return -EINVAL;
3443    }
3444
3445
3446    /*
3447     * If inherits_from pointed recursively to bs then let's update it to
3448     * point directly to bs (else it will become NULL).
3449     */
3450    if (update_inherits_from) {
3451        bdrv_set_inherits_from(child_bs, parent_bs, tran);
3452    }
3453
3454out:
3455    bdrv_graph_rdlock_main_loop();
3456    bdrv_refresh_limits(parent_bs, tran, NULL);
3457    bdrv_graph_rdunlock_main_loop();
3458
3459    return 0;
3460}
3461
3462/*
3463 * The caller must hold the AioContext lock for @backing_hd. Both @bs and
3464 * @backing_hd can move to a different AioContext in this function. Callers must
3465 * make sure that their AioContext locking is still correct after this.
3466 */
3467static int bdrv_set_backing_noperm(BlockDriverState *bs,
3468                                   BlockDriverState *backing_hd,
3469                                   Transaction *tran, Error **errp)
3470{
3471    GLOBAL_STATE_CODE();
3472    return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
3473}
3474
3475int bdrv_set_backing_hd_drained(BlockDriverState *bs,
3476                                BlockDriverState *backing_hd,
3477                                Error **errp)
3478{
3479    int ret;
3480    Transaction *tran = tran_new();
3481
3482    GLOBAL_STATE_CODE();
3483    assert(bs->quiesce_counter > 0);
3484
3485    ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
3486    if (ret < 0) {
3487        goto out;
3488    }
3489
3490    ret = bdrv_refresh_perms(bs, tran, errp);
3491out:
3492    tran_finalize(tran, ret);
3493    return ret;
3494}
3495
3496int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
3497                        Error **errp)
3498{
3499    int ret;
3500    GLOBAL_STATE_CODE();
3501
3502    bdrv_drained_begin(bs);
3503    ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp);
3504    bdrv_drained_end(bs);
3505
3506    return ret;
3507}
3508
3509/*
3510 * Opens the backing file for a BlockDriverState if not yet open
3511 *
3512 * bdref_key specifies the key for the image's BlockdevRef in the options QDict.
3513 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3514 * itself, all options starting with "${bdref_key}." are considered part of the
3515 * BlockdevRef.
3516 *
3517 * The caller must hold the main AioContext lock.
3518 *
3519 * TODO Can this be unified with bdrv_open_image()?
3520 */
3521int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
3522                           const char *bdref_key, Error **errp)
3523{
3524    char *backing_filename = NULL;
3525    char *bdref_key_dot;
3526    const char *reference = NULL;
3527    int ret = 0;
3528    bool implicit_backing = false;
3529    BlockDriverState *backing_hd;
3530    AioContext *backing_hd_ctx;
3531    QDict *options;
3532    QDict *tmp_parent_options = NULL;
3533    Error *local_err = NULL;
3534
3535    GLOBAL_STATE_CODE();
3536
3537    if (bs->backing != NULL) {
3538        goto free_exit;
3539    }
3540
3541    /* NULL means an empty set of options */
3542    if (parent_options == NULL) {
3543        tmp_parent_options = qdict_new();
3544        parent_options = tmp_parent_options;
3545    }
3546
3547    bs->open_flags &= ~BDRV_O_NO_BACKING;
3548
3549    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3550    qdict_extract_subqdict(parent_options, &options, bdref_key_dot);
3551    g_free(bdref_key_dot);
3552
3553    /*
3554     * Caution: while qdict_get_try_str() is fine, getting non-string
3555     * types would require more care.  When @parent_options come from
3556     * -blockdev or blockdev_add, its members are typed according to
3557     * the QAPI schema, but when they come from -drive, they're all
3558     * QString.
3559     */
3560    reference = qdict_get_try_str(parent_options, bdref_key);
3561    if (reference || qdict_haskey(options, "file.filename")) {
3562        /* keep backing_filename NULL */
3563    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
3564        qobject_unref(options);
3565        goto free_exit;
3566    } else {
3567        if (qdict_size(options) == 0) {
3568            /* If the user specifies options that do not modify the
3569             * backing file's behavior, we might still consider it the
3570             * implicit backing file.  But it's easier this way, and
3571             * just specifying some of the backing BDS's options is
3572             * only possible with -drive anyway (otherwise the QAPI
3573             * schema forces the user to specify everything). */
3574            implicit_backing = !strcmp(bs->auto_backing_file, bs->backing_file);
3575        }
3576
3577        backing_filename = bdrv_get_full_backing_filename(bs, &local_err);
3578        if (local_err) {
3579            ret = -EINVAL;
3580            error_propagate(errp, local_err);
3581            qobject_unref(options);
3582            goto free_exit;
3583        }
3584    }
3585
3586    if (!bs->drv || !bs->drv->supports_backing) {
3587        ret = -EINVAL;
3588        error_setg(errp, "Driver doesn't support backing files");
3589        qobject_unref(options);
3590        goto free_exit;
3591    }
3592
3593    if (!reference &&
3594        bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
3595        qdict_put_str(options, "driver", bs->backing_format);
3596    }
3597
3598    backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
3599                                   &child_of_bds, bdrv_backing_role(bs), errp);
3600    if (!backing_hd) {
3601        bs->open_flags |= BDRV_O_NO_BACKING;
3602        error_prepend(errp, "Could not open backing file: ");
3603        ret = -EINVAL;
3604        goto free_exit;
3605    }
3606
3607    if (implicit_backing) {
3608        bdrv_refresh_filename(backing_hd);
3609        pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
3610                backing_hd->filename);
3611    }
3612
3613    /* Hook up the backing file link; drop our reference, bs owns the
3614     * backing_hd reference now */
3615    backing_hd_ctx = bdrv_get_aio_context(backing_hd);
3616    aio_context_acquire(backing_hd_ctx);
3617    ret = bdrv_set_backing_hd(bs, backing_hd, errp);
3618    bdrv_unref(backing_hd);
3619    aio_context_release(backing_hd_ctx);
3620
3621    if (ret < 0) {
3622        goto free_exit;
3623    }
3624
3625    qdict_del(parent_options, bdref_key);
3626
3627free_exit:
3628    g_free(backing_filename);
3629    qobject_unref(tmp_parent_options);
3630    return ret;
3631}
3632
3633static BlockDriverState *
3634bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
3635                   BlockDriverState *parent, const BdrvChildClass *child_class,
3636                   BdrvChildRole child_role, bool allow_none, Error **errp)
3637{
3638    BlockDriverState *bs = NULL;
3639    QDict *image_options;
3640    char *bdref_key_dot;
3641    const char *reference;
3642
3643    assert(child_class != NULL);
3644
3645    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3646    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
3647    g_free(bdref_key_dot);
3648
3649    /*
3650     * Caution: while qdict_get_try_str() is fine, getting non-string
3651     * types would require more care.  When @options come from
3652     * -blockdev or blockdev_add, its members are typed according to
3653     * the QAPI schema, but when they come from -drive, they're all
3654     * QString.
3655     */
3656    reference = qdict_get_try_str(options, bdref_key);
3657    if (!filename && !reference && !qdict_size(image_options)) {
3658        if (!allow_none) {
3659            error_setg(errp, "A block device must be specified for \"%s\"",
3660                       bdref_key);
3661        }
3662        qobject_unref(image_options);
3663        goto done;
3664    }
3665
3666    bs = bdrv_open_inherit(filename, reference, image_options, 0,
3667                           parent, child_class, child_role, errp);
3668    if (!bs) {
3669        goto done;
3670    }
3671
3672done:
3673    qdict_del(options, bdref_key);
3674    return bs;
3675}
3676
3677/*
3678 * Opens a disk image whose options are given as BlockdevRef in another block
3679 * device's options.
3680 *
3681 * If allow_none is true, no image will be opened if filename is false and no
3682 * BlockdevRef is given. NULL will be returned, but errp remains unset.
3683 *
3684 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
3685 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3686 * itself, all options starting with "${bdref_key}." are considered part of the
3687 * BlockdevRef.
3688 *
3689 * The BlockdevRef will be removed from the options QDict.
3690 *
3691 * The caller must hold the lock of the main AioContext and no other AioContext.
3692 * @parent can move to a different AioContext in this function. Callers must
3693 * make sure that their AioContext locking is still correct after this.
3694 */
3695BdrvChild *bdrv_open_child(const char *filename,
3696                           QDict *options, const char *bdref_key,
3697                           BlockDriverState *parent,
3698                           const BdrvChildClass *child_class,
3699                           BdrvChildRole child_role,
3700                           bool allow_none, Error **errp)
3701{
3702    BlockDriverState *bs;
3703    BdrvChild *child;
3704    AioContext *ctx;
3705
3706    GLOBAL_STATE_CODE();
3707
3708    bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
3709                            child_role, allow_none, errp);
3710    if (bs == NULL) {
3711        return NULL;
3712    }
3713
3714    ctx = bdrv_get_aio_context(bs);
3715    aio_context_acquire(ctx);
3716    child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
3717                              errp);
3718    aio_context_release(ctx);
3719
3720    return child;
3721}
3722
3723/*
3724 * Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
3725 *
3726 * The caller must hold the lock of the main AioContext and no other AioContext.
3727 * @parent can move to a different AioContext in this function. Callers must
3728 * make sure that their AioContext locking is still correct after this.
3729 */
3730int bdrv_open_file_child(const char *filename,
3731                         QDict *options, const char *bdref_key,
3732                         BlockDriverState *parent, Error **errp)
3733{
3734    BdrvChildRole role;
3735
3736    /* commit_top and mirror_top don't use this function */
3737    assert(!parent->drv->filtered_child_is_backing);
3738    role = parent->drv->is_filter ?
3739        (BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY) : BDRV_CHILD_IMAGE;
3740
3741    if (!bdrv_open_child(filename, options, bdref_key, parent,
3742                         &child_of_bds, role, false, errp))
3743    {
3744        return -EINVAL;
3745    }
3746
3747    return 0;
3748}
3749
3750/*
3751 * TODO Future callers may need to specify parent/child_class in order for
3752 * option inheritance to work. Existing callers use it for the root node.
3753 */
3754BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
3755{
3756    BlockDriverState *bs = NULL;
3757    QObject *obj = NULL;
3758    QDict *qdict = NULL;
3759    const char *reference = NULL;
3760    Visitor *v = NULL;
3761
3762    GLOBAL_STATE_CODE();
3763
3764    if (ref->type == QTYPE_QSTRING) {
3765        reference = ref->u.reference;
3766    } else {
3767        BlockdevOptions *options = &ref->u.definition;
3768        assert(ref->type == QTYPE_QDICT);
3769
3770        v = qobject_output_visitor_new(&obj);
3771        visit_type_BlockdevOptions(v, NULL, &options, &error_abort);
3772        visit_complete(v, &obj);
3773
3774        qdict = qobject_to(QDict, obj);
3775        qdict_flatten(qdict);
3776
3777        /* bdrv_open_inherit() defaults to the values in bdrv_flags (for
3778         * compatibility with other callers) rather than what we want as the
3779         * real defaults. Apply the defaults here instead. */
3780        qdict_set_default_str(qdict, BDRV_OPT_CACHE_DIRECT, "off");
3781        qdict_set_default_str(qdict, BDRV_OPT_CACHE_NO_FLUSH, "off");
3782        qdict_set_default_str(qdict, BDRV_OPT_READ_ONLY, "off");
3783        qdict_set_default_str(qdict, BDRV_OPT_AUTO_READ_ONLY, "off");
3784
3785    }
3786
3787    bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp);
3788    obj = NULL;
3789    qobject_unref(obj);
3790    visit_free(v);
3791    return bs;
3792}
3793
3794static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
3795                                                   int flags,
3796                                                   QDict *snapshot_options,
3797                                                   Error **errp)
3798{
3799    g_autofree char *tmp_filename = NULL;
3800    int64_t total_size;
3801    QemuOpts *opts = NULL;
3802    BlockDriverState *bs_snapshot = NULL;
3803    AioContext *ctx = bdrv_get_aio_context(bs);
3804    int ret;
3805
3806    GLOBAL_STATE_CODE();
3807
3808    /* if snapshot, we create a temporary backing file and open it
3809       instead of opening 'filename' directly */
3810
3811    /* Get the required size from the image */
3812    aio_context_acquire(ctx);
3813    total_size = bdrv_getlength(bs);
3814    aio_context_release(ctx);
3815
3816    if (total_size < 0) {
3817        error_setg_errno(errp, -total_size, "Could not get image size");
3818        goto out;
3819    }
3820
3821    /* Create the temporary image */
3822    tmp_filename = create_tmp_file(errp);
3823    if (!tmp_filename) {
3824        goto out;
3825    }
3826
3827    opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
3828                            &error_abort);
3829    qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
3830    ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, errp);
3831    qemu_opts_del(opts);
3832    if (ret < 0) {
3833        error_prepend(errp, "Could not create temporary overlay '%s': ",
3834                      tmp_filename);
3835        goto out;
3836    }
3837
3838    /* Prepare options QDict for the temporary file */
3839    qdict_put_str(snapshot_options, "file.driver", "file");
3840    qdict_put_str(snapshot_options, "file.filename", tmp_filename);
3841    qdict_put_str(snapshot_options, "driver", "qcow2");
3842
3843    bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
3844    snapshot_options = NULL;
3845    if (!bs_snapshot) {
3846        goto out;
3847    }
3848
3849    aio_context_acquire(ctx);
3850    ret = bdrv_append(bs_snapshot, bs, errp);
3851    aio_context_release(ctx);
3852
3853    if (ret < 0) {
3854        bs_snapshot = NULL;
3855        goto out;
3856    }
3857
3858out:
3859    qobject_unref(snapshot_options);
3860    return bs_snapshot;
3861}
3862
3863/*
3864 * Opens a disk image (raw, qcow2, vmdk, ...)
3865 *
3866 * options is a QDict of options to pass to the block drivers, or NULL for an
3867 * empty set of options. The reference to the QDict belongs to the block layer
3868 * after the call (even on failure), so if the caller intends to reuse the
3869 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
3870 *
3871 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
3872 * If it is not NULL, the referenced BDS will be reused.
3873 *
3874 * The reference parameter may be used to specify an existing block device which
3875 * should be opened. If specified, neither options nor a filename may be given,
3876 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
3877 *
3878 * The caller must always hold the main AioContext lock.
3879 */
3880static BlockDriverState * no_coroutine_fn
3881bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
3882                  int flags, BlockDriverState *parent,
3883                  const BdrvChildClass *child_class, BdrvChildRole child_role,
3884                  Error **errp)
3885{
3886    int ret;
3887    BlockBackend *file = NULL;
3888    BlockDriverState *bs;
3889    BlockDriver *drv = NULL;
3890    BdrvChild *child;
3891    const char *drvname;
3892    const char *backing;
3893    Error *local_err = NULL;
3894    QDict *snapshot_options = NULL;
3895    int snapshot_flags = 0;
3896    AioContext *ctx = qemu_get_aio_context();
3897
3898    assert(!child_class || !flags);
3899    assert(!child_class == !parent);
3900    GLOBAL_STATE_CODE();
3901    assert(!qemu_in_coroutine());
3902
3903    if (reference) {
3904        bool options_non_empty = options ? qdict_size(options) : false;
3905        qobject_unref(options);
3906
3907        if (filename || options_non_empty) {
3908            error_setg(errp, "Cannot reference an existing block device with "
3909                       "additional options or a new filename");
3910            return NULL;
3911        }
3912
3913        bs = bdrv_lookup_bs(reference, reference, errp);
3914        if (!bs) {
3915            return NULL;
3916        }
3917
3918        bdrv_ref(bs);
3919        return bs;
3920    }
3921
3922    bs = bdrv_new();
3923
3924    /* NULL means an empty set of options */
3925    if (options == NULL) {
3926        options = qdict_new();
3927    }
3928
3929    /* json: syntax counts as explicit options, as if in the QDict */
3930    parse_json_protocol(options, &filename, &local_err);
3931    if (local_err) {
3932        goto fail;
3933    }
3934
3935    bs->explicit_options = qdict_clone_shallow(options);
3936
3937    if (child_class) {
3938        bool parent_is_format;
3939
3940        if (parent->drv) {
3941            parent_is_format = parent->drv->is_format;
3942        } else {
3943            /*
3944             * parent->drv is not set yet because this node is opened for
3945             * (potential) format probing.  That means that @parent is going
3946             * to be a format node.
3947             */
3948            parent_is_format = true;
3949        }
3950
3951        bs->inherits_from = parent;
3952        child_class->inherit_options(child_role, parent_is_format,
3953                                     &flags, options,
3954                                     parent->open_flags, parent->options);
3955    }
3956
3957    ret = bdrv_fill_options(&options, filename, &flags, &local_err);
3958    if (ret < 0) {
3959        goto fail;
3960    }
3961
3962    /*
3963     * Set the BDRV_O_RDWR and BDRV_O_ALLOW_RDWR flags.
3964     * Caution: getting a boolean member of @options requires care.
3965     * When @options come from -blockdev or blockdev_add, members are
3966     * typed according to the QAPI schema, but when they come from
3967     * -drive, they're all QString.
3968     */
3969    if (g_strcmp0(qdict_get_try_str(options, BDRV_OPT_READ_ONLY), "on") &&
3970        !qdict_get_try_bool(options, BDRV_OPT_READ_ONLY, false)) {
3971        flags |= (BDRV_O_RDWR | BDRV_O_ALLOW_RDWR);
3972    } else {
3973        flags &= ~BDRV_O_RDWR;
3974    }
3975
3976    if (flags & BDRV_O_SNAPSHOT) {
3977        snapshot_options = qdict_new();
3978        bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
3979                                   flags, options);
3980        /* Let bdrv_backing_options() override "read-only" */
3981        qdict_del(options, BDRV_OPT_READ_ONLY);
3982        bdrv_inherited_options(BDRV_CHILD_COW, true,
3983                               &flags, options, flags, options);
3984    }
3985
3986    bs->open_flags = flags;
3987    bs->options = options;
3988    options = qdict_clone_shallow(options);
3989
3990    /* Find the right image format driver */
3991    /* See cautionary note on accessing @options above */
3992    drvname = qdict_get_try_str(options, "driver");
3993    if (drvname) {
3994        drv = bdrv_find_format(drvname);
3995        if (!drv) {
3996            error_setg(errp, "Unknown driver: '%s'", drvname);
3997            goto fail;
3998        }
3999    }
4000
4001    assert(drvname || !(flags & BDRV_O_PROTOCOL));
4002
4003    /* See cautionary note on accessing @options above */
4004    backing = qdict_get_try_str(options, "backing");
4005    if (qobject_to(QNull, qdict_get(options, "backing")) != NULL ||
4006        (backing && *backing == '\0'))
4007    {
4008        if (backing) {
4009            warn_report("Use of \"backing\": \"\" is deprecated; "
4010                        "use \"backing\": null instead");
4011        }
4012        flags |= BDRV_O_NO_BACKING;
4013        qdict_del(bs->explicit_options, "backing");
4014        qdict_del(bs->options, "backing");
4015        qdict_del(options, "backing");
4016    }
4017
4018    /* Open image file without format layer. This BlockBackend is only used for
4019     * probing, the block drivers will do their own bdrv_open_child() for the
4020     * same BDS, which is why we put the node name back into options. */
4021    if ((flags & BDRV_O_PROTOCOL) == 0) {
4022        BlockDriverState *file_bs;
4023
4024        file_bs = bdrv_open_child_bs(filename, options, "file", bs,
4025                                     &child_of_bds, BDRV_CHILD_IMAGE,
4026                                     true, &local_err);
4027        if (local_err) {
4028            goto fail;
4029        }
4030        if (file_bs != NULL) {
4031            /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
4032             * looking at the header to guess the image format. This works even
4033             * in cases where a guest would not see a consistent state. */
4034            ctx = bdrv_get_aio_context(file_bs);
4035            aio_context_acquire(ctx);
4036            file = blk_new(ctx, 0, BLK_PERM_ALL);
4037            blk_insert_bs(file, file_bs, &local_err);
4038            bdrv_unref(file_bs);
4039            aio_context_release(ctx);
4040
4041            if (local_err) {
4042                goto fail;
4043            }
4044
4045            qdict_put_str(options, "file", bdrv_get_node_name(file_bs));
4046        }
4047    }
4048
4049    /* Image format probing */
4050    bs->probed = !drv;
4051    if (!drv && file) {
4052        ret = find_image_format(file, filename, &drv, &local_err);
4053        if (ret < 0) {
4054            goto fail;
4055        }
4056        /*
4057         * This option update would logically belong in bdrv_fill_options(),
4058         * but we first need to open bs->file for the probing to work, while
4059         * opening bs->file already requires the (mostly) final set of options
4060         * so that cache mode etc. can be inherited.
4061         *
4062         * Adding the driver later is somewhat ugly, but it's not an option
4063         * that would ever be inherited, so it's correct. We just need to make
4064         * sure to update both bs->options (which has the full effective
4065         * options for bs) and options (which has file.* already removed).
4066         */
4067        qdict_put_str(bs->options, "driver", drv->format_name);
4068        qdict_put_str(options, "driver", drv->format_name);
4069    } else if (!drv) {
4070        error_setg(errp, "Must specify either driver or file");
4071        goto fail;
4072    }
4073
4074    /* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */
4075    assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->bdrv_file_open);
4076    /* file must be NULL if a protocol BDS is about to be created
4077     * (the inverse results in an error message from bdrv_open_common()) */
4078    assert(!(flags & BDRV_O_PROTOCOL) || !file);
4079
4080    /* Open the image */
4081    ret = bdrv_open_common(bs, file, options, &local_err);
4082    if (ret < 0) {
4083        goto fail;
4084    }
4085
4086    /* The AioContext could have changed during bdrv_open_common() */
4087    ctx = bdrv_get_aio_context(bs);
4088
4089    if (file) {
4090        aio_context_acquire(ctx);
4091        blk_unref(file);
4092        aio_context_release(ctx);
4093        file = NULL;
4094    }
4095
4096    /* If there is a backing file, use it */
4097    if ((flags & BDRV_O_NO_BACKING) == 0) {
4098        ret = bdrv_open_backing_file(bs, options, "backing", &local_err);
4099        if (ret < 0) {
4100            goto close_and_fail;
4101        }
4102    }
4103
4104    /* Remove all children options and references
4105     * from bs->options and bs->explicit_options */
4106    QLIST_FOREACH(child, &bs->children, next) {
4107        char *child_key_dot;
4108        child_key_dot = g_strdup_printf("%s.", child->name);
4109        qdict_extract_subqdict(bs->explicit_options, NULL, child_key_dot);
4110        qdict_extract_subqdict(bs->options, NULL, child_key_dot);
4111        qdict_del(bs->explicit_options, child->name);
4112        qdict_del(bs->options, child->name);
4113        g_free(child_key_dot);
4114    }
4115
4116    /* Check if any unknown options were used */
4117    if (qdict_size(options) != 0) {
4118        const QDictEntry *entry = qdict_first(options);
4119        if (flags & BDRV_O_PROTOCOL) {
4120            error_setg(errp, "Block protocol '%s' doesn't support the option "
4121                       "'%s'", drv->format_name, entry->key);
4122        } else {
4123            error_setg(errp,
4124                       "Block format '%s' does not support the option '%s'",
4125                       drv->format_name, entry->key);
4126        }
4127
4128        goto close_and_fail;
4129    }
4130
4131    bdrv_parent_cb_change_media(bs, true);
4132
4133    qobject_unref(options);
4134    options = NULL;
4135
4136    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
4137     * temporary snapshot afterwards. */
4138    if (snapshot_flags) {
4139        BlockDriverState *snapshot_bs;
4140        snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
4141                                                snapshot_options, &local_err);
4142        snapshot_options = NULL;
4143        if (local_err) {
4144            goto close_and_fail;
4145        }
4146        /* We are not going to return bs but the overlay on top of it
4147         * (snapshot_bs); thus, we have to drop the strong reference to bs
4148         * (which we obtained by calling bdrv_new()). bs will not be deleted,
4149         * though, because the overlay still has a reference to it. */
4150        aio_context_acquire(ctx);
4151        bdrv_unref(bs);
4152        aio_context_release(ctx);
4153        bs = snapshot_bs;
4154    }
4155
4156    return bs;
4157
4158fail:
4159    aio_context_acquire(ctx);
4160    blk_unref(file);
4161    qobject_unref(snapshot_options);
4162    qobject_unref(bs->explicit_options);
4163    qobject_unref(bs->options);
4164    qobject_unref(options);
4165    bs->options = NULL;
4166    bs->explicit_options = NULL;
4167    bdrv_unref(bs);
4168    aio_context_release(ctx);
4169    error_propagate(errp, local_err);
4170    return NULL;
4171
4172close_and_fail:
4173    aio_context_acquire(ctx);
4174    bdrv_unref(bs);
4175    aio_context_release(ctx);
4176    qobject_unref(snapshot_options);
4177    qobject_unref(options);
4178    error_propagate(errp, local_err);
4179    return NULL;
4180}
4181
4182/* The caller must always hold the main AioContext lock. */
4183BlockDriverState *bdrv_open(const char *filename, const char *reference,
4184                            QDict *options, int flags, Error **errp)
4185{
4186    GLOBAL_STATE_CODE();
4187
4188    return bdrv_open_inherit(filename, reference, options, flags, NULL,
4189                             NULL, 0, errp);
4190}
4191
4192/* Return true if the NULL-terminated @list contains @str */
4193static bool is_str_in_list(const char *str, const char *const *list)
4194{
4195    if (str && list) {
4196        int i;
4197        for (i = 0; list[i] != NULL; i++) {
4198            if (!strcmp(str, list[i])) {
4199                return true;
4200            }
4201        }
4202    }
4203    return false;
4204}
4205
4206/*
4207 * Check that every option set in @bs->options is also set in
4208 * @new_opts.
4209 *
4210 * Options listed in the common_options list and in
4211 * @bs->drv->mutable_opts are skipped.
4212 *
4213 * Return 0 on success, otherwise return -EINVAL and set @errp.
4214 */
4215static int bdrv_reset_options_allowed(BlockDriverState *bs,
4216                                      const QDict *new_opts, Error **errp)
4217{
4218    const QDictEntry *e;
4219    /* These options are common to all block drivers and are handled
4220     * in bdrv_reopen_prepare() so they can be left out of @new_opts */
4221    const char *const common_options[] = {
4222        "node-name", "discard", "cache.direct", "cache.no-flush",
4223        "read-only", "auto-read-only", "detect-zeroes", NULL
4224    };
4225
4226    for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
4227        if (!qdict_haskey(new_opts, e->key) &&
4228            !is_str_in_list(e->key, common_options) &&
4229            !is_str_in_list(e->key, bs->drv->mutable_opts)) {
4230            error_setg(errp, "Option '%s' cannot be reset "
4231                       "to its default value", e->key);
4232            return -EINVAL;
4233        }
4234    }
4235
4236    return 0;
4237}
4238
4239/*
4240 * Returns true if @child can be reached recursively from @bs
4241 */
4242static bool bdrv_recurse_has_child(BlockDriverState *bs,
4243                                   BlockDriverState *child)
4244{
4245    BdrvChild *c;
4246
4247    if (bs == child) {
4248        return true;
4249    }
4250
4251    QLIST_FOREACH(c, &bs->children, next) {
4252        if (bdrv_recurse_has_child(c->bs, child)) {
4253            return true;
4254        }
4255    }
4256
4257    return false;
4258}
4259
4260/*
4261 * Adds a BlockDriverState to a simple queue for an atomic, transactional
4262 * reopen of multiple devices.
4263 *
4264 * bs_queue can either be an existing BlockReopenQueue that has had QTAILQ_INIT
4265 * already performed, or alternatively may be NULL a new BlockReopenQueue will
4266 * be created and initialized. This newly created BlockReopenQueue should be
4267 * passed back in for subsequent calls that are intended to be of the same
4268 * atomic 'set'.
4269 *
4270 * bs is the BlockDriverState to add to the reopen queue.
4271 *
4272 * options contains the changed options for the associated bs
4273 * (the BlockReopenQueue takes ownership)
4274 *
4275 * flags contains the open flags for the associated bs
4276 *
4277 * returns a pointer to bs_queue, which is either the newly allocated
4278 * bs_queue, or the existing bs_queue being used.
4279 *
4280 * bs is drained here and undrained by bdrv_reopen_queue_free().
4281 *
4282 * To be called with bs->aio_context locked.
4283 */
4284static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
4285                                                 BlockDriverState *bs,
4286                                                 QDict *options,
4287                                                 const BdrvChildClass *klass,
4288                                                 BdrvChildRole role,
4289                                                 bool parent_is_format,
4290                                                 QDict *parent_options,
4291                                                 int parent_flags,
4292                                                 bool keep_old_opts)
4293{
4294    assert(bs != NULL);
4295
4296    BlockReopenQueueEntry *bs_entry;
4297    BdrvChild *child;
4298    QDict *old_options, *explicit_options, *options_copy;
4299    int flags;
4300    QemuOpts *opts;
4301
4302    GLOBAL_STATE_CODE();
4303
4304    bdrv_drained_begin(bs);
4305
4306    if (bs_queue == NULL) {
4307        bs_queue = g_new0(BlockReopenQueue, 1);
4308        QTAILQ_INIT(bs_queue);
4309    }
4310
4311    if (!options) {
4312        options = qdict_new();
4313    }
4314
4315    /* Check if this BlockDriverState is already in the queue */
4316    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4317        if (bs == bs_entry->state.bs) {
4318            break;
4319        }
4320    }
4321
4322    /*
4323     * Precedence of options:
4324     * 1. Explicitly passed in options (highest)
4325     * 2. Retained from explicitly set options of bs
4326     * 3. Inherited from parent node
4327     * 4. Retained from effective options of bs
4328     */
4329
4330    /* Old explicitly set values (don't overwrite by inherited value) */
4331    if (bs_entry || keep_old_opts) {
4332        old_options = qdict_clone_shallow(bs_entry ?
4333                                          bs_entry->state.explicit_options :
4334                                          bs->explicit_options);
4335        bdrv_join_options(bs, options, old_options);
4336        qobject_unref(old_options);
4337    }
4338
4339    explicit_options = qdict_clone_shallow(options);
4340
4341    /* Inherit from parent node */
4342    if (parent_options) {
4343        flags = 0;
4344        klass->inherit_options(role, parent_is_format, &flags, options,
4345                               parent_flags, parent_options);
4346    } else {
4347        flags = bdrv_get_flags(bs);
4348    }
4349
4350    if (keep_old_opts) {
4351        /* Old values are used for options that aren't set yet */
4352        old_options = qdict_clone_shallow(bs->options);
4353        bdrv_join_options(bs, options, old_options);
4354        qobject_unref(old_options);
4355    }
4356
4357    /* We have the final set of options so let's update the flags */
4358    options_copy = qdict_clone_shallow(options);
4359    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
4360    qemu_opts_absorb_qdict(opts, options_copy, NULL);
4361    update_flags_from_options(&flags, opts);
4362    qemu_opts_del(opts);
4363    qobject_unref(options_copy);
4364
4365    /* bdrv_open_inherit() sets and clears some additional flags internally */
4366    flags &= ~BDRV_O_PROTOCOL;
4367    if (flags & BDRV_O_RDWR) {
4368        flags |= BDRV_O_ALLOW_RDWR;
4369    }
4370
4371    if (!bs_entry) {
4372        bs_entry = g_new0(BlockReopenQueueEntry, 1);
4373        QTAILQ_INSERT_TAIL(bs_queue, bs_entry, entry);
4374    } else {
4375        qobject_unref(bs_entry->state.options);
4376        qobject_unref(bs_entry->state.explicit_options);
4377    }
4378
4379    bs_entry->state.bs = bs;
4380    bs_entry->state.options = options;
4381    bs_entry->state.explicit_options = explicit_options;
4382    bs_entry->state.flags = flags;
4383
4384    /*
4385     * If keep_old_opts is false then it means that unspecified
4386     * options must be reset to their original value. We don't allow
4387     * resetting 'backing' but we need to know if the option is
4388     * missing in order to decide if we have to return an error.
4389     */
4390    if (!keep_old_opts) {
4391        bs_entry->state.backing_missing =
4392            !qdict_haskey(options, "backing") &&
4393            !qdict_haskey(options, "backing.driver");
4394    }
4395
4396    QLIST_FOREACH(child, &bs->children, next) {
4397        QDict *new_child_options = NULL;
4398        bool child_keep_old = keep_old_opts;
4399
4400        /* reopen can only change the options of block devices that were
4401         * implicitly created and inherited options. For other (referenced)
4402         * block devices, a syntax like "backing.foo" results in an error. */
4403        if (child->bs->inherits_from != bs) {
4404            continue;
4405        }
4406
4407        /* Check if the options contain a child reference */
4408        if (qdict_haskey(options, child->name)) {
4409            const char *childref = qdict_get_try_str(options, child->name);
4410            /*
4411             * The current child must not be reopened if the child
4412             * reference is null or points to a different node.
4413             */
4414            if (g_strcmp0(childref, child->bs->node_name)) {
4415                continue;
4416            }
4417            /*
4418             * If the child reference points to the current child then
4419             * reopen it with its existing set of options (note that
4420             * it can still inherit new options from the parent).
4421             */
4422            child_keep_old = true;
4423        } else {
4424            /* Extract child options ("child-name.*") */
4425            char *child_key_dot = g_strdup_printf("%s.", child->name);
4426            qdict_extract_subqdict(explicit_options, NULL, child_key_dot);
4427            qdict_extract_subqdict(options, &new_child_options, child_key_dot);
4428            g_free(child_key_dot);
4429        }
4430
4431        bdrv_reopen_queue_child(bs_queue, child->bs, new_child_options,
4432                                child->klass, child->role, bs->drv->is_format,
4433                                options, flags, child_keep_old);
4434    }
4435
4436    return bs_queue;
4437}
4438
4439/* To be called with bs->aio_context locked */
4440BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
4441                                    BlockDriverState *bs,
4442                                    QDict *options, bool keep_old_opts)
4443{
4444    GLOBAL_STATE_CODE();
4445
4446    return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, 0, false,
4447                                   NULL, 0, keep_old_opts);
4448}
4449
4450void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)
4451{
4452    GLOBAL_STATE_CODE();
4453    if (bs_queue) {
4454        BlockReopenQueueEntry *bs_entry, *next;
4455        QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
4456            AioContext *ctx = bdrv_get_aio_context(bs_entry->state.bs);
4457
4458            aio_context_acquire(ctx);
4459            bdrv_drained_end(bs_entry->state.bs);
4460            aio_context_release(ctx);
4461
4462            qobject_unref(bs_entry->state.explicit_options);
4463            qobject_unref(bs_entry->state.options);
4464            g_free(bs_entry);
4465        }
4466        g_free(bs_queue);
4467    }
4468}
4469
4470/*
4471 * Reopen multiple BlockDriverStates atomically & transactionally.
4472 *
4473 * The queue passed in (bs_queue) must have been built up previous
4474 * via bdrv_reopen_queue().
4475 *
4476 * Reopens all BDS specified in the queue, with the appropriate
4477 * flags.  All devices are prepared for reopen, and failure of any
4478 * device will cause all device changes to be abandoned, and intermediate
4479 * data cleaned up.
4480 *
4481 * If all devices prepare successfully, then the changes are committed
4482 * to all devices.
4483 *
4484 * All affected nodes must be drained between bdrv_reopen_queue() and
4485 * bdrv_reopen_multiple().
4486 *
4487 * To be called from the main thread, with all other AioContexts unlocked.
4488 */
4489int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
4490{
4491    int ret = -1;
4492    BlockReopenQueueEntry *bs_entry, *next;
4493    AioContext *ctx;
4494    Transaction *tran = tran_new();
4495    g_autoptr(GSList) refresh_list = NULL;
4496
4497    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
4498    assert(bs_queue != NULL);
4499    GLOBAL_STATE_CODE();
4500
4501    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4502        ctx = bdrv_get_aio_context(bs_entry->state.bs);
4503        aio_context_acquire(ctx);
4504        ret = bdrv_flush(bs_entry->state.bs);
4505        aio_context_release(ctx);
4506        if (ret < 0) {
4507            error_setg_errno(errp, -ret, "Error flushing drive");
4508            goto abort;
4509        }
4510    }
4511
4512    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4513        assert(bs_entry->state.bs->quiesce_counter > 0);
4514        ctx = bdrv_get_aio_context(bs_entry->state.bs);
4515        aio_context_acquire(ctx);
4516        ret = bdrv_reopen_prepare(&bs_entry->state, bs_queue, tran, errp);
4517        aio_context_release(ctx);
4518        if (ret < 0) {
4519            goto abort;
4520        }
4521        bs_entry->prepared = true;
4522    }
4523
4524    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4525        BDRVReopenState *state = &bs_entry->state;
4526
4527        refresh_list = g_slist_prepend(refresh_list, state->bs);
4528        if (state->old_backing_bs) {
4529            refresh_list = g_slist_prepend(refresh_list, state->old_backing_bs);
4530        }
4531        if (state->old_file_bs) {
4532            refresh_list = g_slist_prepend(refresh_list, state->old_file_bs);
4533        }
4534    }
4535
4536    /*
4537     * Note that file-posix driver rely on permission update done during reopen
4538     * (even if no permission changed), because it wants "new" permissions for
4539     * reconfiguring the fd and that's why it does it in raw_check_perm(), not
4540     * in raw_reopen_prepare() which is called with "old" permissions.
4541     */
4542    ret = bdrv_list_refresh_perms(refresh_list, bs_queue, tran, errp);
4543    if (ret < 0) {
4544        goto abort;
4545    }
4546
4547    /*
4548     * If we reach this point, we have success and just need to apply the
4549     * changes.
4550     *
4551     * Reverse order is used to comfort qcow2 driver: on commit it need to write
4552     * IN_USE flag to the image, to mark bitmaps in the image as invalid. But
4553     * children are usually goes after parents in reopen-queue, so go from last
4554     * to first element.
4555     */
4556    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
4557        ctx = bdrv_get_aio_context(bs_entry->state.bs);
4558        aio_context_acquire(ctx);
4559        bdrv_reopen_commit(&bs_entry->state);
4560        aio_context_release(ctx);
4561    }
4562
4563    tran_commit(tran);
4564
4565    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
4566        BlockDriverState *bs = bs_entry->state.bs;
4567
4568        if (bs->drv->bdrv_reopen_commit_post) {
4569            ctx = bdrv_get_aio_context(bs);
4570            aio_context_acquire(ctx);
4571            bs->drv->bdrv_reopen_commit_post(&bs_entry->state);
4572            aio_context_release(ctx);
4573        }
4574    }
4575
4576    ret = 0;
4577    goto cleanup;
4578
4579abort:
4580    tran_abort(tran);
4581    QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
4582        if (bs_entry->prepared) {
4583            ctx = bdrv_get_aio_context(bs_entry->state.bs);
4584            aio_context_acquire(ctx);
4585            bdrv_reopen_abort(&bs_entry->state);
4586            aio_context_release(ctx);
4587        }
4588    }
4589
4590cleanup:
4591    bdrv_reopen_queue_free(bs_queue);
4592
4593    return ret;
4594}
4595
4596int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
4597                Error **errp)
4598{
4599    AioContext *ctx = bdrv_get_aio_context(bs);
4600    BlockReopenQueue *queue;
4601    int ret;
4602
4603    GLOBAL_STATE_CODE();
4604
4605    queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
4606
4607    if (ctx != qemu_get_aio_context()) {
4608        aio_context_release(ctx);
4609    }
4610    ret = bdrv_reopen_multiple(queue, errp);
4611
4612    if (ctx != qemu_get_aio_context()) {
4613        aio_context_acquire(ctx);
4614    }
4615
4616    return ret;
4617}
4618
4619int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
4620                              Error **errp)
4621{
4622    QDict *opts = qdict_new();
4623
4624    GLOBAL_STATE_CODE();
4625
4626    qdict_put_bool(opts, BDRV_OPT_READ_ONLY, read_only);
4627
4628    return bdrv_reopen(bs, opts, true, errp);
4629}
4630
4631/*
4632 * Take a BDRVReopenState and check if the value of 'backing' in the
4633 * reopen_state->options QDict is valid or not.
4634 *
4635 * If 'backing' is missing from the QDict then return 0.
4636 *
4637 * If 'backing' contains the node name of the backing file of
4638 * reopen_state->bs then return 0.
4639 *
4640 * If 'backing' contains a different node name (or is null) then check
4641 * whether the current backing file can be replaced with the new one.
4642 * If that's the case then reopen_state->replace_backing_bs is set to
4643 * true and reopen_state->new_backing_bs contains a pointer to the new
4644 * backing BlockDriverState (or NULL).
4645 *
4646 * Return 0 on success, otherwise return < 0 and set @errp.
4647 *
4648 * The caller must hold the AioContext lock of @reopen_state->bs.
4649 * @reopen_state->bs can move to a different AioContext in this function.
4650 * Callers must make sure that their AioContext locking is still correct after
4651 * this.
4652 */
4653static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
4654                                             bool is_backing, Transaction *tran,
4655                                             Error **errp)
4656{
4657    BlockDriverState *bs = reopen_state->bs;
4658    BlockDriverState *new_child_bs;
4659    BlockDriverState *old_child_bs = is_backing ? child_bs(bs->backing) :
4660                                                  child_bs(bs->file);
4661    const char *child_name = is_backing ? "backing" : "file";
4662    QObject *value;
4663    const char *str;
4664    AioContext *ctx, *old_ctx;
4665    int ret;
4666
4667    GLOBAL_STATE_CODE();
4668
4669    value = qdict_get(reopen_state->options, child_name);
4670    if (value == NULL) {
4671        return 0;
4672    }
4673
4674    switch (qobject_type(value)) {
4675    case QTYPE_QNULL:
4676        assert(is_backing); /* The 'file' option does not allow a null value */
4677        new_child_bs = NULL;
4678        break;
4679    case QTYPE_QSTRING:
4680        str = qstring_get_str(qobject_to(QString, value));
4681        new_child_bs = bdrv_lookup_bs(NULL, str, errp);
4682        if (new_child_bs == NULL) {
4683            return -EINVAL;
4684        } else if (bdrv_recurse_has_child(new_child_bs, bs)) {
4685            error_setg(errp, "Making '%s' a %s child of '%s' would create a "
4686                       "cycle", str, child_name, bs->node_name);
4687            return -EINVAL;
4688        }
4689        break;
4690    default:
4691        /*
4692         * The options QDict has been flattened, so 'backing' and 'file'
4693         * do not allow any other data type here.
4694         */
4695        g_assert_not_reached();
4696    }
4697
4698    if (old_child_bs == new_child_bs) {
4699        return 0;
4700    }
4701
4702    if (old_child_bs) {
4703        if (bdrv_skip_implicit_filters(old_child_bs) == new_child_bs) {
4704            return 0;
4705        }
4706
4707        if (old_child_bs->implicit) {
4708            error_setg(errp, "Cannot replace implicit %s child of %s",
4709                       child_name, bs->node_name);
4710            return -EPERM;
4711        }
4712    }
4713
4714    if (bs->drv->is_filter && !old_child_bs) {
4715        /*
4716         * Filters always have a file or a backing child, so we are trying to
4717         * change wrong child
4718         */
4719        error_setg(errp, "'%s' is a %s filter node that does not support a "
4720                   "%s child", bs->node_name, bs->drv->format_name, child_name);
4721        return -EINVAL;
4722    }
4723
4724    if (is_backing) {
4725        reopen_state->old_backing_bs = old_child_bs;
4726    } else {
4727        reopen_state->old_file_bs = old_child_bs;
4728    }
4729
4730    old_ctx = bdrv_get_aio_context(bs);
4731    ctx = bdrv_get_aio_context(new_child_bs);
4732    if (old_ctx != ctx) {
4733        aio_context_release(old_ctx);
4734        aio_context_acquire(ctx);
4735    }
4736
4737    ret = bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
4738                                          tran, errp);
4739
4740    if (old_ctx != ctx) {
4741        aio_context_release(ctx);
4742        aio_context_acquire(old_ctx);
4743    }
4744
4745    return ret;
4746}
4747
4748/*
4749 * Prepares a BlockDriverState for reopen. All changes are staged in the
4750 * 'opaque' field of the BDRVReopenState, which is used and allocated by
4751 * the block driver layer .bdrv_reopen_prepare()
4752 *
4753 * bs is the BlockDriverState to reopen
4754 * flags are the new open flags
4755 * queue is the reopen queue
4756 *
4757 * Returns 0 on success, non-zero on error.  On error errp will be set
4758 * as well.
4759 *
4760 * On failure, bdrv_reopen_abort() will be called to clean up any data.
4761 * It is the responsibility of the caller to then call the abort() or
4762 * commit() for any other BDS that have been left in a prepare() state
4763 *
4764 * The caller must hold the AioContext lock of @reopen_state->bs.
4765 */
4766static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
4767                               BlockReopenQueue *queue,
4768                               Transaction *change_child_tran, Error **errp)
4769{
4770    int ret = -1;
4771    int old_flags;
4772    Error *local_err = NULL;
4773    BlockDriver *drv;
4774    QemuOpts *opts;
4775    QDict *orig_reopen_opts;
4776    char *discard = NULL;
4777    bool read_only;
4778    bool drv_prepared = false;
4779
4780    assert(reopen_state != NULL);
4781    assert(reopen_state->bs->drv != NULL);
4782    GLOBAL_STATE_CODE();
4783    drv = reopen_state->bs->drv;
4784
4785    /* This function and each driver's bdrv_reopen_prepare() remove
4786     * entries from reopen_state->options as they are processed, so
4787     * we need to make a copy of the original QDict. */
4788    orig_reopen_opts = qdict_clone_shallow(reopen_state->options);
4789
4790    /* Process generic block layer options */
4791    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
4792    if (!qemu_opts_absorb_qdict(opts, reopen_state->options, errp)) {
4793        ret = -EINVAL;
4794        goto error;
4795    }
4796
4797    /* This was already called in bdrv_reopen_queue_child() so the flags
4798     * are up-to-date. This time we simply want to remove the options from
4799     * QemuOpts in order to indicate that they have been processed. */
4800    old_flags = reopen_state->flags;
4801    update_flags_from_options(&reopen_state->flags, opts);
4802    assert(old_flags == reopen_state->flags);
4803
4804    discard = qemu_opt_get_del(opts, BDRV_OPT_DISCARD);
4805    if (discard != NULL) {
4806        if (bdrv_parse_discard_flags(discard, &reopen_state->flags) != 0) {
4807            error_setg(errp, "Invalid discard option");
4808            ret = -EINVAL;
4809            goto error;
4810        }
4811    }
4812
4813    reopen_state->detect_zeroes =
4814        bdrv_parse_detect_zeroes(opts, reopen_state->flags, &local_err);
4815    if (local_err) {
4816        error_propagate(errp, local_err);
4817        ret = -EINVAL;
4818        goto error;
4819    }
4820
4821    /* All other options (including node-name and driver) must be unchanged.
4822     * Put them back into the QDict, so that they are checked at the end
4823     * of this function. */
4824    qemu_opts_to_qdict(opts, reopen_state->options);
4825
4826    /* If we are to stay read-only, do not allow permission change
4827     * to r/w. Attempting to set to r/w may fail if either BDRV_O_ALLOW_RDWR is
4828     * not set, or if the BDS still has copy_on_read enabled */
4829    read_only = !(reopen_state->flags & BDRV_O_RDWR);
4830    ret = bdrv_can_set_read_only(reopen_state->bs, read_only, true, &local_err);
4831    if (local_err) {
4832        error_propagate(errp, local_err);
4833        goto error;
4834    }
4835
4836    if (drv->bdrv_reopen_prepare) {
4837        /*
4838         * If a driver-specific option is missing, it means that we
4839         * should reset it to its default value.
4840         * But not all options allow that, so we need to check it first.
4841         */
4842        ret = bdrv_reset_options_allowed(reopen_state->bs,
4843                                         reopen_state->options, errp);
4844        if (ret) {
4845            goto error;
4846        }
4847
4848        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
4849        if (ret) {
4850            if (local_err != NULL) {
4851                error_propagate(errp, local_err);
4852            } else {
4853                bdrv_refresh_filename(reopen_state->bs);
4854                error_setg(errp, "failed while preparing to reopen image '%s'",
4855                           reopen_state->bs->filename);
4856            }
4857            goto error;
4858        }
4859    } else {
4860        /* It is currently mandatory to have a bdrv_reopen_prepare()
4861         * handler for each supported drv. */
4862        error_setg(errp, "Block format '%s' used by node '%s' "
4863                   "does not support reopening files", drv->format_name,
4864                   bdrv_get_device_or_node_name(reopen_state->bs));
4865        ret = -1;
4866        goto error;
4867    }
4868
4869    drv_prepared = true;
4870
4871    /*
4872     * We must provide the 'backing' option if the BDS has a backing
4873     * file or if the image file has a backing file name as part of
4874     * its metadata. Otherwise the 'backing' option can be omitted.
4875     */
4876    if (drv->supports_backing && reopen_state->backing_missing &&
4877        (reopen_state->bs->backing || reopen_state->bs->backing_file[0])) {
4878        error_setg(errp, "backing is missing for '%s'",
4879                   reopen_state->bs->node_name);
4880        ret = -EINVAL;
4881        goto error;
4882    }
4883
4884    /*
4885     * Allow changing the 'backing' option. The new value can be
4886     * either a reference to an existing node (using its node name)
4887     * or NULL to simply detach the current backing file.
4888     */
4889    ret = bdrv_reopen_parse_file_or_backing(reopen_state, true,
4890                                            change_child_tran, errp);
4891    if (ret < 0) {
4892        goto error;
4893    }
4894    qdict_del(reopen_state->options, "backing");
4895
4896    /* Allow changing the 'file' option. In this case NULL is not allowed */
4897    ret = bdrv_reopen_parse_file_or_backing(reopen_state, false,
4898                                            change_child_tran, errp);
4899    if (ret < 0) {
4900        goto error;
4901    }
4902    qdict_del(reopen_state->options, "file");
4903
4904    /* Options that are not handled are only okay if they are unchanged
4905     * compared to the old state. It is expected that some options are only
4906     * used for the initial open, but not reopen (e.g. filename) */
4907    if (qdict_size(reopen_state->options)) {
4908        const QDictEntry *entry = qdict_first(reopen_state->options);
4909
4910        do {
4911            QObject *new = entry->value;
4912            QObject *old = qdict_get(reopen_state->bs->options, entry->key);
4913
4914            /* Allow child references (child_name=node_name) as long as they
4915             * point to the current child (i.e. everything stays the same). */
4916            if (qobject_type(new) == QTYPE_QSTRING) {
4917                BdrvChild *child;
4918                QLIST_FOREACH(child, &reopen_state->bs->children, next) {
4919                    if (!strcmp(child->name, entry->key)) {
4920                        break;
4921                    }
4922                }
4923
4924                if (child) {
4925                    if (!strcmp(child->bs->node_name,
4926                                qstring_get_str(qobject_to(QString, new)))) {
4927                        continue; /* Found child with this name, skip option */
4928                    }
4929                }
4930            }
4931
4932            /*
4933             * TODO: When using -drive to specify blockdev options, all values
4934             * will be strings; however, when using -blockdev, blockdev-add or
4935             * filenames using the json:{} pseudo-protocol, they will be
4936             * correctly typed.
4937             * In contrast, reopening options are (currently) always strings
4938             * (because you can only specify them through qemu-io; all other
4939             * callers do not specify any options).
4940             * Therefore, when using anything other than -drive to create a BDS,
4941             * this cannot detect non-string options as unchanged, because
4942             * qobject_is_equal() always returns false for objects of different
4943             * type.  In the future, this should be remedied by correctly typing
4944             * all options.  For now, this is not too big of an issue because
4945             * the user can simply omit options which cannot be changed anyway,
4946             * so they will stay unchanged.
4947             */
4948            if (!qobject_is_equal(new, old)) {
4949                error_setg(errp, "Cannot change the option '%s'", entry->key);
4950                ret = -EINVAL;
4951                goto error;
4952            }
4953        } while ((entry = qdict_next(reopen_state->options, entry)));
4954    }
4955
4956    ret = 0;
4957
4958    /* Restore the original reopen_state->options QDict */
4959    qobject_unref(reopen_state->options);
4960    reopen_state->options = qobject_ref(orig_reopen_opts);
4961
4962error:
4963    if (ret < 0 && drv_prepared) {
4964        /* drv->bdrv_reopen_prepare() has succeeded, so we need to
4965         * call drv->bdrv_reopen_abort() before signaling an error
4966         * (bdrv_reopen_multiple() will not call bdrv_reopen_abort()
4967         * when the respective bdrv_reopen_prepare() has failed) */
4968        if (drv->bdrv_reopen_abort) {
4969            drv->bdrv_reopen_abort(reopen_state);
4970        }
4971    }
4972    qemu_opts_del(opts);
4973    qobject_unref(orig_reopen_opts);
4974    g_free(discard);
4975    return ret;
4976}
4977
4978/*
4979 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
4980 * makes them final by swapping the staging BlockDriverState contents into
4981 * the active BlockDriverState contents.
4982 */
4983static void bdrv_reopen_commit(BDRVReopenState *reopen_state)
4984{
4985    BlockDriver *drv;
4986    BlockDriverState *bs;
4987    BdrvChild *child;
4988
4989    assert(reopen_state != NULL);
4990    bs = reopen_state->bs;
4991    drv = bs->drv;
4992    assert(drv != NULL);
4993    GLOBAL_STATE_CODE();
4994
4995    /* If there are any driver level actions to take */
4996    if (drv->bdrv_reopen_commit) {
4997        drv->bdrv_reopen_commit(reopen_state);
4998    }
4999
5000    /* set BDS specific flags now */
5001    qobject_unref(bs->explicit_options);
5002    qobject_unref(bs->options);
5003    qobject_ref(reopen_state->explicit_options);
5004    qobject_ref(reopen_state->options);
5005
5006    bs->explicit_options   = reopen_state->explicit_options;
5007    bs->options            = reopen_state->options;
5008    bs->open_flags         = reopen_state->flags;
5009    bs->detect_zeroes      = reopen_state->detect_zeroes;
5010
5011    /* Remove child references from bs->options and bs->explicit_options.
5012     * Child options were already removed in bdrv_reopen_queue_child() */
5013    QLIST_FOREACH(child, &bs->children, next) {
5014        qdict_del(bs->explicit_options, child->name);
5015        qdict_del(bs->options, child->name);
5016    }
5017    /* backing is probably removed, so it's not handled by previous loop */
5018    qdict_del(bs->explicit_options, "backing");
5019    qdict_del(bs->options, "backing");
5020
5021    bdrv_graph_rdlock_main_loop();
5022    bdrv_refresh_limits(bs, NULL, NULL);
5023    bdrv_graph_rdunlock_main_loop();
5024    bdrv_refresh_total_sectors(bs, bs->total_sectors);
5025}
5026
5027/*
5028 * Abort the reopen, and delete and free the staged changes in
5029 * reopen_state
5030 */
5031static void bdrv_reopen_abort(BDRVReopenState *reopen_state)
5032{
5033    BlockDriver *drv;
5034
5035    assert(reopen_state != NULL);
5036    drv = reopen_state->bs->drv;
5037    assert(drv != NULL);
5038    GLOBAL_STATE_CODE();
5039
5040    if (drv->bdrv_reopen_abort) {
5041        drv->bdrv_reopen_abort(reopen_state);
5042    }
5043}
5044
5045
5046static void bdrv_close(BlockDriverState *bs)
5047{
5048    BdrvAioNotifier *ban, *ban_next;
5049    BdrvChild *child, *next;
5050
5051    GLOBAL_STATE_CODE();
5052    assert(!bs->refcnt);
5053
5054    bdrv_drained_begin(bs); /* complete I/O */
5055    bdrv_flush(bs);
5056    bdrv_drain(bs); /* in case flush left pending I/O */
5057
5058    if (bs->drv) {
5059        if (bs->drv->bdrv_close) {
5060            /* Must unfreeze all children, so bdrv_unref_child() works */
5061            bs->drv->bdrv_close(bs);
5062        }
5063        bs->drv = NULL;
5064    }
5065
5066    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
5067        bdrv_unref_child(bs, child);
5068    }
5069
5070    assert(!bs->backing);
5071    assert(!bs->file);
5072    g_free(bs->opaque);
5073    bs->opaque = NULL;
5074    qatomic_set(&bs->copy_on_read, 0);
5075    bs->backing_file[0] = '\0';
5076    bs->backing_format[0] = '\0';
5077    bs->total_sectors = 0;
5078    bs->encrypted = false;
5079    bs->sg = false;
5080    qobject_unref(bs->options);
5081    qobject_unref(bs->explicit_options);
5082    bs->options = NULL;
5083    bs->explicit_options = NULL;
5084    qobject_unref(bs->full_open_options);
5085    bs->full_open_options = NULL;
5086    g_free(bs->block_status_cache);
5087    bs->block_status_cache = NULL;
5088
5089    bdrv_release_named_dirty_bitmaps(bs);
5090    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
5091
5092    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5093        g_free(ban);
5094    }
5095    QLIST_INIT(&bs->aio_notifiers);
5096    bdrv_drained_end(bs);
5097
5098    /*
5099     * If we're still inside some bdrv_drain_all_begin()/end() sections, end
5100     * them now since this BDS won't exist anymore when bdrv_drain_all_end()
5101     * gets called.
5102     */
5103    if (bs->quiesce_counter) {
5104        bdrv_drain_all_end_quiesce(bs);
5105    }
5106}
5107
5108void bdrv_close_all(void)
5109{
5110    GLOBAL_STATE_CODE();
5111    assert(job_next(NULL) == NULL);
5112
5113    /* Drop references from requests still in flight, such as canceled block
5114     * jobs whose AIO context has not been polled yet */
5115    bdrv_drain_all();
5116
5117    blk_remove_all_bs();
5118    blockdev_close_all_bdrv_states();
5119
5120    assert(QTAILQ_EMPTY(&all_bdrv_states));
5121}
5122
5123static bool should_update_child(BdrvChild *c, BlockDriverState *to)
5124{
5125    GQueue *queue;
5126    GHashTable *found;
5127    bool ret;
5128
5129    if (c->klass->stay_at_node) {
5130        return false;
5131    }
5132
5133    /* If the child @c belongs to the BDS @to, replacing the current
5134     * c->bs by @to would mean to create a loop.
5135     *
5136     * Such a case occurs when appending a BDS to a backing chain.
5137     * For instance, imagine the following chain:
5138     *
5139     *   guest device -> node A -> further backing chain...
5140     *
5141     * Now we create a new BDS B which we want to put on top of this
5142     * chain, so we first attach A as its backing node:
5143     *
5144     *                   node B
5145     *                     |
5146     *                     v
5147     *   guest device -> node A -> further backing chain...
5148     *
5149     * Finally we want to replace A by B.  When doing that, we want to
5150     * replace all pointers to A by pointers to B -- except for the
5151     * pointer from B because (1) that would create a loop, and (2)
5152     * that pointer should simply stay intact:
5153     *
5154     *   guest device -> node B
5155     *                     |
5156     *                     v
5157     *                   node A -> further backing chain...
5158     *
5159     * In general, when replacing a node A (c->bs) by a node B (@to),
5160     * if A is a child of B, that means we cannot replace A by B there
5161     * because that would create a loop.  Silently detaching A from B
5162     * is also not really an option.  So overall just leaving A in
5163     * place there is the most sensible choice.
5164     *
5165     * We would also create a loop in any cases where @c is only
5166     * indirectly referenced by @to. Prevent this by returning false
5167     * if @c is found (by breadth-first search) anywhere in the whole
5168     * subtree of @to.
5169     */
5170
5171    ret = true;
5172    found = g_hash_table_new(NULL, NULL);
5173    g_hash_table_add(found, to);
5174    queue = g_queue_new();
5175    g_queue_push_tail(queue, to);
5176
5177    while (!g_queue_is_empty(queue)) {
5178        BlockDriverState *v = g_queue_pop_head(queue);
5179        BdrvChild *c2;
5180
5181        QLIST_FOREACH(c2, &v->children, next) {
5182            if (c2 == c) {
5183                ret = false;
5184                break;
5185            }
5186
5187            if (g_hash_table_contains(found, c2->bs)) {
5188                continue;
5189            }
5190
5191            g_queue_push_tail(queue, c2->bs);
5192            g_hash_table_add(found, c2->bs);
5193        }
5194    }
5195
5196    g_queue_free(queue);
5197    g_hash_table_destroy(found);
5198
5199    return ret;
5200}
5201
5202static void bdrv_remove_child_commit(void *opaque)
5203{
5204    GLOBAL_STATE_CODE();
5205    bdrv_child_free(opaque);
5206}
5207
5208static TransactionActionDrv bdrv_remove_child_drv = {
5209    .commit = bdrv_remove_child_commit,
5210};
5211
5212/* Function doesn't update permissions, caller is responsible for this. */
5213static void bdrv_remove_child(BdrvChild *child, Transaction *tran)
5214{
5215    if (!child) {
5216        return;
5217    }
5218
5219    if (child->bs) {
5220        BlockDriverState *bs = child->bs;
5221        bdrv_drained_begin(bs);
5222        bdrv_replace_child_tran(child, NULL, tran);
5223        bdrv_drained_end(bs);
5224    }
5225
5226    tran_add(tran, &bdrv_remove_child_drv, child);
5227}
5228
5229static void undrain_on_clean_cb(void *opaque)
5230{
5231    bdrv_drained_end(opaque);
5232}
5233
5234static TransactionActionDrv undrain_on_clean = {
5235    .clean = undrain_on_clean_cb,
5236};
5237
5238static int bdrv_replace_node_noperm(BlockDriverState *from,
5239                                    BlockDriverState *to,
5240                                    bool auto_skip, Transaction *tran,
5241                                    Error **errp)
5242{
5243    BdrvChild *c, *next;
5244
5245    GLOBAL_STATE_CODE();
5246
5247    bdrv_drained_begin(from);
5248    bdrv_drained_begin(to);
5249    tran_add(tran, &undrain_on_clean, from);
5250    tran_add(tran, &undrain_on_clean, to);
5251
5252    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
5253        assert(c->bs == from);
5254        if (!should_update_child(c, to)) {
5255            if (auto_skip) {
5256                continue;
5257            }
5258            error_setg(errp, "Should not change '%s' link to '%s'",
5259                       c->name, from->node_name);
5260            return -EINVAL;
5261        }
5262        if (c->frozen) {
5263            error_setg(errp, "Cannot change '%s' link to '%s'",
5264                       c->name, from->node_name);
5265            return -EPERM;
5266        }
5267        bdrv_replace_child_tran(c, to, tran);
5268    }
5269
5270    return 0;
5271}
5272
5273/*
5274 * With auto_skip=true bdrv_replace_node_common skips updating from parents
5275 * if it creates a parent-child relation loop or if parent is block-job.
5276 *
5277 * With auto_skip=false the error is returned if from has a parent which should
5278 * not be updated.
5279 *
5280 * With @detach_subchain=true @to must be in a backing chain of @from. In this
5281 * case backing link of the cow-parent of @to is removed.
5282 */
5283static int bdrv_replace_node_common(BlockDriverState *from,
5284                                    BlockDriverState *to,
5285                                    bool auto_skip, bool detach_subchain,
5286                                    Error **errp)
5287{
5288    Transaction *tran = tran_new();
5289    g_autoptr(GSList) refresh_list = NULL;
5290    BlockDriverState *to_cow_parent = NULL;
5291    int ret;
5292
5293    GLOBAL_STATE_CODE();
5294
5295    if (detach_subchain) {
5296        assert(bdrv_chain_contains(from, to));
5297        assert(from != to);
5298        for (to_cow_parent = from;
5299             bdrv_filter_or_cow_bs(to_cow_parent) != to;
5300             to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent))
5301        {
5302            ;
5303        }
5304    }
5305
5306    /* Make sure that @from doesn't go away until we have successfully attached
5307     * all of its parents to @to. */
5308    bdrv_ref(from);
5309
5310    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
5311    assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
5312    bdrv_drained_begin(from);
5313
5314    /*
5315     * Do the replacement without permission update.
5316     * Replacement may influence the permissions, we should calculate new
5317     * permissions based on new graph. If we fail, we'll roll-back the
5318     * replacement.
5319     */
5320    ret = bdrv_replace_node_noperm(from, to, auto_skip, tran, errp);
5321    if (ret < 0) {
5322        goto out;
5323    }
5324
5325    if (detach_subchain) {
5326        bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran);
5327    }
5328
5329    refresh_list = g_slist_prepend(refresh_list, to);
5330    refresh_list = g_slist_prepend(refresh_list, from);
5331
5332    ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
5333    if (ret < 0) {
5334        goto out;
5335    }
5336
5337    ret = 0;
5338
5339out:
5340    tran_finalize(tran, ret);
5341
5342    bdrv_drained_end(from);
5343    bdrv_unref(from);
5344
5345    return ret;
5346}
5347
5348int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
5349                      Error **errp)
5350{
5351    GLOBAL_STATE_CODE();
5352
5353    return bdrv_replace_node_common(from, to, true, false, errp);
5354}
5355
5356int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
5357{
5358    GLOBAL_STATE_CODE();
5359
5360    return bdrv_replace_node_common(bs, bdrv_filter_or_cow_bs(bs), true, true,
5361                                    errp);
5362}
5363
5364/*
5365 * Add new bs contents at the top of an image chain while the chain is
5366 * live, while keeping required fields on the top layer.
5367 *
5368 * This will modify the BlockDriverState fields, and swap contents
5369 * between bs_new and bs_top. Both bs_new and bs_top are modified.
5370 *
5371 * bs_new must not be attached to a BlockBackend and must not have backing
5372 * child.
5373 *
5374 * This function does not create any image files.
5375 *
5376 * The caller must hold the AioContext lock for @bs_top.
5377 */
5378int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
5379                Error **errp)
5380{
5381    int ret;
5382    BdrvChild *child;
5383    Transaction *tran = tran_new();
5384    AioContext *old_context, *new_context = NULL;
5385
5386    GLOBAL_STATE_CODE();
5387
5388    assert(!bs_new->backing);
5389
5390    old_context = bdrv_get_aio_context(bs_top);
5391
5392    child = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
5393                                     &child_of_bds, bdrv_backing_role(bs_new),
5394                                     tran, errp);
5395    if (!child) {
5396        ret = -EINVAL;
5397        goto out;
5398    }
5399
5400    /*
5401     * bdrv_attach_child_noperm could change the AioContext of bs_top.
5402     * bdrv_replace_node_noperm calls bdrv_drained_begin, so let's temporarily
5403     * hold the new AioContext, since bdrv_drained_begin calls BDRV_POLL_WHILE
5404     * that assumes the new lock is taken.
5405     */
5406    new_context = bdrv_get_aio_context(bs_top);
5407
5408    if (old_context != new_context) {
5409        aio_context_release(old_context);
5410        aio_context_acquire(new_context);
5411    }
5412
5413    ret = bdrv_replace_node_noperm(bs_top, bs_new, true, tran, errp);
5414    if (ret < 0) {
5415        goto out;
5416    }
5417
5418    ret = bdrv_refresh_perms(bs_new, tran, errp);
5419out:
5420    tran_finalize(tran, ret);
5421
5422    bdrv_graph_rdlock_main_loop();
5423    bdrv_refresh_limits(bs_top, NULL, NULL);
5424    bdrv_graph_rdunlock_main_loop();
5425
5426    if (new_context && old_context != new_context) {
5427        aio_context_release(new_context);
5428        aio_context_acquire(old_context);
5429    }
5430
5431    return ret;
5432}
5433
5434/* Not for empty child */
5435int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
5436                          Error **errp)
5437{
5438    int ret;
5439    Transaction *tran = tran_new();
5440    g_autoptr(GSList) refresh_list = NULL;
5441    BlockDriverState *old_bs = child->bs;
5442
5443    GLOBAL_STATE_CODE();
5444
5445    bdrv_ref(old_bs);
5446    bdrv_drained_begin(old_bs);
5447    bdrv_drained_begin(new_bs);
5448
5449    bdrv_replace_child_tran(child, new_bs, tran);
5450
5451    refresh_list = g_slist_prepend(refresh_list, old_bs);
5452    refresh_list = g_slist_prepend(refresh_list, new_bs);
5453
5454    ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
5455
5456    tran_finalize(tran, ret);
5457
5458    bdrv_drained_end(old_bs);
5459    bdrv_drained_end(new_bs);
5460    bdrv_unref(old_bs);
5461
5462    return ret;
5463}
5464
5465static void bdrv_delete(BlockDriverState *bs)
5466{
5467    assert(bdrv_op_blocker_is_empty(bs));
5468    assert(!bs->refcnt);
5469    GLOBAL_STATE_CODE();
5470
5471    /* remove from list, if necessary */
5472    if (bs->node_name[0] != '\0') {
5473        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
5474    }
5475    QTAILQ_REMOVE(&all_bdrv_states, bs, bs_list);
5476
5477    bdrv_close(bs);
5478
5479    g_free(bs);
5480}
5481
5482
5483/*
5484 * Replace @bs by newly created block node.
5485 *
5486 * @options is a QDict of options to pass to the block drivers, or NULL for an
5487 * empty set of options. The reference to the QDict belongs to the block layer
5488 * after the call (even on failure), so if the caller intends to reuse the
5489 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
5490 *
5491 * The caller holds the AioContext lock for @bs. It must make sure that @bs
5492 * stays in the same AioContext, i.e. @options must not refer to nodes in a
5493 * different AioContext.
5494 */
5495BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
5496                                   int flags, Error **errp)
5497{
5498    ERRP_GUARD();
5499    int ret;
5500    AioContext *ctx = bdrv_get_aio_context(bs);
5501    BlockDriverState *new_node_bs = NULL;
5502    const char *drvname, *node_name;
5503    BlockDriver *drv;
5504
5505    drvname = qdict_get_try_str(options, "driver");
5506    if (!drvname) {
5507        error_setg(errp, "driver is not specified");
5508        goto fail;
5509    }
5510
5511    drv = bdrv_find_format(drvname);
5512    if (!drv) {
5513        error_setg(errp, "Unknown driver: '%s'", drvname);
5514        goto fail;
5515    }
5516
5517    node_name = qdict_get_try_str(options, "node-name");
5518
5519    GLOBAL_STATE_CODE();
5520
5521    aio_context_release(ctx);
5522    aio_context_acquire(qemu_get_aio_context());
5523    new_node_bs = bdrv_new_open_driver_opts(drv, node_name, options, flags,
5524                                            errp);
5525    aio_context_release(qemu_get_aio_context());
5526    aio_context_acquire(ctx);
5527    assert(bdrv_get_aio_context(bs) == ctx);
5528
5529    options = NULL; /* bdrv_new_open_driver() eats options */
5530    if (!new_node_bs) {
5531        error_prepend(errp, "Could not create node: ");
5532        goto fail;
5533    }
5534
5535    bdrv_drained_begin(bs);
5536    ret = bdrv_replace_node(bs, new_node_bs, errp);
5537    bdrv_drained_end(bs);
5538
5539    if (ret < 0) {
5540        error_prepend(errp, "Could not replace node: ");
5541        goto fail;
5542    }
5543
5544    return new_node_bs;
5545
5546fail:
5547    qobject_unref(options);
5548    bdrv_unref(new_node_bs);
5549    return NULL;
5550}
5551
5552/*
5553 * Run consistency checks on an image
5554 *
5555 * Returns 0 if the check could be completed (it doesn't mean that the image is
5556 * free of errors) or -errno when an internal error occurred. The results of the
5557 * check are stored in res.
5558 */
5559int coroutine_fn bdrv_co_check(BlockDriverState *bs,
5560                               BdrvCheckResult *res, BdrvCheckMode fix)
5561{
5562    IO_CODE();
5563    assert_bdrv_graph_readable();
5564    if (bs->drv == NULL) {
5565        return -ENOMEDIUM;
5566    }
5567    if (bs->drv->bdrv_co_check == NULL) {
5568        return -ENOTSUP;
5569    }
5570
5571    memset(res, 0, sizeof(*res));
5572    return bs->drv->bdrv_co_check(bs, res, fix);
5573}
5574
5575/*
5576 * Return values:
5577 * 0        - success
5578 * -EINVAL  - backing format specified, but no file
5579 * -ENOSPC  - can't update the backing file because no space is left in the
5580 *            image file header
5581 * -ENOTSUP - format driver doesn't support changing the backing file
5582 */
5583int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
5584                             const char *backing_fmt, bool require)
5585{
5586    BlockDriver *drv = bs->drv;
5587    int ret;
5588
5589    GLOBAL_STATE_CODE();
5590
5591    if (!drv) {
5592        return -ENOMEDIUM;
5593    }
5594
5595    /* Backing file format doesn't make sense without a backing file */
5596    if (backing_fmt && !backing_file) {
5597        return -EINVAL;
5598    }
5599
5600    if (require && backing_file && !backing_fmt) {
5601        return -EINVAL;
5602    }
5603
5604    if (drv->bdrv_change_backing_file != NULL) {
5605        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
5606    } else {
5607        ret = -ENOTSUP;
5608    }
5609
5610    if (ret == 0) {
5611        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
5612        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
5613        pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
5614                backing_file ?: "");
5615    }
5616    return ret;
5617}
5618
5619/*
5620 * Finds the first non-filter node above bs in the chain between
5621 * active and bs.  The returned node is either an immediate parent of
5622 * bs, or there are only filter nodes between the two.
5623 *
5624 * Returns NULL if bs is not found in active's image chain,
5625 * or if active == bs.
5626 *
5627 * Returns the bottommost base image if bs == NULL.
5628 */
5629BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
5630                                    BlockDriverState *bs)
5631{
5632
5633    GLOBAL_STATE_CODE();
5634
5635    bs = bdrv_skip_filters(bs);
5636    active = bdrv_skip_filters(active);
5637
5638    while (active) {
5639        BlockDriverState *next = bdrv_backing_chain_next(active);
5640        if (bs == next) {
5641            return active;
5642        }
5643        active = next;
5644    }
5645
5646    return NULL;
5647}
5648
5649/* Given a BDS, searches for the base layer. */
5650BlockDriverState *bdrv_find_base(BlockDriverState *bs)
5651{
5652    GLOBAL_STATE_CODE();
5653
5654    return bdrv_find_overlay(bs, NULL);
5655}
5656
5657/*
5658 * Return true if at least one of the COW (backing) and filter links
5659 * between @bs and @base is frozen. @errp is set if that's the case.
5660 * @base must be reachable from @bs, or NULL.
5661 */
5662bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
5663                                  Error **errp)
5664{
5665    BlockDriverState *i;
5666    BdrvChild *child;
5667
5668    GLOBAL_STATE_CODE();
5669
5670    for (i = bs; i != base; i = child_bs(child)) {
5671        child = bdrv_filter_or_cow_child(i);
5672
5673        if (child && child->frozen) {
5674            error_setg(errp, "Cannot change '%s' link from '%s' to '%s'",
5675                       child->name, i->node_name, child->bs->node_name);
5676            return true;
5677        }
5678    }
5679
5680    return false;
5681}
5682
5683/*
5684 * Freeze all COW (backing) and filter links between @bs and @base.
5685 * If any of the links is already frozen the operation is aborted and
5686 * none of the links are modified.
5687 * @base must be reachable from @bs, or NULL.
5688 * Returns 0 on success. On failure returns < 0 and sets @errp.
5689 */
5690int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
5691                              Error **errp)
5692{
5693    BlockDriverState *i;
5694    BdrvChild *child;
5695
5696    GLOBAL_STATE_CODE();
5697
5698    if (bdrv_is_backing_chain_frozen(bs, base, errp)) {
5699        return -EPERM;
5700    }
5701
5702    for (i = bs; i != base; i = child_bs(child)) {
5703        child = bdrv_filter_or_cow_child(i);
5704        if (child && child->bs->never_freeze) {
5705            error_setg(errp, "Cannot freeze '%s' link to '%s'",
5706                       child->name, child->bs->node_name);
5707            return -EPERM;
5708        }
5709    }
5710
5711    for (i = bs; i != base; i = child_bs(child)) {
5712        child = bdrv_filter_or_cow_child(i);
5713        if (child) {
5714            child->frozen = true;
5715        }
5716    }
5717
5718    return 0;
5719}
5720
5721/*
5722 * Unfreeze all COW (backing) and filter links between @bs and @base.
5723 * The caller must ensure that all links are frozen before using this
5724 * function.
5725 * @base must be reachable from @bs, or NULL.
5726 */
5727void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base)
5728{
5729    BlockDriverState *i;
5730    BdrvChild *child;
5731
5732    GLOBAL_STATE_CODE();
5733
5734    for (i = bs; i != base; i = child_bs(child)) {
5735        child = bdrv_filter_or_cow_child(i);
5736        if (child) {
5737            assert(child->frozen);
5738            child->frozen = false;
5739        }
5740    }
5741}
5742
5743/*
5744 * Drops images above 'base' up to and including 'top', and sets the image
5745 * above 'top' to have base as its backing file.
5746 *
5747 * Requires that the overlay to 'top' is opened r/w, so that the backing file
5748 * information in 'bs' can be properly updated.
5749 *
5750 * E.g., this will convert the following chain:
5751 * bottom <- base <- intermediate <- top <- active
5752 *
5753 * to
5754 *
5755 * bottom <- base <- active
5756 *
5757 * It is allowed for bottom==base, in which case it converts:
5758 *
5759 * base <- intermediate <- top <- active
5760 *
5761 * to
5762 *
5763 * base <- active
5764 *
5765 * If backing_file_str is non-NULL, it will be used when modifying top's
5766 * overlay image metadata.
5767 *
5768 * Error conditions:
5769 *  if active == top, that is considered an error
5770 *
5771 */
5772int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
5773                           const char *backing_file_str)
5774{
5775    BlockDriverState *explicit_top = top;
5776    bool update_inherits_from;
5777    BdrvChild *c;
5778    Error *local_err = NULL;
5779    int ret = -EIO;
5780    g_autoptr(GSList) updated_children = NULL;
5781    GSList *p;
5782
5783    GLOBAL_STATE_CODE();
5784
5785    bdrv_ref(top);
5786    bdrv_drained_begin(base);
5787
5788    if (!top->drv || !base->drv) {
5789        goto exit;
5790    }
5791
5792    /* Make sure that base is in the backing chain of top */
5793    if (!bdrv_chain_contains(top, base)) {
5794        goto exit;
5795    }
5796
5797    /* If 'base' recursively inherits from 'top' then we should set
5798     * base->inherits_from to top->inherits_from after 'top' and all
5799     * other intermediate nodes have been dropped.
5800     * If 'top' is an implicit node (e.g. "commit_top") we should skip
5801     * it because no one inherits from it. We use explicit_top for that. */
5802    explicit_top = bdrv_skip_implicit_filters(explicit_top);
5803    update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top);
5804
5805    /* success - we can delete the intermediate states, and link top->base */
5806    if (!backing_file_str) {
5807        bdrv_refresh_filename(base);
5808        backing_file_str = base->filename;
5809    }
5810
5811    QLIST_FOREACH(c, &top->parents, next_parent) {
5812        updated_children = g_slist_prepend(updated_children, c);
5813    }
5814
5815    /*
5816     * It seems correct to pass detach_subchain=true here, but it triggers
5817     * one more yet not fixed bug, when due to nested aio_poll loop we switch to
5818     * another drained section, which modify the graph (for example, removing
5819     * the child, which we keep in updated_children list). So, it's a TODO.
5820     *
5821     * Note, bug triggered if pass detach_subchain=true here and run
5822     * test-bdrv-drain. test_drop_intermediate_poll() test-case will crash.
5823     * That's a FIXME.
5824     */
5825    bdrv_replace_node_common(top, base, false, false, &local_err);
5826    if (local_err) {
5827        error_report_err(local_err);
5828        goto exit;
5829    }
5830
5831    for (p = updated_children; p; p = p->next) {
5832        c = p->data;
5833
5834        if (c->klass->update_filename) {
5835            ret = c->klass->update_filename(c, base, backing_file_str,
5836                                            &local_err);
5837            if (ret < 0) {
5838                /*
5839                 * TODO: Actually, we want to rollback all previous iterations
5840                 * of this loop, and (which is almost impossible) previous
5841                 * bdrv_replace_node()...
5842                 *
5843                 * Note, that c->klass->update_filename may lead to permission
5844                 * update, so it's a bad idea to call it inside permission
5845                 * update transaction of bdrv_replace_node.
5846                 */
5847                error_report_err(local_err);
5848                goto exit;
5849            }
5850        }
5851    }
5852
5853    if (update_inherits_from) {
5854        base->inherits_from = explicit_top->inherits_from;
5855    }
5856
5857    ret = 0;
5858exit:
5859    bdrv_drained_end(base);
5860    bdrv_unref(top);
5861    return ret;
5862}
5863
5864/**
5865 * Implementation of BlockDriver.bdrv_co_get_allocated_file_size() that
5866 * sums the size of all data-bearing children.  (This excludes backing
5867 * children.)
5868 */
5869static int64_t coroutine_fn GRAPH_RDLOCK
5870bdrv_sum_allocated_file_size(BlockDriverState *bs)
5871{
5872    BdrvChild *child;
5873    int64_t child_size, sum = 0;
5874
5875    QLIST_FOREACH(child, &bs->children, next) {
5876        if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
5877                           BDRV_CHILD_FILTERED))
5878        {
5879            child_size = bdrv_co_get_allocated_file_size(child->bs);
5880            if (child_size < 0) {
5881                return child_size;
5882            }
5883            sum += child_size;
5884        }
5885    }
5886
5887    return sum;
5888}
5889
5890/**
5891 * Length of a allocated file in bytes. Sparse files are counted by actual
5892 * allocated space. Return < 0 if error or unknown.
5893 */
5894int64_t coroutine_fn bdrv_co_get_allocated_file_size(BlockDriverState *bs)
5895{
5896    BlockDriver *drv = bs->drv;
5897    IO_CODE();
5898    assert_bdrv_graph_readable();
5899
5900    if (!drv) {
5901        return -ENOMEDIUM;
5902    }
5903    if (drv->bdrv_co_get_allocated_file_size) {
5904        return drv->bdrv_co_get_allocated_file_size(bs);
5905    }
5906
5907    if (drv->bdrv_file_open) {
5908        /*
5909         * Protocol drivers default to -ENOTSUP (most of their data is
5910         * not stored in any of their children (if they even have any),
5911         * so there is no generic way to figure it out).
5912         */
5913        return -ENOTSUP;
5914    } else if (drv->is_filter) {
5915        /* Filter drivers default to the size of their filtered child */
5916        return bdrv_co_get_allocated_file_size(bdrv_filter_bs(bs));
5917    } else {
5918        /* Other drivers default to summing their children's sizes */
5919        return bdrv_sum_allocated_file_size(bs);
5920    }
5921}
5922
5923/*
5924 * bdrv_measure:
5925 * @drv: Format driver
5926 * @opts: Creation options for new image
5927 * @in_bs: Existing image containing data for new image (may be NULL)
5928 * @errp: Error object
5929 * Returns: A #BlockMeasureInfo (free using qapi_free_BlockMeasureInfo())
5930 *          or NULL on error
5931 *
5932 * Calculate file size required to create a new image.
5933 *
5934 * If @in_bs is given then space for allocated clusters and zero clusters
5935 * from that image are included in the calculation.  If @opts contains a
5936 * backing file that is shared by @in_bs then backing clusters may be omitted
5937 * from the calculation.
5938 *
5939 * If @in_bs is NULL then the calculation includes no allocated clusters
5940 * unless a preallocation option is given in @opts.
5941 *
5942 * Note that @in_bs may use a different BlockDriver from @drv.
5943 *
5944 * If an error occurs the @errp pointer is set.
5945 */
5946BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
5947                               BlockDriverState *in_bs, Error **errp)
5948{
5949    IO_CODE();
5950    if (!drv->bdrv_measure) {
5951        error_setg(errp, "Block driver '%s' does not support size measurement",
5952                   drv->format_name);
5953        return NULL;
5954    }
5955
5956    return drv->bdrv_measure(opts, in_bs, errp);
5957}
5958
5959/**
5960 * Return number of sectors on success, -errno on error.
5961 */
5962int64_t coroutine_fn bdrv_co_nb_sectors(BlockDriverState *bs)
5963{
5964    BlockDriver *drv = bs->drv;
5965    IO_CODE();
5966    assert_bdrv_graph_readable();
5967
5968    if (!drv)
5969        return -ENOMEDIUM;
5970
5971    if (bs->bl.has_variable_length) {
5972        int ret = bdrv_co_refresh_total_sectors(bs, bs->total_sectors);
5973        if (ret < 0) {
5974            return ret;
5975        }
5976    }
5977    return bs->total_sectors;
5978}
5979
5980/*
5981 * This wrapper is written by hand because this function is in the hot I/O path,
5982 * via blk_get_geometry.
5983 */
5984int64_t coroutine_mixed_fn bdrv_nb_sectors(BlockDriverState *bs)
5985{
5986    BlockDriver *drv = bs->drv;
5987    IO_CODE();
5988
5989    if (!drv)
5990        return -ENOMEDIUM;
5991
5992    if (bs->bl.has_variable_length) {
5993        int ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
5994        if (ret < 0) {
5995            return ret;
5996        }
5997    }
5998
5999    return bs->total_sectors;
6000}
6001
6002/**
6003 * Return length in bytes on success, -errno on error.
6004 * The length is always a multiple of BDRV_SECTOR_SIZE.
6005 */
6006int64_t coroutine_fn bdrv_co_getlength(BlockDriverState *bs)
6007{
6008    int64_t ret;
6009    IO_CODE();
6010    assert_bdrv_graph_readable();
6011
6012    ret = bdrv_co_nb_sectors(bs);
6013    if (ret < 0) {
6014        return ret;
6015    }
6016    if (ret > INT64_MAX / BDRV_SECTOR_SIZE) {
6017        return -EFBIG;
6018    }
6019    return ret * BDRV_SECTOR_SIZE;
6020}
6021
6022bool bdrv_is_sg(BlockDriverState *bs)
6023{
6024    IO_CODE();
6025    return bs->sg;
6026}
6027
6028/**
6029 * Return whether the given node supports compressed writes.
6030 */
6031bool bdrv_supports_compressed_writes(BlockDriverState *bs)
6032{
6033    BlockDriverState *filtered;
6034    IO_CODE();
6035
6036    if (!bs->drv || !block_driver_can_compress(bs->drv)) {
6037        return false;
6038    }
6039
6040    filtered = bdrv_filter_bs(bs);
6041    if (filtered) {
6042        /*
6043         * Filters can only forward compressed writes, so we have to
6044         * check the child.
6045         */
6046        return bdrv_supports_compressed_writes(filtered);
6047    }
6048
6049    return true;
6050}
6051
6052const char *bdrv_get_format_name(BlockDriverState *bs)
6053{
6054    IO_CODE();
6055    return bs->drv ? bs->drv->format_name : NULL;
6056}
6057
6058static int qsort_strcmp(const void *a, const void *b)
6059{
6060    return strcmp(*(char *const *)a, *(char *const *)b);
6061}
6062
6063void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
6064                         void *opaque, bool read_only)
6065{
6066    BlockDriver *drv;
6067    int count = 0;
6068    int i;
6069    const char **formats = NULL;
6070
6071    GLOBAL_STATE_CODE();
6072
6073    QLIST_FOREACH(drv, &bdrv_drivers, list) {
6074        if (drv->format_name) {
6075            bool found = false;
6076            int i = count;
6077
6078            if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, read_only)) {
6079                continue;
6080            }
6081
6082            while (formats && i && !found) {
6083                found = !strcmp(formats[--i], drv->format_name);
6084            }
6085
6086            if (!found) {
6087                formats = g_renew(const char *, formats, count + 1);
6088                formats[count++] = drv->format_name;
6089            }
6090        }
6091    }
6092
6093    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); i++) {
6094        const char *format_name = block_driver_modules[i].format_name;
6095
6096        if (format_name) {
6097            bool found = false;
6098            int j = count;
6099
6100            if (use_bdrv_whitelist &&
6101                !bdrv_format_is_whitelisted(format_name, read_only)) {
6102                continue;
6103            }
6104
6105            while (formats && j && !found) {
6106                found = !strcmp(formats[--j], format_name);
6107            }
6108
6109            if (!found) {
6110                formats = g_renew(const char *, formats, count + 1);
6111                formats[count++] = format_name;
6112            }
6113        }
6114    }
6115
6116    qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
6117
6118    for (i = 0; i < count; i++) {
6119        it(opaque, formats[i]);
6120    }
6121
6122    g_free(formats);
6123}
6124
6125/* This function is to find a node in the bs graph */
6126BlockDriverState *bdrv_find_node(const char *node_name)
6127{
6128    BlockDriverState *bs;
6129
6130    assert(node_name);
6131    GLOBAL_STATE_CODE();
6132
6133    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
6134        if (!strcmp(node_name, bs->node_name)) {
6135            return bs;
6136        }
6137    }
6138    return NULL;
6139}
6140
6141/* Put this QMP function here so it can access the static graph_bdrv_states. */
6142BlockDeviceInfoList *bdrv_named_nodes_list(bool flat,
6143                                           Error **errp)
6144{
6145    BlockDeviceInfoList *list;
6146    BlockDriverState *bs;
6147
6148    GLOBAL_STATE_CODE();
6149
6150    list = NULL;
6151    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
6152        BlockDeviceInfo *info = bdrv_block_device_info(NULL, bs, flat, errp);
6153        if (!info) {
6154            qapi_free_BlockDeviceInfoList(list);
6155            return NULL;
6156        }
6157        QAPI_LIST_PREPEND(list, info);
6158    }
6159
6160    return list;
6161}
6162
6163typedef struct XDbgBlockGraphConstructor {
6164    XDbgBlockGraph *graph;
6165    GHashTable *graph_nodes;
6166} XDbgBlockGraphConstructor;
6167
6168static XDbgBlockGraphConstructor *xdbg_graph_new(void)
6169{
6170    XDbgBlockGraphConstructor *gr = g_new(XDbgBlockGraphConstructor, 1);
6171
6172    gr->graph = g_new0(XDbgBlockGraph, 1);
6173    gr->graph_nodes = g_hash_table_new(NULL, NULL);
6174
6175    return gr;
6176}
6177
6178static XDbgBlockGraph *xdbg_graph_finalize(XDbgBlockGraphConstructor *gr)
6179{
6180    XDbgBlockGraph *graph = gr->graph;
6181
6182    g_hash_table_destroy(gr->graph_nodes);
6183    g_free(gr);
6184
6185    return graph;
6186}
6187
6188static uintptr_t xdbg_graph_node_num(XDbgBlockGraphConstructor *gr, void *node)
6189{
6190    uintptr_t ret = (uintptr_t)g_hash_table_lookup(gr->graph_nodes, node);
6191
6192    if (ret != 0) {
6193        return ret;
6194    }
6195
6196    /*
6197     * Start counting from 1, not 0, because 0 interferes with not-found (NULL)
6198     * answer of g_hash_table_lookup.
6199     */
6200    ret = g_hash_table_size(gr->graph_nodes) + 1;
6201    g_hash_table_insert(gr->graph_nodes, node, (void *)ret);
6202
6203    return ret;
6204}
6205
6206static void xdbg_graph_add_node(XDbgBlockGraphConstructor *gr, void *node,
6207                                XDbgBlockGraphNodeType type, const char *name)
6208{
6209    XDbgBlockGraphNode *n;
6210
6211    n = g_new0(XDbgBlockGraphNode, 1);
6212
6213    n->id = xdbg_graph_node_num(gr, node);
6214    n->type = type;
6215    n->name = g_strdup(name);
6216
6217    QAPI_LIST_PREPEND(gr->graph->nodes, n);
6218}
6219
6220static void xdbg_graph_add_edge(XDbgBlockGraphConstructor *gr, void *parent,
6221                                const BdrvChild *child)
6222{
6223    BlockPermission qapi_perm;
6224    XDbgBlockGraphEdge *edge;
6225    GLOBAL_STATE_CODE();
6226
6227    edge = g_new0(XDbgBlockGraphEdge, 1);
6228
6229    edge->parent = xdbg_graph_node_num(gr, parent);
6230    edge->child = xdbg_graph_node_num(gr, child->bs);
6231    edge->name = g_strdup(child->name);
6232
6233    for (qapi_perm = 0; qapi_perm < BLOCK_PERMISSION__MAX; qapi_perm++) {
6234        uint64_t flag = bdrv_qapi_perm_to_blk_perm(qapi_perm);
6235
6236        if (flag & child->perm) {
6237            QAPI_LIST_PREPEND(edge->perm, qapi_perm);
6238        }
6239        if (flag & child->shared_perm) {
6240            QAPI_LIST_PREPEND(edge->shared_perm, qapi_perm);
6241        }
6242    }
6243
6244    QAPI_LIST_PREPEND(gr->graph->edges, edge);
6245}
6246
6247
6248XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
6249{
6250    BlockBackend *blk;
6251    BlockJob *job;
6252    BlockDriverState *bs;
6253    BdrvChild *child;
6254    XDbgBlockGraphConstructor *gr = xdbg_graph_new();
6255
6256    GLOBAL_STATE_CODE();
6257
6258    for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
6259        char *allocated_name = NULL;
6260        const char *name = blk_name(blk);
6261
6262        if (!*name) {
6263            name = allocated_name = blk_get_attached_dev_id(blk);
6264        }
6265        xdbg_graph_add_node(gr, blk, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_BACKEND,
6266                           name);
6267        g_free(allocated_name);
6268        if (blk_root(blk)) {
6269            xdbg_graph_add_edge(gr, blk, blk_root(blk));
6270        }
6271    }
6272
6273    WITH_JOB_LOCK_GUARD() {
6274        for (job = block_job_next_locked(NULL); job;
6275             job = block_job_next_locked(job)) {
6276            GSList *el;
6277
6278            xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
6279                                job->job.id);
6280            for (el = job->nodes; el; el = el->next) {
6281                xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
6282            }
6283        }
6284    }
6285
6286    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
6287        xdbg_graph_add_node(gr, bs, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_DRIVER,
6288                           bs->node_name);
6289        QLIST_FOREACH(child, &bs->children, next) {
6290            xdbg_graph_add_edge(gr, bs, child);
6291        }
6292    }
6293
6294    return xdbg_graph_finalize(gr);
6295}
6296
6297BlockDriverState *bdrv_lookup_bs(const char *device,
6298                                 const char *node_name,
6299                                 Error **errp)
6300{
6301    BlockBackend *blk;
6302    BlockDriverState *bs;
6303
6304    GLOBAL_STATE_CODE();
6305
6306    if (device) {
6307        blk = blk_by_name(device);
6308
6309        if (blk) {
6310            bs = blk_bs(blk);
6311            if (!bs) {
6312                error_setg(errp, "Device '%s' has no medium", device);
6313            }
6314
6315            return bs;
6316        }
6317    }
6318
6319    if (node_name) {
6320        bs = bdrv_find_node(node_name);
6321
6322        if (bs) {
6323            return bs;
6324        }
6325    }
6326
6327    error_setg(errp, "Cannot find device=\'%s\' nor node-name=\'%s\'",
6328                     device ? device : "",
6329                     node_name ? node_name : "");
6330    return NULL;
6331}
6332
6333/* If 'base' is in the same chain as 'top', return true. Otherwise,
6334 * return false.  If either argument is NULL, return false. */
6335bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
6336{
6337
6338    GLOBAL_STATE_CODE();
6339
6340    while (top && top != base) {
6341        top = bdrv_filter_or_cow_bs(top);
6342    }
6343
6344    return top != NULL;
6345}
6346
6347BlockDriverState *bdrv_next_node(BlockDriverState *bs)
6348{
6349    GLOBAL_STATE_CODE();
6350    if (!bs) {
6351        return QTAILQ_FIRST(&graph_bdrv_states);
6352    }
6353    return QTAILQ_NEXT(bs, node_list);
6354}
6355
6356BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
6357{
6358    GLOBAL_STATE_CODE();
6359    if (!bs) {
6360        return QTAILQ_FIRST(&all_bdrv_states);
6361    }
6362    return QTAILQ_NEXT(bs, bs_list);
6363}
6364
6365const char *bdrv_get_node_name(const BlockDriverState *bs)
6366{
6367    IO_CODE();
6368    return bs->node_name;
6369}
6370
6371const char *bdrv_get_parent_name(const BlockDriverState *bs)
6372{
6373    BdrvChild *c;
6374    const char *name;
6375    IO_CODE();
6376
6377    /* If multiple parents have a name, just pick the first one. */
6378    QLIST_FOREACH(c, &bs->parents, next_parent) {
6379        if (c->klass->get_name) {
6380            name = c->klass->get_name(c);
6381            if (name && *name) {
6382                return name;
6383            }
6384        }
6385    }
6386
6387    return NULL;
6388}
6389
6390/* TODO check what callers really want: bs->node_name or blk_name() */
6391const char *bdrv_get_device_name(const BlockDriverState *bs)
6392{
6393    IO_CODE();
6394    return bdrv_get_parent_name(bs) ?: "";
6395}
6396
6397/* This can be used to identify nodes that might not have a device
6398 * name associated. Since node and device names live in the same
6399 * namespace, the result is unambiguous. The exception is if both are
6400 * absent, then this returns an empty (non-null) string. */
6401const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
6402{
6403    IO_CODE();
6404    return bdrv_get_parent_name(bs) ?: bs->node_name;
6405}
6406
6407int bdrv_get_flags(BlockDriverState *bs)
6408{
6409    IO_CODE();
6410    return bs->open_flags;
6411}
6412
6413int bdrv_has_zero_init_1(BlockDriverState *bs)
6414{
6415    GLOBAL_STATE_CODE();
6416    return 1;
6417}
6418
6419int bdrv_has_zero_init(BlockDriverState *bs)
6420{
6421    BlockDriverState *filtered;
6422    GLOBAL_STATE_CODE();
6423
6424    if (!bs->drv) {
6425        return 0;
6426    }
6427
6428    /* If BS is a copy on write image, it is initialized to
6429       the contents of the base image, which may not be zeroes.  */
6430    if (bdrv_cow_child(bs)) {
6431        return 0;
6432    }
6433    if (bs->drv->bdrv_has_zero_init) {
6434        return bs->drv->bdrv_has_zero_init(bs);
6435    }
6436
6437    filtered = bdrv_filter_bs(bs);
6438    if (filtered) {
6439        return bdrv_has_zero_init(filtered);
6440    }
6441
6442    /* safe default */
6443    return 0;
6444}
6445
6446bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
6447{
6448    IO_CODE();
6449    if (!(bs->open_flags & BDRV_O_UNMAP)) {
6450        return false;
6451    }
6452
6453    return bs->supported_zero_flags & BDRV_REQ_MAY_UNMAP;
6454}
6455
6456void bdrv_get_backing_filename(BlockDriverState *bs,
6457                               char *filename, int filename_size)
6458{
6459    IO_CODE();
6460    pstrcpy(filename, filename_size, bs->backing_file);
6461}
6462
6463int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
6464{
6465    int ret;
6466    BlockDriver *drv = bs->drv;
6467    IO_CODE();
6468    assert_bdrv_graph_readable();
6469
6470    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
6471    if (!drv) {
6472        return -ENOMEDIUM;
6473    }
6474    if (!drv->bdrv_co_get_info) {
6475        BlockDriverState *filtered = bdrv_filter_bs(bs);
6476        if (filtered) {
6477            return bdrv_co_get_info(filtered, bdi);
6478        }
6479        return -ENOTSUP;
6480    }
6481    memset(bdi, 0, sizeof(*bdi));
6482    ret = drv->bdrv_co_get_info(bs, bdi);
6483    if (ret < 0) {
6484        return ret;
6485    }
6486
6487    if (bdi->cluster_size > BDRV_MAX_ALIGNMENT) {
6488        return -EINVAL;
6489    }
6490
6491    return 0;
6492}
6493
6494ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
6495                                          Error **errp)
6496{
6497    BlockDriver *drv = bs->drv;
6498    IO_CODE();
6499    if (drv && drv->bdrv_get_specific_info) {
6500        return drv->bdrv_get_specific_info(bs, errp);
6501    }
6502    return NULL;
6503}
6504
6505BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs)
6506{
6507    BlockDriver *drv = bs->drv;
6508    IO_CODE();
6509    if (!drv || !drv->bdrv_get_specific_stats) {
6510        return NULL;
6511    }
6512    return drv->bdrv_get_specific_stats(bs);
6513}
6514
6515void coroutine_fn bdrv_co_debug_event(BlockDriverState *bs, BlkdebugEvent event)
6516{
6517    IO_CODE();
6518    assert_bdrv_graph_readable();
6519
6520    if (!bs || !bs->drv || !bs->drv->bdrv_co_debug_event) {
6521        return;
6522    }
6523
6524    bs->drv->bdrv_co_debug_event(bs, event);
6525}
6526
6527static BlockDriverState *bdrv_find_debug_node(BlockDriverState *bs)
6528{
6529    GLOBAL_STATE_CODE();
6530    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
6531        bs = bdrv_primary_bs(bs);
6532    }
6533
6534    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
6535        assert(bs->drv->bdrv_debug_remove_breakpoint);
6536        return bs;
6537    }
6538
6539    return NULL;
6540}
6541
6542int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
6543                          const char *tag)
6544{
6545    GLOBAL_STATE_CODE();
6546    bs = bdrv_find_debug_node(bs);
6547    if (bs) {
6548        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
6549    }
6550
6551    return -ENOTSUP;
6552}
6553
6554int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
6555{
6556    GLOBAL_STATE_CODE();
6557    bs = bdrv_find_debug_node(bs);
6558    if (bs) {
6559        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
6560    }
6561
6562    return -ENOTSUP;
6563}
6564
6565int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
6566{
6567    GLOBAL_STATE_CODE();
6568    while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
6569        bs = bdrv_primary_bs(bs);
6570    }
6571
6572    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
6573        return bs->drv->bdrv_debug_resume(bs, tag);
6574    }
6575
6576    return -ENOTSUP;
6577}
6578
6579bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
6580{
6581    GLOBAL_STATE_CODE();
6582    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
6583        bs = bdrv_primary_bs(bs);
6584    }
6585
6586    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
6587        return bs->drv->bdrv_debug_is_suspended(bs, tag);
6588    }
6589
6590    return false;
6591}
6592
6593/* backing_file can either be relative, or absolute, or a protocol.  If it is
6594 * relative, it must be relative to the chain.  So, passing in bs->filename
6595 * from a BDS as backing_file should not be done, as that may be relative to
6596 * the CWD rather than the chain. */
6597BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
6598        const char *backing_file)
6599{
6600    char *filename_full = NULL;
6601    char *backing_file_full = NULL;
6602    char *filename_tmp = NULL;
6603    int is_protocol = 0;
6604    bool filenames_refreshed = false;
6605    BlockDriverState *curr_bs = NULL;
6606    BlockDriverState *retval = NULL;
6607    BlockDriverState *bs_below;
6608
6609    GLOBAL_STATE_CODE();
6610
6611    if (!bs || !bs->drv || !backing_file) {
6612        return NULL;
6613    }
6614
6615    filename_full     = g_malloc(PATH_MAX);
6616    backing_file_full = g_malloc(PATH_MAX);
6617
6618    is_protocol = path_has_protocol(backing_file);
6619
6620    /*
6621     * Being largely a legacy function, skip any filters here
6622     * (because filters do not have normal filenames, so they cannot
6623     * match anyway; and allowing json:{} filenames is a bit out of
6624     * scope).
6625     */
6626    for (curr_bs = bdrv_skip_filters(bs);
6627         bdrv_cow_child(curr_bs) != NULL;
6628         curr_bs = bs_below)
6629    {
6630        bs_below = bdrv_backing_chain_next(curr_bs);
6631
6632        if (bdrv_backing_overridden(curr_bs)) {
6633            /*
6634             * If the backing file was overridden, we can only compare
6635             * directly against the backing node's filename.
6636             */
6637
6638            if (!filenames_refreshed) {
6639                /*
6640                 * This will automatically refresh all of the
6641                 * filenames in the rest of the backing chain, so we
6642                 * only need to do this once.
6643                 */
6644                bdrv_refresh_filename(bs_below);
6645                filenames_refreshed = true;
6646            }
6647
6648            if (strcmp(backing_file, bs_below->filename) == 0) {
6649                retval = bs_below;
6650                break;
6651            }
6652        } else if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
6653            /*
6654             * If either of the filename paths is actually a protocol, then
6655             * compare unmodified paths; otherwise make paths relative.
6656             */
6657            char *backing_file_full_ret;
6658
6659            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
6660                retval = bs_below;
6661                break;
6662            }
6663            /* Also check against the full backing filename for the image */
6664            backing_file_full_ret = bdrv_get_full_backing_filename(curr_bs,
6665                                                                   NULL);
6666            if (backing_file_full_ret) {
6667                bool equal = strcmp(backing_file, backing_file_full_ret) == 0;
6668                g_free(backing_file_full_ret);
6669                if (equal) {
6670                    retval = bs_below;
6671                    break;
6672                }
6673            }
6674        } else {
6675            /* If not an absolute filename path, make it relative to the current
6676             * image's filename path */
6677            filename_tmp = bdrv_make_absolute_filename(curr_bs, backing_file,
6678                                                       NULL);
6679            /* We are going to compare canonicalized absolute pathnames */
6680            if (!filename_tmp || !realpath(filename_tmp, filename_full)) {
6681                g_free(filename_tmp);
6682                continue;
6683            }
6684            g_free(filename_tmp);
6685
6686            /* We need to make sure the backing filename we are comparing against
6687             * is relative to the current image filename (or absolute) */
6688            filename_tmp = bdrv_get_full_backing_filename(curr_bs, NULL);
6689            if (!filename_tmp || !realpath(filename_tmp, backing_file_full)) {
6690                g_free(filename_tmp);
6691                continue;
6692            }
6693            g_free(filename_tmp);
6694
6695            if (strcmp(backing_file_full, filename_full) == 0) {
6696                retval = bs_below;
6697                break;
6698            }
6699        }
6700    }
6701
6702    g_free(filename_full);
6703    g_free(backing_file_full);
6704    return retval;
6705}
6706
6707void bdrv_init(void)
6708{
6709#ifdef CONFIG_BDRV_WHITELIST_TOOLS
6710    use_bdrv_whitelist = 1;
6711#endif
6712    module_call_init(MODULE_INIT_BLOCK);
6713}
6714
6715void bdrv_init_with_whitelist(void)
6716{
6717    use_bdrv_whitelist = 1;
6718    bdrv_init();
6719}
6720
6721int bdrv_activate(BlockDriverState *bs, Error **errp)
6722{
6723    BdrvChild *child, *parent;
6724    Error *local_err = NULL;
6725    int ret;
6726    BdrvDirtyBitmap *bm;
6727
6728    GLOBAL_STATE_CODE();
6729
6730    if (!bs->drv)  {
6731        return -ENOMEDIUM;
6732    }
6733
6734    QLIST_FOREACH(child, &bs->children, next) {
6735        bdrv_activate(child->bs, &local_err);
6736        if (local_err) {
6737            error_propagate(errp, local_err);
6738            return -EINVAL;
6739        }
6740    }
6741
6742    /*
6743     * Update permissions, they may differ for inactive nodes.
6744     *
6745     * Note that the required permissions of inactive images are always a
6746     * subset of the permissions required after activating the image. This
6747     * allows us to just get the permissions upfront without restricting
6748     * bdrv_co_invalidate_cache().
6749     *
6750     * It also means that in error cases, we don't have to try and revert to
6751     * the old permissions (which is an operation that could fail, too). We can
6752     * just keep the extended permissions for the next time that an activation
6753     * of the image is tried.
6754     */
6755    if (bs->open_flags & BDRV_O_INACTIVE) {
6756        bs->open_flags &= ~BDRV_O_INACTIVE;
6757        ret = bdrv_refresh_perms(bs, NULL, errp);
6758        if (ret < 0) {
6759            bs->open_flags |= BDRV_O_INACTIVE;
6760            return ret;
6761        }
6762
6763        ret = bdrv_invalidate_cache(bs, errp);
6764        if (ret < 0) {
6765            bs->open_flags |= BDRV_O_INACTIVE;
6766            return ret;
6767        }
6768
6769        FOR_EACH_DIRTY_BITMAP(bs, bm) {
6770            bdrv_dirty_bitmap_skip_store(bm, false);
6771        }
6772
6773        ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
6774        if (ret < 0) {
6775            bs->open_flags |= BDRV_O_INACTIVE;
6776            error_setg_errno(errp, -ret, "Could not refresh total sector count");
6777            return ret;
6778        }
6779    }
6780
6781    QLIST_FOREACH(parent, &bs->parents, next_parent) {
6782        if (parent->klass->activate) {
6783            parent->klass->activate(parent, &local_err);
6784            if (local_err) {
6785                bs->open_flags |= BDRV_O_INACTIVE;
6786                error_propagate(errp, local_err);
6787                return -EINVAL;
6788            }
6789        }
6790    }
6791
6792    return 0;
6793}
6794
6795int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
6796{
6797    Error *local_err = NULL;
6798    IO_CODE();
6799
6800    assert(!(bs->open_flags & BDRV_O_INACTIVE));
6801    assert_bdrv_graph_readable();
6802
6803    if (bs->drv->bdrv_co_invalidate_cache) {
6804        bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
6805        if (local_err) {
6806            error_propagate(errp, local_err);
6807            return -EINVAL;
6808        }
6809    }
6810
6811    return 0;
6812}
6813
6814void bdrv_activate_all(Error **errp)
6815{
6816    BlockDriverState *bs;
6817    BdrvNextIterator it;
6818
6819    GLOBAL_STATE_CODE();
6820
6821    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
6822        AioContext *aio_context = bdrv_get_aio_context(bs);
6823        int ret;
6824
6825        aio_context_acquire(aio_context);
6826        ret = bdrv_activate(bs, errp);
6827        aio_context_release(aio_context);
6828        if (ret < 0) {
6829            bdrv_next_cleanup(&it);
6830            return;
6831        }
6832    }
6833}
6834
6835static bool bdrv_has_bds_parent(BlockDriverState *bs, bool only_active)
6836{
6837    BdrvChild *parent;
6838    GLOBAL_STATE_CODE();
6839
6840    QLIST_FOREACH(parent, &bs->parents, next_parent) {
6841        if (parent->klass->parent_is_bds) {
6842            BlockDriverState *parent_bs = parent->opaque;
6843            if (!only_active || !(parent_bs->open_flags & BDRV_O_INACTIVE)) {
6844                return true;
6845            }
6846        }
6847    }
6848
6849    return false;
6850}
6851
6852static int bdrv_inactivate_recurse(BlockDriverState *bs)
6853{
6854    BdrvChild *child, *parent;
6855    int ret;
6856    uint64_t cumulative_perms, cumulative_shared_perms;
6857
6858    GLOBAL_STATE_CODE();
6859
6860    if (!bs->drv) {
6861        return -ENOMEDIUM;
6862    }
6863
6864    /* Make sure that we don't inactivate a child before its parent.
6865     * It will be covered by recursion from the yet active parent. */
6866    if (bdrv_has_bds_parent(bs, true)) {
6867        return 0;
6868    }
6869
6870    assert(!(bs->open_flags & BDRV_O_INACTIVE));
6871
6872    /* Inactivate this node */
6873    if (bs->drv->bdrv_inactivate) {
6874        ret = bs->drv->bdrv_inactivate(bs);
6875        if (ret < 0) {
6876            return ret;
6877        }
6878    }
6879
6880    QLIST_FOREACH(parent, &bs->parents, next_parent) {
6881        if (parent->klass->inactivate) {
6882            ret = parent->klass->inactivate(parent);
6883            if (ret < 0) {
6884                return ret;
6885            }
6886        }
6887    }
6888
6889    bdrv_get_cumulative_perm(bs, &cumulative_perms,
6890                             &cumulative_shared_perms);
6891    if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
6892        /* Our inactive parents still need write access. Inactivation failed. */
6893        return -EPERM;
6894    }
6895
6896    bs->open_flags |= BDRV_O_INACTIVE;
6897
6898    /*
6899     * Update permissions, they may differ for inactive nodes.
6900     * We only tried to loosen restrictions, so errors are not fatal, ignore
6901     * them.
6902     */
6903    bdrv_refresh_perms(bs, NULL, NULL);
6904
6905    /* Recursively inactivate children */
6906    QLIST_FOREACH(child, &bs->children, next) {
6907        ret = bdrv_inactivate_recurse(child->bs);
6908        if (ret < 0) {
6909            return ret;
6910        }
6911    }
6912
6913    return 0;
6914}
6915
6916int bdrv_inactivate_all(void)
6917{
6918    BlockDriverState *bs = NULL;
6919    BdrvNextIterator it;
6920    int ret = 0;
6921    GSList *aio_ctxs = NULL, *ctx;
6922
6923    GLOBAL_STATE_CODE();
6924
6925    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
6926        AioContext *aio_context = bdrv_get_aio_context(bs);
6927
6928        if (!g_slist_find(aio_ctxs, aio_context)) {
6929            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
6930            aio_context_acquire(aio_context);
6931        }
6932    }
6933
6934    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
6935        /* Nodes with BDS parents are covered by recursion from the last
6936         * parent that gets inactivated. Don't inactivate them a second
6937         * time if that has already happened. */
6938        if (bdrv_has_bds_parent(bs, false)) {
6939            continue;
6940        }
6941        ret = bdrv_inactivate_recurse(bs);
6942        if (ret < 0) {
6943            bdrv_next_cleanup(&it);
6944            goto out;
6945        }
6946    }
6947
6948out:
6949    for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
6950        AioContext *aio_context = ctx->data;
6951        aio_context_release(aio_context);
6952    }
6953    g_slist_free(aio_ctxs);
6954
6955    return ret;
6956}
6957
6958/**************************************************************/
6959/* removable device support */
6960
6961/**
6962 * Return TRUE if the media is present
6963 */
6964bool coroutine_fn bdrv_co_is_inserted(BlockDriverState *bs)
6965{
6966    BlockDriver *drv = bs->drv;
6967    BdrvChild *child;
6968    IO_CODE();
6969    assert_bdrv_graph_readable();
6970
6971    if (!drv) {
6972        return false;
6973    }
6974    if (drv->bdrv_co_is_inserted) {
6975        return drv->bdrv_co_is_inserted(bs);
6976    }
6977    QLIST_FOREACH(child, &bs->children, next) {
6978        if (!bdrv_co_is_inserted(child->bs)) {
6979            return false;
6980        }
6981    }
6982    return true;
6983}
6984
6985/**
6986 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
6987 */
6988void coroutine_fn bdrv_co_eject(BlockDriverState *bs, bool eject_flag)
6989{
6990    BlockDriver *drv = bs->drv;
6991    IO_CODE();
6992    assert_bdrv_graph_readable();
6993
6994    if (drv && drv->bdrv_co_eject) {
6995        drv->bdrv_co_eject(bs, eject_flag);
6996    }
6997}
6998
6999/**
7000 * Lock or unlock the media (if it is locked, the user won't be able
7001 * to eject it manually).
7002 */
7003void coroutine_fn bdrv_co_lock_medium(BlockDriverState *bs, bool locked)
7004{
7005    BlockDriver *drv = bs->drv;
7006    IO_CODE();
7007    assert_bdrv_graph_readable();
7008    trace_bdrv_lock_medium(bs, locked);
7009
7010    if (drv && drv->bdrv_co_lock_medium) {
7011        drv->bdrv_co_lock_medium(bs, locked);
7012    }
7013}
7014
7015/* Get a reference to bs */
7016void bdrv_ref(BlockDriverState *bs)
7017{
7018    GLOBAL_STATE_CODE();
7019    bs->refcnt++;
7020}
7021
7022/* Release a previously grabbed reference to bs.
7023 * If after releasing, reference count is zero, the BlockDriverState is
7024 * deleted. */
7025void bdrv_unref(BlockDriverState *bs)
7026{
7027    GLOBAL_STATE_CODE();
7028    if (!bs) {
7029        return;
7030    }
7031    assert(bs->refcnt > 0);
7032    if (--bs->refcnt == 0) {
7033        bdrv_delete(bs);
7034    }
7035}
7036
7037struct BdrvOpBlocker {
7038    Error *reason;
7039    QLIST_ENTRY(BdrvOpBlocker) list;
7040};
7041
7042bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
7043{
7044    BdrvOpBlocker *blocker;
7045    GLOBAL_STATE_CODE();
7046    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
7047    if (!QLIST_EMPTY(&bs->op_blockers[op])) {
7048        blocker = QLIST_FIRST(&bs->op_blockers[op]);
7049        error_propagate_prepend(errp, error_copy(blocker->reason),
7050                                "Node '%s' is busy: ",
7051                                bdrv_get_device_or_node_name(bs));
7052        return true;
7053    }
7054    return false;
7055}
7056
7057void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
7058{
7059    BdrvOpBlocker *blocker;
7060    GLOBAL_STATE_CODE();
7061    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
7062
7063    blocker = g_new0(BdrvOpBlocker, 1);
7064    blocker->reason = reason;
7065    QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
7066}
7067
7068void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
7069{
7070    BdrvOpBlocker *blocker, *next;
7071    GLOBAL_STATE_CODE();
7072    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
7073    QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
7074        if (blocker->reason == reason) {
7075            QLIST_REMOVE(blocker, list);
7076            g_free(blocker);
7077        }
7078    }
7079}
7080
7081void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
7082{
7083    int i;
7084    GLOBAL_STATE_CODE();
7085    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
7086        bdrv_op_block(bs, i, reason);
7087    }
7088}
7089
7090void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
7091{
7092    int i;
7093    GLOBAL_STATE_CODE();
7094    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
7095        bdrv_op_unblock(bs, i, reason);
7096    }
7097}
7098
7099bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
7100{
7101    int i;
7102    GLOBAL_STATE_CODE();
7103    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
7104        if (!QLIST_EMPTY(&bs->op_blockers[i])) {
7105            return false;
7106        }
7107    }
7108    return true;
7109}
7110
7111/*
7112 * Must not be called while holding the lock of an AioContext other than the
7113 * current one.
7114 */
7115void bdrv_img_create(const char *filename, const char *fmt,
7116                     const char *base_filename, const char *base_fmt,
7117                     char *options, uint64_t img_size, int flags, bool quiet,
7118                     Error **errp)
7119{
7120    QemuOptsList *create_opts = NULL;
7121    QemuOpts *opts = NULL;
7122    const char *backing_fmt, *backing_file;
7123    int64_t size;
7124    BlockDriver *drv, *proto_drv;
7125    Error *local_err = NULL;
7126    int ret = 0;
7127
7128    GLOBAL_STATE_CODE();
7129
7130    /* Find driver and parse its options */
7131    drv = bdrv_find_format(fmt);
7132    if (!drv) {
7133        error_setg(errp, "Unknown file format '%s'", fmt);
7134        return;
7135    }
7136
7137    proto_drv = bdrv_find_protocol(filename, true, errp);
7138    if (!proto_drv) {
7139        return;
7140    }
7141
7142    if (!drv->create_opts) {
7143        error_setg(errp, "Format driver '%s' does not support image creation",
7144                   drv->format_name);
7145        return;
7146    }
7147
7148    if (!proto_drv->create_opts) {
7149        error_setg(errp, "Protocol driver '%s' does not support image creation",
7150                   proto_drv->format_name);
7151        return;
7152    }
7153
7154    aio_context_acquire(qemu_get_aio_context());
7155
7156    /* Create parameter list */
7157    create_opts = qemu_opts_append(create_opts, drv->create_opts);
7158    create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
7159
7160    opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
7161
7162    /* Parse -o options */
7163    if (options) {
7164        if (!qemu_opts_do_parse(opts, options, NULL, errp)) {
7165            goto out;
7166        }
7167    }
7168
7169    if (!qemu_opt_get(opts, BLOCK_OPT_SIZE)) {
7170        qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
7171    } else if (img_size != UINT64_C(-1)) {
7172        error_setg(errp, "The image size must be specified only once");
7173        goto out;
7174    }
7175
7176    if (base_filename) {
7177        if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename,
7178                          NULL)) {
7179            error_setg(errp, "Backing file not supported for file format '%s'",
7180                       fmt);
7181            goto out;
7182        }
7183    }
7184
7185    if (base_fmt) {
7186        if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, NULL)) {
7187            error_setg(errp, "Backing file format not supported for file "
7188                             "format '%s'", fmt);
7189            goto out;
7190        }
7191    }
7192
7193    backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
7194    if (backing_file) {
7195        if (!strcmp(filename, backing_file)) {
7196            error_setg(errp, "Error: Trying to create an image with the "
7197                             "same filename as the backing file");
7198            goto out;
7199        }
7200        if (backing_file[0] == '\0') {
7201            error_setg(errp, "Expected backing file name, got empty string");
7202            goto out;
7203        }
7204    }
7205
7206    backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
7207
7208    /* The size for the image must always be specified, unless we have a backing
7209     * file and we have not been forbidden from opening it. */
7210    size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, img_size);
7211    if (backing_file && !(flags & BDRV_O_NO_BACKING)) {
7212        BlockDriverState *bs;
7213        char *full_backing;
7214        int back_flags;
7215        QDict *backing_options = NULL;
7216
7217        full_backing =
7218            bdrv_get_full_backing_filename_from_filename(filename, backing_file,
7219                                                         &local_err);
7220        if (local_err) {
7221            goto out;
7222        }
7223        assert(full_backing);
7224
7225        /*
7226         * No need to do I/O here, which allows us to open encrypted
7227         * backing images without needing the secret
7228         */
7229        back_flags = flags;
7230        back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
7231        back_flags |= BDRV_O_NO_IO;
7232
7233        backing_options = qdict_new();
7234        if (backing_fmt) {
7235            qdict_put_str(backing_options, "driver", backing_fmt);
7236        }
7237        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
7238
7239        bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
7240                       &local_err);
7241        g_free(full_backing);
7242        if (!bs) {
7243            error_append_hint(&local_err, "Could not open backing image.\n");
7244            goto out;
7245        } else {
7246            if (!backing_fmt) {
7247                error_setg(&local_err,
7248                           "Backing file specified without backing format");
7249                error_append_hint(&local_err, "Detected format of %s.\n",
7250                                  bs->drv->format_name);
7251                goto out;
7252            }
7253            if (size == -1) {
7254                /* Opened BS, have no size */
7255                size = bdrv_getlength(bs);
7256                if (size < 0) {
7257                    error_setg_errno(errp, -size, "Could not get size of '%s'",
7258                                     backing_file);
7259                    bdrv_unref(bs);
7260                    goto out;
7261                }
7262                qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
7263            }
7264            bdrv_unref(bs);
7265        }
7266        /* (backing_file && !(flags & BDRV_O_NO_BACKING)) */
7267    } else if (backing_file && !backing_fmt) {
7268        error_setg(&local_err,
7269                   "Backing file specified without backing format");
7270        goto out;
7271    }
7272
7273    if (size == -1) {
7274        error_setg(errp, "Image creation needs a size parameter");
7275        goto out;
7276    }
7277
7278    if (!quiet) {
7279        printf("Formatting '%s', fmt=%s ", filename, fmt);
7280        qemu_opts_print(opts, " ");
7281        puts("");
7282        fflush(stdout);
7283    }
7284
7285    ret = bdrv_create(drv, filename, opts, &local_err);
7286
7287    if (ret == -EFBIG) {
7288        /* This is generally a better message than whatever the driver would
7289         * deliver (especially because of the cluster_size_hint), since that
7290         * is most probably not much different from "image too large". */
7291        const char *cluster_size_hint = "";
7292        if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
7293            cluster_size_hint = " (try using a larger cluster size)";
7294        }
7295        error_setg(errp, "The image size is too large for file format '%s'"
7296                   "%s", fmt, cluster_size_hint);
7297        error_free(local_err);
7298        local_err = NULL;
7299    }
7300
7301out:
7302    qemu_opts_del(opts);
7303    qemu_opts_free(create_opts);
7304    error_propagate(errp, local_err);
7305    aio_context_release(qemu_get_aio_context());
7306}
7307
7308AioContext *bdrv_get_aio_context(BlockDriverState *bs)
7309{
7310    IO_CODE();
7311    return bs ? bs->aio_context : qemu_get_aio_context();
7312}
7313
7314AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs)
7315{
7316    Coroutine *self = qemu_coroutine_self();
7317    AioContext *old_ctx = qemu_coroutine_get_aio_context(self);
7318    AioContext *new_ctx;
7319    IO_CODE();
7320
7321    /*
7322     * Increase bs->in_flight to ensure that this operation is completed before
7323     * moving the node to a different AioContext. Read new_ctx only afterwards.
7324     */
7325    bdrv_inc_in_flight(bs);
7326
7327    new_ctx = bdrv_get_aio_context(bs);
7328    aio_co_reschedule_self(new_ctx);
7329    return old_ctx;
7330}
7331
7332void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx)
7333{
7334    IO_CODE();
7335    aio_co_reschedule_self(old_ctx);
7336    bdrv_dec_in_flight(bs);
7337}
7338
7339void coroutine_fn bdrv_co_lock(BlockDriverState *bs)
7340{
7341    AioContext *ctx = bdrv_get_aio_context(bs);
7342
7343    /* In the main thread, bs->aio_context won't change concurrently */
7344    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
7345
7346    /*
7347     * We're in coroutine context, so we already hold the lock of the main
7348     * loop AioContext. Don't lock it twice to avoid deadlocks.
7349     */
7350    assert(qemu_in_coroutine());
7351    if (ctx != qemu_get_aio_context()) {
7352        aio_context_acquire(ctx);
7353    }
7354}
7355
7356void coroutine_fn bdrv_co_unlock(BlockDriverState *bs)
7357{
7358    AioContext *ctx = bdrv_get_aio_context(bs);
7359
7360    assert(qemu_in_coroutine());
7361    if (ctx != qemu_get_aio_context()) {
7362        aio_context_release(ctx);
7363    }
7364}
7365
7366static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
7367{
7368    GLOBAL_STATE_CODE();
7369    QLIST_REMOVE(ban, list);
7370    g_free(ban);
7371}
7372
7373static void bdrv_detach_aio_context(BlockDriverState *bs)
7374{
7375    BdrvAioNotifier *baf, *baf_tmp;
7376
7377    assert(!bs->walking_aio_notifiers);
7378    GLOBAL_STATE_CODE();
7379    bs->walking_aio_notifiers = true;
7380    QLIST_FOREACH_SAFE(baf, &bs->aio_notifiers, list, baf_tmp) {
7381        if (baf->deleted) {
7382            bdrv_do_remove_aio_context_notifier(baf);
7383        } else {
7384            baf->detach_aio_context(baf->opaque);
7385        }
7386    }
7387    /* Never mind iterating again to check for ->deleted.  bdrv_close() will
7388     * remove remaining aio notifiers if we aren't called again.
7389     */
7390    bs->walking_aio_notifiers = false;
7391
7392    if (bs->drv && bs->drv->bdrv_detach_aio_context) {
7393        bs->drv->bdrv_detach_aio_context(bs);
7394    }
7395
7396    bs->aio_context = NULL;
7397}
7398
7399static void bdrv_attach_aio_context(BlockDriverState *bs,
7400                                    AioContext *new_context)
7401{
7402    BdrvAioNotifier *ban, *ban_tmp;
7403    GLOBAL_STATE_CODE();
7404
7405    bs->aio_context = new_context;
7406
7407    if (bs->drv && bs->drv->bdrv_attach_aio_context) {
7408        bs->drv->bdrv_attach_aio_context(bs, new_context);
7409    }
7410
7411    assert(!bs->walking_aio_notifiers);
7412    bs->walking_aio_notifiers = true;
7413    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_tmp) {
7414        if (ban->deleted) {
7415            bdrv_do_remove_aio_context_notifier(ban);
7416        } else {
7417            ban->attached_aio_context(new_context, ban->opaque);
7418        }
7419    }
7420    bs->walking_aio_notifiers = false;
7421}
7422
7423typedef struct BdrvStateSetAioContext {
7424    AioContext *new_ctx;
7425    BlockDriverState *bs;
7426} BdrvStateSetAioContext;
7427
7428static bool bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx,
7429                                           GHashTable *visited,
7430                                           Transaction *tran,
7431                                           Error **errp)
7432{
7433    GLOBAL_STATE_CODE();
7434    if (g_hash_table_contains(visited, c)) {
7435        return true;
7436    }
7437    g_hash_table_add(visited, c);
7438
7439    /*
7440     * A BdrvChildClass that doesn't handle AioContext changes cannot
7441     * tolerate any AioContext changes
7442     */
7443    if (!c->klass->change_aio_ctx) {
7444        char *user = bdrv_child_user_desc(c);
7445        error_setg(errp, "Changing iothreads is not supported by %s", user);
7446        g_free(user);
7447        return false;
7448    }
7449    if (!c->klass->change_aio_ctx(c, ctx, visited, tran, errp)) {
7450        assert(!errp || *errp);
7451        return false;
7452    }
7453    return true;
7454}
7455
7456bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx,
7457                                   GHashTable *visited, Transaction *tran,
7458                                   Error **errp)
7459{
7460    GLOBAL_STATE_CODE();
7461    if (g_hash_table_contains(visited, c)) {
7462        return true;
7463    }
7464    g_hash_table_add(visited, c);
7465    return bdrv_change_aio_context(c->bs, ctx, visited, tran, errp);
7466}
7467
7468static void bdrv_set_aio_context_clean(void *opaque)
7469{
7470    BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque;
7471    BlockDriverState *bs = (BlockDriverState *) state->bs;
7472
7473    /* Paired with bdrv_drained_begin in bdrv_change_aio_context() */
7474    bdrv_drained_end(bs);
7475
7476    g_free(state);
7477}
7478
7479static void bdrv_set_aio_context_commit(void *opaque)
7480{
7481    BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque;
7482    BlockDriverState *bs = (BlockDriverState *) state->bs;
7483    AioContext *new_context = state->new_ctx;
7484    AioContext *old_context = bdrv_get_aio_context(bs);
7485
7486    /*
7487     * Take the old AioContex when detaching it from bs.
7488     * At this point, new_context lock is already acquired, and we are now
7489     * also taking old_context. This is safe as long as bdrv_detach_aio_context
7490     * does not call AIO_POLL_WHILE().
7491     */
7492    if (old_context != qemu_get_aio_context()) {
7493        aio_context_acquire(old_context);
7494    }
7495    bdrv_detach_aio_context(bs);
7496    if (old_context != qemu_get_aio_context()) {
7497        aio_context_release(old_context);
7498    }
7499    bdrv_attach_aio_context(bs, new_context);
7500}
7501
7502static TransactionActionDrv set_aio_context = {
7503    .commit = bdrv_set_aio_context_commit,
7504    .clean = bdrv_set_aio_context_clean,
7505};
7506
7507/*
7508 * Changes the AioContext used for fd handlers, timers, and BHs by this
7509 * BlockDriverState and all its children and parents.
7510 *
7511 * Must be called from the main AioContext.
7512 *
7513 * The caller must own the AioContext lock for the old AioContext of bs, but it
7514 * must not own the AioContext lock for new_context (unless new_context is the
7515 * same as the current context of bs).
7516 *
7517 * @visited will accumulate all visited BdrvChild objects. The caller is
7518 * responsible for freeing the list afterwards.
7519 */
7520static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
7521                                    GHashTable *visited, Transaction *tran,
7522                                    Error **errp)
7523{
7524    BdrvChild *c;
7525    BdrvStateSetAioContext *state;
7526
7527    GLOBAL_STATE_CODE();
7528
7529    if (bdrv_get_aio_context(bs) == ctx) {
7530        return true;
7531    }
7532
7533    QLIST_FOREACH(c, &bs->parents, next_parent) {
7534        if (!bdrv_parent_change_aio_context(c, ctx, visited, tran, errp)) {
7535            return false;
7536        }
7537    }
7538
7539    QLIST_FOREACH(c, &bs->children, next) {
7540        if (!bdrv_child_change_aio_context(c, ctx, visited, tran, errp)) {
7541            return false;
7542        }
7543    }
7544
7545    state = g_new(BdrvStateSetAioContext, 1);
7546    *state = (BdrvStateSetAioContext) {
7547        .new_ctx = ctx,
7548        .bs = bs,
7549    };
7550
7551    /* Paired with bdrv_drained_end in bdrv_set_aio_context_clean() */
7552    bdrv_drained_begin(bs);
7553
7554    tran_add(tran, &set_aio_context, state);
7555
7556    return true;
7557}
7558
7559/*
7560 * Change bs's and recursively all of its parents' and children's AioContext
7561 * to the given new context, returning an error if that isn't possible.
7562 *
7563 * If ignore_child is not NULL, that child (and its subgraph) will not
7564 * be touched.
7565 *
7566 * This function still requires the caller to take the bs current
7567 * AioContext lock, otherwise draining will fail since AIO_WAIT_WHILE
7568 * assumes the lock is always held if bs is in another AioContext.
7569 * For the same reason, it temporarily also holds the new AioContext, since
7570 * bdrv_drained_end calls BDRV_POLL_WHILE that assumes the lock is taken too.
7571 * Therefore the new AioContext lock must not be taken by the caller.
7572 */
7573int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
7574                                BdrvChild *ignore_child, Error **errp)
7575{
7576    Transaction *tran;
7577    GHashTable *visited;
7578    int ret;
7579    AioContext *old_context = bdrv_get_aio_context(bs);
7580    GLOBAL_STATE_CODE();
7581
7582    /*
7583     * Recursion phase: go through all nodes of the graph.
7584     * Take care of checking that all nodes support changing AioContext
7585     * and drain them, builing a linear list of callbacks to run if everything
7586     * is successful (the transaction itself).
7587     */
7588    tran = tran_new();
7589    visited = g_hash_table_new(NULL, NULL);
7590    if (ignore_child) {
7591        g_hash_table_add(visited, ignore_child);
7592    }
7593    ret = bdrv_change_aio_context(bs, ctx, visited, tran, errp);
7594    g_hash_table_destroy(visited);
7595
7596    /*
7597     * Linear phase: go through all callbacks collected in the transaction.
7598     * Run all callbacks collected in the recursion to switch all nodes
7599     * AioContext lock (transaction commit), or undo all changes done in the
7600     * recursion (transaction abort).
7601     */
7602
7603    if (!ret) {
7604        /* Just run clean() callbacks. No AioContext changed. */
7605        tran_abort(tran);
7606        return -EPERM;
7607    }
7608
7609    /*
7610     * Release old AioContext, it won't be needed anymore, as all
7611     * bdrv_drained_begin() have been called already.
7612     */
7613    if (qemu_get_aio_context() != old_context) {
7614        aio_context_release(old_context);
7615    }
7616
7617    /*
7618     * Acquire new AioContext since bdrv_drained_end() is going to be called
7619     * after we switched all nodes in the new AioContext, and the function
7620     * assumes that the lock of the bs is always taken.
7621     */
7622    if (qemu_get_aio_context() != ctx) {
7623        aio_context_acquire(ctx);
7624    }
7625
7626    tran_commit(tran);
7627
7628    if (qemu_get_aio_context() != ctx) {
7629        aio_context_release(ctx);
7630    }
7631
7632    /* Re-acquire the old AioContext, since the caller takes and releases it. */
7633    if (qemu_get_aio_context() != old_context) {
7634        aio_context_acquire(old_context);
7635    }
7636
7637    return 0;
7638}
7639
7640void bdrv_add_aio_context_notifier(BlockDriverState *bs,
7641        void (*attached_aio_context)(AioContext *new_context, void *opaque),
7642        void (*detach_aio_context)(void *opaque), void *opaque)
7643{
7644    BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
7645    *ban = (BdrvAioNotifier){
7646        .attached_aio_context = attached_aio_context,
7647        .detach_aio_context   = detach_aio_context,
7648        .opaque               = opaque
7649    };
7650    GLOBAL_STATE_CODE();
7651
7652    QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
7653}
7654
7655void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
7656                                      void (*attached_aio_context)(AioContext *,
7657                                                                   void *),
7658                                      void (*detach_aio_context)(void *),
7659                                      void *opaque)
7660{
7661    BdrvAioNotifier *ban, *ban_next;
7662    GLOBAL_STATE_CODE();
7663
7664    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
7665        if (ban->attached_aio_context == attached_aio_context &&
7666            ban->detach_aio_context   == detach_aio_context   &&
7667            ban->opaque               == opaque               &&
7668            ban->deleted              == false)
7669        {
7670            if (bs->walking_aio_notifiers) {
7671                ban->deleted = true;
7672            } else {
7673                bdrv_do_remove_aio_context_notifier(ban);
7674            }
7675            return;
7676        }
7677    }
7678
7679    abort();
7680}
7681
7682int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
7683                       BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
7684                       bool force,
7685                       Error **errp)
7686{
7687    GLOBAL_STATE_CODE();
7688    if (!bs->drv) {
7689        error_setg(errp, "Node is ejected");
7690        return -ENOMEDIUM;
7691    }
7692    if (!bs->drv->bdrv_amend_options) {
7693        error_setg(errp, "Block driver '%s' does not support option amendment",
7694                   bs->drv->format_name);
7695        return -ENOTSUP;
7696    }
7697    return bs->drv->bdrv_amend_options(bs, opts, status_cb,
7698                                       cb_opaque, force, errp);
7699}
7700
7701/*
7702 * This function checks whether the given @to_replace is allowed to be
7703 * replaced by a node that always shows the same data as @bs.  This is
7704 * used for example to verify whether the mirror job can replace
7705 * @to_replace by the target mirrored from @bs.
7706 * To be replaceable, @bs and @to_replace may either be guaranteed to
7707 * always show the same data (because they are only connected through
7708 * filters), or some driver may allow replacing one of its children
7709 * because it can guarantee that this child's data is not visible at
7710 * all (for example, for dissenting quorum children that have no other
7711 * parents).
7712 */
7713bool bdrv_recurse_can_replace(BlockDriverState *bs,
7714                              BlockDriverState *to_replace)
7715{
7716    BlockDriverState *filtered;
7717
7718    GLOBAL_STATE_CODE();
7719
7720    if (!bs || !bs->drv) {
7721        return false;
7722    }
7723
7724    if (bs == to_replace) {
7725        return true;
7726    }
7727
7728    /* See what the driver can do */
7729    if (bs->drv->bdrv_recurse_can_replace) {
7730        return bs->drv->bdrv_recurse_can_replace(bs, to_replace);
7731    }
7732
7733    /* For filters without an own implementation, we can recurse on our own */
7734    filtered = bdrv_filter_bs(bs);
7735    if (filtered) {
7736        return bdrv_recurse_can_replace(filtered, to_replace);
7737    }
7738
7739    /* Safe default */
7740    return false;
7741}
7742
7743/*
7744 * Check whether the given @node_name can be replaced by a node that
7745 * has the same data as @parent_bs.  If so, return @node_name's BDS;
7746 * NULL otherwise.
7747 *
7748 * @node_name must be a (recursive) *child of @parent_bs (or this
7749 * function will return NULL).
7750 *
7751 * The result (whether the node can be replaced or not) is only valid
7752 * for as long as no graph or permission changes occur.
7753 */
7754BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
7755                                        const char *node_name, Error **errp)
7756{
7757    BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
7758    AioContext *aio_context;
7759
7760    GLOBAL_STATE_CODE();
7761
7762    if (!to_replace_bs) {
7763        error_setg(errp, "Failed to find node with node-name='%s'", node_name);
7764        return NULL;
7765    }
7766
7767    aio_context = bdrv_get_aio_context(to_replace_bs);
7768    aio_context_acquire(aio_context);
7769
7770    if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
7771        to_replace_bs = NULL;
7772        goto out;
7773    }
7774
7775    /* We don't want arbitrary node of the BDS chain to be replaced only the top
7776     * most non filter in order to prevent data corruption.
7777     * Another benefit is that this tests exclude backing files which are
7778     * blocked by the backing blockers.
7779     */
7780    if (!bdrv_recurse_can_replace(parent_bs, to_replace_bs)) {
7781        error_setg(errp, "Cannot replace '%s' by a node mirrored from '%s', "
7782                   "because it cannot be guaranteed that doing so would not "
7783                   "lead to an abrupt change of visible data",
7784                   node_name, parent_bs->node_name);
7785        to_replace_bs = NULL;
7786        goto out;
7787    }
7788
7789out:
7790    aio_context_release(aio_context);
7791    return to_replace_bs;
7792}
7793
7794/**
7795 * Iterates through the list of runtime option keys that are said to
7796 * be "strong" for a BDS.  An option is called "strong" if it changes
7797 * a BDS's data.  For example, the null block driver's "size" and
7798 * "read-zeroes" options are strong, but its "latency-ns" option is
7799 * not.
7800 *
7801 * If a key returned by this function ends with a dot, all options
7802 * starting with that prefix are strong.
7803 */
7804static const char *const *strong_options(BlockDriverState *bs,
7805                                         const char *const *curopt)
7806{
7807    static const char *const global_options[] = {
7808        "driver", "filename", NULL
7809    };
7810
7811    if (!curopt) {
7812        return &global_options[0];
7813    }
7814
7815    curopt++;
7816    if (curopt == &global_options[ARRAY_SIZE(global_options) - 1] && bs->drv) {
7817        curopt = bs->drv->strong_runtime_opts;
7818    }
7819
7820    return (curopt && *curopt) ? curopt : NULL;
7821}
7822
7823/**
7824 * Copies all strong runtime options from bs->options to the given
7825 * QDict.  The set of strong option keys is determined by invoking
7826 * strong_options().
7827 *
7828 * Returns true iff any strong option was present in bs->options (and
7829 * thus copied to the target QDict) with the exception of "filename"
7830 * and "driver".  The caller is expected to use this value to decide
7831 * whether the existence of strong options prevents the generation of
7832 * a plain filename.
7833 */
7834static bool append_strong_runtime_options(QDict *d, BlockDriverState *bs)
7835{
7836    bool found_any = false;
7837    const char *const *option_name = NULL;
7838
7839    if (!bs->drv) {
7840        return false;
7841    }
7842
7843    while ((option_name = strong_options(bs, option_name))) {
7844        bool option_given = false;
7845
7846        assert(strlen(*option_name) > 0);
7847        if ((*option_name)[strlen(*option_name) - 1] != '.') {
7848            QObject *entry = qdict_get(bs->options, *option_name);
7849            if (!entry) {
7850                continue;
7851            }
7852
7853            qdict_put_obj(d, *option_name, qobject_ref(entry));
7854            option_given = true;
7855        } else {
7856            const QDictEntry *entry;
7857            for (entry = qdict_first(bs->options); entry;
7858                 entry = qdict_next(bs->options, entry))
7859            {
7860                if (strstart(qdict_entry_key(entry), *option_name, NULL)) {
7861                    qdict_put_obj(d, qdict_entry_key(entry),
7862                                  qobject_ref(qdict_entry_value(entry)));
7863                    option_given = true;
7864                }
7865            }
7866        }
7867
7868        /* While "driver" and "filename" need to be included in a JSON filename,
7869         * their existence does not prohibit generation of a plain filename. */
7870        if (!found_any && option_given &&
7871            strcmp(*option_name, "driver") && strcmp(*option_name, "filename"))
7872        {
7873            found_any = true;
7874        }
7875    }
7876
7877    if (!qdict_haskey(d, "driver")) {
7878        /* Drivers created with bdrv_new_open_driver() may not have a
7879         * @driver option.  Add it here. */
7880        qdict_put_str(d, "driver", bs->drv->format_name);
7881    }
7882
7883    return found_any;
7884}
7885
7886/* Note: This function may return false positives; it may return true
7887 * even if opening the backing file specified by bs's image header
7888 * would result in exactly bs->backing. */
7889static bool bdrv_backing_overridden(BlockDriverState *bs)
7890{
7891    GLOBAL_STATE_CODE();
7892    if (bs->backing) {
7893        return strcmp(bs->auto_backing_file,
7894                      bs->backing->bs->filename);
7895    } else {
7896        /* No backing BDS, so if the image header reports any backing
7897         * file, it must have been suppressed */
7898        return bs->auto_backing_file[0] != '\0';
7899    }
7900}
7901
7902/* Updates the following BDS fields:
7903 *  - exact_filename: A filename which may be used for opening a block device
7904 *                    which (mostly) equals the given BDS (even without any
7905 *                    other options; so reading and writing must return the same
7906 *                    results, but caching etc. may be different)
7907 *  - full_open_options: Options which, when given when opening a block device
7908 *                       (without a filename), result in a BDS (mostly)
7909 *                       equalling the given one
7910 *  - filename: If exact_filename is set, it is copied here. Otherwise,
7911 *              full_open_options is converted to a JSON object, prefixed with
7912 *              "json:" (for use through the JSON pseudo protocol) and put here.
7913 */
7914void bdrv_refresh_filename(BlockDriverState *bs)
7915{
7916    BlockDriver *drv = bs->drv;
7917    BdrvChild *child;
7918    BlockDriverState *primary_child_bs;
7919    QDict *opts;
7920    bool backing_overridden;
7921    bool generate_json_filename; /* Whether our default implementation should
7922                                    fill exact_filename (false) or not (true) */
7923
7924    GLOBAL_STATE_CODE();
7925
7926    if (!drv) {
7927        return;
7928    }
7929
7930    /* This BDS's file name may depend on any of its children's file names, so
7931     * refresh those first */
7932    QLIST_FOREACH(child, &bs->children, next) {
7933        bdrv_refresh_filename(child->bs);
7934    }
7935
7936    if (bs->implicit) {
7937        /* For implicit nodes, just copy everything from the single child */
7938        child = QLIST_FIRST(&bs->children);
7939        assert(QLIST_NEXT(child, next) == NULL);
7940
7941        pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
7942                child->bs->exact_filename);
7943        pstrcpy(bs->filename, sizeof(bs->filename), child->bs->filename);
7944
7945        qobject_unref(bs->full_open_options);
7946        bs->full_open_options = qobject_ref(child->bs->full_open_options);
7947
7948        return;
7949    }
7950
7951    backing_overridden = bdrv_backing_overridden(bs);
7952
7953    if (bs->open_flags & BDRV_O_NO_IO) {
7954        /* Without I/O, the backing file does not change anything.
7955         * Therefore, in such a case (primarily qemu-img), we can
7956         * pretend the backing file has not been overridden even if
7957         * it technically has been. */
7958        backing_overridden = false;
7959    }
7960
7961    /* Gather the options QDict */
7962    opts = qdict_new();
7963    generate_json_filename = append_strong_runtime_options(opts, bs);
7964    generate_json_filename |= backing_overridden;
7965
7966    if (drv->bdrv_gather_child_options) {
7967        /* Some block drivers may not want to present all of their children's
7968         * options, or name them differently from BdrvChild.name */
7969        drv->bdrv_gather_child_options(bs, opts, backing_overridden);
7970    } else {
7971        QLIST_FOREACH(child, &bs->children, next) {
7972            if (child == bs->backing && !backing_overridden) {
7973                /* We can skip the backing BDS if it has not been overridden */
7974                continue;
7975            }
7976
7977            qdict_put(opts, child->name,
7978                      qobject_ref(child->bs->full_open_options));
7979        }
7980
7981        if (backing_overridden && !bs->backing) {
7982            /* Force no backing file */
7983            qdict_put_null(opts, "backing");
7984        }
7985    }
7986
7987    qobject_unref(bs->full_open_options);
7988    bs->full_open_options = opts;
7989
7990    primary_child_bs = bdrv_primary_bs(bs);
7991
7992    if (drv->bdrv_refresh_filename) {
7993        /* Obsolete information is of no use here, so drop the old file name
7994         * information before refreshing it */
7995        bs->exact_filename[0] = '\0';
7996
7997        drv->bdrv_refresh_filename(bs);
7998    } else if (primary_child_bs) {
7999        /*
8000         * Try to reconstruct valid information from the underlying
8001         * file -- this only works for format nodes (filter nodes
8002         * cannot be probed and as such must be selected by the user
8003         * either through an options dict, or through a special
8004         * filename which the filter driver must construct in its
8005         * .bdrv_refresh_filename() implementation).
8006         */
8007
8008        bs->exact_filename[0] = '\0';
8009
8010        /*
8011         * We can use the underlying file's filename if:
8012         * - it has a filename,
8013         * - the current BDS is not a filter,
8014         * - the file is a protocol BDS, and
8015         * - opening that file (as this BDS's format) will automatically create
8016         *   the BDS tree we have right now, that is:
8017         *   - the user did not significantly change this BDS's behavior with
8018         *     some explicit (strong) options
8019         *   - no non-file child of this BDS has been overridden by the user
8020         *   Both of these conditions are represented by generate_json_filename.
8021         */
8022        if (primary_child_bs->exact_filename[0] &&
8023            primary_child_bs->drv->bdrv_file_open &&
8024            !drv->is_filter && !generate_json_filename)
8025        {
8026            strcpy(bs->exact_filename, primary_child_bs->exact_filename);
8027        }
8028    }
8029
8030    if (bs->exact_filename[0]) {
8031        pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
8032    } else {
8033        GString *json = qobject_to_json(QOBJECT(bs->full_open_options));
8034        if (snprintf(bs->filename, sizeof(bs->filename), "json:%s",
8035                     json->str) >= sizeof(bs->filename)) {
8036            /* Give user a hint if we truncated things. */
8037            strcpy(bs->filename + sizeof(bs->filename) - 4, "...");
8038        }
8039        g_string_free(json, true);
8040    }
8041}
8042
8043char *bdrv_dirname(BlockDriverState *bs, Error **errp)
8044{
8045    BlockDriver *drv = bs->drv;
8046    BlockDriverState *child_bs;
8047
8048    GLOBAL_STATE_CODE();
8049
8050    if (!drv) {
8051        error_setg(errp, "Node '%s' is ejected", bs->node_name);
8052        return NULL;
8053    }
8054
8055    if (drv->bdrv_dirname) {
8056        return drv->bdrv_dirname(bs, errp);
8057    }
8058
8059    child_bs = bdrv_primary_bs(bs);
8060    if (child_bs) {
8061        return bdrv_dirname(child_bs, errp);
8062    }
8063
8064    bdrv_refresh_filename(bs);
8065    if (bs->exact_filename[0] != '\0') {
8066        return path_combine(bs->exact_filename, "");
8067    }
8068
8069    error_setg(errp, "Cannot generate a base directory for %s nodes",
8070               drv->format_name);
8071    return NULL;
8072}
8073
8074/*
8075 * Hot add/remove a BDS's child. So the user can take a child offline when
8076 * it is broken and take a new child online
8077 */
8078void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
8079                    Error **errp)
8080{
8081    GLOBAL_STATE_CODE();
8082    if (!parent_bs->drv || !parent_bs->drv->bdrv_add_child) {
8083        error_setg(errp, "The node %s does not support adding a child",
8084                   bdrv_get_device_or_node_name(parent_bs));
8085        return;
8086    }
8087
8088    /*
8089     * Non-zoned block drivers do not follow zoned storage constraints
8090     * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
8091     * drivers in a graph.
8092     */
8093    if (!parent_bs->drv->supports_zoned_children &&
8094        child_bs->bl.zoned == BLK_Z_HM) {
8095        /*
8096         * The host-aware model allows zoned storage constraints and random
8097         * write. Allow mixing host-aware and non-zoned drivers. Using
8098         * host-aware device as a regular device.
8099         */
8100        error_setg(errp, "Cannot add a %s child to a %s parent",
8101                   child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
8102                   parent_bs->drv->supports_zoned_children ?
8103                   "support zoned children" : "not support zoned children");
8104        return;
8105    }
8106
8107    if (!QLIST_EMPTY(&child_bs->parents)) {
8108        error_setg(errp, "The node %s already has a parent",
8109                   child_bs->node_name);
8110        return;
8111    }
8112
8113    parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp);
8114}
8115
8116void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp)
8117{
8118    BdrvChild *tmp;
8119
8120    GLOBAL_STATE_CODE();
8121    if (!parent_bs->drv || !parent_bs->drv->bdrv_del_child) {
8122        error_setg(errp, "The node %s does not support removing a child",
8123                   bdrv_get_device_or_node_name(parent_bs));
8124        return;
8125    }
8126
8127    QLIST_FOREACH(tmp, &parent_bs->children, next) {
8128        if (tmp == child) {
8129            break;
8130        }
8131    }
8132
8133    if (!tmp) {
8134        error_setg(errp, "The node %s does not have a child named %s",
8135                   bdrv_get_device_or_node_name(parent_bs),
8136                   bdrv_get_device_or_node_name(child->bs));
8137        return;
8138    }
8139
8140    parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
8141}
8142
8143int bdrv_make_empty(BdrvChild *c, Error **errp)
8144{
8145    BlockDriver *drv = c->bs->drv;
8146    int ret;
8147
8148    GLOBAL_STATE_CODE();
8149    assert(c->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED));
8150
8151    if (!drv->bdrv_make_empty) {
8152        error_setg(errp, "%s does not support emptying nodes",
8153                   drv->format_name);
8154        return -ENOTSUP;
8155    }
8156
8157    ret = drv->bdrv_make_empty(c->bs);
8158    if (ret < 0) {
8159        error_setg_errno(errp, -ret, "Failed to empty %s",
8160                         c->bs->filename);
8161        return ret;
8162    }
8163
8164    return 0;
8165}
8166
8167/*
8168 * Return the child that @bs acts as an overlay for, and from which data may be
8169 * copied in COW or COR operations.  Usually this is the backing file.
8170 */
8171BdrvChild *bdrv_cow_child(BlockDriverState *bs)
8172{
8173    IO_CODE();
8174
8175    if (!bs || !bs->drv) {
8176        return NULL;
8177    }
8178
8179    if (bs->drv->is_filter) {
8180        return NULL;
8181    }
8182
8183    if (!bs->backing) {
8184        return NULL;
8185    }
8186
8187    assert(bs->backing->role & BDRV_CHILD_COW);
8188    return bs->backing;
8189}
8190
8191/*
8192 * If @bs acts as a filter for exactly one of its children, return
8193 * that child.
8194 */
8195BdrvChild *bdrv_filter_child(BlockDriverState *bs)
8196{
8197    BdrvChild *c;
8198    IO_CODE();
8199
8200    if (!bs || !bs->drv) {
8201        return NULL;
8202    }
8203
8204    if (!bs->drv->is_filter) {
8205        return NULL;
8206    }
8207
8208    /* Only one of @backing or @file may be used */
8209    assert(!(bs->backing && bs->file));
8210
8211    c = bs->backing ?: bs->file;
8212    if (!c) {
8213        return NULL;
8214    }
8215
8216    assert(c->role & BDRV_CHILD_FILTERED);
8217    return c;
8218}
8219
8220/*
8221 * Return either the result of bdrv_cow_child() or bdrv_filter_child(),
8222 * whichever is non-NULL.
8223 *
8224 * Return NULL if both are NULL.
8225 */
8226BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs)
8227{
8228    BdrvChild *cow_child = bdrv_cow_child(bs);
8229    BdrvChild *filter_child = bdrv_filter_child(bs);
8230    IO_CODE();
8231
8232    /* Filter nodes cannot have COW backing files */
8233    assert(!(cow_child && filter_child));
8234
8235    return cow_child ?: filter_child;
8236}
8237
8238/*
8239 * Return the primary child of this node: For filters, that is the
8240 * filtered child.  For other nodes, that is usually the child storing
8241 * metadata.
8242 * (A generally more helpful description is that this is (usually) the
8243 * child that has the same filename as @bs.)
8244 *
8245 * Drivers do not necessarily have a primary child; for example quorum
8246 * does not.
8247 */
8248BdrvChild *bdrv_primary_child(BlockDriverState *bs)
8249{
8250    BdrvChild *c, *found = NULL;
8251    IO_CODE();
8252
8253    QLIST_FOREACH(c, &bs->children, next) {
8254        if (c->role & BDRV_CHILD_PRIMARY) {
8255            assert(!found);
8256            found = c;
8257        }
8258    }
8259
8260    return found;
8261}
8262
8263static BlockDriverState *bdrv_do_skip_filters(BlockDriverState *bs,
8264                                              bool stop_on_explicit_filter)
8265{
8266    BdrvChild *c;
8267
8268    if (!bs) {
8269        return NULL;
8270    }
8271
8272    while (!(stop_on_explicit_filter && !bs->implicit)) {
8273        c = bdrv_filter_child(bs);
8274        if (!c) {
8275            /*
8276             * A filter that is embedded in a working block graph must
8277             * have a child.  Assert this here so this function does
8278             * not return a filter node that is not expected by the
8279             * caller.
8280             */
8281            assert(!bs->drv || !bs->drv->is_filter);
8282            break;
8283        }
8284        bs = c->bs;
8285    }
8286    /*
8287     * Note that this treats nodes with bs->drv == NULL as not being
8288     * filters (bs->drv == NULL should be replaced by something else
8289     * anyway).
8290     * The advantage of this behavior is that this function will thus
8291     * always return a non-NULL value (given a non-NULL @bs).
8292     */
8293
8294    return bs;
8295}
8296
8297/*
8298 * Return the first BDS that has not been added implicitly or that
8299 * does not have a filtered child down the chain starting from @bs
8300 * (including @bs itself).
8301 */
8302BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
8303{
8304    GLOBAL_STATE_CODE();
8305    return bdrv_do_skip_filters(bs, true);
8306}
8307
8308/*
8309 * Return the first BDS that does not have a filtered child down the
8310 * chain starting from @bs (including @bs itself).
8311 */
8312BlockDriverState *bdrv_skip_filters(BlockDriverState *bs)
8313{
8314    IO_CODE();
8315    return bdrv_do_skip_filters(bs, false);
8316}
8317
8318/*
8319 * For a backing chain, return the first non-filter backing image of
8320 * the first non-filter image.
8321 */
8322BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
8323{
8324    IO_CODE();
8325    return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
8326}
8327
8328/**
8329 * Check whether [offset, offset + bytes) overlaps with the cached
8330 * block-status data region.
8331 *
8332 * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`,
8333 * which is what bdrv_bsc_is_data()'s interface needs.
8334 * Otherwise, *pnum is not touched.
8335 */
8336static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
8337                                           int64_t offset, int64_t bytes,
8338                                           int64_t *pnum)
8339{
8340    BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache);
8341    bool overlaps;
8342
8343    overlaps =
8344        qatomic_read(&bsc->valid) &&
8345        ranges_overlap(offset, bytes, bsc->data_start,
8346                       bsc->data_end - bsc->data_start);
8347
8348    if (overlaps && pnum) {
8349        *pnum = bsc->data_end - offset;
8350    }
8351
8352    return overlaps;
8353}
8354
8355/**
8356 * See block_int.h for this function's documentation.
8357 */
8358bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
8359{
8360    IO_CODE();
8361    RCU_READ_LOCK_GUARD();
8362    return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
8363}
8364
8365/**
8366 * See block_int.h for this function's documentation.
8367 */
8368void bdrv_bsc_invalidate_range(BlockDriverState *bs,
8369                               int64_t offset, int64_t bytes)
8370{
8371    IO_CODE();
8372    RCU_READ_LOCK_GUARD();
8373
8374    if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
8375        qatomic_set(&bs->block_status_cache->valid, false);
8376    }
8377}
8378
8379/**
8380 * See block_int.h for this function's documentation.
8381 */
8382void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes)
8383{
8384    BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
8385    BdrvBlockStatusCache *old_bsc;
8386    IO_CODE();
8387
8388    *new_bsc = (BdrvBlockStatusCache) {
8389        .valid = true,
8390        .data_start = offset,
8391        .data_end = offset + bytes,
8392    };
8393
8394    QEMU_LOCK_GUARD(&bs->bsc_modify_lock);
8395
8396    old_bsc = qatomic_rcu_read(&bs->block_status_cache);
8397    qatomic_rcu_set(&bs->block_status_cache, new_bsc);
8398    if (old_bsc) {
8399        g_free_rcu(old_bsc, rcu);
8400    }
8401}
8402