qemu/block/preallocate.c
<<
>>
Prefs
   1/*
   2 * preallocate filter driver
   3 *
   4 * The driver performs preallocate operation: it is injected above
   5 * some node, and before each write over EOF it does additional preallocating
   6 * write-zeroes request.
   7 *
   8 * Copyright (c) 2020 Virtuozzo International GmbH.
   9 *
  10 * Author:
  11 *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
  12 *
  13 * This program is free software; you can redistribute it and/or modify
  14 * it under the terms of the GNU General Public License as published by
  15 * the Free Software Foundation; either version 2 of the License, or
  16 * (at your option) any later version.
  17 *
  18 * This program is distributed in the hope that it will be useful,
  19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 * GNU General Public License for more details.
  22 *
  23 * You should have received a copy of the GNU General Public License
  24 * along with this program. If not, see <http://www.gnu.org/licenses/>.
  25 */
  26
  27#include "qemu/osdep.h"
  28
  29#include "qapi/error.h"
  30#include "qemu/module.h"
  31#include "qemu/option.h"
  32#include "qemu/units.h"
  33#include "block/block-io.h"
  34#include "block/block_int.h"
  35
  36
  37typedef struct PreallocateOpts {
  38    int64_t prealloc_size;
  39    int64_t prealloc_align;
  40} PreallocateOpts;
  41
  42typedef struct BDRVPreallocateState {
  43    PreallocateOpts opts;
  44
  45    /*
  46     * Track real data end, to crop preallocation on close. If < 0 the status is
  47     * unknown.
  48     *
  49     * @data_end is a maximum of file size on open (or when we get write/resize
  50     * permissions) and all write request ends after it. So it's safe to
  51     * truncate to data_end if it is valid.
  52     */
  53    int64_t data_end;
  54
  55    /*
  56     * Start of trailing preallocated area which reads as zero. May be smaller
  57     * than data_end, if user does over-EOF write zero operation. If < 0 the
  58     * status is unknown.
  59     *
  60     * If both @zero_start and @file_end are valid, the region
  61     * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
  62     * is not valid, @zero_start doesn't make much sense.
  63     */
  64    int64_t zero_start;
  65
  66    /*
  67     * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
  68     * to avoid extra lseek() calls on each write operation. If < 0 the status
  69     * is unknown.
  70     */
  71    int64_t file_end;
  72
  73    /*
  74     * All three states @data_end, @zero_start and @file_end are guaranteed to
  75     * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
  76     * BLK_PERM_WRITE permissions on file child.
  77     */
  78} BDRVPreallocateState;
  79
  80#define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
  81#define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
  82static QemuOptsList runtime_opts = {
  83    .name = "preallocate",
  84    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
  85    .desc = {
  86        {
  87            .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
  88            .type = QEMU_OPT_SIZE,
  89            .help = "on preallocation, align file length to this number, "
  90                "default 1M",
  91        },
  92        {
  93            .name = PREALLOCATE_OPT_PREALLOC_SIZE,
  94            .type = QEMU_OPT_SIZE,
  95            .help = "how much to preallocate, default 128M",
  96        },
  97        { /* end of list */ }
  98    },
  99};
 100
 101static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
 102                                    BlockDriverState *child_bs, Error **errp)
 103{
 104    QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
 105
 106    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
 107        return false;
 108    }
 109
 110    dest->prealloc_align =
 111        qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
 112    dest->prealloc_size =
 113        qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
 114
 115    qemu_opts_del(opts);
 116
 117    if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
 118        error_setg(errp, "prealloc-align parameter of preallocate filter "
 119                   "is not aligned to %llu", BDRV_SECTOR_SIZE);
 120        return false;
 121    }
 122
 123    if (!QEMU_IS_ALIGNED(dest->prealloc_align,
 124                         child_bs->bl.request_alignment)) {
 125        error_setg(errp, "prealloc-align parameter of preallocate filter "
 126                   "is not aligned to underlying node request alignment "
 127                   "(%" PRIi32 ")", child_bs->bl.request_alignment);
 128        return false;
 129    }
 130
 131    return true;
 132}
 133
 134static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
 135                            Error **errp)
 136{
 137    BDRVPreallocateState *s = bs->opaque;
 138    int ret;
 139
 140    /*
 141     * s->data_end and friends should be initialized on permission update.
 142     * For this to work, mark them invalid.
 143     */
 144    s->file_end = s->zero_start = s->data_end = -EINVAL;
 145
 146    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 147    if (ret < 0) {
 148        return ret;
 149    }
 150
 151    if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
 152        return -EINVAL;
 153    }
 154
 155    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
 156        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
 157
 158    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
 159        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
 160            bs->file->bs->supported_zero_flags);
 161
 162    return 0;
 163}
 164
 165static void preallocate_close(BlockDriverState *bs)
 166{
 167    int ret;
 168    BDRVPreallocateState *s = bs->opaque;
 169
 170    if (s->data_end < 0) {
 171        return;
 172    }
 173
 174    if (s->file_end < 0) {
 175        s->file_end = bdrv_getlength(bs->file->bs);
 176        if (s->file_end < 0) {
 177            return;
 178        }
 179    }
 180
 181    if (s->data_end < s->file_end) {
 182        ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
 183                            NULL);
 184        s->file_end = ret < 0 ? ret : s->data_end;
 185    }
 186}
 187
 188
 189/*
 190 * Handle reopen.
 191 *
 192 * We must implement reopen handlers, otherwise reopen just don't work. Handle
 193 * new options and don't care about preallocation state, as it is handled in
 194 * set/check permission handlers.
 195 */
 196
 197static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
 198                                      BlockReopenQueue *queue, Error **errp)
 199{
 200    PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
 201
 202    if (!preallocate_absorb_opts(opts, reopen_state->options,
 203                                 reopen_state->bs->file->bs, errp)) {
 204        g_free(opts);
 205        return -EINVAL;
 206    }
 207
 208    reopen_state->opaque = opts;
 209
 210    return 0;
 211}
 212
 213static void preallocate_reopen_commit(BDRVReopenState *state)
 214{
 215    BDRVPreallocateState *s = state->bs->opaque;
 216
 217    s->opts = *(PreallocateOpts *)state->opaque;
 218
 219    g_free(state->opaque);
 220    state->opaque = NULL;
 221}
 222
 223static void preallocate_reopen_abort(BDRVReopenState *state)
 224{
 225    g_free(state->opaque);
 226    state->opaque = NULL;
 227}
 228
 229static int coroutine_fn GRAPH_RDLOCK
 230preallocate_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
 231                           QEMUIOVector *qiov, size_t qiov_offset,
 232                           BdrvRequestFlags flags)
 233{
 234    return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
 235                               flags);
 236}
 237
 238static int coroutine_fn GRAPH_RDLOCK
 239preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 240{
 241    return bdrv_co_pdiscard(bs->file, offset, bytes);
 242}
 243
 244static bool can_write_resize(uint64_t perm)
 245{
 246    return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
 247}
 248
 249static bool has_prealloc_perms(BlockDriverState *bs)
 250{
 251    BDRVPreallocateState *s = bs->opaque;
 252
 253    if (can_write_resize(bs->file->perm)) {
 254        assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
 255        assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
 256        return true;
 257    }
 258
 259    assert(s->data_end < 0);
 260    assert(s->zero_start < 0);
 261    assert(s->file_end < 0);
 262    return false;
 263}
 264
 265/*
 266 * Call on each write. Returns true if @want_merge_zero is true and the region
 267 * [offset, offset + bytes) is zeroed (as a result of this call or earlier
 268 * preallocation).
 269 *
 270 * want_merge_zero is used to merge write-zero request with preallocation in
 271 * one bdrv_co_pwrite_zeroes() call.
 272 */
 273static bool coroutine_fn GRAPH_RDLOCK
 274handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
 275             bool want_merge_zero)
 276{
 277    BDRVPreallocateState *s = bs->opaque;
 278    int64_t end = offset + bytes;
 279    int64_t prealloc_start, prealloc_end;
 280    int ret;
 281    uint32_t file_align = bs->file->bs->bl.request_alignment;
 282    uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
 283
 284    assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
 285
 286    if (!has_prealloc_perms(bs)) {
 287        /* We don't have state neither should try to recover it */
 288        return false;
 289    }
 290
 291    if (s->data_end < 0) {
 292        s->data_end = bdrv_co_getlength(bs->file->bs);
 293        if (s->data_end < 0) {
 294            return false;
 295        }
 296
 297        if (s->file_end < 0) {
 298            s->file_end = s->data_end;
 299        }
 300    }
 301
 302    if (end <= s->data_end) {
 303        return false;
 304    }
 305
 306    /* We have valid s->data_end, and request writes beyond it. */
 307
 308    s->data_end = end;
 309    if (s->zero_start < 0 || !want_merge_zero) {
 310        s->zero_start = end;
 311    }
 312
 313    if (s->file_end < 0) {
 314        s->file_end = bdrv_co_getlength(bs->file->bs);
 315        if (s->file_end < 0) {
 316            return false;
 317        }
 318    }
 319
 320    /* Now s->data_end, s->zero_start and s->file_end are valid. */
 321
 322    if (end <= s->file_end) {
 323        /* No preallocation needed. */
 324        return want_merge_zero && offset >= s->zero_start;
 325    }
 326
 327    /* Now we want new preallocation, as request writes beyond s->file_end. */
 328
 329    prealloc_start = QEMU_ALIGN_UP(
 330            want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
 331            file_align);
 332    prealloc_end = QEMU_ALIGN_UP(
 333            MAX(prealloc_start, end) + s->opts.prealloc_size,
 334            prealloc_align);
 335
 336    want_merge_zero = want_merge_zero && (prealloc_start <= offset);
 337
 338    ret = bdrv_co_pwrite_zeroes(
 339            bs->file, prealloc_start, prealloc_end - prealloc_start,
 340            BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
 341    if (ret < 0) {
 342        s->file_end = ret;
 343        return false;
 344    }
 345
 346    s->file_end = prealloc_end;
 347    return want_merge_zero;
 348}
 349
 350static int coroutine_fn GRAPH_RDLOCK
 351preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
 352                             int64_t bytes, BdrvRequestFlags flags)
 353{
 354    bool want_merge_zero =
 355        !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
 356    if (handle_write(bs, offset, bytes, want_merge_zero)) {
 357        return 0;
 358    }
 359
 360    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 361}
 362
 363static int coroutine_fn GRAPH_RDLOCK
 364preallocate_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
 365                            QEMUIOVector *qiov, size_t qiov_offset,
 366                            BdrvRequestFlags flags)
 367{
 368    handle_write(bs, offset, bytes, false);
 369
 370    return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
 371                                flags);
 372}
 373
 374static int coroutine_fn GRAPH_RDLOCK
 375preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
 376                        bool exact, PreallocMode prealloc,
 377                        BdrvRequestFlags flags, Error **errp)
 378{
 379    ERRP_GUARD();
 380    BDRVPreallocateState *s = bs->opaque;
 381    int ret;
 382
 383    if (s->data_end >= 0 && offset > s->data_end) {
 384        if (s->file_end < 0) {
 385            s->file_end = bdrv_co_getlength(bs->file->bs);
 386            if (s->file_end < 0) {
 387                error_setg(errp, "failed to get file length");
 388                return s->file_end;
 389            }
 390        }
 391
 392        if (prealloc == PREALLOC_MODE_FALLOC) {
 393            /*
 394             * If offset <= s->file_end, the task is already done, just
 395             * update s->data_end, to move part of "filter preallocation"
 396             * to "preallocation requested by user".
 397             * Otherwise just proceed to preallocate missing part.
 398             */
 399            if (offset <= s->file_end) {
 400                s->data_end = offset;
 401                return 0;
 402            }
 403        } else {
 404            /*
 405             * We have to drop our preallocation, to
 406             * - avoid "Cannot use preallocation for shrinking files" in
 407             *   case of offset < file_end
 408             * - give PREALLOC_MODE_OFF a chance to keep small disk
 409             *   usage
 410             * - give PREALLOC_MODE_FULL a chance to actually write the
 411             *   whole region as user expects
 412             */
 413            if (s->file_end > s->data_end) {
 414                ret = bdrv_co_truncate(bs->file, s->data_end, true,
 415                                       PREALLOC_MODE_OFF, 0, errp);
 416                if (ret < 0) {
 417                    s->file_end = ret;
 418                    error_prepend(errp, "preallocate-filter: failed to drop "
 419                                  "write-zero preallocation: ");
 420                    return ret;
 421                }
 422                s->file_end = s->data_end;
 423            }
 424        }
 425
 426        s->data_end = offset;
 427    }
 428
 429    ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
 430    if (ret < 0) {
 431        s->file_end = s->zero_start = s->data_end = ret;
 432        return ret;
 433    }
 434
 435    if (has_prealloc_perms(bs)) {
 436        s->file_end = s->zero_start = s->data_end = offset;
 437    }
 438    return 0;
 439}
 440
 441static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs)
 442{
 443    return bdrv_co_flush(bs->file->bs);
 444}
 445
 446static int64_t coroutine_fn GRAPH_RDLOCK
 447preallocate_co_getlength(BlockDriverState *bs)
 448{
 449    int64_t ret;
 450    BDRVPreallocateState *s = bs->opaque;
 451
 452    if (s->data_end >= 0) {
 453        return s->data_end;
 454    }
 455
 456    ret = bdrv_co_getlength(bs->file->bs);
 457
 458    if (has_prealloc_perms(bs)) {
 459        s->file_end = s->zero_start = s->data_end = ret;
 460    }
 461
 462    return ret;
 463}
 464
 465static int preallocate_check_perm(BlockDriverState *bs,
 466                                  uint64_t perm, uint64_t shared, Error **errp)
 467{
 468    BDRVPreallocateState *s = bs->opaque;
 469
 470    if (s->data_end >= 0 && !can_write_resize(perm)) {
 471        /*
 472         * Lose permissions.
 473         * We should truncate in check_perm, as in set_perm bs->file->perm will
 474         * be already changed, and we should not violate it.
 475         */
 476        if (s->file_end < 0) {
 477            s->file_end = bdrv_getlength(bs->file->bs);
 478            if (s->file_end < 0) {
 479                error_setg(errp, "Failed to get file length");
 480                return s->file_end;
 481            }
 482        }
 483
 484        if (s->data_end < s->file_end) {
 485            int ret = bdrv_truncate(bs->file, s->data_end, true,
 486                                    PREALLOC_MODE_OFF, 0, NULL);
 487            if (ret < 0) {
 488                error_setg(errp, "Failed to drop preallocation");
 489                s->file_end = ret;
 490                return ret;
 491            }
 492            s->file_end = s->data_end;
 493        }
 494    }
 495
 496    return 0;
 497}
 498
 499static void preallocate_set_perm(BlockDriverState *bs,
 500                                 uint64_t perm, uint64_t shared)
 501{
 502    BDRVPreallocateState *s = bs->opaque;
 503
 504    if (can_write_resize(perm)) {
 505        if (s->data_end < 0) {
 506            s->data_end = s->file_end = s->zero_start =
 507                bdrv_getlength(bs->file->bs);
 508        }
 509    } else {
 510        /*
 511         * We drop our permissions, as well as allow shared
 512         * permissions (see preallocate_child_perm), anyone will be able to
 513         * change the child, so mark all states invalid. We'll regain control if
 514         * get good permissions back.
 515         */
 516        s->data_end = s->file_end = s->zero_start = -EINVAL;
 517    }
 518}
 519
 520static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
 521    BdrvChildRole role, BlockReopenQueue *reopen_queue,
 522    uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
 523{
 524    bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
 525
 526    if (can_write_resize(perm)) {
 527        /* This should come by default, but let's enforce: */
 528        *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
 529
 530        /*
 531         * Don't share, to keep our states s->file_end, s->data_end and
 532         * s->zero_start valid.
 533         */
 534        *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
 535    }
 536}
 537
 538BlockDriver bdrv_preallocate_filter = {
 539    .format_name = "preallocate",
 540    .instance_size = sizeof(BDRVPreallocateState),
 541
 542    .bdrv_co_getlength    = preallocate_co_getlength,
 543    .bdrv_open            = preallocate_open,
 544    .bdrv_close           = preallocate_close,
 545
 546    .bdrv_reopen_prepare  = preallocate_reopen_prepare,
 547    .bdrv_reopen_commit   = preallocate_reopen_commit,
 548    .bdrv_reopen_abort    = preallocate_reopen_abort,
 549
 550    .bdrv_co_preadv_part = preallocate_co_preadv_part,
 551    .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
 552    .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
 553    .bdrv_co_pdiscard = preallocate_co_pdiscard,
 554    .bdrv_co_flush = preallocate_co_flush,
 555    .bdrv_co_truncate = preallocate_co_truncate,
 556
 557    .bdrv_check_perm = preallocate_check_perm,
 558    .bdrv_set_perm = preallocate_set_perm,
 559    .bdrv_child_perm = preallocate_child_perm,
 560
 561    .is_filter = true,
 562};
 563
 564static void bdrv_preallocate_init(void)
 565{
 566    bdrv_register(&bdrv_preallocate_filter);
 567}
 568
 569block_init(bdrv_preallocate_init);
 570