qemu/block/preallocate.c
<<
>>
Prefs
   1/*
   2 * preallocate filter driver
   3 *
   4 * The driver performs preallocate operation: it is injected above
   5 * some node, and before each write over EOF it does additional preallocating
   6 * write-zeroes request.
   7 *
   8 * Copyright (c) 2020 Virtuozzo International GmbH.
   9 *
  10 * Author:
  11 *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
  12 *
  13 * This program is free software; you can redistribute it and/or modify
  14 * it under the terms of the GNU General Public License as published by
  15 * the Free Software Foundation; either version 2 of the License, or
  16 * (at your option) any later version.
  17 *
  18 * This program is distributed in the hope that it will be useful,
  19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 * GNU General Public License for more details.
  22 *
  23 * You should have received a copy of the GNU General Public License
  24 * along with this program. If not, see <http://www.gnu.org/licenses/>.
  25 */
  26
  27#include "qemu/osdep.h"
  28
  29#include "qapi/error.h"
  30#include "qemu/module.h"
  31#include "qemu/option.h"
  32#include "qemu/units.h"
  33#include "block/block_int.h"
  34
  35
  36typedef struct PreallocateOpts {
  37    int64_t prealloc_size;
  38    int64_t prealloc_align;
  39} PreallocateOpts;
  40
  41typedef struct BDRVPreallocateState {
  42    PreallocateOpts opts;
  43
  44    /*
  45     * Track real data end, to crop preallocation on close. If < 0 the status is
  46     * unknown.
  47     *
  48     * @data_end is a maximum of file size on open (or when we get write/resize
  49     * permissions) and all write request ends after it. So it's safe to
  50     * truncate to data_end if it is valid.
  51     */
  52    int64_t data_end;
  53
  54    /*
  55     * Start of trailing preallocated area which reads as zero. May be smaller
  56     * than data_end, if user does over-EOF write zero operation. If < 0 the
  57     * status is unknown.
  58     *
  59     * If both @zero_start and @file_end are valid, the region
  60     * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
  61     * is not valid, @zero_start doesn't make much sense.
  62     */
  63    int64_t zero_start;
  64
  65    /*
  66     * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
  67     * to avoid extra lseek() calls on each write operation. If < 0 the status
  68     * is unknown.
  69     */
  70    int64_t file_end;
  71
  72    /*
  73     * All three states @data_end, @zero_start and @file_end are guaranteed to
  74     * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
  75     * BLK_PERM_WRITE permissions on file child.
  76     */
  77} BDRVPreallocateState;
  78
  79#define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
  80#define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
  81static QemuOptsList runtime_opts = {
  82    .name = "preallocate",
  83    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
  84    .desc = {
  85        {
  86            .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
  87            .type = QEMU_OPT_SIZE,
  88            .help = "on preallocation, align file length to this number, "
  89                "default 1M",
  90        },
  91        {
  92            .name = PREALLOCATE_OPT_PREALLOC_SIZE,
  93            .type = QEMU_OPT_SIZE,
  94            .help = "how much to preallocate, default 128M",
  95        },
  96        { /* end of list */ }
  97    },
  98};
  99
 100static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
 101                                    BlockDriverState *child_bs, Error **errp)
 102{
 103    QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
 104
 105    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
 106        return false;
 107    }
 108
 109    dest->prealloc_align =
 110        qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
 111    dest->prealloc_size =
 112        qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
 113
 114    qemu_opts_del(opts);
 115
 116    if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
 117        error_setg(errp, "prealloc-align parameter of preallocate filter "
 118                   "is not aligned to %llu", BDRV_SECTOR_SIZE);
 119        return false;
 120    }
 121
 122    if (!QEMU_IS_ALIGNED(dest->prealloc_align,
 123                         child_bs->bl.request_alignment)) {
 124        error_setg(errp, "prealloc-align parameter of preallocate filter "
 125                   "is not aligned to underlying node request alignment "
 126                   "(%" PRIi32 ")", child_bs->bl.request_alignment);
 127        return false;
 128    }
 129
 130    return true;
 131}
 132
 133static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
 134                            Error **errp)
 135{
 136    BDRVPreallocateState *s = bs->opaque;
 137
 138    /*
 139     * s->data_end and friends should be initialized on permission update.
 140     * For this to work, mark them invalid.
 141     */
 142    s->file_end = s->zero_start = s->data_end = -EINVAL;
 143
 144    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 145                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 146                               false, errp);
 147    if (!bs->file) {
 148        return -EINVAL;
 149    }
 150
 151    if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
 152        return -EINVAL;
 153    }
 154
 155    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
 156        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
 157
 158    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
 159        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
 160            bs->file->bs->supported_zero_flags);
 161
 162    return 0;
 163}
 164
 165static void preallocate_close(BlockDriverState *bs)
 166{
 167    int ret;
 168    BDRVPreallocateState *s = bs->opaque;
 169
 170    if (s->data_end < 0) {
 171        return;
 172    }
 173
 174    if (s->file_end < 0) {
 175        s->file_end = bdrv_getlength(bs->file->bs);
 176        if (s->file_end < 0) {
 177            return;
 178        }
 179    }
 180
 181    if (s->data_end < s->file_end) {
 182        ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
 183                            NULL);
 184        s->file_end = ret < 0 ? ret : s->data_end;
 185    }
 186}
 187
 188
 189/*
 190 * Handle reopen.
 191 *
 192 * We must implement reopen handlers, otherwise reopen just don't work. Handle
 193 * new options and don't care about preallocation state, as it is handled in
 194 * set/check permission handlers.
 195 */
 196
 197static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
 198                                      BlockReopenQueue *queue, Error **errp)
 199{
 200    PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
 201
 202    if (!preallocate_absorb_opts(opts, reopen_state->options,
 203                                 reopen_state->bs->file->bs, errp)) {
 204        g_free(opts);
 205        return -EINVAL;
 206    }
 207
 208    reopen_state->opaque = opts;
 209
 210    return 0;
 211}
 212
 213static void preallocate_reopen_commit(BDRVReopenState *state)
 214{
 215    BDRVPreallocateState *s = state->bs->opaque;
 216
 217    s->opts = *(PreallocateOpts *)state->opaque;
 218
 219    g_free(state->opaque);
 220    state->opaque = NULL;
 221}
 222
 223static void preallocate_reopen_abort(BDRVReopenState *state)
 224{
 225    g_free(state->opaque);
 226    state->opaque = NULL;
 227}
 228
 229static coroutine_fn int preallocate_co_preadv_part(
 230        BlockDriverState *bs, int64_t offset, int64_t bytes,
 231        QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags)
 232{
 233    return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
 234                               flags);
 235}
 236
 237static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs,
 238                                               int64_t offset, int64_t bytes)
 239{
 240    return bdrv_co_pdiscard(bs->file, offset, bytes);
 241}
 242
 243static bool can_write_resize(uint64_t perm)
 244{
 245    return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
 246}
 247
 248static bool has_prealloc_perms(BlockDriverState *bs)
 249{
 250    BDRVPreallocateState *s = bs->opaque;
 251
 252    if (can_write_resize(bs->file->perm)) {
 253        assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
 254        assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
 255        return true;
 256    }
 257
 258    assert(s->data_end < 0);
 259    assert(s->zero_start < 0);
 260    assert(s->file_end < 0);
 261    return false;
 262}
 263
 264/*
 265 * Call on each write. Returns true if @want_merge_zero is true and the region
 266 * [offset, offset + bytes) is zeroed (as a result of this call or earlier
 267 * preallocation).
 268 *
 269 * want_merge_zero is used to merge write-zero request with preallocation in
 270 * one bdrv_co_pwrite_zeroes() call.
 271 */
 272static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset,
 273                                      int64_t bytes, bool want_merge_zero)
 274{
 275    BDRVPreallocateState *s = bs->opaque;
 276    int64_t end = offset + bytes;
 277    int64_t prealloc_start, prealloc_end;
 278    int ret;
 279    uint32_t file_align = bs->file->bs->bl.request_alignment;
 280    uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
 281
 282    assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
 283
 284    if (!has_prealloc_perms(bs)) {
 285        /* We don't have state neither should try to recover it */
 286        return false;
 287    }
 288
 289    if (s->data_end < 0) {
 290        s->data_end = bdrv_getlength(bs->file->bs);
 291        if (s->data_end < 0) {
 292            return false;
 293        }
 294
 295        if (s->file_end < 0) {
 296            s->file_end = s->data_end;
 297        }
 298    }
 299
 300    if (end <= s->data_end) {
 301        return false;
 302    }
 303
 304    /* We have valid s->data_end, and request writes beyond it. */
 305
 306    s->data_end = end;
 307    if (s->zero_start < 0 || !want_merge_zero) {
 308        s->zero_start = end;
 309    }
 310
 311    if (s->file_end < 0) {
 312        s->file_end = bdrv_getlength(bs->file->bs);
 313        if (s->file_end < 0) {
 314            return false;
 315        }
 316    }
 317
 318    /* Now s->data_end, s->zero_start and s->file_end are valid. */
 319
 320    if (end <= s->file_end) {
 321        /* No preallocation needed. */
 322        return want_merge_zero && offset >= s->zero_start;
 323    }
 324
 325    /* Now we want new preallocation, as request writes beyond s->file_end. */
 326
 327    prealloc_start = QEMU_ALIGN_UP(
 328            want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
 329            file_align);
 330    prealloc_end = QEMU_ALIGN_UP(
 331            MAX(prealloc_start, end) + s->opts.prealloc_size,
 332            prealloc_align);
 333
 334    want_merge_zero = want_merge_zero && (prealloc_start <= offset);
 335
 336    ret = bdrv_co_pwrite_zeroes(
 337            bs->file, prealloc_start, prealloc_end - prealloc_start,
 338            BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
 339    if (ret < 0) {
 340        s->file_end = ret;
 341        return false;
 342    }
 343
 344    s->file_end = prealloc_end;
 345    return want_merge_zero;
 346}
 347
 348static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs,
 349        int64_t offset, int64_t bytes, BdrvRequestFlags flags)
 350{
 351    bool want_merge_zero =
 352        !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
 353    if (handle_write(bs, offset, bytes, want_merge_zero)) {
 354        return 0;
 355    }
 356
 357    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 358}
 359
 360static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs,
 361                                                    int64_t offset,
 362                                                    int64_t bytes,
 363                                                    QEMUIOVector *qiov,
 364                                                    size_t qiov_offset,
 365                                                    BdrvRequestFlags flags)
 366{
 367    handle_write(bs, offset, bytes, false);
 368
 369    return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
 370                                flags);
 371}
 372
 373static int coroutine_fn
 374preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
 375                        bool exact, PreallocMode prealloc,
 376                        BdrvRequestFlags flags, Error **errp)
 377{
 378    ERRP_GUARD();
 379    BDRVPreallocateState *s = bs->opaque;
 380    int ret;
 381
 382    if (s->data_end >= 0 && offset > s->data_end) {
 383        if (s->file_end < 0) {
 384            s->file_end = bdrv_getlength(bs->file->bs);
 385            if (s->file_end < 0) {
 386                error_setg(errp, "failed to get file length");
 387                return s->file_end;
 388            }
 389        }
 390
 391        if (prealloc == PREALLOC_MODE_FALLOC) {
 392            /*
 393             * If offset <= s->file_end, the task is already done, just
 394             * update s->data_end, to move part of "filter preallocation"
 395             * to "preallocation requested by user".
 396             * Otherwise just proceed to preallocate missing part.
 397             */
 398            if (offset <= s->file_end) {
 399                s->data_end = offset;
 400                return 0;
 401            }
 402        } else {
 403            /*
 404             * We have to drop our preallocation, to
 405             * - avoid "Cannot use preallocation for shrinking files" in
 406             *   case of offset < file_end
 407             * - give PREALLOC_MODE_OFF a chance to keep small disk
 408             *   usage
 409             * - give PREALLOC_MODE_FULL a chance to actually write the
 410             *   whole region as user expects
 411             */
 412            if (s->file_end > s->data_end) {
 413                ret = bdrv_co_truncate(bs->file, s->data_end, true,
 414                                       PREALLOC_MODE_OFF, 0, errp);
 415                if (ret < 0) {
 416                    s->file_end = ret;
 417                    error_prepend(errp, "preallocate-filter: failed to drop "
 418                                  "write-zero preallocation: ");
 419                    return ret;
 420                }
 421                s->file_end = s->data_end;
 422            }
 423        }
 424
 425        s->data_end = offset;
 426    }
 427
 428    ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
 429    if (ret < 0) {
 430        s->file_end = s->zero_start = s->data_end = ret;
 431        return ret;
 432    }
 433
 434    if (has_prealloc_perms(bs)) {
 435        s->file_end = s->zero_start = s->data_end = offset;
 436    }
 437    return 0;
 438}
 439
 440static int coroutine_fn preallocate_co_flush(BlockDriverState *bs)
 441{
 442    return bdrv_co_flush(bs->file->bs);
 443}
 444
 445static int64_t preallocate_getlength(BlockDriverState *bs)
 446{
 447    int64_t ret;
 448    BDRVPreallocateState *s = bs->opaque;
 449
 450    if (s->data_end >= 0) {
 451        return s->data_end;
 452    }
 453
 454    ret = bdrv_getlength(bs->file->bs);
 455
 456    if (has_prealloc_perms(bs)) {
 457        s->file_end = s->zero_start = s->data_end = ret;
 458    }
 459
 460    return ret;
 461}
 462
 463static int preallocate_check_perm(BlockDriverState *bs,
 464                                  uint64_t perm, uint64_t shared, Error **errp)
 465{
 466    BDRVPreallocateState *s = bs->opaque;
 467
 468    if (s->data_end >= 0 && !can_write_resize(perm)) {
 469        /*
 470         * Lose permissions.
 471         * We should truncate in check_perm, as in set_perm bs->file->perm will
 472         * be already changed, and we should not violate it.
 473         */
 474        if (s->file_end < 0) {
 475            s->file_end = bdrv_getlength(bs->file->bs);
 476            if (s->file_end < 0) {
 477                error_setg(errp, "Failed to get file length");
 478                return s->file_end;
 479            }
 480        }
 481
 482        if (s->data_end < s->file_end) {
 483            int ret = bdrv_truncate(bs->file, s->data_end, true,
 484                                    PREALLOC_MODE_OFF, 0, NULL);
 485            if (ret < 0) {
 486                error_setg(errp, "Failed to drop preallocation");
 487                s->file_end = ret;
 488                return ret;
 489            }
 490            s->file_end = s->data_end;
 491        }
 492    }
 493
 494    return 0;
 495}
 496
 497static void preallocate_set_perm(BlockDriverState *bs,
 498                                 uint64_t perm, uint64_t shared)
 499{
 500    BDRVPreallocateState *s = bs->opaque;
 501
 502    if (can_write_resize(perm)) {
 503        if (s->data_end < 0) {
 504            s->data_end = s->file_end = s->zero_start =
 505                bdrv_getlength(bs->file->bs);
 506        }
 507    } else {
 508        /*
 509         * We drop our permissions, as well as allow shared
 510         * permissions (see preallocate_child_perm), anyone will be able to
 511         * change the child, so mark all states invalid. We'll regain control if
 512         * get good permissions back.
 513         */
 514        s->data_end = s->file_end = s->zero_start = -EINVAL;
 515    }
 516}
 517
 518static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
 519    BdrvChildRole role, BlockReopenQueue *reopen_queue,
 520    uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
 521{
 522    bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
 523
 524    if (can_write_resize(perm)) {
 525        /* This should come by default, but let's enforce: */
 526        *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
 527
 528        /*
 529         * Don't share, to keep our states s->file_end, s->data_end and
 530         * s->zero_start valid.
 531         */
 532        *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
 533    }
 534}
 535
 536BlockDriver bdrv_preallocate_filter = {
 537    .format_name = "preallocate",
 538    .instance_size = sizeof(BDRVPreallocateState),
 539
 540    .bdrv_getlength = preallocate_getlength,
 541    .bdrv_open = preallocate_open,
 542    .bdrv_close = preallocate_close,
 543
 544    .bdrv_reopen_prepare  = preallocate_reopen_prepare,
 545    .bdrv_reopen_commit   = preallocate_reopen_commit,
 546    .bdrv_reopen_abort    = preallocate_reopen_abort,
 547
 548    .bdrv_co_preadv_part = preallocate_co_preadv_part,
 549    .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
 550    .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
 551    .bdrv_co_pdiscard = preallocate_co_pdiscard,
 552    .bdrv_co_flush = preallocate_co_flush,
 553    .bdrv_co_truncate = preallocate_co_truncate,
 554
 555    .bdrv_check_perm = preallocate_check_perm,
 556    .bdrv_set_perm = preallocate_set_perm,
 557    .bdrv_child_perm = preallocate_child_perm,
 558
 559    .has_variable_length = true,
 560    .is_filter = true,
 561};
 562
 563static void bdrv_preallocate_init(void)
 564{
 565    bdrv_register(&bdrv_preallocate_filter);
 566}
 567
 568block_init(bdrv_preallocate_init);
 569