linux/fs/nfs/blocklayout/extent_tree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2014-2016 Christoph Hellwig.
   4 */
   5
   6#include <linux/vmalloc.h>
   7
   8#include "blocklayout.h"
   9
  10#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
  11
  12static inline struct pnfs_block_extent *
  13ext_node(struct rb_node *node)
  14{
  15        return rb_entry(node, struct pnfs_block_extent, be_node);
  16}
  17
  18static struct pnfs_block_extent *
  19ext_tree_first(struct rb_root *root)
  20{
  21        struct rb_node *node = rb_first(root);
  22        return node ? ext_node(node) : NULL;
  23}
  24
  25static struct pnfs_block_extent *
  26ext_tree_prev(struct pnfs_block_extent *be)
  27{
  28        struct rb_node *node = rb_prev(&be->be_node);
  29        return node ? ext_node(node) : NULL;
  30}
  31
  32static struct pnfs_block_extent *
  33ext_tree_next(struct pnfs_block_extent *be)
  34{
  35        struct rb_node *node = rb_next(&be->be_node);
  36        return node ? ext_node(node) : NULL;
  37}
  38
  39static inline sector_t
  40ext_f_end(struct pnfs_block_extent *be)
  41{
  42        return be->be_f_offset + be->be_length;
  43}
  44
  45static struct pnfs_block_extent *
  46__ext_tree_search(struct rb_root *root, sector_t start)
  47{
  48        struct rb_node *node = root->rb_node;
  49        struct pnfs_block_extent *be = NULL;
  50
  51        while (node) {
  52                be = ext_node(node);
  53                if (start < be->be_f_offset)
  54                        node = node->rb_left;
  55                else if (start >= ext_f_end(be))
  56                        node = node->rb_right;
  57                else
  58                        return be;
  59        }
  60
  61        if (be) {
  62                if (start < be->be_f_offset)
  63                        return be;
  64
  65                if (start >= ext_f_end(be))
  66                        return ext_tree_next(be);
  67        }
  68
  69        return NULL;
  70}
  71
  72static bool
  73ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
  74{
  75        if (be1->be_state != be2->be_state)
  76                return false;
  77        if (be1->be_device != be2->be_device)
  78                return false;
  79
  80        if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
  81                return false;
  82
  83        if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
  84            (be1->be_v_offset + be1->be_length != be2->be_v_offset))
  85                return false;
  86
  87        if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
  88            be1->be_tag != be2->be_tag)
  89                return false;
  90
  91        return true;
  92}
  93
  94static struct pnfs_block_extent *
  95ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
  96{
  97        struct pnfs_block_extent *left = ext_tree_prev(be);
  98
  99        if (left && ext_can_merge(left, be)) {
 100                left->be_length += be->be_length;
 101                rb_erase(&be->be_node, root);
 102                nfs4_put_deviceid_node(be->be_device);
 103                kfree(be);
 104                return left;
 105        }
 106
 107        return be;
 108}
 109
 110static struct pnfs_block_extent *
 111ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
 112{
 113        struct pnfs_block_extent *right = ext_tree_next(be);
 114
 115        if (right && ext_can_merge(be, right)) {
 116                be->be_length += right->be_length;
 117                rb_erase(&right->be_node, root);
 118                nfs4_put_deviceid_node(right->be_device);
 119                kfree(right);
 120        }
 121
 122        return be;
 123}
 124
 125static void __ext_put_deviceids(struct list_head *head)
 126{
 127        struct pnfs_block_extent *be, *tmp;
 128
 129        list_for_each_entry_safe(be, tmp, head, be_list) {
 130                nfs4_put_deviceid_node(be->be_device);
 131                kfree(be);
 132        }
 133}
 134
 135static void
 136__ext_tree_insert(struct rb_root *root,
 137                struct pnfs_block_extent *new, bool merge_ok)
 138{
 139        struct rb_node **p = &root->rb_node, *parent = NULL;
 140        struct pnfs_block_extent *be;
 141
 142        while (*p) {
 143                parent = *p;
 144                be = ext_node(parent);
 145
 146                if (new->be_f_offset < be->be_f_offset) {
 147                        if (merge_ok && ext_can_merge(new, be)) {
 148                                be->be_f_offset = new->be_f_offset;
 149                                if (be->be_state != PNFS_BLOCK_NONE_DATA)
 150                                        be->be_v_offset = new->be_v_offset;
 151                                be->be_length += new->be_length;
 152                                be = ext_try_to_merge_left(root, be);
 153                                goto free_new;
 154                        }
 155                        p = &(*p)->rb_left;
 156                } else if (new->be_f_offset >= ext_f_end(be)) {
 157                        if (merge_ok && ext_can_merge(be, new)) {
 158                                be->be_length += new->be_length;
 159                                be = ext_try_to_merge_right(root, be);
 160                                goto free_new;
 161                        }
 162                        p = &(*p)->rb_right;
 163                } else {
 164                        BUG();
 165                }
 166        }
 167
 168        rb_link_node(&new->be_node, parent, p);
 169        rb_insert_color(&new->be_node, root);
 170        return;
 171free_new:
 172        nfs4_put_deviceid_node(new->be_device);
 173        kfree(new);
 174}
 175
 176static int
 177__ext_tree_remove(struct rb_root *root,
 178                sector_t start, sector_t end, struct list_head *tmp)
 179{
 180        struct pnfs_block_extent *be;
 181        sector_t len1 = 0, len2 = 0;
 182        sector_t orig_v_offset;
 183        sector_t orig_len;
 184
 185        be = __ext_tree_search(root, start);
 186        if (!be)
 187                return 0;
 188        if (be->be_f_offset >= end)
 189                return 0;
 190
 191        orig_v_offset = be->be_v_offset;
 192        orig_len = be->be_length;
 193
 194        if (start > be->be_f_offset)
 195                len1 = start - be->be_f_offset;
 196        if (ext_f_end(be) > end)
 197                len2 = ext_f_end(be) - end;
 198
 199        if (len2 > 0) {
 200                if (len1 > 0) {
 201                        struct pnfs_block_extent *new;
 202
 203                        new = kzalloc(sizeof(*new), GFP_ATOMIC);
 204                        if (!new)
 205                                return -ENOMEM;
 206
 207                        be->be_length = len1;
 208
 209                        new->be_f_offset = end;
 210                        if (be->be_state != PNFS_BLOCK_NONE_DATA) {
 211                                new->be_v_offset =
 212                                        orig_v_offset + orig_len - len2;
 213                        }
 214                        new->be_length = len2;
 215                        new->be_state = be->be_state;
 216                        new->be_tag = be->be_tag;
 217                        new->be_device = nfs4_get_deviceid(be->be_device);
 218
 219                        __ext_tree_insert(root, new, true);
 220                } else {
 221                        be->be_f_offset = end;
 222                        if (be->be_state != PNFS_BLOCK_NONE_DATA) {
 223                                be->be_v_offset =
 224                                        orig_v_offset + orig_len - len2;
 225                        }
 226                        be->be_length = len2;
 227                }
 228        } else {
 229                if (len1 > 0) {
 230                        be->be_length = len1;
 231                        be = ext_tree_next(be);
 232                }
 233
 234                while (be && ext_f_end(be) <= end) {
 235                        struct pnfs_block_extent *next = ext_tree_next(be);
 236
 237                        rb_erase(&be->be_node, root);
 238                        list_add_tail(&be->be_list, tmp);
 239                        be = next;
 240                }
 241
 242                if (be && be->be_f_offset < end) {
 243                        len1 = ext_f_end(be) - end;
 244                        be->be_f_offset = end;
 245                        if (be->be_state != PNFS_BLOCK_NONE_DATA)
 246                                be->be_v_offset += be->be_length - len1;
 247                        be->be_length = len1;
 248                }
 249        }
 250
 251        return 0;
 252}
 253
 254int
 255ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
 256{
 257        struct pnfs_block_extent *be;
 258        struct rb_root *root;
 259        int err = 0;
 260
 261        switch (new->be_state) {
 262        case PNFS_BLOCK_READWRITE_DATA:
 263        case PNFS_BLOCK_INVALID_DATA:
 264                root = &bl->bl_ext_rw;
 265                break;
 266        case PNFS_BLOCK_READ_DATA:
 267        case PNFS_BLOCK_NONE_DATA:
 268                root = &bl->bl_ext_ro;
 269                break;
 270        default:
 271                dprintk("invalid extent type\n");
 272                return -EINVAL;
 273        }
 274
 275        spin_lock(&bl->bl_ext_lock);
 276retry:
 277        be = __ext_tree_search(root, new->be_f_offset);
 278        if (!be || be->be_f_offset >= ext_f_end(new)) {
 279                __ext_tree_insert(root, new, true);
 280        } else if (new->be_f_offset >= be->be_f_offset) {
 281                if (ext_f_end(new) <= ext_f_end(be)) {
 282                        nfs4_put_deviceid_node(new->be_device);
 283                        kfree(new);
 284                } else {
 285                        sector_t new_len = ext_f_end(new) - ext_f_end(be);
 286                        sector_t diff = new->be_length - new_len;
 287
 288                        new->be_f_offset += diff;
 289                        new->be_v_offset += diff;
 290                        new->be_length = new_len;
 291                        goto retry;
 292                }
 293        } else if (ext_f_end(new) <= ext_f_end(be)) {
 294                new->be_length = be->be_f_offset - new->be_f_offset;
 295                __ext_tree_insert(root, new, true);
 296        } else {
 297                struct pnfs_block_extent *split;
 298                sector_t new_len = ext_f_end(new) - ext_f_end(be);
 299                sector_t diff = new->be_length - new_len;
 300
 301                split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
 302                if (!split) {
 303                        err = -EINVAL;
 304                        goto out;
 305                }
 306
 307                split->be_length = be->be_f_offset - split->be_f_offset;
 308                split->be_device = nfs4_get_deviceid(new->be_device);
 309                __ext_tree_insert(root, split, true);
 310
 311                new->be_f_offset += diff;
 312                new->be_v_offset += diff;
 313                new->be_length = new_len;
 314                goto retry;
 315        }
 316out:
 317        spin_unlock(&bl->bl_ext_lock);
 318        return err;
 319}
 320
 321static bool
 322__ext_tree_lookup(struct rb_root *root, sector_t isect,
 323                struct pnfs_block_extent *ret)
 324{
 325        struct rb_node *node;
 326        struct pnfs_block_extent *be;
 327
 328        node = root->rb_node;
 329        while (node) {
 330                be = ext_node(node);
 331                if (isect < be->be_f_offset)
 332                        node = node->rb_left;
 333                else if (isect >= ext_f_end(be))
 334                        node = node->rb_right;
 335                else {
 336                        *ret = *be;
 337                        return true;
 338                }
 339        }
 340
 341        return false;
 342}
 343
 344bool
 345ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
 346            struct pnfs_block_extent *ret, bool rw)
 347{
 348        bool found = false;
 349
 350        spin_lock(&bl->bl_ext_lock);
 351        if (!rw)
 352                found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
 353        if (!found)
 354                found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
 355        spin_unlock(&bl->bl_ext_lock);
 356
 357        return found;
 358}
 359
 360int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
 361                sector_t start, sector_t end)
 362{
 363        int err, err2;
 364        LIST_HEAD(tmp);
 365
 366        spin_lock(&bl->bl_ext_lock);
 367        err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
 368        if (rw) {
 369                err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
 370                if (!err)
 371                        err = err2;
 372        }
 373        spin_unlock(&bl->bl_ext_lock);
 374
 375        __ext_put_deviceids(&tmp);
 376        return err;
 377}
 378
 379static int
 380ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
 381                sector_t split)
 382{
 383        struct pnfs_block_extent *new;
 384        sector_t orig_len = be->be_length;
 385
 386        new = kzalloc(sizeof(*new), GFP_ATOMIC);
 387        if (!new)
 388                return -ENOMEM;
 389
 390        be->be_length = split - be->be_f_offset;
 391
 392        new->be_f_offset = split;
 393        if (be->be_state != PNFS_BLOCK_NONE_DATA)
 394                new->be_v_offset = be->be_v_offset + be->be_length;
 395        new->be_length = orig_len - be->be_length;
 396        new->be_state = be->be_state;
 397        new->be_tag = be->be_tag;
 398        new->be_device = nfs4_get_deviceid(be->be_device);
 399
 400        __ext_tree_insert(root, new, false);
 401        return 0;
 402}
 403
 404int
 405ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
 406                sector_t len, u64 lwb)
 407{
 408        struct rb_root *root = &bl->bl_ext_rw;
 409        sector_t end = start + len;
 410        struct pnfs_block_extent *be;
 411        int err = 0;
 412        LIST_HEAD(tmp);
 413
 414        spin_lock(&bl->bl_ext_lock);
 415        /*
 416         * First remove all COW extents or holes from written to range.
 417         */
 418        err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
 419        if (err)
 420                goto out;
 421
 422        /*
 423         * Then mark all invalid extents in the range as written to.
 424         */
 425        for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
 426                if (be->be_f_offset >= end)
 427                        break;
 428
 429                if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
 430                        continue;
 431
 432                if (be->be_f_offset < start) {
 433                        struct pnfs_block_extent *left = ext_tree_prev(be);
 434
 435                        if (left && ext_can_merge(left, be)) {
 436                                sector_t diff = start - be->be_f_offset;
 437
 438                                left->be_length += diff;
 439
 440                                be->be_f_offset += diff;
 441                                be->be_v_offset += diff;
 442                                be->be_length -= diff;
 443                        } else {
 444                                err = ext_tree_split(root, be, start);
 445                                if (err)
 446                                        goto out;
 447                        }
 448                }
 449
 450                if (ext_f_end(be) > end) {
 451                        struct pnfs_block_extent *right = ext_tree_next(be);
 452
 453                        if (right && ext_can_merge(be, right)) {
 454                                sector_t diff = end - be->be_f_offset;
 455
 456                                be->be_length -= diff;
 457
 458                                right->be_f_offset -= diff;
 459                                right->be_v_offset -= diff;
 460                                right->be_length += diff;
 461                        } else {
 462                                err = ext_tree_split(root, be, end);
 463                                if (err)
 464                                        goto out;
 465                        }
 466                }
 467
 468                if (be->be_f_offset >= start && ext_f_end(be) <= end) {
 469                        be->be_tag = EXTENT_WRITTEN;
 470                        be = ext_try_to_merge_left(root, be);
 471                        be = ext_try_to_merge_right(root, be);
 472                }
 473        }
 474out:
 475        if (bl->bl_lwb < lwb)
 476                bl->bl_lwb = lwb;
 477        spin_unlock(&bl->bl_ext_lock);
 478
 479        __ext_put_deviceids(&tmp);
 480        return err;
 481}
 482
 483static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
 484{
 485        if (bl->bl_scsi_layout)
 486                return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
 487        else
 488                return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
 489}
 490
 491static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 492                size_t buffer_size)
 493{
 494        if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
 495                int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
 496
 497                for (i = 0; i < nr_pages; i++)
 498                        put_page(arg->layoutupdate_pages[i]);
 499                vfree(arg->start_p);
 500                kfree(arg->layoutupdate_pages);
 501        } else {
 502                put_page(arg->layoutupdate_page);
 503        }
 504}
 505
 506static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
 507{
 508        p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
 509                        NFS4_DEVICEID4_SIZE);
 510        p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
 511        p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
 512        p = xdr_encode_hyper(p, 0LL);
 513        *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
 514        return p;
 515}
 516
 517static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
 518{
 519        p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
 520        return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
 521}
 522
 523static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 524                size_t buffer_size, size_t *count, __u64 *lastbyte)
 525{
 526        struct pnfs_block_extent *be;
 527        int ret = 0;
 528
 529        spin_lock(&bl->bl_ext_lock);
 530        for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
 531                if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
 532                    be->be_tag != EXTENT_WRITTEN)
 533                        continue;
 534
 535                (*count)++;
 536                if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
 537                        /* keep counting.. */
 538                        ret = -ENOSPC;
 539                        continue;
 540                }
 541
 542                if (bl->bl_scsi_layout)
 543                        p = encode_scsi_range(be, p);
 544                else
 545                        p = encode_block_extent(be, p);
 546                be->be_tag = EXTENT_COMMITTING;
 547        }
 548        *lastbyte = bl->bl_lwb - 1;
 549        bl->bl_lwb = 0;
 550        spin_unlock(&bl->bl_ext_lock);
 551
 552        return ret;
 553}
 554
 555int
 556ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
 557{
 558        struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
 559        size_t count = 0, buffer_size = PAGE_SIZE;
 560        __be32 *start_p;
 561        int ret;
 562
 563        dprintk("%s enter\n", __func__);
 564
 565        arg->layoutupdate_page = alloc_page(GFP_NOFS);
 566        if (!arg->layoutupdate_page)
 567                return -ENOMEM;
 568        start_p = page_address(arg->layoutupdate_page);
 569        arg->layoutupdate_pages = &arg->layoutupdate_page;
 570
 571retry:
 572        ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
 573        if (unlikely(ret)) {
 574                ext_tree_free_commitdata(arg, buffer_size);
 575
 576                buffer_size = ext_tree_layoutupdate_size(bl, count);
 577                count = 0;
 578
 579                arg->layoutupdate_pages =
 580                        kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
 581                                sizeof(struct page *), GFP_NOFS);
 582                if (!arg->layoutupdate_pages)
 583                        return -ENOMEM;
 584
 585                start_p = __vmalloc(buffer_size, GFP_NOFS);
 586                if (!start_p) {
 587                        kfree(arg->layoutupdate_pages);
 588                        return -ENOMEM;
 589                }
 590
 591                goto retry;
 592        }
 593
 594        *start_p = cpu_to_be32(count);
 595        arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
 596
 597        if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
 598                void *p = start_p, *end = p + arg->layoutupdate_len;
 599                struct page *page = NULL;
 600                int i = 0;
 601
 602                arg->start_p = start_p;
 603                for ( ; p < end; p += PAGE_SIZE) {
 604                        page = vmalloc_to_page(p);
 605                        arg->layoutupdate_pages[i++] = page;
 606                        get_page(page);
 607                }
 608        }
 609
 610        dprintk("%s found %zu ranges\n", __func__, count);
 611        return 0;
 612}
 613
 614void
 615ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
 616{
 617        struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
 618        struct rb_root *root = &bl->bl_ext_rw;
 619        struct pnfs_block_extent *be;
 620
 621        dprintk("%s status %d\n", __func__, status);
 622
 623        ext_tree_free_commitdata(arg, arg->layoutupdate_len);
 624
 625        spin_lock(&bl->bl_ext_lock);
 626        for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
 627                if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
 628                    be->be_tag != EXTENT_COMMITTING)
 629                        continue;
 630
 631                if (status) {
 632                        /*
 633                         * Mark as written and try again.
 634                         *
 635                         * XXX: some real error handling here wouldn't hurt..
 636                         */
 637                        be->be_tag = EXTENT_WRITTEN;
 638                } else {
 639                        be->be_state = PNFS_BLOCK_READWRITE_DATA;
 640                        be->be_tag = 0;
 641                }
 642
 643                be = ext_try_to_merge_left(root, be);
 644                be = ext_try_to_merge_right(root, be);
 645        }
 646        spin_unlock(&bl->bl_ext_lock);
 647}
 648