linux/fs/nfs/blocklayout/extent_tree.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2014-2016 Christoph Hellwig.
   3 */
   4
   5#include <linux/vmalloc.h>
   6
   7#include "blocklayout.h"
   8
   9#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
  10
  11static inline struct pnfs_block_extent *
  12ext_node(struct rb_node *node)
  13{
  14        return rb_entry(node, struct pnfs_block_extent, be_node);
  15}
  16
  17static struct pnfs_block_extent *
  18ext_tree_first(struct rb_root *root)
  19{
  20        struct rb_node *node = rb_first(root);
  21        return node ? ext_node(node) : NULL;
  22}
  23
  24static struct pnfs_block_extent *
  25ext_tree_prev(struct pnfs_block_extent *be)
  26{
  27        struct rb_node *node = rb_prev(&be->be_node);
  28        return node ? ext_node(node) : NULL;
  29}
  30
  31static struct pnfs_block_extent *
  32ext_tree_next(struct pnfs_block_extent *be)
  33{
  34        struct rb_node *node = rb_next(&be->be_node);
  35        return node ? ext_node(node) : NULL;
  36}
  37
  38static inline sector_t
  39ext_f_end(struct pnfs_block_extent *be)
  40{
  41        return be->be_f_offset + be->be_length;
  42}
  43
  44static struct pnfs_block_extent *
  45__ext_tree_search(struct rb_root *root, sector_t start)
  46{
  47        struct rb_node *node = root->rb_node;
  48        struct pnfs_block_extent *be = NULL;
  49
  50        while (node) {
  51                be = ext_node(node);
  52                if (start < be->be_f_offset)
  53                        node = node->rb_left;
  54                else if (start >= ext_f_end(be))
  55                        node = node->rb_right;
  56                else
  57                        return be;
  58        }
  59
  60        if (be) {
  61                if (start < be->be_f_offset)
  62                        return be;
  63
  64                if (start >= ext_f_end(be))
  65                        return ext_tree_next(be);
  66        }
  67
  68        return NULL;
  69}
  70
  71static bool
  72ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
  73{
  74        if (be1->be_state != be2->be_state)
  75                return false;
  76        if (be1->be_device != be2->be_device)
  77                return false;
  78
  79        if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
  80                return false;
  81
  82        if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
  83            (be1->be_v_offset + be1->be_length != be2->be_v_offset))
  84                return false;
  85
  86        if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
  87            be1->be_tag != be2->be_tag)
  88                return false;
  89
  90        return true;
  91}
  92
  93static struct pnfs_block_extent *
  94ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
  95{
  96        struct pnfs_block_extent *left = ext_tree_prev(be);
  97
  98        if (left && ext_can_merge(left, be)) {
  99                left->be_length += be->be_length;
 100                rb_erase(&be->be_node, root);
 101                nfs4_put_deviceid_node(be->be_device);
 102                kfree(be);
 103                return left;
 104        }
 105
 106        return be;
 107}
 108
 109static struct pnfs_block_extent *
 110ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
 111{
 112        struct pnfs_block_extent *right = ext_tree_next(be);
 113
 114        if (right && ext_can_merge(be, right)) {
 115                be->be_length += right->be_length;
 116                rb_erase(&right->be_node, root);
 117                nfs4_put_deviceid_node(right->be_device);
 118                kfree(right);
 119        }
 120
 121        return be;
 122}
 123
 124static void __ext_put_deviceids(struct list_head *head)
 125{
 126        struct pnfs_block_extent *be, *tmp;
 127
 128        list_for_each_entry_safe(be, tmp, head, be_list) {
 129                nfs4_put_deviceid_node(be->be_device);
 130                kfree(be);
 131        }
 132}
 133
 134static void
 135__ext_tree_insert(struct rb_root *root,
 136                struct pnfs_block_extent *new, bool merge_ok)
 137{
 138        struct rb_node **p = &root->rb_node, *parent = NULL;
 139        struct pnfs_block_extent *be;
 140
 141        while (*p) {
 142                parent = *p;
 143                be = ext_node(parent);
 144
 145                if (new->be_f_offset < be->be_f_offset) {
 146                        if (merge_ok && ext_can_merge(new, be)) {
 147                                be->be_f_offset = new->be_f_offset;
 148                                if (be->be_state != PNFS_BLOCK_NONE_DATA)
 149                                        be->be_v_offset = new->be_v_offset;
 150                                be->be_length += new->be_length;
 151                                be = ext_try_to_merge_left(root, be);
 152                                goto free_new;
 153                        }
 154                        p = &(*p)->rb_left;
 155                } else if (new->be_f_offset >= ext_f_end(be)) {
 156                        if (merge_ok && ext_can_merge(be, new)) {
 157                                be->be_length += new->be_length;
 158                                be = ext_try_to_merge_right(root, be);
 159                                goto free_new;
 160                        }
 161                        p = &(*p)->rb_right;
 162                } else {
 163                        BUG();
 164                }
 165        }
 166
 167        rb_link_node(&new->be_node, parent, p);
 168        rb_insert_color(&new->be_node, root);
 169        return;
 170free_new:
 171        nfs4_put_deviceid_node(new->be_device);
 172        kfree(new);
 173}
 174
 175static int
 176__ext_tree_remove(struct rb_root *root,
 177                sector_t start, sector_t end, struct list_head *tmp)
 178{
 179        struct pnfs_block_extent *be;
 180        sector_t len1 = 0, len2 = 0;
 181        sector_t orig_v_offset;
 182        sector_t orig_len;
 183
 184        be = __ext_tree_search(root, start);
 185        if (!be)
 186                return 0;
 187        if (be->be_f_offset >= end)
 188                return 0;
 189
 190        orig_v_offset = be->be_v_offset;
 191        orig_len = be->be_length;
 192
 193        if (start > be->be_f_offset)
 194                len1 = start - be->be_f_offset;
 195        if (ext_f_end(be) > end)
 196                len2 = ext_f_end(be) - end;
 197
 198        if (len2 > 0) {
 199                if (len1 > 0) {
 200                        struct pnfs_block_extent *new;
 201
 202                        new = kzalloc(sizeof(*new), GFP_ATOMIC);
 203                        if (!new)
 204                                return -ENOMEM;
 205
 206                        be->be_length = len1;
 207
 208                        new->be_f_offset = end;
 209                        if (be->be_state != PNFS_BLOCK_NONE_DATA) {
 210                                new->be_v_offset =
 211                                        orig_v_offset + orig_len - len2;
 212                        }
 213                        new->be_length = len2;
 214                        new->be_state = be->be_state;
 215                        new->be_tag = be->be_tag;
 216                        new->be_device = nfs4_get_deviceid(be->be_device);
 217
 218                        __ext_tree_insert(root, new, true);
 219                } else {
 220                        be->be_f_offset = end;
 221                        if (be->be_state != PNFS_BLOCK_NONE_DATA) {
 222                                be->be_v_offset =
 223                                        orig_v_offset + orig_len - len2;
 224                        }
 225                        be->be_length = len2;
 226                }
 227        } else {
 228                if (len1 > 0) {
 229                        be->be_length = len1;
 230                        be = ext_tree_next(be);
 231                }
 232
 233                while (be && ext_f_end(be) <= end) {
 234                        struct pnfs_block_extent *next = ext_tree_next(be);
 235
 236                        rb_erase(&be->be_node, root);
 237                        list_add_tail(&be->be_list, tmp);
 238                        be = next;
 239                }
 240
 241                if (be && be->be_f_offset < end) {
 242                        len1 = ext_f_end(be) - end;
 243                        be->be_f_offset = end;
 244                        if (be->be_state != PNFS_BLOCK_NONE_DATA)
 245                                be->be_v_offset += be->be_length - len1;
 246                        be->be_length = len1;
 247                }
 248        }
 249
 250        return 0;
 251}
 252
 253int
 254ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
 255{
 256        struct pnfs_block_extent *be;
 257        struct rb_root *root;
 258        int err = 0;
 259
 260        switch (new->be_state) {
 261        case PNFS_BLOCK_READWRITE_DATA:
 262        case PNFS_BLOCK_INVALID_DATA:
 263                root = &bl->bl_ext_rw;
 264                break;
 265        case PNFS_BLOCK_READ_DATA:
 266        case PNFS_BLOCK_NONE_DATA:
 267                root = &bl->bl_ext_ro;
 268                break;
 269        default:
 270                dprintk("invalid extent type\n");
 271                return -EINVAL;
 272        }
 273
 274        spin_lock(&bl->bl_ext_lock);
 275retry:
 276        be = __ext_tree_search(root, new->be_f_offset);
 277        if (!be || be->be_f_offset >= ext_f_end(new)) {
 278                __ext_tree_insert(root, new, true);
 279        } else if (new->be_f_offset >= be->be_f_offset) {
 280                if (ext_f_end(new) <= ext_f_end(be)) {
 281                        nfs4_put_deviceid_node(new->be_device);
 282                        kfree(new);
 283                } else {
 284                        sector_t new_len = ext_f_end(new) - ext_f_end(be);
 285                        sector_t diff = new->be_length - new_len;
 286
 287                        new->be_f_offset += diff;
 288                        new->be_v_offset += diff;
 289                        new->be_length = new_len;
 290                        goto retry;
 291                }
 292        } else if (ext_f_end(new) <= ext_f_end(be)) {
 293                new->be_length = be->be_f_offset - new->be_f_offset;
 294                __ext_tree_insert(root, new, true);
 295        } else {
 296                struct pnfs_block_extent *split;
 297                sector_t new_len = ext_f_end(new) - ext_f_end(be);
 298                sector_t diff = new->be_length - new_len;
 299
 300                split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
 301                if (!split) {
 302                        err = -EINVAL;
 303                        goto out;
 304                }
 305
 306                split->be_length = be->be_f_offset - split->be_f_offset;
 307                split->be_device = nfs4_get_deviceid(new->be_device);
 308                __ext_tree_insert(root, split, true);
 309
 310                new->be_f_offset += diff;
 311                new->be_v_offset += diff;
 312                new->be_length = new_len;
 313                goto retry;
 314        }
 315out:
 316        spin_unlock(&bl->bl_ext_lock);
 317        return err;
 318}
 319
 320static bool
 321__ext_tree_lookup(struct rb_root *root, sector_t isect,
 322                struct pnfs_block_extent *ret)
 323{
 324        struct rb_node *node;
 325        struct pnfs_block_extent *be;
 326
 327        node = root->rb_node;
 328        while (node) {
 329                be = ext_node(node);
 330                if (isect < be->be_f_offset)
 331                        node = node->rb_left;
 332                else if (isect >= ext_f_end(be))
 333                        node = node->rb_right;
 334                else {
 335                        *ret = *be;
 336                        return true;
 337                }
 338        }
 339
 340        return false;
 341}
 342
 343bool
 344ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
 345            struct pnfs_block_extent *ret, bool rw)
 346{
 347        bool found = false;
 348
 349        spin_lock(&bl->bl_ext_lock);
 350        if (!rw)
 351                found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
 352        if (!found)
 353                found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
 354        spin_unlock(&bl->bl_ext_lock);
 355
 356        return found;
 357}
 358
 359int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
 360                sector_t start, sector_t end)
 361{
 362        int err, err2;
 363        LIST_HEAD(tmp);
 364
 365        spin_lock(&bl->bl_ext_lock);
 366        err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
 367        if (rw) {
 368                err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
 369                if (!err)
 370                        err = err2;
 371        }
 372        spin_unlock(&bl->bl_ext_lock);
 373
 374        __ext_put_deviceids(&tmp);
 375        return err;
 376}
 377
 378static int
 379ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
 380                sector_t split)
 381{
 382        struct pnfs_block_extent *new;
 383        sector_t orig_len = be->be_length;
 384
 385        new = kzalloc(sizeof(*new), GFP_ATOMIC);
 386        if (!new)
 387                return -ENOMEM;
 388
 389        be->be_length = split - be->be_f_offset;
 390
 391        new->be_f_offset = split;
 392        if (be->be_state != PNFS_BLOCK_NONE_DATA)
 393                new->be_v_offset = be->be_v_offset + be->be_length;
 394        new->be_length = orig_len - be->be_length;
 395        new->be_state = be->be_state;
 396        new->be_tag = be->be_tag;
 397        new->be_device = nfs4_get_deviceid(be->be_device);
 398
 399        __ext_tree_insert(root, new, false);
 400        return 0;
 401}
 402
 403int
 404ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
 405                sector_t len, u64 lwb)
 406{
 407        struct rb_root *root = &bl->bl_ext_rw;
 408        sector_t end = start + len;
 409        struct pnfs_block_extent *be;
 410        int err = 0;
 411        LIST_HEAD(tmp);
 412
 413        spin_lock(&bl->bl_ext_lock);
 414        /*
 415         * First remove all COW extents or holes from written to range.
 416         */
 417        err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
 418        if (err)
 419                goto out;
 420
 421        /*
 422         * Then mark all invalid extents in the range as written to.
 423         */
 424        for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
 425                if (be->be_f_offset >= end)
 426                        break;
 427
 428                if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
 429                        continue;
 430
 431                if (be->be_f_offset < start) {
 432                        struct pnfs_block_extent *left = ext_tree_prev(be);
 433
 434                        if (left && ext_can_merge(left, be)) {
 435                                sector_t diff = start - be->be_f_offset;
 436
 437                                left->be_length += diff;
 438
 439                                be->be_f_offset += diff;
 440                                be->be_v_offset += diff;
 441                                be->be_length -= diff;
 442                        } else {
 443                                err = ext_tree_split(root, be, start);
 444                                if (err)
 445                                        goto out;
 446                        }
 447                }
 448
 449                if (ext_f_end(be) > end) {
 450                        struct pnfs_block_extent *right = ext_tree_next(be);
 451
 452                        if (right && ext_can_merge(be, right)) {
 453                                sector_t diff = end - be->be_f_offset;
 454
 455                                be->be_length -= diff;
 456
 457                                right->be_f_offset -= diff;
 458                                right->be_v_offset -= diff;
 459                                right->be_length += diff;
 460                        } else {
 461                                err = ext_tree_split(root, be, end);
 462                                if (err)
 463                                        goto out;
 464                        }
 465                }
 466
 467                if (be->be_f_offset >= start && ext_f_end(be) <= end) {
 468                        be->be_tag = EXTENT_WRITTEN;
 469                        be = ext_try_to_merge_left(root, be);
 470                        be = ext_try_to_merge_right(root, be);
 471                }
 472        }
 473out:
 474        if (bl->bl_lwb < lwb)
 475                bl->bl_lwb = lwb;
 476        spin_unlock(&bl->bl_ext_lock);
 477
 478        __ext_put_deviceids(&tmp);
 479        return err;
 480}
 481
 482static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
 483{
 484        if (bl->bl_scsi_layout)
 485                return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
 486        else
 487                return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
 488}
 489
 490static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 491                size_t buffer_size)
 492{
 493        if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
 494                int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
 495
 496                for (i = 0; i < nr_pages; i++)
 497                        put_page(arg->layoutupdate_pages[i]);
 498                vfree(arg->start_p);
 499                kfree(arg->layoutupdate_pages);
 500        } else {
 501                put_page(arg->layoutupdate_page);
 502        }
 503}
 504
 505static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
 506{
 507        p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
 508                        NFS4_DEVICEID4_SIZE);
 509        p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
 510        p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
 511        p = xdr_encode_hyper(p, 0LL);
 512        *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
 513        return p;
 514}
 515
 516static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
 517{
 518        p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
 519        return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
 520}
 521
 522static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 523                size_t buffer_size, size_t *count, __u64 *lastbyte)
 524{
 525        struct pnfs_block_extent *be;
 526        int ret = 0;
 527
 528        spin_lock(&bl->bl_ext_lock);
 529        for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
 530                if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
 531                    be->be_tag != EXTENT_WRITTEN)
 532                        continue;
 533
 534                (*count)++;
 535                if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
 536                        /* keep counting.. */
 537                        ret = -ENOSPC;
 538                        continue;
 539                }
 540
 541                if (bl->bl_scsi_layout)
 542                        p = encode_scsi_range(be, p);
 543                else
 544                        p = encode_block_extent(be, p);
 545                be->be_tag = EXTENT_COMMITTING;
 546        }
 547        *lastbyte = bl->bl_lwb - 1;
 548        bl->bl_lwb = 0;
 549        spin_unlock(&bl->bl_ext_lock);
 550
 551        return ret;
 552}
 553
 554int
 555ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
 556{
 557        struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
 558        size_t count = 0, buffer_size = PAGE_SIZE;
 559        __be32 *start_p;
 560        int ret;
 561
 562        dprintk("%s enter\n", __func__);
 563
 564        arg->layoutupdate_page = alloc_page(GFP_NOFS);
 565        if (!arg->layoutupdate_page)
 566                return -ENOMEM;
 567        start_p = page_address(arg->layoutupdate_page);
 568        arg->layoutupdate_pages = &arg->layoutupdate_page;
 569
 570retry:
 571        ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
 572        if (unlikely(ret)) {
 573                ext_tree_free_commitdata(arg, buffer_size);
 574
 575                buffer_size = ext_tree_layoutupdate_size(bl, count);
 576                count = 0;
 577
 578                arg->layoutupdate_pages =
 579                        kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
 580                                sizeof(struct page *), GFP_NOFS);
 581                if (!arg->layoutupdate_pages)
 582                        return -ENOMEM;
 583
 584                start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
 585                if (!start_p) {
 586                        kfree(arg->layoutupdate_pages);
 587                        return -ENOMEM;
 588                }
 589
 590                goto retry;
 591        }
 592
 593        *start_p = cpu_to_be32(count);
 594        arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
 595
 596        if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
 597                void *p = start_p, *end = p + arg->layoutupdate_len;
 598                struct page *page = NULL;
 599                int i = 0;
 600
 601                arg->start_p = start_p;
 602                for ( ; p < end; p += PAGE_SIZE) {
 603                        page = vmalloc_to_page(p);
 604                        arg->layoutupdate_pages[i++] = page;
 605                        get_page(page);
 606                }
 607        }
 608
 609        dprintk("%s found %zu ranges\n", __func__, count);
 610        return 0;
 611}
 612
 613void
 614ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
 615{
 616        struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
 617        struct rb_root *root = &bl->bl_ext_rw;
 618        struct pnfs_block_extent *be;
 619
 620        dprintk("%s status %d\n", __func__, status);
 621
 622        ext_tree_free_commitdata(arg, arg->layoutupdate_len);
 623
 624        spin_lock(&bl->bl_ext_lock);
 625        for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
 626                if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
 627                    be->be_tag != EXTENT_COMMITTING)
 628                        continue;
 629
 630                if (status) {
 631                        /*
 632                         * Mark as written and try again.
 633                         *
 634                         * XXX: some real error handling here wouldn't hurt..
 635                         */
 636                        be->be_tag = EXTENT_WRITTEN;
 637                } else {
 638                        be->be_state = PNFS_BLOCK_READWRITE_DATA;
 639                        be->be_tag = 0;
 640                }
 641
 642                be = ext_try_to_merge_left(root, be);
 643                be = ext_try_to_merge_right(root, be);
 644        }
 645        spin_unlock(&bl->bl_ext_lock);
 646}
 647