linux/net/ipv4/inet_fragment.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * inet fragments management
   4 *
   5 *              Authors:        Pavel Emelyanov <xemul@openvz.org>
   6 *                              Started as consolidation of ipv4/ip_fragment.c,
   7 *                              ipv6/reassembly. and ipv6 nf conntrack reassembly
   8 */
   9
  10#include <linux/list.h>
  11#include <linux/spinlock.h>
  12#include <linux/module.h>
  13#include <linux/timer.h>
  14#include <linux/mm.h>
  15#include <linux/random.h>
  16#include <linux/skbuff.h>
  17#include <linux/rtnetlink.h>
  18#include <linux/slab.h>
  19#include <linux/rhashtable.h>
  20
  21#include <net/sock.h>
  22#include <net/inet_frag.h>
  23#include <net/inet_ecn.h>
  24#include <net/ip.h>
  25#include <net/ipv6.h>
  26
  27/* Use skb->cb to track consecutive/adjacent fragments coming at
  28 * the end of the queue. Nodes in the rb-tree queue will
  29 * contain "runs" of one or more adjacent fragments.
  30 *
  31 * Invariants:
  32 * - next_frag is NULL at the tail of a "run";
  33 * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
  34 */
  35struct ipfrag_skb_cb {
  36        union {
  37                struct inet_skb_parm    h4;
  38                struct inet6_skb_parm   h6;
  39        };
  40        struct sk_buff          *next_frag;
  41        int                     frag_run_len;
  42};
  43
  44#define FRAG_CB(skb)            ((struct ipfrag_skb_cb *)((skb)->cb))
  45
  46static void fragcb_clear(struct sk_buff *skb)
  47{
  48        RB_CLEAR_NODE(&skb->rbnode);
  49        FRAG_CB(skb)->next_frag = NULL;
  50        FRAG_CB(skb)->frag_run_len = skb->len;
  51}
  52
  53/* Append skb to the last "run". */
  54static void fragrun_append_to_last(struct inet_frag_queue *q,
  55                                   struct sk_buff *skb)
  56{
  57        fragcb_clear(skb);
  58
  59        FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
  60        FRAG_CB(q->fragments_tail)->next_frag = skb;
  61        q->fragments_tail = skb;
  62}
  63
  64/* Create a new "run" with the skb. */
  65static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
  66{
  67        BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
  68        fragcb_clear(skb);
  69
  70        if (q->last_run_head)
  71                rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
  72                             &q->last_run_head->rbnode.rb_right);
  73        else
  74                rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
  75        rb_insert_color(&skb->rbnode, &q->rb_fragments);
  76
  77        q->fragments_tail = skb;
  78        q->last_run_head = skb;
  79}
  80
  81/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
  82 * Value : 0xff if frame should be dropped.
  83 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
  84 */
  85const u8 ip_frag_ecn_table[16] = {
  86        /* at least one fragment had CE, and others ECT_0 or ECT_1 */
  87        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]                      = INET_ECN_CE,
  88        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]                      = INET_ECN_CE,
  89        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]   = INET_ECN_CE,
  90
  91        /* invalid combinations : drop frame */
  92        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
  93        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
  94        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
  95        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
  96        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
  97        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
  98        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
  99};
 100EXPORT_SYMBOL(ip_frag_ecn_table);
 101
 102int inet_frags_init(struct inet_frags *f)
 103{
 104        f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
 105                                            NULL);
 106        if (!f->frags_cachep)
 107                return -ENOMEM;
 108
 109        refcount_set(&f->refcnt, 1);
 110        init_completion(&f->completion);
 111        return 0;
 112}
 113EXPORT_SYMBOL(inet_frags_init);
 114
 115void inet_frags_fini(struct inet_frags *f)
 116{
 117        if (refcount_dec_and_test(&f->refcnt))
 118                complete(&f->completion);
 119
 120        wait_for_completion(&f->completion);
 121
 122        kmem_cache_destroy(f->frags_cachep);
 123        f->frags_cachep = NULL;
 124}
 125EXPORT_SYMBOL(inet_frags_fini);
 126
 127/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
 128static void inet_frags_free_cb(void *ptr, void *arg)
 129{
 130        struct inet_frag_queue *fq = ptr;
 131        int count;
 132
 133        count = del_timer_sync(&fq->timer) ? 1 : 0;
 134
 135        spin_lock_bh(&fq->lock);
 136        if (!(fq->flags & INET_FRAG_COMPLETE)) {
 137                fq->flags |= INET_FRAG_COMPLETE;
 138                count++;
 139        } else if (fq->flags & INET_FRAG_HASH_DEAD) {
 140                count++;
 141        }
 142        spin_unlock_bh(&fq->lock);
 143
 144        if (refcount_sub_and_test(count, &fq->refcnt))
 145                inet_frag_destroy(fq);
 146}
 147
 148static LLIST_HEAD(fqdir_free_list);
 149
 150static void fqdir_free_fn(struct work_struct *work)
 151{
 152        struct llist_node *kill_list;
 153        struct fqdir *fqdir, *tmp;
 154        struct inet_frags *f;
 155
 156        /* Atomically snapshot the list of fqdirs to free */
 157        kill_list = llist_del_all(&fqdir_free_list);
 158
 159        /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
 160         * have completed, since they need to dereference fqdir.
 161         * Would it not be nice to have kfree_rcu_barrier() ? :)
 162         */
 163        rcu_barrier();
 164
 165        llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) {
 166                f = fqdir->f;
 167                if (refcount_dec_and_test(&f->refcnt))
 168                        complete(&f->completion);
 169
 170                kfree(fqdir);
 171        }
 172}
 173
 174static DECLARE_WORK(fqdir_free_work, fqdir_free_fn);
 175
 176static void fqdir_work_fn(struct work_struct *work)
 177{
 178        struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
 179
 180        rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
 181
 182        if (llist_add(&fqdir->free_list, &fqdir_free_list))
 183                queue_work(system_wq, &fqdir_free_work);
 184}
 185
 186int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
 187{
 188        struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
 189        int res;
 190
 191        if (!fqdir)
 192                return -ENOMEM;
 193        fqdir->f = f;
 194        fqdir->net = net;
 195        res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
 196        if (res < 0) {
 197                kfree(fqdir);
 198                return res;
 199        }
 200        refcount_inc(&f->refcnt);
 201        *fqdirp = fqdir;
 202        return 0;
 203}
 204EXPORT_SYMBOL(fqdir_init);
 205
 206static struct workqueue_struct *inet_frag_wq;
 207
 208static int __init inet_frag_wq_init(void)
 209{
 210        inet_frag_wq = create_workqueue("inet_frag_wq");
 211        if (!inet_frag_wq)
 212                panic("Could not create inet frag workq");
 213        return 0;
 214}
 215
 216pure_initcall(inet_frag_wq_init);
 217
 218void fqdir_exit(struct fqdir *fqdir)
 219{
 220        INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
 221        queue_work(inet_frag_wq, &fqdir->destroy_work);
 222}
 223EXPORT_SYMBOL(fqdir_exit);
 224
 225void inet_frag_kill(struct inet_frag_queue *fq)
 226{
 227        if (del_timer(&fq->timer))
 228                refcount_dec(&fq->refcnt);
 229
 230        if (!(fq->flags & INET_FRAG_COMPLETE)) {
 231                struct fqdir *fqdir = fq->fqdir;
 232
 233                fq->flags |= INET_FRAG_COMPLETE;
 234                rcu_read_lock();
 235                /* The RCU read lock provides a memory barrier
 236                 * guaranteeing that if fqdir->dead is false then
 237                 * the hash table destruction will not start until
 238                 * after we unlock.  Paired with fqdir_pre_exit().
 239                 */
 240                if (!READ_ONCE(fqdir->dead)) {
 241                        rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
 242                                               fqdir->f->rhash_params);
 243                        refcount_dec(&fq->refcnt);
 244                } else {
 245                        fq->flags |= INET_FRAG_HASH_DEAD;
 246                }
 247                rcu_read_unlock();
 248        }
 249}
 250EXPORT_SYMBOL(inet_frag_kill);
 251
 252static void inet_frag_destroy_rcu(struct rcu_head *head)
 253{
 254        struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
 255                                                 rcu);
 256        struct inet_frags *f = q->fqdir->f;
 257
 258        if (f->destructor)
 259                f->destructor(q);
 260        kmem_cache_free(f->frags_cachep, q);
 261}
 262
 263unsigned int inet_frag_rbtree_purge(struct rb_root *root)
 264{
 265        struct rb_node *p = rb_first(root);
 266        unsigned int sum = 0;
 267
 268        while (p) {
 269                struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 270
 271                p = rb_next(p);
 272                rb_erase(&skb->rbnode, root);
 273                while (skb) {
 274                        struct sk_buff *next = FRAG_CB(skb)->next_frag;
 275
 276                        sum += skb->truesize;
 277                        kfree_skb(skb);
 278                        skb = next;
 279                }
 280        }
 281        return sum;
 282}
 283EXPORT_SYMBOL(inet_frag_rbtree_purge);
 284
 285void inet_frag_destroy(struct inet_frag_queue *q)
 286{
 287        struct fqdir *fqdir;
 288        unsigned int sum, sum_truesize = 0;
 289        struct inet_frags *f;
 290
 291        WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 292        WARN_ON(del_timer(&q->timer) != 0);
 293
 294        /* Release all fragment data. */
 295        fqdir = q->fqdir;
 296        f = fqdir->f;
 297        sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
 298        sum = sum_truesize + f->qsize;
 299
 300        call_rcu(&q->rcu, inet_frag_destroy_rcu);
 301
 302        sub_frag_mem_limit(fqdir, sum);
 303}
 304EXPORT_SYMBOL(inet_frag_destroy);
 305
 306static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
 307                                               struct inet_frags *f,
 308                                               void *arg)
 309{
 310        struct inet_frag_queue *q;
 311
 312        q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 313        if (!q)
 314                return NULL;
 315
 316        q->fqdir = fqdir;
 317        f->constructor(q, arg);
 318        add_frag_mem_limit(fqdir, f->qsize);
 319
 320        timer_setup(&q->timer, f->frag_expire, 0);
 321        spin_lock_init(&q->lock);
 322        refcount_set(&q->refcnt, 3);
 323
 324        return q;
 325}
 326
 327static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
 328                                                void *arg,
 329                                                struct inet_frag_queue **prev)
 330{
 331        struct inet_frags *f = fqdir->f;
 332        struct inet_frag_queue *q;
 333
 334        q = inet_frag_alloc(fqdir, f, arg);
 335        if (!q) {
 336                *prev = ERR_PTR(-ENOMEM);
 337                return NULL;
 338        }
 339        mod_timer(&q->timer, jiffies + fqdir->timeout);
 340
 341        *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
 342                                                 &q->node, f->rhash_params);
 343        if (*prev) {
 344                q->flags |= INET_FRAG_COMPLETE;
 345                inet_frag_kill(q);
 346                inet_frag_destroy(q);
 347                return NULL;
 348        }
 349        return q;
 350}
 351
 352/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
 353struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
 354{
 355        /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
 356        long high_thresh = READ_ONCE(fqdir->high_thresh);
 357        struct inet_frag_queue *fq = NULL, *prev;
 358
 359        if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
 360                return NULL;
 361
 362        rcu_read_lock();
 363
 364        prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
 365        if (!prev)
 366                fq = inet_frag_create(fqdir, key, &prev);
 367        if (!IS_ERR_OR_NULL(prev)) {
 368                fq = prev;
 369                if (!refcount_inc_not_zero(&fq->refcnt))
 370                        fq = NULL;
 371        }
 372        rcu_read_unlock();
 373        return fq;
 374}
 375EXPORT_SYMBOL(inet_frag_find);
 376
 377int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
 378                           int offset, int end)
 379{
 380        struct sk_buff *last = q->fragments_tail;
 381
 382        /* RFC5722, Section 4, amended by Errata ID : 3089
 383         *                          When reassembling an IPv6 datagram, if
 384         *   one or more its constituent fragments is determined to be an
 385         *   overlapping fragment, the entire datagram (and any constituent
 386         *   fragments) MUST be silently discarded.
 387         *
 388         * Duplicates, however, should be ignored (i.e. skb dropped, but the
 389         * queue/fragments kept for later reassembly).
 390         */
 391        if (!last)
 392                fragrun_create(q, skb);  /* First fragment. */
 393        else if (last->ip_defrag_offset + last->len < end) {
 394                /* This is the common case: skb goes to the end. */
 395                /* Detect and discard overlaps. */
 396                if (offset < last->ip_defrag_offset + last->len)
 397                        return IPFRAG_OVERLAP;
 398                if (offset == last->ip_defrag_offset + last->len)
 399                        fragrun_append_to_last(q, skb);
 400                else
 401                        fragrun_create(q, skb);
 402        } else {
 403                /* Binary search. Note that skb can become the first fragment,
 404                 * but not the last (covered above).
 405                 */
 406                struct rb_node **rbn, *parent;
 407
 408                rbn = &q->rb_fragments.rb_node;
 409                do {
 410                        struct sk_buff *curr;
 411                        int curr_run_end;
 412
 413                        parent = *rbn;
 414                        curr = rb_to_skb(parent);
 415                        curr_run_end = curr->ip_defrag_offset +
 416                                        FRAG_CB(curr)->frag_run_len;
 417                        if (end <= curr->ip_defrag_offset)
 418                                rbn = &parent->rb_left;
 419                        else if (offset >= curr_run_end)
 420                                rbn = &parent->rb_right;
 421                        else if (offset >= curr->ip_defrag_offset &&
 422                                 end <= curr_run_end)
 423                                return IPFRAG_DUP;
 424                        else
 425                                return IPFRAG_OVERLAP;
 426                } while (*rbn);
 427                /* Here we have parent properly set, and rbn pointing to
 428                 * one of its NULL left/right children. Insert skb.
 429                 */
 430                fragcb_clear(skb);
 431                rb_link_node(&skb->rbnode, parent, rbn);
 432                rb_insert_color(&skb->rbnode, &q->rb_fragments);
 433        }
 434
 435        skb->ip_defrag_offset = offset;
 436
 437        return IPFRAG_OK;
 438}
 439EXPORT_SYMBOL(inet_frag_queue_insert);
 440
 441void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
 442                              struct sk_buff *parent)
 443{
 444        struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
 445        struct sk_buff **nextp;
 446        int delta;
 447
 448        if (head != skb) {
 449                fp = skb_clone(skb, GFP_ATOMIC);
 450                if (!fp)
 451                        return NULL;
 452                FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
 453                if (RB_EMPTY_NODE(&skb->rbnode))
 454                        FRAG_CB(parent)->next_frag = fp;
 455                else
 456                        rb_replace_node(&skb->rbnode, &fp->rbnode,
 457                                        &q->rb_fragments);
 458                if (q->fragments_tail == skb)
 459                        q->fragments_tail = fp;
 460                skb_morph(skb, head);
 461                FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
 462                rb_replace_node(&head->rbnode, &skb->rbnode,
 463                                &q->rb_fragments);
 464                consume_skb(head);
 465                head = skb;
 466        }
 467        WARN_ON(head->ip_defrag_offset != 0);
 468
 469        delta = -head->truesize;
 470
 471        /* Head of list must not be cloned. */
 472        if (skb_unclone(head, GFP_ATOMIC))
 473                return NULL;
 474
 475        delta += head->truesize;
 476        if (delta)
 477                add_frag_mem_limit(q->fqdir, delta);
 478
 479        /* If the first fragment is fragmented itself, we split
 480         * it to two chunks: the first with data and paged part
 481         * and the second, holding only fragments.
 482         */
 483        if (skb_has_frag_list(head)) {
 484                struct sk_buff *clone;
 485                int i, plen = 0;
 486
 487                clone = alloc_skb(0, GFP_ATOMIC);
 488                if (!clone)
 489                        return NULL;
 490                skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
 491                skb_frag_list_init(head);
 492                for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
 493                        plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
 494                clone->data_len = head->data_len - plen;
 495                clone->len = clone->data_len;
 496                head->truesize += clone->truesize;
 497                clone->csum = 0;
 498                clone->ip_summed = head->ip_summed;
 499                add_frag_mem_limit(q->fqdir, clone->truesize);
 500                skb_shinfo(head)->frag_list = clone;
 501                nextp = &clone->next;
 502        } else {
 503                nextp = &skb_shinfo(head)->frag_list;
 504        }
 505
 506        return nextp;
 507}
 508EXPORT_SYMBOL(inet_frag_reasm_prepare);
 509
 510void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
 511                            void *reasm_data, bool try_coalesce)
 512{
 513        struct sk_buff **nextp = reasm_data;
 514        struct rb_node *rbn;
 515        struct sk_buff *fp;
 516        int sum_truesize;
 517
 518        skb_push(head, head->data - skb_network_header(head));
 519
 520        /* Traverse the tree in order, to build frag_list. */
 521        fp = FRAG_CB(head)->next_frag;
 522        rbn = rb_next(&head->rbnode);
 523        rb_erase(&head->rbnode, &q->rb_fragments);
 524
 525        sum_truesize = head->truesize;
 526        while (rbn || fp) {
 527                /* fp points to the next sk_buff in the current run;
 528                 * rbn points to the next run.
 529                 */
 530                /* Go through the current run. */
 531                while (fp) {
 532                        struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
 533                        bool stolen;
 534                        int delta;
 535
 536                        sum_truesize += fp->truesize;
 537                        if (head->ip_summed != fp->ip_summed)
 538                                head->ip_summed = CHECKSUM_NONE;
 539                        else if (head->ip_summed == CHECKSUM_COMPLETE)
 540                                head->csum = csum_add(head->csum, fp->csum);
 541
 542                        if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
 543                                                             &delta)) {
 544                                kfree_skb_partial(fp, stolen);
 545                        } else {
 546                                fp->prev = NULL;
 547                                memset(&fp->rbnode, 0, sizeof(fp->rbnode));
 548                                fp->sk = NULL;
 549
 550                                head->data_len += fp->len;
 551                                head->len += fp->len;
 552                                head->truesize += fp->truesize;
 553
 554                                *nextp = fp;
 555                                nextp = &fp->next;
 556                        }
 557
 558                        fp = next_frag;
 559                }
 560                /* Move to the next run. */
 561                if (rbn) {
 562                        struct rb_node *rbnext = rb_next(rbn);
 563
 564                        fp = rb_to_skb(rbn);
 565                        rb_erase(rbn, &q->rb_fragments);
 566                        rbn = rbnext;
 567                }
 568        }
 569        sub_frag_mem_limit(q->fqdir, sum_truesize);
 570
 571        *nextp = NULL;
 572        skb_mark_not_on_list(head);
 573        head->prev = NULL;
 574        head->tstamp = q->stamp;
 575        head->mono_delivery_time = q->mono_delivery_time;
 576}
 577EXPORT_SYMBOL(inet_frag_reasm_finish);
 578
 579struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
 580{
 581        struct sk_buff *head, *skb;
 582
 583        head = skb_rb_first(&q->rb_fragments);
 584        if (!head)
 585                return NULL;
 586        skb = FRAG_CB(head)->next_frag;
 587        if (skb)
 588                rb_replace_node(&head->rbnode, &skb->rbnode,
 589                                &q->rb_fragments);
 590        else
 591                rb_erase(&head->rbnode, &q->rb_fragments);
 592        memset(&head->rbnode, 0, sizeof(head->rbnode));
 593        barrier();
 594
 595        if (head == q->fragments_tail)
 596                q->fragments_tail = NULL;
 597
 598        sub_frag_mem_limit(q->fqdir, head->truesize);
 599
 600        return head;
 601}
 602EXPORT_SYMBOL(inet_frag_pull_head);
 603