linux/net/ipv4/inet_fragment.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * inet fragments management
   4 *
   5 *              Authors:        Pavel Emelyanov <xemul@openvz.org>
   6 *                              Started as consolidation of ipv4/ip_fragment.c,
   7 *                              ipv6/reassembly. and ipv6 nf conntrack reassembly
   8 */
   9
  10#include <linux/list.h>
  11#include <linux/spinlock.h>
  12#include <linux/module.h>
  13#include <linux/timer.h>
  14#include <linux/mm.h>
  15#include <linux/random.h>
  16#include <linux/skbuff.h>
  17#include <linux/rtnetlink.h>
  18#include <linux/slab.h>
  19#include <linux/rhashtable.h>
  20
  21#include <net/sock.h>
  22#include <net/inet_frag.h>
  23#include <net/inet_ecn.h>
  24#include <net/ip.h>
  25#include <net/ipv6.h>
  26
  27/* Use skb->cb to track consecutive/adjacent fragments coming at
  28 * the end of the queue. Nodes in the rb-tree queue will
  29 * contain "runs" of one or more adjacent fragments.
  30 *
  31 * Invariants:
  32 * - next_frag is NULL at the tail of a "run";
  33 * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
  34 */
  35struct ipfrag_skb_cb {
  36        union {
  37                struct inet_skb_parm    h4;
  38                struct inet6_skb_parm   h6;
  39        };
  40        struct sk_buff          *next_frag;
  41        int                     frag_run_len;
  42};
  43
  44#define FRAG_CB(skb)            ((struct ipfrag_skb_cb *)((skb)->cb))
  45
  46static void fragcb_clear(struct sk_buff *skb)
  47{
  48        RB_CLEAR_NODE(&skb->rbnode);
  49        FRAG_CB(skb)->next_frag = NULL;
  50        FRAG_CB(skb)->frag_run_len = skb->len;
  51}
  52
  53/* Append skb to the last "run". */
  54static void fragrun_append_to_last(struct inet_frag_queue *q,
  55                                   struct sk_buff *skb)
  56{
  57        fragcb_clear(skb);
  58
  59        FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
  60        FRAG_CB(q->fragments_tail)->next_frag = skb;
  61        q->fragments_tail = skb;
  62}
  63
  64/* Create a new "run" with the skb. */
  65static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
  66{
  67        BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
  68        fragcb_clear(skb);
  69
  70        if (q->last_run_head)
  71                rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
  72                             &q->last_run_head->rbnode.rb_right);
  73        else
  74                rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
  75        rb_insert_color(&skb->rbnode, &q->rb_fragments);
  76
  77        q->fragments_tail = skb;
  78        q->last_run_head = skb;
  79}
  80
  81/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
  82 * Value : 0xff if frame should be dropped.
  83 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
  84 */
  85const u8 ip_frag_ecn_table[16] = {
  86        /* at least one fragment had CE, and others ECT_0 or ECT_1 */
  87        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]                      = INET_ECN_CE,
  88        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]                      = INET_ECN_CE,
  89        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]   = INET_ECN_CE,
  90
  91        /* invalid combinations : drop frame */
  92        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
  93        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
  94        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
  95        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
  96        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
  97        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
  98        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
  99};
 100EXPORT_SYMBOL(ip_frag_ecn_table);
 101
 102int inet_frags_init(struct inet_frags *f)
 103{
 104        f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
 105                                            NULL);
 106        if (!f->frags_cachep)
 107                return -ENOMEM;
 108
 109        refcount_set(&f->refcnt, 1);
 110        init_completion(&f->completion);
 111        return 0;
 112}
 113EXPORT_SYMBOL(inet_frags_init);
 114
 115void inet_frags_fini(struct inet_frags *f)
 116{
 117        if (refcount_dec_and_test(&f->refcnt))
 118                complete(&f->completion);
 119
 120        wait_for_completion(&f->completion);
 121
 122        kmem_cache_destroy(f->frags_cachep);
 123        f->frags_cachep = NULL;
 124}
 125EXPORT_SYMBOL(inet_frags_fini);
 126
 127/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
 128static void inet_frags_free_cb(void *ptr, void *arg)
 129{
 130        struct inet_frag_queue *fq = ptr;
 131        int count;
 132
 133        count = del_timer_sync(&fq->timer) ? 1 : 0;
 134
 135        spin_lock_bh(&fq->lock);
 136        if (!(fq->flags & INET_FRAG_COMPLETE)) {
 137                fq->flags |= INET_FRAG_COMPLETE;
 138                count++;
 139        } else if (fq->flags & INET_FRAG_HASH_DEAD) {
 140                count++;
 141        }
 142        spin_unlock_bh(&fq->lock);
 143
 144        if (refcount_sub_and_test(count, &fq->refcnt))
 145                inet_frag_destroy(fq);
 146}
 147
 148static LLIST_HEAD(fqdir_free_list);
 149
 150static void fqdir_free_fn(struct work_struct *work)
 151{
 152        struct llist_node *kill_list;
 153        struct fqdir *fqdir, *tmp;
 154        struct inet_frags *f;
 155
 156        /* Atomically snapshot the list of fqdirs to free */
 157        kill_list = llist_del_all(&fqdir_free_list);
 158
 159        /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
 160         * have completed, since they need to dereference fqdir.
 161         * Would it not be nice to have kfree_rcu_barrier() ? :)
 162         */
 163        rcu_barrier();
 164
 165        llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) {
 166                f = fqdir->f;
 167                if (refcount_dec_and_test(&f->refcnt))
 168                        complete(&f->completion);
 169
 170                kfree(fqdir);
 171        }
 172}
 173
 174static DECLARE_WORK(fqdir_free_work, fqdir_free_fn);
 175
 176static void fqdir_work_fn(struct work_struct *work)
 177{
 178        struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
 179
 180        rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
 181
 182        if (llist_add(&fqdir->free_list, &fqdir_free_list))
 183                queue_work(system_wq, &fqdir_free_work);
 184}
 185
 186int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
 187{
 188        struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
 189        int res;
 190
 191        if (!fqdir)
 192                return -ENOMEM;
 193        fqdir->f = f;
 194        fqdir->net = net;
 195        res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
 196        if (res < 0) {
 197                kfree(fqdir);
 198                return res;
 199        }
 200        refcount_inc(&f->refcnt);
 201        *fqdirp = fqdir;
 202        return 0;
 203}
 204EXPORT_SYMBOL(fqdir_init);
 205
 206static struct workqueue_struct *inet_frag_wq;
 207
 208static int __init inet_frag_wq_init(void)
 209{
 210        inet_frag_wq = create_workqueue("inet_frag_wq");
 211        if (!inet_frag_wq)
 212                panic("Could not create inet frag workq");
 213        return 0;
 214}
 215
 216pure_initcall(inet_frag_wq_init);
 217
 218void fqdir_exit(struct fqdir *fqdir)
 219{
 220        INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
 221        queue_work(inet_frag_wq, &fqdir->destroy_work);
 222}
 223EXPORT_SYMBOL(fqdir_exit);
 224
 225void inet_frag_kill(struct inet_frag_queue *fq)
 226{
 227        if (del_timer(&fq->timer))
 228                refcount_dec(&fq->refcnt);
 229
 230        if (!(fq->flags & INET_FRAG_COMPLETE)) {
 231                struct fqdir *fqdir = fq->fqdir;
 232
 233                fq->flags |= INET_FRAG_COMPLETE;
 234                rcu_read_lock();
 235                /* The RCU read lock provides a memory barrier
 236                 * guaranteeing that if fqdir->dead is false then
 237                 * the hash table destruction will not start until
 238                 * after we unlock.  Paired with inet_frags_exit_net().
 239                 */
 240                if (!fqdir->dead) {
 241                        rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
 242                                               fqdir->f->rhash_params);
 243                        refcount_dec(&fq->refcnt);
 244                } else {
 245                        fq->flags |= INET_FRAG_HASH_DEAD;
 246                }
 247                rcu_read_unlock();
 248        }
 249}
 250EXPORT_SYMBOL(inet_frag_kill);
 251
 252static void inet_frag_destroy_rcu(struct rcu_head *head)
 253{
 254        struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
 255                                                 rcu);
 256        struct inet_frags *f = q->fqdir->f;
 257
 258        if (f->destructor)
 259                f->destructor(q);
 260        kmem_cache_free(f->frags_cachep, q);
 261}
 262
 263unsigned int inet_frag_rbtree_purge(struct rb_root *root)
 264{
 265        struct rb_node *p = rb_first(root);
 266        unsigned int sum = 0;
 267
 268        while (p) {
 269                struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 270
 271                p = rb_next(p);
 272                rb_erase(&skb->rbnode, root);
 273                while (skb) {
 274                        struct sk_buff *next = FRAG_CB(skb)->next_frag;
 275
 276                        sum += skb->truesize;
 277                        kfree_skb(skb);
 278                        skb = next;
 279                }
 280        }
 281        return sum;
 282}
 283EXPORT_SYMBOL(inet_frag_rbtree_purge);
 284
 285void inet_frag_destroy(struct inet_frag_queue *q)
 286{
 287        struct fqdir *fqdir;
 288        unsigned int sum, sum_truesize = 0;
 289        struct inet_frags *f;
 290
 291        WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 292        WARN_ON(del_timer(&q->timer) != 0);
 293
 294        /* Release all fragment data. */
 295        fqdir = q->fqdir;
 296        f = fqdir->f;
 297        sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
 298        sum = sum_truesize + f->qsize;
 299
 300        call_rcu(&q->rcu, inet_frag_destroy_rcu);
 301
 302        sub_frag_mem_limit(fqdir, sum);
 303}
 304EXPORT_SYMBOL(inet_frag_destroy);
 305
 306static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
 307                                               struct inet_frags *f,
 308                                               void *arg)
 309{
 310        struct inet_frag_queue *q;
 311
 312        q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 313        if (!q)
 314                return NULL;
 315
 316        q->fqdir = fqdir;
 317        f->constructor(q, arg);
 318        add_frag_mem_limit(fqdir, f->qsize);
 319
 320        timer_setup(&q->timer, f->frag_expire, 0);
 321        spin_lock_init(&q->lock);
 322        refcount_set(&q->refcnt, 3);
 323
 324        return q;
 325}
 326
 327static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
 328                                                void *arg,
 329                                                struct inet_frag_queue **prev)
 330{
 331        struct inet_frags *f = fqdir->f;
 332        struct inet_frag_queue *q;
 333
 334        q = inet_frag_alloc(fqdir, f, arg);
 335        if (!q) {
 336                *prev = ERR_PTR(-ENOMEM);
 337                return NULL;
 338        }
 339        mod_timer(&q->timer, jiffies + fqdir->timeout);
 340
 341        *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
 342                                                 &q->node, f->rhash_params);
 343        if (*prev) {
 344                q->flags |= INET_FRAG_COMPLETE;
 345                inet_frag_kill(q);
 346                inet_frag_destroy(q);
 347                return NULL;
 348        }
 349        return q;
 350}
 351
 352/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
 353struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
 354{
 355        struct inet_frag_queue *fq = NULL, *prev;
 356
 357        if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh)
 358                return NULL;
 359
 360        rcu_read_lock();
 361
 362        prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
 363        if (!prev)
 364                fq = inet_frag_create(fqdir, key, &prev);
 365        if (!IS_ERR_OR_NULL(prev)) {
 366                fq = prev;
 367                if (!refcount_inc_not_zero(&fq->refcnt))
 368                        fq = NULL;
 369        }
 370        rcu_read_unlock();
 371        return fq;
 372}
 373EXPORT_SYMBOL(inet_frag_find);
 374
 375int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
 376                           int offset, int end)
 377{
 378        struct sk_buff *last = q->fragments_tail;
 379
 380        /* RFC5722, Section 4, amended by Errata ID : 3089
 381         *                          When reassembling an IPv6 datagram, if
 382         *   one or more its constituent fragments is determined to be an
 383         *   overlapping fragment, the entire datagram (and any constituent
 384         *   fragments) MUST be silently discarded.
 385         *
 386         * Duplicates, however, should be ignored (i.e. skb dropped, but the
 387         * queue/fragments kept for later reassembly).
 388         */
 389        if (!last)
 390                fragrun_create(q, skb);  /* First fragment. */
 391        else if (last->ip_defrag_offset + last->len < end) {
 392                /* This is the common case: skb goes to the end. */
 393                /* Detect and discard overlaps. */
 394                if (offset < last->ip_defrag_offset + last->len)
 395                        return IPFRAG_OVERLAP;
 396                if (offset == last->ip_defrag_offset + last->len)
 397                        fragrun_append_to_last(q, skb);
 398                else
 399                        fragrun_create(q, skb);
 400        } else {
 401                /* Binary search. Note that skb can become the first fragment,
 402                 * but not the last (covered above).
 403                 */
 404                struct rb_node **rbn, *parent;
 405
 406                rbn = &q->rb_fragments.rb_node;
 407                do {
 408                        struct sk_buff *curr;
 409                        int curr_run_end;
 410
 411                        parent = *rbn;
 412                        curr = rb_to_skb(parent);
 413                        curr_run_end = curr->ip_defrag_offset +
 414                                        FRAG_CB(curr)->frag_run_len;
 415                        if (end <= curr->ip_defrag_offset)
 416                                rbn = &parent->rb_left;
 417                        else if (offset >= curr_run_end)
 418                                rbn = &parent->rb_right;
 419                        else if (offset >= curr->ip_defrag_offset &&
 420                                 end <= curr_run_end)
 421                                return IPFRAG_DUP;
 422                        else
 423                                return IPFRAG_OVERLAP;
 424                } while (*rbn);
 425                /* Here we have parent properly set, and rbn pointing to
 426                 * one of its NULL left/right children. Insert skb.
 427                 */
 428                fragcb_clear(skb);
 429                rb_link_node(&skb->rbnode, parent, rbn);
 430                rb_insert_color(&skb->rbnode, &q->rb_fragments);
 431        }
 432
 433        skb->ip_defrag_offset = offset;
 434
 435        return IPFRAG_OK;
 436}
 437EXPORT_SYMBOL(inet_frag_queue_insert);
 438
 439void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
 440                              struct sk_buff *parent)
 441{
 442        struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
 443        struct sk_buff **nextp;
 444        int delta;
 445
 446        if (head != skb) {
 447                fp = skb_clone(skb, GFP_ATOMIC);
 448                if (!fp)
 449                        return NULL;
 450                FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
 451                if (RB_EMPTY_NODE(&skb->rbnode))
 452                        FRAG_CB(parent)->next_frag = fp;
 453                else
 454                        rb_replace_node(&skb->rbnode, &fp->rbnode,
 455                                        &q->rb_fragments);
 456                if (q->fragments_tail == skb)
 457                        q->fragments_tail = fp;
 458                skb_morph(skb, head);
 459                FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
 460                rb_replace_node(&head->rbnode, &skb->rbnode,
 461                                &q->rb_fragments);
 462                consume_skb(head);
 463                head = skb;
 464        }
 465        WARN_ON(head->ip_defrag_offset != 0);
 466
 467        delta = -head->truesize;
 468
 469        /* Head of list must not be cloned. */
 470        if (skb_unclone(head, GFP_ATOMIC))
 471                return NULL;
 472
 473        delta += head->truesize;
 474        if (delta)
 475                add_frag_mem_limit(q->fqdir, delta);
 476
 477        /* If the first fragment is fragmented itself, we split
 478         * it to two chunks: the first with data and paged part
 479         * and the second, holding only fragments.
 480         */
 481        if (skb_has_frag_list(head)) {
 482                struct sk_buff *clone;
 483                int i, plen = 0;
 484
 485                clone = alloc_skb(0, GFP_ATOMIC);
 486                if (!clone)
 487                        return NULL;
 488                skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
 489                skb_frag_list_init(head);
 490                for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
 491                        plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
 492                clone->data_len = head->data_len - plen;
 493                clone->len = clone->data_len;
 494                head->truesize += clone->truesize;
 495                clone->csum = 0;
 496                clone->ip_summed = head->ip_summed;
 497                add_frag_mem_limit(q->fqdir, clone->truesize);
 498                skb_shinfo(head)->frag_list = clone;
 499                nextp = &clone->next;
 500        } else {
 501                nextp = &skb_shinfo(head)->frag_list;
 502        }
 503
 504        return nextp;
 505}
 506EXPORT_SYMBOL(inet_frag_reasm_prepare);
 507
 508void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
 509                            void *reasm_data, bool try_coalesce)
 510{
 511        struct sk_buff **nextp = (struct sk_buff **)reasm_data;
 512        struct rb_node *rbn;
 513        struct sk_buff *fp;
 514        int sum_truesize;
 515
 516        skb_push(head, head->data - skb_network_header(head));
 517
 518        /* Traverse the tree in order, to build frag_list. */
 519        fp = FRAG_CB(head)->next_frag;
 520        rbn = rb_next(&head->rbnode);
 521        rb_erase(&head->rbnode, &q->rb_fragments);
 522
 523        sum_truesize = head->truesize;
 524        while (rbn || fp) {
 525                /* fp points to the next sk_buff in the current run;
 526                 * rbn points to the next run.
 527                 */
 528                /* Go through the current run. */
 529                while (fp) {
 530                        struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
 531                        bool stolen;
 532                        int delta;
 533
 534                        sum_truesize += fp->truesize;
 535                        if (head->ip_summed != fp->ip_summed)
 536                                head->ip_summed = CHECKSUM_NONE;
 537                        else if (head->ip_summed == CHECKSUM_COMPLETE)
 538                                head->csum = csum_add(head->csum, fp->csum);
 539
 540                        if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
 541                                                             &delta)) {
 542                                kfree_skb_partial(fp, stolen);
 543                        } else {
 544                                fp->prev = NULL;
 545                                memset(&fp->rbnode, 0, sizeof(fp->rbnode));
 546                                fp->sk = NULL;
 547
 548                                head->data_len += fp->len;
 549                                head->len += fp->len;
 550                                head->truesize += fp->truesize;
 551
 552                                *nextp = fp;
 553                                nextp = &fp->next;
 554                        }
 555
 556                        fp = next_frag;
 557                }
 558                /* Move to the next run. */
 559                if (rbn) {
 560                        struct rb_node *rbnext = rb_next(rbn);
 561
 562                        fp = rb_to_skb(rbn);
 563                        rb_erase(rbn, &q->rb_fragments);
 564                        rbn = rbnext;
 565                }
 566        }
 567        sub_frag_mem_limit(q->fqdir, sum_truesize);
 568
 569        *nextp = NULL;
 570        skb_mark_not_on_list(head);
 571        head->prev = NULL;
 572        head->tstamp = q->stamp;
 573}
 574EXPORT_SYMBOL(inet_frag_reasm_finish);
 575
 576struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
 577{
 578        struct sk_buff *head, *skb;
 579
 580        head = skb_rb_first(&q->rb_fragments);
 581        if (!head)
 582                return NULL;
 583        skb = FRAG_CB(head)->next_frag;
 584        if (skb)
 585                rb_replace_node(&head->rbnode, &skb->rbnode,
 586                                &q->rb_fragments);
 587        else
 588                rb_erase(&head->rbnode, &q->rb_fragments);
 589        memset(&head->rbnode, 0, sizeof(head->rbnode));
 590        barrier();
 591
 592        if (head == q->fragments_tail)
 593                q->fragments_tail = NULL;
 594
 595        sub_frag_mem_limit(q->fqdir, head->truesize);
 596
 597        return head;
 598}
 599EXPORT_SYMBOL(inet_frag_pull_head);
 600