linux/net/ipv4/inet_fragment.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * inet fragments management
   4 *
   5 *              Authors:        Pavel Emelyanov <xemul@openvz.org>
   6 *                              Started as consolidation of ipv4/ip_fragment.c,
   7 *                              ipv6/reassembly. and ipv6 nf conntrack reassembly
   8 */
   9
  10#include <linux/list.h>
  11#include <linux/spinlock.h>
  12#include <linux/module.h>
  13#include <linux/timer.h>
  14#include <linux/mm.h>
  15#include <linux/random.h>
  16#include <linux/skbuff.h>
  17#include <linux/rtnetlink.h>
  18#include <linux/slab.h>
  19#include <linux/rhashtable.h>
  20
  21#include <net/sock.h>
  22#include <net/inet_frag.h>
  23#include <net/inet_ecn.h>
  24#include <net/ip.h>
  25#include <net/ipv6.h>
  26
  27/* Use skb->cb to track consecutive/adjacent fragments coming at
  28 * the end of the queue. Nodes in the rb-tree queue will
  29 * contain "runs" of one or more adjacent fragments.
  30 *
  31 * Invariants:
  32 * - next_frag is NULL at the tail of a "run";
  33 * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
  34 */
  35struct ipfrag_skb_cb {
  36        union {
  37                struct inet_skb_parm    h4;
  38                struct inet6_skb_parm   h6;
  39        };
  40        struct sk_buff          *next_frag;
  41        int                     frag_run_len;
  42};
  43
  44#define FRAG_CB(skb)            ((struct ipfrag_skb_cb *)((skb)->cb))
  45
  46static void fragcb_clear(struct sk_buff *skb)
  47{
  48        RB_CLEAR_NODE(&skb->rbnode);
  49        FRAG_CB(skb)->next_frag = NULL;
  50        FRAG_CB(skb)->frag_run_len = skb->len;
  51}
  52
  53/* Append skb to the last "run". */
  54static void fragrun_append_to_last(struct inet_frag_queue *q,
  55                                   struct sk_buff *skb)
  56{
  57        fragcb_clear(skb);
  58
  59        FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
  60        FRAG_CB(q->fragments_tail)->next_frag = skb;
  61        q->fragments_tail = skb;
  62}
  63
  64/* Create a new "run" with the skb. */
  65static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
  66{
  67        BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
  68        fragcb_clear(skb);
  69
  70        if (q->last_run_head)
  71                rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
  72                             &q->last_run_head->rbnode.rb_right);
  73        else
  74                rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
  75        rb_insert_color(&skb->rbnode, &q->rb_fragments);
  76
  77        q->fragments_tail = skb;
  78        q->last_run_head = skb;
  79}
  80
  81/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
  82 * Value : 0xff if frame should be dropped.
  83 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
  84 */
  85const u8 ip_frag_ecn_table[16] = {
  86        /* at least one fragment had CE, and others ECT_0 or ECT_1 */
  87        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]                      = INET_ECN_CE,
  88        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]                      = INET_ECN_CE,
  89        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]   = INET_ECN_CE,
  90
  91        /* invalid combinations : drop frame */
  92        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
  93        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
  94        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
  95        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
  96        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
  97        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
  98        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
  99};
 100EXPORT_SYMBOL(ip_frag_ecn_table);
 101
 102int inet_frags_init(struct inet_frags *f)
 103{
 104        f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
 105                                            NULL);
 106        if (!f->frags_cachep)
 107                return -ENOMEM;
 108
 109        refcount_set(&f->refcnt, 1);
 110        init_completion(&f->completion);
 111        return 0;
 112}
 113EXPORT_SYMBOL(inet_frags_init);
 114
 115void inet_frags_fini(struct inet_frags *f)
 116{
 117        if (refcount_dec_and_test(&f->refcnt))
 118                complete(&f->completion);
 119
 120        wait_for_completion(&f->completion);
 121
 122        kmem_cache_destroy(f->frags_cachep);
 123        f->frags_cachep = NULL;
 124}
 125EXPORT_SYMBOL(inet_frags_fini);
 126
 127/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
 128static void inet_frags_free_cb(void *ptr, void *arg)
 129{
 130        struct inet_frag_queue *fq = ptr;
 131        int count;
 132
 133        count = del_timer_sync(&fq->timer) ? 1 : 0;
 134
 135        spin_lock_bh(&fq->lock);
 136        if (!(fq->flags & INET_FRAG_COMPLETE)) {
 137                fq->flags |= INET_FRAG_COMPLETE;
 138                count++;
 139        } else if (fq->flags & INET_FRAG_HASH_DEAD) {
 140                count++;
 141        }
 142        spin_unlock_bh(&fq->lock);
 143
 144        if (refcount_sub_and_test(count, &fq->refcnt))
 145                inet_frag_destroy(fq);
 146}
 147
 148static void fqdir_work_fn(struct work_struct *work)
 149{
 150        struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
 151        struct inet_frags *f = fqdir->f;
 152
 153        rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
 154
 155        /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
 156         * have completed, since they need to dereference fqdir.
 157         * Would it not be nice to have kfree_rcu_barrier() ? :)
 158         */
 159        rcu_barrier();
 160
 161        if (refcount_dec_and_test(&f->refcnt))
 162                complete(&f->completion);
 163
 164        kfree(fqdir);
 165}
 166
 167int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
 168{
 169        struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
 170        int res;
 171
 172        if (!fqdir)
 173                return -ENOMEM;
 174        fqdir->f = f;
 175        fqdir->net = net;
 176        res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
 177        if (res < 0) {
 178                kfree(fqdir);
 179                return res;
 180        }
 181        refcount_inc(&f->refcnt);
 182        *fqdirp = fqdir;
 183        return 0;
 184}
 185EXPORT_SYMBOL(fqdir_init);
 186
 187void fqdir_exit(struct fqdir *fqdir)
 188{
 189        INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
 190        queue_work(system_wq, &fqdir->destroy_work);
 191}
 192EXPORT_SYMBOL(fqdir_exit);
 193
 194void inet_frag_kill(struct inet_frag_queue *fq)
 195{
 196        if (del_timer(&fq->timer))
 197                refcount_dec(&fq->refcnt);
 198
 199        if (!(fq->flags & INET_FRAG_COMPLETE)) {
 200                struct fqdir *fqdir = fq->fqdir;
 201
 202                fq->flags |= INET_FRAG_COMPLETE;
 203                rcu_read_lock();
 204                /* The RCU read lock provides a memory barrier
 205                 * guaranteeing that if fqdir->dead is false then
 206                 * the hash table destruction will not start until
 207                 * after we unlock.  Paired with inet_frags_exit_net().
 208                 */
 209                if (!fqdir->dead) {
 210                        rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
 211                                               fqdir->f->rhash_params);
 212                        refcount_dec(&fq->refcnt);
 213                } else {
 214                        fq->flags |= INET_FRAG_HASH_DEAD;
 215                }
 216                rcu_read_unlock();
 217        }
 218}
 219EXPORT_SYMBOL(inet_frag_kill);
 220
 221static void inet_frag_destroy_rcu(struct rcu_head *head)
 222{
 223        struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
 224                                                 rcu);
 225        struct inet_frags *f = q->fqdir->f;
 226
 227        if (f->destructor)
 228                f->destructor(q);
 229        kmem_cache_free(f->frags_cachep, q);
 230}
 231
 232unsigned int inet_frag_rbtree_purge(struct rb_root *root)
 233{
 234        struct rb_node *p = rb_first(root);
 235        unsigned int sum = 0;
 236
 237        while (p) {
 238                struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 239
 240                p = rb_next(p);
 241                rb_erase(&skb->rbnode, root);
 242                while (skb) {
 243                        struct sk_buff *next = FRAG_CB(skb)->next_frag;
 244
 245                        sum += skb->truesize;
 246                        kfree_skb(skb);
 247                        skb = next;
 248                }
 249        }
 250        return sum;
 251}
 252EXPORT_SYMBOL(inet_frag_rbtree_purge);
 253
 254void inet_frag_destroy(struct inet_frag_queue *q)
 255{
 256        struct fqdir *fqdir;
 257        unsigned int sum, sum_truesize = 0;
 258        struct inet_frags *f;
 259
 260        WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 261        WARN_ON(del_timer(&q->timer) != 0);
 262
 263        /* Release all fragment data. */
 264        fqdir = q->fqdir;
 265        f = fqdir->f;
 266        sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
 267        sum = sum_truesize + f->qsize;
 268
 269        call_rcu(&q->rcu, inet_frag_destroy_rcu);
 270
 271        sub_frag_mem_limit(fqdir, sum);
 272}
 273EXPORT_SYMBOL(inet_frag_destroy);
 274
 275static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
 276                                               struct inet_frags *f,
 277                                               void *arg)
 278{
 279        struct inet_frag_queue *q;
 280
 281        q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 282        if (!q)
 283                return NULL;
 284
 285        q->fqdir = fqdir;
 286        f->constructor(q, arg);
 287        add_frag_mem_limit(fqdir, f->qsize);
 288
 289        timer_setup(&q->timer, f->frag_expire, 0);
 290        spin_lock_init(&q->lock);
 291        refcount_set(&q->refcnt, 3);
 292
 293        return q;
 294}
 295
 296static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
 297                                                void *arg,
 298                                                struct inet_frag_queue **prev)
 299{
 300        struct inet_frags *f = fqdir->f;
 301        struct inet_frag_queue *q;
 302
 303        q = inet_frag_alloc(fqdir, f, arg);
 304        if (!q) {
 305                *prev = ERR_PTR(-ENOMEM);
 306                return NULL;
 307        }
 308        mod_timer(&q->timer, jiffies + fqdir->timeout);
 309
 310        *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
 311                                                 &q->node, f->rhash_params);
 312        if (*prev) {
 313                q->flags |= INET_FRAG_COMPLETE;
 314                inet_frag_kill(q);
 315                inet_frag_destroy(q);
 316                return NULL;
 317        }
 318        return q;
 319}
 320
 321/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
 322struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
 323{
 324        struct inet_frag_queue *fq = NULL, *prev;
 325
 326        if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh)
 327                return NULL;
 328
 329        rcu_read_lock();
 330
 331        prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
 332        if (!prev)
 333                fq = inet_frag_create(fqdir, key, &prev);
 334        if (!IS_ERR_OR_NULL(prev)) {
 335                fq = prev;
 336                if (!refcount_inc_not_zero(&fq->refcnt))
 337                        fq = NULL;
 338        }
 339        rcu_read_unlock();
 340        return fq;
 341}
 342EXPORT_SYMBOL(inet_frag_find);
 343
 344int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
 345                           int offset, int end)
 346{
 347        struct sk_buff *last = q->fragments_tail;
 348
 349        /* RFC5722, Section 4, amended by Errata ID : 3089
 350         *                          When reassembling an IPv6 datagram, if
 351         *   one or more its constituent fragments is determined to be an
 352         *   overlapping fragment, the entire datagram (and any constituent
 353         *   fragments) MUST be silently discarded.
 354         *
 355         * Duplicates, however, should be ignored (i.e. skb dropped, but the
 356         * queue/fragments kept for later reassembly).
 357         */
 358        if (!last)
 359                fragrun_create(q, skb);  /* First fragment. */
 360        else if (last->ip_defrag_offset + last->len < end) {
 361                /* This is the common case: skb goes to the end. */
 362                /* Detect and discard overlaps. */
 363                if (offset < last->ip_defrag_offset + last->len)
 364                        return IPFRAG_OVERLAP;
 365                if (offset == last->ip_defrag_offset + last->len)
 366                        fragrun_append_to_last(q, skb);
 367                else
 368                        fragrun_create(q, skb);
 369        } else {
 370                /* Binary search. Note that skb can become the first fragment,
 371                 * but not the last (covered above).
 372                 */
 373                struct rb_node **rbn, *parent;
 374
 375                rbn = &q->rb_fragments.rb_node;
 376                do {
 377                        struct sk_buff *curr;
 378                        int curr_run_end;
 379
 380                        parent = *rbn;
 381                        curr = rb_to_skb(parent);
 382                        curr_run_end = curr->ip_defrag_offset +
 383                                        FRAG_CB(curr)->frag_run_len;
 384                        if (end <= curr->ip_defrag_offset)
 385                                rbn = &parent->rb_left;
 386                        else if (offset >= curr_run_end)
 387                                rbn = &parent->rb_right;
 388                        else if (offset >= curr->ip_defrag_offset &&
 389                                 end <= curr_run_end)
 390                                return IPFRAG_DUP;
 391                        else
 392                                return IPFRAG_OVERLAP;
 393                } while (*rbn);
 394                /* Here we have parent properly set, and rbn pointing to
 395                 * one of its NULL left/right children. Insert skb.
 396                 */
 397                fragcb_clear(skb);
 398                rb_link_node(&skb->rbnode, parent, rbn);
 399                rb_insert_color(&skb->rbnode, &q->rb_fragments);
 400        }
 401
 402        skb->ip_defrag_offset = offset;
 403
 404        return IPFRAG_OK;
 405}
 406EXPORT_SYMBOL(inet_frag_queue_insert);
 407
 408void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
 409                              struct sk_buff *parent)
 410{
 411        struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
 412        struct sk_buff **nextp;
 413        int delta;
 414
 415        if (head != skb) {
 416                fp = skb_clone(skb, GFP_ATOMIC);
 417                if (!fp)
 418                        return NULL;
 419                FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
 420                if (RB_EMPTY_NODE(&skb->rbnode))
 421                        FRAG_CB(parent)->next_frag = fp;
 422                else
 423                        rb_replace_node(&skb->rbnode, &fp->rbnode,
 424                                        &q->rb_fragments);
 425                if (q->fragments_tail == skb)
 426                        q->fragments_tail = fp;
 427                skb_morph(skb, head);
 428                FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
 429                rb_replace_node(&head->rbnode, &skb->rbnode,
 430                                &q->rb_fragments);
 431                consume_skb(head);
 432                head = skb;
 433        }
 434        WARN_ON(head->ip_defrag_offset != 0);
 435
 436        delta = -head->truesize;
 437
 438        /* Head of list must not be cloned. */
 439        if (skb_unclone(head, GFP_ATOMIC))
 440                return NULL;
 441
 442        delta += head->truesize;
 443        if (delta)
 444                add_frag_mem_limit(q->fqdir, delta);
 445
 446        /* If the first fragment is fragmented itself, we split
 447         * it to two chunks: the first with data and paged part
 448         * and the second, holding only fragments.
 449         */
 450        if (skb_has_frag_list(head)) {
 451                struct sk_buff *clone;
 452                int i, plen = 0;
 453
 454                clone = alloc_skb(0, GFP_ATOMIC);
 455                if (!clone)
 456                        return NULL;
 457                skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
 458                skb_frag_list_init(head);
 459                for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
 460                        plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
 461                clone->data_len = head->data_len - plen;
 462                clone->len = clone->data_len;
 463                head->truesize += clone->truesize;
 464                clone->csum = 0;
 465                clone->ip_summed = head->ip_summed;
 466                add_frag_mem_limit(q->fqdir, clone->truesize);
 467                skb_shinfo(head)->frag_list = clone;
 468                nextp = &clone->next;
 469        } else {
 470                nextp = &skb_shinfo(head)->frag_list;
 471        }
 472
 473        return nextp;
 474}
 475EXPORT_SYMBOL(inet_frag_reasm_prepare);
 476
 477void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
 478                            void *reasm_data, bool try_coalesce)
 479{
 480        struct sk_buff **nextp = (struct sk_buff **)reasm_data;
 481        struct rb_node *rbn;
 482        struct sk_buff *fp;
 483        int sum_truesize;
 484
 485        skb_push(head, head->data - skb_network_header(head));
 486
 487        /* Traverse the tree in order, to build frag_list. */
 488        fp = FRAG_CB(head)->next_frag;
 489        rbn = rb_next(&head->rbnode);
 490        rb_erase(&head->rbnode, &q->rb_fragments);
 491
 492        sum_truesize = head->truesize;
 493        while (rbn || fp) {
 494                /* fp points to the next sk_buff in the current run;
 495                 * rbn points to the next run.
 496                 */
 497                /* Go through the current run. */
 498                while (fp) {
 499                        struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
 500                        bool stolen;
 501                        int delta;
 502
 503                        sum_truesize += fp->truesize;
 504                        if (head->ip_summed != fp->ip_summed)
 505                                head->ip_summed = CHECKSUM_NONE;
 506                        else if (head->ip_summed == CHECKSUM_COMPLETE)
 507                                head->csum = csum_add(head->csum, fp->csum);
 508
 509                        if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
 510                                                             &delta)) {
 511                                kfree_skb_partial(fp, stolen);
 512                        } else {
 513                                fp->prev = NULL;
 514                                memset(&fp->rbnode, 0, sizeof(fp->rbnode));
 515                                fp->sk = NULL;
 516
 517                                head->data_len += fp->len;
 518                                head->len += fp->len;
 519                                head->truesize += fp->truesize;
 520
 521                                *nextp = fp;
 522                                nextp = &fp->next;
 523                        }
 524
 525                        fp = next_frag;
 526                }
 527                /* Move to the next run. */
 528                if (rbn) {
 529                        struct rb_node *rbnext = rb_next(rbn);
 530
 531                        fp = rb_to_skb(rbn);
 532                        rb_erase(rbn, &q->rb_fragments);
 533                        rbn = rbnext;
 534                }
 535        }
 536        sub_frag_mem_limit(q->fqdir, sum_truesize);
 537
 538        *nextp = NULL;
 539        skb_mark_not_on_list(head);
 540        head->prev = NULL;
 541        head->tstamp = q->stamp;
 542}
 543EXPORT_SYMBOL(inet_frag_reasm_finish);
 544
 545struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
 546{
 547        struct sk_buff *head, *skb;
 548
 549        head = skb_rb_first(&q->rb_fragments);
 550        if (!head)
 551                return NULL;
 552        skb = FRAG_CB(head)->next_frag;
 553        if (skb)
 554                rb_replace_node(&head->rbnode, &skb->rbnode,
 555                                &q->rb_fragments);
 556        else
 557                rb_erase(&head->rbnode, &q->rb_fragments);
 558        memset(&head->rbnode, 0, sizeof(head->rbnode));
 559        barrier();
 560
 561        if (head == q->fragments_tail)
 562                q->fragments_tail = NULL;
 563
 564        sub_frag_mem_limit(q->fqdir, head->truesize);
 565
 566        return head;
 567}
 568EXPORT_SYMBOL(inet_frag_pull_head);
 569