linux/net/ipv4/inet_fragment.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * inet fragments management
   4 *
   5 *              Authors:        Pavel Emelyanov <xemul@openvz.org>
   6 *                              Started as consolidation of ipv4/ip_fragment.c,
   7 *                              ipv6/reassembly. and ipv6 nf conntrack reassembly
   8 */
   9
  10#include <linux/list.h>
  11#include <linux/spinlock.h>
  12#include <linux/module.h>
  13#include <linux/timer.h>
  14#include <linux/mm.h>
  15#include <linux/random.h>
  16#include <linux/skbuff.h>
  17#include <linux/rtnetlink.h>
  18#include <linux/slab.h>
  19#include <linux/rhashtable.h>
  20
  21#include <net/sock.h>
  22#include <net/inet_frag.h>
  23#include <net/inet_ecn.h>
  24#include <net/ip.h>
  25#include <net/ipv6.h>
  26
  27/* Use skb->cb to track consecutive/adjacent fragments coming at
  28 * the end of the queue. Nodes in the rb-tree queue will
  29 * contain "runs" of one or more adjacent fragments.
  30 *
  31 * Invariants:
  32 * - next_frag is NULL at the tail of a "run";
  33 * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
  34 */
  35struct ipfrag_skb_cb {
  36        union {
  37                struct inet_skb_parm    h4;
  38                struct inet6_skb_parm   h6;
  39        };
  40        struct sk_buff          *next_frag;
  41        int                     frag_run_len;
  42};
  43
  44#define FRAG_CB(skb)            ((struct ipfrag_skb_cb *)((skb)->cb))
  45
  46static void fragcb_clear(struct sk_buff *skb)
  47{
  48        RB_CLEAR_NODE(&skb->rbnode);
  49        FRAG_CB(skb)->next_frag = NULL;
  50        FRAG_CB(skb)->frag_run_len = skb->len;
  51}
  52
  53/* Append skb to the last "run". */
  54static void fragrun_append_to_last(struct inet_frag_queue *q,
  55                                   struct sk_buff *skb)
  56{
  57        fragcb_clear(skb);
  58
  59        FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
  60        FRAG_CB(q->fragments_tail)->next_frag = skb;
  61        q->fragments_tail = skb;
  62}
  63
  64/* Create a new "run" with the skb. */
  65static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
  66{
  67        BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
  68        fragcb_clear(skb);
  69
  70        if (q->last_run_head)
  71                rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
  72                             &q->last_run_head->rbnode.rb_right);
  73        else
  74                rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
  75        rb_insert_color(&skb->rbnode, &q->rb_fragments);
  76
  77        q->fragments_tail = skb;
  78        q->last_run_head = skb;
  79}
  80
  81/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
  82 * Value : 0xff if frame should be dropped.
  83 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
  84 */
  85const u8 ip_frag_ecn_table[16] = {
  86        /* at least one fragment had CE, and others ECT_0 or ECT_1 */
  87        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]                      = INET_ECN_CE,
  88        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]                      = INET_ECN_CE,
  89        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]   = INET_ECN_CE,
  90
  91        /* invalid combinations : drop frame */
  92        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
  93        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
  94        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
  95        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
  96        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
  97        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
  98        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
  99};
 100EXPORT_SYMBOL(ip_frag_ecn_table);
 101
 102int inet_frags_init(struct inet_frags *f)
 103{
 104        f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
 105                                            NULL);
 106        if (!f->frags_cachep)
 107                return -ENOMEM;
 108
 109        return 0;
 110}
 111EXPORT_SYMBOL(inet_frags_init);
 112
 113void inet_frags_fini(struct inet_frags *f)
 114{
 115        /* We must wait that all inet_frag_destroy_rcu() have completed. */
 116        rcu_barrier();
 117
 118        kmem_cache_destroy(f->frags_cachep);
 119        f->frags_cachep = NULL;
 120}
 121EXPORT_SYMBOL(inet_frags_fini);
 122
 123static void inet_frags_free_cb(void *ptr, void *arg)
 124{
 125        struct inet_frag_queue *fq = ptr;
 126
 127        /* If we can not cancel the timer, it means this frag_queue
 128         * is already disappearing, we have nothing to do.
 129         * Otherwise, we own a refcount until the end of this function.
 130         */
 131        if (!del_timer(&fq->timer))
 132                return;
 133
 134        spin_lock_bh(&fq->lock);
 135        if (!(fq->flags & INET_FRAG_COMPLETE)) {
 136                fq->flags |= INET_FRAG_COMPLETE;
 137                refcount_dec(&fq->refcnt);
 138        }
 139        spin_unlock_bh(&fq->lock);
 140
 141        inet_frag_put(fq);
 142}
 143
 144void inet_frags_exit_net(struct netns_frags *nf)
 145{
 146        nf->high_thresh = 0; /* prevent creation of new frags */
 147
 148        rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
 149}
 150EXPORT_SYMBOL(inet_frags_exit_net);
 151
 152void inet_frag_kill(struct inet_frag_queue *fq)
 153{
 154        if (del_timer(&fq->timer))
 155                refcount_dec(&fq->refcnt);
 156
 157        if (!(fq->flags & INET_FRAG_COMPLETE)) {
 158                struct netns_frags *nf = fq->net;
 159
 160                fq->flags |= INET_FRAG_COMPLETE;
 161                rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
 162                refcount_dec(&fq->refcnt);
 163        }
 164}
 165EXPORT_SYMBOL(inet_frag_kill);
 166
 167static void inet_frag_destroy_rcu(struct rcu_head *head)
 168{
 169        struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
 170                                                 rcu);
 171        struct inet_frags *f = q->net->f;
 172
 173        if (f->destructor)
 174                f->destructor(q);
 175        kmem_cache_free(f->frags_cachep, q);
 176}
 177
 178unsigned int inet_frag_rbtree_purge(struct rb_root *root)
 179{
 180        struct rb_node *p = rb_first(root);
 181        unsigned int sum = 0;
 182
 183        while (p) {
 184                struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 185
 186                p = rb_next(p);
 187                rb_erase(&skb->rbnode, root);
 188                while (skb) {
 189                        struct sk_buff *next = FRAG_CB(skb)->next_frag;
 190
 191                        sum += skb->truesize;
 192                        kfree_skb(skb);
 193                        skb = next;
 194                }
 195        }
 196        return sum;
 197}
 198EXPORT_SYMBOL(inet_frag_rbtree_purge);
 199
 200void inet_frag_destroy(struct inet_frag_queue *q)
 201{
 202        struct netns_frags *nf;
 203        unsigned int sum, sum_truesize = 0;
 204        struct inet_frags *f;
 205
 206        WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 207        WARN_ON(del_timer(&q->timer) != 0);
 208
 209        /* Release all fragment data. */
 210        nf = q->net;
 211        f = nf->f;
 212        sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
 213        sum = sum_truesize + f->qsize;
 214
 215        call_rcu(&q->rcu, inet_frag_destroy_rcu);
 216
 217        sub_frag_mem_limit(nf, sum);
 218}
 219EXPORT_SYMBOL(inet_frag_destroy);
 220
 221static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 222                                               struct inet_frags *f,
 223                                               void *arg)
 224{
 225        struct inet_frag_queue *q;
 226
 227        q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 228        if (!q)
 229                return NULL;
 230
 231        q->net = nf;
 232        f->constructor(q, arg);
 233        add_frag_mem_limit(nf, f->qsize);
 234
 235        timer_setup(&q->timer, f->frag_expire, 0);
 236        spin_lock_init(&q->lock);
 237        refcount_set(&q->refcnt, 3);
 238
 239        return q;
 240}
 241
 242static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
 243                                                void *arg,
 244                                                struct inet_frag_queue **prev)
 245{
 246        struct inet_frags *f = nf->f;
 247        struct inet_frag_queue *q;
 248
 249        q = inet_frag_alloc(nf, f, arg);
 250        if (!q) {
 251                *prev = ERR_PTR(-ENOMEM);
 252                return NULL;
 253        }
 254        mod_timer(&q->timer, jiffies + nf->timeout);
 255
 256        *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
 257                                                 &q->node, f->rhash_params);
 258        if (*prev) {
 259                q->flags |= INET_FRAG_COMPLETE;
 260                inet_frag_kill(q);
 261                inet_frag_destroy(q);
 262                return NULL;
 263        }
 264        return q;
 265}
 266
 267/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
 268struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
 269{
 270        struct inet_frag_queue *fq = NULL, *prev;
 271
 272        if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
 273                return NULL;
 274
 275        rcu_read_lock();
 276
 277        prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
 278        if (!prev)
 279                fq = inet_frag_create(nf, key, &prev);
 280        if (prev && !IS_ERR(prev)) {
 281                fq = prev;
 282                if (!refcount_inc_not_zero(&fq->refcnt))
 283                        fq = NULL;
 284        }
 285        rcu_read_unlock();
 286        return fq;
 287}
 288EXPORT_SYMBOL(inet_frag_find);
 289
 290int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
 291                           int offset, int end)
 292{
 293        struct sk_buff *last = q->fragments_tail;
 294
 295        /* RFC5722, Section 4, amended by Errata ID : 3089
 296         *                          When reassembling an IPv6 datagram, if
 297         *   one or more its constituent fragments is determined to be an
 298         *   overlapping fragment, the entire datagram (and any constituent
 299         *   fragments) MUST be silently discarded.
 300         *
 301         * Duplicates, however, should be ignored (i.e. skb dropped, but the
 302         * queue/fragments kept for later reassembly).
 303         */
 304        if (!last)
 305                fragrun_create(q, skb);  /* First fragment. */
 306        else if (last->ip_defrag_offset + last->len < end) {
 307                /* This is the common case: skb goes to the end. */
 308                /* Detect and discard overlaps. */
 309                if (offset < last->ip_defrag_offset + last->len)
 310                        return IPFRAG_OVERLAP;
 311                if (offset == last->ip_defrag_offset + last->len)
 312                        fragrun_append_to_last(q, skb);
 313                else
 314                        fragrun_create(q, skb);
 315        } else {
 316                /* Binary search. Note that skb can become the first fragment,
 317                 * but not the last (covered above).
 318                 */
 319                struct rb_node **rbn, *parent;
 320
 321                rbn = &q->rb_fragments.rb_node;
 322                do {
 323                        struct sk_buff *curr;
 324                        int curr_run_end;
 325
 326                        parent = *rbn;
 327                        curr = rb_to_skb(parent);
 328                        curr_run_end = curr->ip_defrag_offset +
 329                                        FRAG_CB(curr)->frag_run_len;
 330                        if (end <= curr->ip_defrag_offset)
 331                                rbn = &parent->rb_left;
 332                        else if (offset >= curr_run_end)
 333                                rbn = &parent->rb_right;
 334                        else if (offset >= curr->ip_defrag_offset &&
 335                                 end <= curr_run_end)
 336                                return IPFRAG_DUP;
 337                        else
 338                                return IPFRAG_OVERLAP;
 339                } while (*rbn);
 340                /* Here we have parent properly set, and rbn pointing to
 341                 * one of its NULL left/right children. Insert skb.
 342                 */
 343                fragcb_clear(skb);
 344                rb_link_node(&skb->rbnode, parent, rbn);
 345                rb_insert_color(&skb->rbnode, &q->rb_fragments);
 346        }
 347
 348        skb->ip_defrag_offset = offset;
 349
 350        return IPFRAG_OK;
 351}
 352EXPORT_SYMBOL(inet_frag_queue_insert);
 353
 354void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
 355                              struct sk_buff *parent)
 356{
 357        struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
 358        struct sk_buff **nextp;
 359        int delta;
 360
 361        if (head != skb) {
 362                fp = skb_clone(skb, GFP_ATOMIC);
 363                if (!fp)
 364                        return NULL;
 365                FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
 366                if (RB_EMPTY_NODE(&skb->rbnode))
 367                        FRAG_CB(parent)->next_frag = fp;
 368                else
 369                        rb_replace_node(&skb->rbnode, &fp->rbnode,
 370                                        &q->rb_fragments);
 371                if (q->fragments_tail == skb)
 372                        q->fragments_tail = fp;
 373                skb_morph(skb, head);
 374                FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
 375                rb_replace_node(&head->rbnode, &skb->rbnode,
 376                                &q->rb_fragments);
 377                consume_skb(head);
 378                head = skb;
 379        }
 380        WARN_ON(head->ip_defrag_offset != 0);
 381
 382        delta = -head->truesize;
 383
 384        /* Head of list must not be cloned. */
 385        if (skb_unclone(head, GFP_ATOMIC))
 386                return NULL;
 387
 388        delta += head->truesize;
 389        if (delta)
 390                add_frag_mem_limit(q->net, delta);
 391
 392        /* If the first fragment is fragmented itself, we split
 393         * it to two chunks: the first with data and paged part
 394         * and the second, holding only fragments.
 395         */
 396        if (skb_has_frag_list(head)) {
 397                struct sk_buff *clone;
 398                int i, plen = 0;
 399
 400                clone = alloc_skb(0, GFP_ATOMIC);
 401                if (!clone)
 402                        return NULL;
 403                skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
 404                skb_frag_list_init(head);
 405                for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
 406                        plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
 407                clone->data_len = head->data_len - plen;
 408                clone->len = clone->data_len;
 409                head->truesize += clone->truesize;
 410                clone->csum = 0;
 411                clone->ip_summed = head->ip_summed;
 412                add_frag_mem_limit(q->net, clone->truesize);
 413                skb_shinfo(head)->frag_list = clone;
 414                nextp = &clone->next;
 415        } else {
 416                nextp = &skb_shinfo(head)->frag_list;
 417        }
 418
 419        return nextp;
 420}
 421EXPORT_SYMBOL(inet_frag_reasm_prepare);
 422
 423void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
 424                            void *reasm_data)
 425{
 426        struct sk_buff **nextp = (struct sk_buff **)reasm_data;
 427        struct rb_node *rbn;
 428        struct sk_buff *fp;
 429
 430        skb_push(head, head->data - skb_network_header(head));
 431
 432        /* Traverse the tree in order, to build frag_list. */
 433        fp = FRAG_CB(head)->next_frag;
 434        rbn = rb_next(&head->rbnode);
 435        rb_erase(&head->rbnode, &q->rb_fragments);
 436        while (rbn || fp) {
 437                /* fp points to the next sk_buff in the current run;
 438                 * rbn points to the next run.
 439                 */
 440                /* Go through the current run. */
 441                while (fp) {
 442                        *nextp = fp;
 443                        nextp = &fp->next;
 444                        fp->prev = NULL;
 445                        memset(&fp->rbnode, 0, sizeof(fp->rbnode));
 446                        fp->sk = NULL;
 447                        head->data_len += fp->len;
 448                        head->len += fp->len;
 449                        if (head->ip_summed != fp->ip_summed)
 450                                head->ip_summed = CHECKSUM_NONE;
 451                        else if (head->ip_summed == CHECKSUM_COMPLETE)
 452                                head->csum = csum_add(head->csum, fp->csum);
 453                        head->truesize += fp->truesize;
 454                        fp = FRAG_CB(fp)->next_frag;
 455                }
 456                /* Move to the next run. */
 457                if (rbn) {
 458                        struct rb_node *rbnext = rb_next(rbn);
 459
 460                        fp = rb_to_skb(rbn);
 461                        rb_erase(rbn, &q->rb_fragments);
 462                        rbn = rbnext;
 463                }
 464        }
 465        sub_frag_mem_limit(q->net, head->truesize);
 466
 467        *nextp = NULL;
 468        skb_mark_not_on_list(head);
 469        head->prev = NULL;
 470        head->tstamp = q->stamp;
 471}
 472EXPORT_SYMBOL(inet_frag_reasm_finish);
 473
 474struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
 475{
 476        struct sk_buff *head, *skb;
 477
 478        head = skb_rb_first(&q->rb_fragments);
 479        if (!head)
 480                return NULL;
 481        skb = FRAG_CB(head)->next_frag;
 482        if (skb)
 483                rb_replace_node(&head->rbnode, &skb->rbnode,
 484                                &q->rb_fragments);
 485        else
 486                rb_erase(&head->rbnode, &q->rb_fragments);
 487        memset(&head->rbnode, 0, sizeof(head->rbnode));
 488        barrier();
 489
 490        if (head == q->fragments_tail)
 491                q->fragments_tail = NULL;
 492
 493        sub_frag_mem_limit(q->net, head->truesize);
 494
 495        return head;
 496}
 497EXPORT_SYMBOL(inet_frag_pull_head);
 498