linux/net/ipv4/ip_fragment.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              The IP fragmentation functionality.
   7 *
   8 * Authors:     Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
   9 *              Alan Cox <alan@lxorguk.ukuu.org.uk>
  10 *
  11 * Fixes:
  12 *              Alan Cox        :       Split from ip.c , see ip_input.c for history.
  13 *              David S. Miller :       Begin massive cleanup...
  14 *              Andi Kleen      :       Add sysctls.
  15 *              xxxx            :       Overlapfrag bug.
  16 *              Ultima          :       ip_expire() kernel panic.
  17 *              Bill Hawes      :       Frag accounting and evictor fixes.
  18 *              John McDonald   :       0 length frag bug.
  19 *              Alexey Kuznetsov:       SMP races, threading, cleanup.
  20 *              Patrick McHardy :       LRU queue of frag heads for evictor.
  21 */
  22
  23#define pr_fmt(fmt) "IPv4: " fmt
  24
  25#include <linux/compiler.h>
  26#include <linux/module.h>
  27#include <linux/types.h>
  28#include <linux/mm.h>
  29#include <linux/jiffies.h>
  30#include <linux/skbuff.h>
  31#include <linux/list.h>
  32#include <linux/ip.h>
  33#include <linux/icmp.h>
  34#include <linux/netdevice.h>
  35#include <linux/jhash.h>
  36#include <linux/random.h>
  37#include <linux/slab.h>
  38#include <net/route.h>
  39#include <net/dst.h>
  40#include <net/sock.h>
  41#include <net/ip.h>
  42#include <net/icmp.h>
  43#include <net/checksum.h>
  44#include <net/inetpeer.h>
  45#include <net/inet_frag.h>
  46#include <linux/tcp.h>
  47#include <linux/udp.h>
  48#include <linux/inet.h>
  49#include <linux/netfilter_ipv4.h>
  50#include <net/inet_ecn.h>
  51
  52/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
  53 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
  54 * as well. Or notify me, at least. --ANK
  55 */
  56
  57static int sysctl_ipfrag_max_dist __read_mostly = 64;
  58
  59/* Use skb->cb to track consecutive/adjacent fragments coming at
  60 * the end of the queue. Nodes in the rb-tree queue will
  61 * contain "runs" of one or more adjacent fragments.
  62 *
  63 * Invariants:
  64 * - next_frag is NULL at the tail of a "run";
  65 * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
  66 */
  67struct ipfrag_skb_cb
  68{
  69        struct inet_skb_parm    h;
  70        int                     offset;
  71        struct sk_buff          *next_frag;
  72        int                     frag_run_len;
  73};
  74
  75#define FRAG_CB(skb)    ((struct ipfrag_skb_cb *)((skb)->cb))
  76
  77static void ip4_frag_init_run(struct sk_buff *skb)
  78{
  79        BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
  80
  81        FRAG_CB(skb)->next_frag = NULL;
  82        FRAG_CB(skb)->frag_run_len = skb->len;
  83}
  84
  85/* Append skb to the last "run". */
  86static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
  87                                        struct sk_buff *skb)
  88{
  89        RB_CLEAR_NODE(&skb->rbnode);
  90        FRAG_CB(skb)->next_frag = NULL;
  91
  92        FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
  93        FRAG_CB(q->fragments_tail)->next_frag = skb;
  94        q->fragments_tail = skb;
  95}
  96
  97/* Create a new "run" with the skb. */
  98static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
  99{
 100        if (q->last_run_head)
 101                rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
 102                             &q->last_run_head->rbnode.rb_right);
 103        else
 104                rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
 105        rb_insert_color(&skb->rbnode, &q->rb_fragments);
 106
 107        ip4_frag_init_run(skb);
 108        q->fragments_tail = skb;
 109        q->last_run_head = skb;
 110}
 111
 112/* Describe an entry in the "incomplete datagrams" queue. */
 113struct ipq {
 114        struct inet_frag_queue q;
 115
 116        u32             user;
 117        __be32          saddr;
 118        __be32          daddr;
 119        __be16          id;
 120        u8              protocol;
 121        u8              ecn; /* RFC3168 support */
 122        u16             max_df_size; /* largest frag with DF set seen */
 123        int             iif;
 124        unsigned int    rid;
 125        struct inet_peer *peer;
 126};
 127
 128static inline u8 ip4_frag_ecn(u8 tos)
 129{
 130        return 1 << (tos & INET_ECN_MASK);
 131}
 132
 133static struct inet_frags ip4_frags;
 134
 135int ip_frag_nqueues(struct net *net)
 136{
 137        return net->ipv4.frags.nqueues;
 138}
 139
 140int ip_frag_mem(struct net *net)
 141{
 142        return sum_frag_mem_limit(&net->ipv4.frags);
 143}
 144
 145static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 146                         struct sk_buff *prev_tail, struct net_device *dev);
 147
 148struct ip4_create_arg {
 149        struct iphdr *iph;
 150        u32 user;
 151};
 152
 153static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
 154{
 155        net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
 156        return jhash_3words((__force u32)id << 16 | prot,
 157                            (__force u32)saddr, (__force u32)daddr,
 158                            ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
 159}
 160
 161static unsigned int ip4_hashfn(struct inet_frag_queue *q)
 162{
 163        struct ipq *ipq;
 164
 165        ipq = container_of(q, struct ipq, q);
 166        return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
 167}
 168
 169static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
 170{
 171        struct ipq *qp;
 172        struct ip4_create_arg *arg = a;
 173
 174        qp = container_of(q, struct ipq, q);
 175        return  qp->id == arg->iph->id &&
 176                qp->saddr == arg->iph->saddr &&
 177                qp->daddr == arg->iph->daddr &&
 178                qp->protocol == arg->iph->protocol &&
 179                qp->user == arg->user;
 180}
 181
 182static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 183{
 184        struct ipq *qp = container_of(q, struct ipq, q);
 185        struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
 186                                               frags);
 187        struct net *net = container_of(ipv4, struct net, ipv4);
 188
 189        struct ip4_create_arg *arg = a;
 190
 191        qp->protocol = arg->iph->protocol;
 192        qp->id = arg->iph->id;
 193        qp->ecn = ip4_frag_ecn(arg->iph->tos);
 194        qp->saddr = arg->iph->saddr;
 195        qp->daddr = arg->iph->daddr;
 196        qp->user = arg->user;
 197        qp->peer = sysctl_ipfrag_max_dist ?
 198                inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
 199}
 200
 201static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
 202{
 203        struct ipq *qp;
 204
 205        qp = container_of(q, struct ipq, q);
 206        if (qp->peer)
 207                inet_putpeer(qp->peer);
 208}
 209
 210
 211/* Destruction primitives. */
 212
 213static __inline__ void ipq_put(struct ipq *ipq)
 214{
 215        inet_frag_put(&ipq->q, &ip4_frags);
 216}
 217
 218/* Kill ipq entry. It is not destroyed immediately,
 219 * because caller (and someone more) holds reference count.
 220 */
 221static void ipq_kill(struct ipq *ipq)
 222{
 223        inet_frag_kill(&ipq->q, &ip4_frags);
 224}
 225
 226/* Memory limiting on fragments.  Evictor trashes the oldest
 227 * fragment queue until we are back under the threshold.
 228 */
 229static void ip_evictor(struct net *net)
 230{
 231        int evicted;
 232
 233        evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
 234        if (evicted)
 235                IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
 236}
 237
 238/*
 239 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 240 */
 241static void ip_expire(unsigned long arg)
 242{
 243        struct sk_buff *head = NULL;
 244        const struct iphdr *iph;
 245        struct ipq *qp;
 246        struct net *net;
 247        int err;
 248
 249        qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
 250        net = container_of(qp->q.net, struct net, ipv4.frags);
 251
 252        spin_lock(&qp->q.lock);
 253
 254        if (qp->q.last_in & INET_FRAG_COMPLETE)
 255                goto out;
 256
 257        ipq_kill(qp);
 258
 259        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
 260        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 261
 262        if (!(qp->q.last_in & INET_FRAG_FIRST_IN))
 263                goto out;
 264
 265        /* sk_buff::dev and sk_buff::rbnode are unionized. So we
 266         * pull the head out of the tree in order to be able to
 267         * deal with head->dev.
 268         */
 269        if (qp->q.fragments) {
 270                head = qp->q.fragments;
 271                qp->q.fragments = head->next;
 272        } else {
 273                head = skb_rb_first(&qp->q.rb_fragments);
 274                if (!head)
 275                        goto out;
 276                if (FRAG_CB(head)->next_frag)
 277                        rb_replace_node(&head->rbnode,
 278                                        &FRAG_CB(head)->next_frag->rbnode,
 279                                        &qp->q.rb_fragments);
 280                else
 281                        rb_erase(&head->rbnode, &qp->q.rb_fragments);
 282                memset(&head->rbnode, 0, sizeof(head->rbnode));
 283                barrier();
 284        }
 285        if (head == qp->q.fragments_tail)
 286                qp->q.fragments_tail = NULL;
 287
 288        sub_frag_mem_limit(&qp->q, head->truesize);
 289
 290        rcu_read_lock();
 291        head->dev = dev_get_by_index_rcu(net, qp->iif);
 292        if (!head->dev)
 293                goto out_rcu_unlock;
 294
 295        /* skb has no dst, perform route lookup again */
 296        iph = ip_hdr(head);
 297        err = ip_route_input_noref(head, iph->daddr, iph->saddr,
 298                                   iph->tos, head->dev);
 299        if (err)
 300                goto out_rcu_unlock;
 301
 302        /*
 303         * Only an end host needs to send an ICMP
 304         * "Fragment Reassembly Timeout" message, per RFC792.
 305         */
 306        if (qp->user == IP_DEFRAG_AF_PACKET ||
 307            (qp->user == IP_DEFRAG_CONNTRACK_IN &&
 308             skb_rtable(head)->rt_type != RTN_LOCAL))
 309                goto out_rcu_unlock;
 310
 311        /* Send an ICMP "Fragment Reassembly Timeout" message. */
 312        icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
 313out_rcu_unlock:
 314        rcu_read_unlock();
 315out:
 316        spin_unlock(&qp->q.lock);
 317        if (head)
 318                kfree_skb(head);
 319        ipq_put(qp);
 320}
 321
 322/* Find the correct entry in the "incomplete datagrams" queue for
 323 * this IP datagram, and create new one, if nothing is found.
 324 */
 325static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
 326{
 327        struct inet_frag_queue *q;
 328        struct ip4_create_arg arg;
 329        unsigned int hash;
 330
 331        arg.iph = iph;
 332        arg.user = user;
 333
 334        read_lock(&ip4_frags.lock);
 335        hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
 336
 337        q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
 338        if (IS_ERR_OR_NULL(q)) {
 339                inet_frag_maybe_warn_overflow(q, pr_fmt());
 340                return NULL;
 341        }
 342        return container_of(q, struct ipq, q);
 343}
 344
 345/* Is the fragment too far ahead to be part of ipq? */
 346static inline int ip_frag_too_far(struct ipq *qp)
 347{
 348        struct inet_peer *peer = qp->peer;
 349        unsigned int max = sysctl_ipfrag_max_dist;
 350        unsigned int start, end;
 351
 352        int rc;
 353
 354        if (!peer || !max)
 355                return 0;
 356
 357        start = qp->rid;
 358        end = atomic_inc_return(&peer->rid);
 359        qp->rid = end;
 360
 361        rc = qp->q.fragments_tail && (end - start) > max;
 362
 363        if (rc) {
 364                struct net *net;
 365
 366                net = container_of(qp->q.net, struct net, ipv4.frags);
 367                IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 368        }
 369
 370        return rc;
 371}
 372
 373static int ip_frag_reinit(struct ipq *qp)
 374{
 375        unsigned int sum_truesize = 0;
 376
 377        if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
 378                atomic_inc(&qp->q.refcnt);
 379                return -ETIMEDOUT;
 380        }
 381
 382        sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
 383        sub_frag_mem_limit(&qp->q, sum_truesize);
 384
 385        qp->q.last_in = 0;
 386        qp->q.len = 0;
 387        qp->q.meat = 0;
 388        qp->q.fragments = NULL;
 389        qp->q.rb_fragments = RB_ROOT;
 390        qp->q.fragments_tail = NULL;
 391        qp->q.last_run_head = NULL;
 392        qp->iif = 0;
 393        qp->ecn = 0;
 394
 395        return 0;
 396}
 397
 398/* Add new segment to existing queue. */
 399static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 400{
 401        struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 402        struct rb_node **rbn, *parent;
 403        struct sk_buff *skb1, *prev_tail;
 404        struct net_device *dev;
 405        unsigned int fragsize;
 406        int flags, offset;
 407        int ihl, end;
 408        int err = -ENOENT;
 409        u8 ecn;
 410
 411        if (qp->q.last_in & INET_FRAG_COMPLETE)
 412                goto err;
 413
 414        if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
 415            unlikely(ip_frag_too_far(qp)) &&
 416            unlikely(err = ip_frag_reinit(qp))) {
 417                ipq_kill(qp);
 418                goto err;
 419        }
 420
 421        ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
 422        offset = ntohs(ip_hdr(skb)->frag_off);
 423        flags = offset & ~IP_OFFSET;
 424        offset &= IP_OFFSET;
 425        offset <<= 3;           /* offset is in 8-byte chunks */
 426        ihl = ip_hdrlen(skb);
 427
 428        /* Determine the position of this fragment. */
 429        end = offset + skb->len - skb_network_offset(skb) - ihl;
 430        err = -EINVAL;
 431
 432        /* Is this the final fragment? */
 433        if ((flags & IP_MF) == 0) {
 434                /* If we already have some bits beyond end
 435                 * or have different end, the segment is corrupted.
 436                 */
 437                if (end < qp->q.len ||
 438                    ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
 439                        goto err;
 440                qp->q.last_in |= INET_FRAG_LAST_IN;
 441                qp->q.len = end;
 442        } else {
 443                if (end&7) {
 444                        end &= ~7;
 445                        if (skb->ip_summed != CHECKSUM_UNNECESSARY)
 446                                skb->ip_summed = CHECKSUM_NONE;
 447                }
 448                if (end > qp->q.len) {
 449                        /* Some bits beyond end -> corruption. */
 450                        if (qp->q.last_in & INET_FRAG_LAST_IN)
 451                                goto err;
 452                        qp->q.len = end;
 453                }
 454        }
 455        if (end == offset)
 456                goto err;
 457
 458        err = -ENOMEM;
 459        if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
 460                goto err;
 461
 462        err = pskb_trim_rcsum(skb, end - offset);
 463        if (err)
 464                goto err;
 465
 466        /* Note : skb->rbnode and skb->dev share the same location. */
 467        dev = skb->dev;
 468        /* Makes sure compiler wont do silly aliasing games */
 469        barrier();
 470
 471        /* RFC5722, Section 4, amended by Errata ID : 3089
 472         *                          When reassembling an IPv6 datagram, if
 473         *   one or more its constituent fragments is determined to be an
 474         *   overlapping fragment, the entire datagram (and any constituent
 475         *   fragments) MUST be silently discarded.
 476         *
 477         * We do the same here for IPv4 (and increment an snmp counter).
 478         */
 479
 480        /* Find out where to put this fragment.  */
 481        prev_tail = qp->q.fragments_tail;
 482        if (!prev_tail)
 483                ip4_frag_create_run(&qp->q, skb);  /* First fragment. */
 484        else if (FRAG_CB(prev_tail)->offset + prev_tail->len < end) {
 485                /* This is the common case: skb goes to the end. */
 486                /* Detect and discard overlaps. */
 487                if (offset < FRAG_CB(prev_tail)->offset + prev_tail->len)
 488                        goto discard_qp;
 489                if (offset == FRAG_CB(prev_tail)->offset + prev_tail->len)
 490                        ip4_frag_append_to_last_run(&qp->q, skb);
 491                else
 492                        ip4_frag_create_run(&qp->q, skb);
 493        } else {
 494                /* Binary search. Note that skb can become the first fragment,
 495                 * but not the last (covered above).
 496                 */
 497                rbn = &qp->q.rb_fragments.rb_node;
 498                do {
 499                        parent = *rbn;
 500                        skb1 = rb_to_skb(parent);
 501                        if (end <= FRAG_CB(skb1)->offset)
 502                                rbn = &parent->rb_left;
 503                        else if (offset >= FRAG_CB(skb1)->offset +
 504                                                FRAG_CB(skb1)->frag_run_len)
 505                                rbn = &parent->rb_right;
 506                        else /* Found an overlap with skb1. */
 507                                goto discard_qp;
 508                } while (*rbn);
 509                /* Here we have parent properly set, and rbn pointing to
 510                 * one of its NULL left/right children. Insert skb.
 511                 */
 512                ip4_frag_init_run(skb);
 513                rb_link_node(&skb->rbnode, parent, rbn);
 514                rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
 515        }
 516
 517        if (dev) {
 518                qp->iif = dev->ifindex;
 519                skb->dev = NULL;
 520        }
 521        FRAG_CB(skb)->offset = offset;
 522
 523        qp->q.stamp = skb->tstamp;
 524        qp->q.meat += skb->len;
 525        qp->ecn |= ecn;
 526        add_frag_mem_limit(&qp->q, skb->truesize);
 527        if (offset == 0)
 528                qp->q.last_in |= INET_FRAG_FIRST_IN;
 529
 530        fragsize = skb->len + ihl;
 531
 532        if (fragsize > qp->q.max_size)
 533                qp->q.max_size = fragsize;
 534
 535        if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
 536            fragsize > qp->max_df_size)
 537                qp->max_df_size = fragsize;
 538
 539        if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 540            qp->q.meat == qp->q.len) {
 541                unsigned long orefdst = skb->_skb_refdst;
 542
 543                skb->_skb_refdst = 0UL;
 544                err = ip_frag_reasm(qp, skb, prev_tail, dev);
 545                skb->_skb_refdst = orefdst;
 546                return err;
 547        }
 548
 549        skb_dst_drop(skb);
 550        inet_frag_lru_move(&qp->q);
 551        return -EINPROGRESS;
 552
 553discard_qp:
 554        ipq_kill(qp);
 555        err = -EINVAL;
 556        IP_INC_STATS_BH(net, IPSTATS_MIB_REASM_OVERLAPS);
 557err:
 558        kfree_skb(skb);
 559        return err;
 560}
 561
 562static bool ip_frag_coalesce_ok(const struct ipq *qp)
 563{
 564        return qp->user == IP_DEFRAG_LOCAL_DELIVER;
 565}
 566
 567/* Build a new IP datagram from all its fragments. */
 568static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 569                         struct sk_buff *prev_tail, struct net_device *dev)
 570{
 571        struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 572        struct iphdr *iph;
 573        struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
 574        struct sk_buff **nextp; /* To build frag_list. */
 575        struct rb_node *rbn;
 576        int sum_truesize;
 577        int len;
 578        int ihlen;
 579        int err;
 580        u8 ecn;
 581
 582        ipq_kill(qp);
 583
 584        ecn = ip_frag_ecn_table[qp->ecn];
 585        if (unlikely(ecn == 0xff)) {
 586                err = -EINVAL;
 587                goto out_fail;
 588        }
 589        /* Make the one we just received the head. */
 590        if (head != skb) {
 591                fp = skb_clone(skb, GFP_ATOMIC);
 592                if (!fp)
 593                        goto out_nomem;
 594                FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
 595                if (RB_EMPTY_NODE(&skb->rbnode))
 596                        FRAG_CB(prev_tail)->next_frag = fp;
 597                else
 598                        rb_replace_node(&skb->rbnode, &fp->rbnode,
 599                                        &qp->q.rb_fragments);
 600                if (qp->q.fragments_tail == skb)
 601                        qp->q.fragments_tail = fp;
 602                skb_morph(skb, head);
 603                FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
 604                rb_replace_node(&head->rbnode, &skb->rbnode,
 605                                &qp->q.rb_fragments);
 606                consume_skb(head);
 607                head = skb;
 608        }
 609
 610        WARN_ON(FRAG_CB(head)->offset != 0);
 611
 612        /* Allocate a new buffer for the datagram. */
 613        ihlen = ip_hdrlen(head);
 614        len = ihlen + qp->q.len;
 615
 616        err = -E2BIG;
 617        if (len > 65535)
 618                goto out_oversize;
 619
 620        /* Head of list must not be cloned. */
 621        if (skb_unclone(head, GFP_ATOMIC))
 622                goto out_nomem;
 623
 624        /* If the first fragment is fragmented itself, we split
 625         * it to two chunks: the first with data and paged part
 626         * and the second, holding only fragments. */
 627        if (skb_has_frag_list(head)) {
 628                struct sk_buff *clone;
 629                int i, plen = 0;
 630
 631                if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
 632                        goto out_nomem;
 633                skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
 634                skb_frag_list_init(head);
 635                for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
 636                        plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
 637                clone->len = clone->data_len = head->data_len - plen;
 638                head->truesize += clone->truesize;
 639                clone->csum = 0;
 640                clone->ip_summed = head->ip_summed;
 641                add_frag_mem_limit(&qp->q, clone->truesize);
 642                skb_shinfo(head)->frag_list = clone;
 643                nextp = &clone->next;
 644        } else {
 645                nextp = &skb_shinfo(head)->frag_list;
 646        }
 647
 648        skb_push(head, head->data - skb_network_header(head));
 649
 650        /* Traverse the tree in order, to build frag_list. */
 651        fp = FRAG_CB(head)->next_frag;
 652        rbn = rb_next(&head->rbnode);
 653        rb_erase(&head->rbnode, &qp->q.rb_fragments);
 654
 655        sum_truesize = head->truesize;
 656        while (rbn || fp) {
 657                /* fp points to the next sk_buff in the current run;
 658                 * rbn points to the next run.
 659                 */
 660                /* Go through the current run. */
 661                while (fp) {
 662                        struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
 663                        bool stolen;
 664                        int delta;
 665
 666                        sum_truesize += fp->truesize;
 667                        if (head->ip_summed != fp->ip_summed)
 668                                head->ip_summed = CHECKSUM_NONE;
 669                        else if (head->ip_summed == CHECKSUM_COMPLETE)
 670                                head->csum = csum_add(head->csum, fp->csum);
 671
 672                        if (ip_frag_coalesce_ok(qp) &&
 673                            skb_try_coalesce(head, fp, &stolen, &delta)) {
 674                                kfree_skb_partial(fp, stolen);
 675                        } else {
 676                                fp->prev = NULL;
 677                                memset(&fp->rbnode, 0, sizeof(fp->rbnode));
 678
 679                                head->data_len += fp->len;
 680                                head->len += fp->len;
 681                                head->truesize += fp->truesize;
 682
 683                                *nextp = fp;
 684                                nextp = &fp->next;
 685                        }
 686
 687                        fp = next_frag;
 688                }
 689                /* Move to the next run. */
 690                if (rbn) {
 691                        struct rb_node *rbnext = rb_next(rbn);
 692
 693                        fp = rb_to_skb(rbn);
 694                        rb_erase(rbn, &qp->q.rb_fragments);
 695                        rbn = rbnext;
 696                }
 697        }
 698        sub_frag_mem_limit(&qp->q, sum_truesize);
 699
 700        *nextp = NULL;
 701        head->next = NULL;
 702        head->prev = NULL;
 703        head->dev = dev;
 704        head->tstamp = qp->q.stamp;
 705        IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
 706
 707        iph = ip_hdr(head);
 708        iph->tot_len = htons(len);
 709        iph->tos |= ecn;
 710
 711        /* When we set IP_DF on a refragmented skb we must also force a
 712         * call to ip_fragment to avoid forwarding a DF-skb of size s while
 713         * original sender only sent fragments of size f (where f < s).
 714         *
 715         * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
 716         * frag seen to avoid sending tiny DF-fragments in case skb was built
 717         * from one very small df-fragment and one large non-df frag.
 718         */
 719        if (qp->max_df_size == qp->q.max_size) {
 720                IPCB(head)->flags |= IPSKB_FRAG_PMTU;
 721                iph->frag_off = htons(IP_DF);
 722        } else {
 723                iph->frag_off = 0;
 724        }
 725
 726        ip_send_check(iph);
 727
 728        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 729        qp->q.fragments = NULL;
 730        qp->q.rb_fragments = RB_ROOT;
 731        qp->q.fragments_tail = NULL;
 732        qp->q.last_run_head = NULL;
 733        return 0;
 734
 735out_nomem:
 736        LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"),
 737                       qp);
 738        err = -ENOMEM;
 739        goto out_fail;
 740out_oversize:
 741        net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
 742out_fail:
 743        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 744        return err;
 745}
 746
 747/* Process an incoming IP datagram fragment. */
 748int ip_defrag(struct sk_buff *skb, u32 user)
 749{
 750        struct ipq *qp;
 751        struct net *net;
 752
 753        net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
 754        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
 755        skb_orphan(skb);
 756
 757        /* Start by cleaning up the memory. */
 758        ip_evictor(net);
 759
 760        /* Lookup (or create) queue header */
 761        if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
 762                int ret;
 763
 764                spin_lock(&qp->q.lock);
 765
 766                ret = ip_frag_queue(qp, skb);
 767
 768                spin_unlock(&qp->q.lock);
 769                ipq_put(qp);
 770                return ret;
 771        }
 772
 773        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 774        kfree_skb(skb);
 775        return -ENOMEM;
 776}
 777EXPORT_SYMBOL(ip_defrag);
 778
 779struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
 780{
 781        struct iphdr iph;
 782        int netoff;
 783        u32 len;
 784
 785        if (skb->protocol != htons(ETH_P_IP))
 786                return skb;
 787
 788        netoff = skb_network_offset(skb);
 789
 790        if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
 791                return skb;
 792
 793        if (iph.ihl < 5 || iph.version != 4)
 794                return skb;
 795
 796        len = ntohs(iph.tot_len);
 797        if (skb->len < netoff + len || len < (iph.ihl * 4))
 798                return skb;
 799
 800        if (ip_is_fragment(&iph)) {
 801                skb = skb_share_check(skb, GFP_ATOMIC);
 802                if (skb) {
 803                        if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
 804                                return skb;
 805                        if (pskb_trim_rcsum(skb, netoff + len))
 806                                return skb;
 807                        memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
 808                        if (ip_defrag(skb, user))
 809                                return NULL;
 810                        skb_clear_hash(skb);
 811                }
 812        }
 813        return skb;
 814}
 815EXPORT_SYMBOL(ip_check_defrag);
 816
 817unsigned int inet_frag_rbtree_purge(struct rb_root *root)
 818{
 819        struct rb_node *p = rb_first(root);
 820        unsigned int sum = 0;
 821
 822        while (p) {
 823                struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 824
 825                p = rb_next(p);
 826                rb_erase(&skb->rbnode, root);
 827                while (skb) {
 828                        struct sk_buff *next = FRAG_CB(skb)->next_frag;
 829
 830                        sum += skb->truesize;
 831                        kfree_skb(skb);
 832                        skb = next;
 833                }
 834        }
 835        return sum;
 836}
 837EXPORT_SYMBOL(inet_frag_rbtree_purge);
 838
 839#ifdef CONFIG_SYSCTL
 840static int zero;
 841
 842static struct ctl_table ip4_frags_ns_ctl_table[] = {
 843        {
 844                .procname       = "ipfrag_high_thresh",
 845                .data           = &init_net.ipv4.frags.high_thresh,
 846                .maxlen         = sizeof(int),
 847                .mode           = 0644,
 848                .proc_handler   = proc_dointvec
 849        },
 850        {
 851                .procname       = "ipfrag_low_thresh",
 852                .data           = &init_net.ipv4.frags.low_thresh,
 853                .maxlen         = sizeof(int),
 854                .mode           = 0644,
 855                .proc_handler   = proc_dointvec
 856        },
 857        {
 858                .procname       = "ipfrag_time",
 859                .data           = &init_net.ipv4.frags.timeout,
 860                .maxlen         = sizeof(int),
 861                .mode           = 0644,
 862                .proc_handler   = proc_dointvec_jiffies,
 863        },
 864        { }
 865};
 866
 867static struct ctl_table ip4_frags_ctl_table[] = {
 868        {
 869                .procname       = "ipfrag_secret_interval",
 870                .data           = &ip4_frags.secret_interval,
 871                .maxlen         = sizeof(int),
 872                .mode           = 0644,
 873                .proc_handler   = proc_dointvec_jiffies,
 874        },
 875        {
 876                .procname       = "ipfrag_max_dist",
 877                .data           = &sysctl_ipfrag_max_dist,
 878                .maxlen         = sizeof(int),
 879                .mode           = 0644,
 880                .proc_handler   = proc_dointvec_minmax,
 881                .extra1         = &zero
 882        },
 883        { }
 884};
 885
 886static int __net_init ip4_frags_ns_ctl_register(struct net *net)
 887{
 888        struct ctl_table *table;
 889        struct ctl_table_header *hdr;
 890
 891        table = ip4_frags_ns_ctl_table;
 892        if (!net_eq(net, &init_net)) {
 893                table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
 894                if (table == NULL)
 895                        goto err_alloc;
 896
 897                table[0].data = &net->ipv4.frags.high_thresh;
 898                table[1].data = &net->ipv4.frags.low_thresh;
 899                table[2].data = &net->ipv4.frags.timeout;
 900
 901                /* Don't export sysctls to unprivileged users */
 902                if (net->user_ns != &init_user_ns)
 903                        table[0].procname = NULL;
 904        }
 905
 906        hdr = register_net_sysctl(net, "net/ipv4", table);
 907        if (hdr == NULL)
 908                goto err_reg;
 909
 910        net->ipv4.frags_hdr = hdr;
 911        return 0;
 912
 913err_reg:
 914        if (!net_eq(net, &init_net))
 915                kfree(table);
 916err_alloc:
 917        return -ENOMEM;
 918}
 919
 920static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
 921{
 922        struct ctl_table *table;
 923
 924        table = net->ipv4.frags_hdr->ctl_table_arg;
 925        unregister_net_sysctl_table(net->ipv4.frags_hdr);
 926        kfree(table);
 927}
 928
 929static void ip4_frags_ctl_register(void)
 930{
 931        register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
 932}
 933#else
 934static inline int ip4_frags_ns_ctl_register(struct net *net)
 935{
 936        return 0;
 937}
 938
 939static inline void ip4_frags_ns_ctl_unregister(struct net *net)
 940{
 941}
 942
 943static inline void ip4_frags_ctl_register(void)
 944{
 945}
 946#endif
 947
 948static int __net_init ipv4_frags_init_net(struct net *net)
 949{
 950        /* Fragment cache limits.
 951         *
 952         * The fragment memory accounting code, (tries to) account for
 953         * the real memory usage, by measuring both the size of frag
 954         * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
 955         * and the SKB's truesize.
 956         *
 957         * A 64K fragment consumes 129736 bytes (44*2944)+200
 958         * (1500 truesize == 2944, sizeof(struct ipq) == 200)
 959         *
 960         * We will commit 4MB at one time. Should we cross that limit
 961         * we will prune down to 3MB, making room for approx 8 big 64K
 962         * fragments 8x128k.
 963         */
 964        net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
 965        net->ipv4.frags.low_thresh  = 3 * 1024 * 1024;
 966        /*
 967         * Important NOTE! Fragment queue must be destroyed before MSL expires.
 968         * RFC791 is wrong proposing to prolongate timer each fragment arrival
 969         * by TTL.
 970         */
 971        net->ipv4.frags.timeout = IP_FRAG_TIME;
 972
 973        inet_frags_init_net(&net->ipv4.frags);
 974
 975        return ip4_frags_ns_ctl_register(net);
 976}
 977
 978static void __net_exit ipv4_frags_exit_net(struct net *net)
 979{
 980        ip4_frags_ns_ctl_unregister(net);
 981        inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
 982}
 983
 984static struct pernet_operations ip4_frags_ops = {
 985        .init = ipv4_frags_init_net,
 986        .exit = ipv4_frags_exit_net,
 987};
 988
 989void __init ipfrag_init(void)
 990{
 991        ip4_frags_ctl_register();
 992        register_pernet_subsys(&ip4_frags_ops);
 993        ip4_frags.hashfn = ip4_hashfn;
 994        ip4_frags.constructor = ip4_frag_init;
 995        ip4_frags.destructor = ip4_frag_free;
 996        ip4_frags.skb_free = NULL;
 997        ip4_frags.qsize = sizeof(struct ipq);
 998        ip4_frags.match = ip4_frag_match;
 999        ip4_frags.frag_expire = ip_expire;
1000        ip4_frags.secret_interval = 10 * 60 * HZ;
1001        inet_frags_init(&ip4_frags);
1002}
1003