linux/net/ipv4/inet_fragment.c
<<
>>
Prefs
   1/*
   2 * inet fragments management
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *              Authors:        Pavel Emelyanov <xemul@openvz.org>
  10 *                              Started as consolidation of ipv4/ip_fragment.c,
  11 *                              ipv6/reassembly. and ipv6 nf conntrack reassembly
  12 */
  13
  14#include <linux/list.h>
  15#include <linux/spinlock.h>
  16#include <linux/module.h>
  17#include <linux/timer.h>
  18#include <linux/mm.h>
  19#include <linux/random.h>
  20#include <linux/skbuff.h>
  21#include <linux/rtnetlink.h>
  22#include <linux/slab.h>
  23
  24#include <net/sock.h>
  25#include <net/inet_frag.h>
  26#include <net/inet_ecn.h>
  27
  28#define INETFRAGS_EVICT_BUCKETS   128
  29#define INETFRAGS_EVICT_MAX       512
  30
  31/* don't rebuild inetfrag table with new secret more often than this */
  32#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
  33
  34/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
  35 * Value : 0xff if frame should be dropped.
  36 *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
  37 */
  38const u8 ip_frag_ecn_table[16] = {
  39        /* at least one fragment had CE, and others ECT_0 or ECT_1 */
  40        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]                      = INET_ECN_CE,
  41        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]                      = INET_ECN_CE,
  42        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]   = INET_ECN_CE,
  43
  44        /* invalid combinations : drop frame */
  45        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
  46        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
  47        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
  48        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
  49        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
  50        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
  51        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
  52};
  53EXPORT_SYMBOL(ip_frag_ecn_table);
  54
  55static unsigned int
  56inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
  57{
  58        return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
  59}
  60
  61static bool inet_frag_may_rebuild(struct inet_frags *f)
  62{
  63        return time_after(jiffies,
  64               f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
  65}
  66
  67static void inet_frag_secret_rebuild(struct inet_frags *f)
  68{
  69        int i;
  70
  71        write_seqlock_bh(&f->rnd_seqlock);
  72
  73        if (!inet_frag_may_rebuild(f))
  74                goto out;
  75
  76        get_random_bytes(&f->rnd, sizeof(u32));
  77
  78        for (i = 0; i < INETFRAGS_HASHSZ; i++) {
  79                struct inet_frag_bucket *hb;
  80                struct inet_frag_queue *q;
  81                struct hlist_node *n;
  82
  83                hb = &f->hash[i];
  84                spin_lock(&hb->chain_lock);
  85
  86                hlist_for_each_entry_safe(q, n, &hb->chain, list) {
  87                        unsigned int hval = inet_frag_hashfn(f, q);
  88
  89                        if (hval != i) {
  90                                struct inet_frag_bucket *hb_dest;
  91
  92                                hlist_del(&q->list);
  93
  94                                /* Relink to new hash chain. */
  95                                hb_dest = &f->hash[hval];
  96
  97                                /* This is the only place where we take
  98                                 * another chain_lock while already holding
  99                                 * one.  As this will not run concurrently,
 100                                 * we cannot deadlock on hb_dest lock below, if its
 101                                 * already locked it will be released soon since
 102                                 * other caller cannot be waiting for hb lock
 103                                 * that we've taken above.
 104                                 */
 105                                spin_lock_nested(&hb_dest->chain_lock,
 106                                                 SINGLE_DEPTH_NESTING);
 107                                hlist_add_head(&q->list, &hb_dest->chain);
 108                                spin_unlock(&hb_dest->chain_lock);
 109                        }
 110                }
 111                spin_unlock(&hb->chain_lock);
 112        }
 113
 114        f->rebuild = false;
 115        f->last_rebuild_jiffies = jiffies;
 116out:
 117        write_sequnlock_bh(&f->rnd_seqlock);
 118}
 119
 120static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
 121{
 122        return q->net->low_thresh == 0 ||
 123               frag_mem_limit(q->net) >= q->net->low_thresh;
 124}
 125
 126static unsigned int
 127inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 128{
 129        struct inet_frag_queue *fq;
 130        struct hlist_node *n;
 131        unsigned int evicted = 0;
 132        HLIST_HEAD(expired);
 133
 134        spin_lock(&hb->chain_lock);
 135
 136        hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
 137                if (!inet_fragq_should_evict(fq))
 138                        continue;
 139
 140                if (!del_timer(&fq->timer))
 141                        continue;
 142
 143                hlist_add_head(&fq->list_evictor, &expired);
 144                ++evicted;
 145        }
 146
 147        spin_unlock(&hb->chain_lock);
 148
 149        hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
 150                f->frag_expire((unsigned long) fq);
 151
 152        return evicted;
 153}
 154
 155static void inet_frag_worker(struct work_struct *work)
 156{
 157        unsigned int budget = INETFRAGS_EVICT_BUCKETS;
 158        unsigned int i, evicted = 0;
 159        struct inet_frags *f;
 160
 161        f = container_of(work, struct inet_frags, frags_work);
 162
 163        BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
 164
 165        local_bh_disable();
 166
 167        for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
 168                evicted += inet_evict_bucket(f, &f->hash[i]);
 169                i = (i + 1) & (INETFRAGS_HASHSZ - 1);
 170                if (evicted > INETFRAGS_EVICT_MAX)
 171                        break;
 172        }
 173
 174        f->next_bucket = i;
 175
 176        local_bh_enable();
 177
 178        if (f->rebuild && inet_frag_may_rebuild(f))
 179                inet_frag_secret_rebuild(f);
 180}
 181
 182static void inet_frag_schedule_worker(struct inet_frags *f)
 183{
 184        if (unlikely(!work_pending(&f->frags_work)))
 185                schedule_work(&f->frags_work);
 186}
 187
 188int inet_frags_init(struct inet_frags *f)
 189{
 190        int i;
 191
 192        INIT_WORK(&f->frags_work, inet_frag_worker);
 193
 194        for (i = 0; i < INETFRAGS_HASHSZ; i++) {
 195                struct inet_frag_bucket *hb = &f->hash[i];
 196
 197                spin_lock_init(&hb->chain_lock);
 198                INIT_HLIST_HEAD(&hb->chain);
 199        }
 200
 201        seqlock_init(&f->rnd_seqlock);
 202        f->last_rebuild_jiffies = 0;
 203        f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
 204                                            NULL);
 205        if (!f->frags_cachep)
 206                return -ENOMEM;
 207
 208        return 0;
 209}
 210EXPORT_SYMBOL(inet_frags_init);
 211
 212void inet_frags_fini(struct inet_frags *f)
 213{
 214        cancel_work_sync(&f->frags_work);
 215        kmem_cache_destroy(f->frags_cachep);
 216}
 217EXPORT_SYMBOL(inet_frags_fini);
 218
 219void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 220{
 221        unsigned int seq;
 222        int i;
 223
 224        nf->low_thresh = 0;
 225
 226evict_again:
 227        local_bh_disable();
 228        seq = read_seqbegin(&f->rnd_seqlock);
 229
 230        for (i = 0; i < INETFRAGS_HASHSZ ; i++)
 231                inet_evict_bucket(f, &f->hash[i]);
 232
 233        local_bh_enable();
 234        cond_resched();
 235
 236        if (read_seqretry(&f->rnd_seqlock, seq) ||
 237            percpu_counter_sum(&nf->mem))
 238                goto evict_again;
 239
 240        percpu_counter_destroy(&nf->mem);
 241}
 242EXPORT_SYMBOL(inet_frags_exit_net);
 243
 244static struct inet_frag_bucket *
 245get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
 246__acquires(hb->chain_lock)
 247{
 248        struct inet_frag_bucket *hb;
 249        unsigned int seq, hash;
 250
 251 restart:
 252        seq = read_seqbegin(&f->rnd_seqlock);
 253
 254        hash = inet_frag_hashfn(f, fq);
 255        hb = &f->hash[hash];
 256
 257        spin_lock(&hb->chain_lock);
 258        if (read_seqretry(&f->rnd_seqlock, seq)) {
 259                spin_unlock(&hb->chain_lock);
 260                goto restart;
 261        }
 262
 263        return hb;
 264}
 265
 266static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 267{
 268        struct inet_frag_bucket *hb;
 269
 270        hb = get_frag_bucket_locked(fq, f);
 271        hlist_del(&fq->list);
 272        fq->flags |= INET_FRAG_COMPLETE;
 273        spin_unlock(&hb->chain_lock);
 274}
 275
 276void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
 277{
 278        if (del_timer(&fq->timer))
 279                atomic_dec(&fq->refcnt);
 280
 281        if (!(fq->flags & INET_FRAG_COMPLETE)) {
 282                fq_unlink(fq, f);
 283                atomic_dec(&fq->refcnt);
 284        }
 285}
 286EXPORT_SYMBOL(inet_frag_kill);
 287
 288void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 289{
 290        struct sk_buff *fp;
 291        struct netns_frags *nf;
 292        unsigned int sum, sum_truesize = 0;
 293
 294        WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 295        WARN_ON(del_timer(&q->timer) != 0);
 296
 297        /* Release all fragment data. */
 298        fp = q->fragments;
 299        nf = q->net;
 300        while (fp) {
 301                struct sk_buff *xp = fp->next;
 302
 303                sum_truesize += fp->truesize;
 304                kfree_skb(fp);
 305                fp = xp;
 306        }
 307        sum = sum_truesize + f->qsize;
 308
 309        if (f->destructor)
 310                f->destructor(q);
 311        kmem_cache_free(f->frags_cachep, q);
 312
 313        sub_frag_mem_limit(nf, sum);
 314}
 315EXPORT_SYMBOL(inet_frag_destroy);
 316
 317static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 318                                                struct inet_frag_queue *qp_in,
 319                                                struct inet_frags *f,
 320                                                void *arg)
 321{
 322        struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
 323        struct inet_frag_queue *qp;
 324
 325#ifdef CONFIG_SMP
 326        /* With SMP race we have to recheck hash table, because
 327         * such entry could have been created on other cpu before
 328         * we acquired hash bucket lock.
 329         */
 330        hlist_for_each_entry(qp, &hb->chain, list) {
 331                if (qp->net == nf && f->match(qp, arg)) {
 332                        atomic_inc(&qp->refcnt);
 333                        spin_unlock(&hb->chain_lock);
 334                        qp_in->flags |= INET_FRAG_COMPLETE;
 335                        inet_frag_put(qp_in, f);
 336                        return qp;
 337                }
 338        }
 339#endif
 340        qp = qp_in;
 341        if (!mod_timer(&qp->timer, jiffies + nf->timeout))
 342                atomic_inc(&qp->refcnt);
 343
 344        atomic_inc(&qp->refcnt);
 345        hlist_add_head(&qp->list, &hb->chain);
 346
 347        spin_unlock(&hb->chain_lock);
 348
 349        return qp;
 350}
 351
 352static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 353                                               struct inet_frags *f,
 354                                               void *arg)
 355{
 356        struct inet_frag_queue *q;
 357
 358        if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
 359                inet_frag_schedule_worker(f);
 360                return NULL;
 361        }
 362
 363        q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 364        if (!q)
 365                return NULL;
 366
 367        q->net = nf;
 368        f->constructor(q, arg);
 369        add_frag_mem_limit(nf, f->qsize);
 370
 371        setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
 372        spin_lock_init(&q->lock);
 373        atomic_set(&q->refcnt, 1);
 374
 375        return q;
 376}
 377
 378static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
 379                                                struct inet_frags *f,
 380                                                void *arg)
 381{
 382        struct inet_frag_queue *q;
 383
 384        q = inet_frag_alloc(nf, f, arg);
 385        if (!q)
 386                return NULL;
 387
 388        return inet_frag_intern(nf, q, f, arg);
 389}
 390
 391struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 392                                       struct inet_frags *f, void *key,
 393                                       unsigned int hash)
 394{
 395        struct inet_frag_bucket *hb;
 396        struct inet_frag_queue *q;
 397        int depth = 0;
 398
 399        if (frag_mem_limit(nf) > nf->low_thresh)
 400                inet_frag_schedule_worker(f);
 401
 402        hash &= (INETFRAGS_HASHSZ - 1);
 403        hb = &f->hash[hash];
 404
 405        spin_lock(&hb->chain_lock);
 406        hlist_for_each_entry(q, &hb->chain, list) {
 407                if (q->net == nf && f->match(q, key)) {
 408                        atomic_inc(&q->refcnt);
 409                        spin_unlock(&hb->chain_lock);
 410                        return q;
 411                }
 412                depth++;
 413        }
 414        spin_unlock(&hb->chain_lock);
 415
 416        if (depth <= INETFRAGS_MAXDEPTH)
 417                return inet_frag_create(nf, f, key);
 418
 419        if (inet_frag_may_rebuild(f)) {
 420                if (!f->rebuild)
 421                        f->rebuild = true;
 422                inet_frag_schedule_worker(f);
 423        }
 424
 425        return ERR_PTR(-ENOBUFS);
 426}
 427EXPORT_SYMBOL(inet_frag_find);
 428
 429void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
 430                                   const char *prefix)
 431{
 432        static const char msg[] = "inet_frag_find: Fragment hash bucket"
 433                " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
 434                ". Dropping fragment.\n";
 435
 436        if (PTR_ERR(q) == -ENOBUFS)
 437                net_dbg_ratelimited("%s%s", prefix, msg);
 438}
 439EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
 440