linux/net/core/sock_reuseport.c
<<
>>
Prefs
   1/*
   2 * To speed up listener socket lookup, create an array to store all sockets
   3 * listening on the same port.  This allows a decision to be made after finding
   4 * the first socket.  An optional BPF program can also be configured for
   5 * selecting the socket index from the array of available sockets.
   6 */
   7
   8#include <net/sock_reuseport.h>
   9#include <linux/bpf.h>
  10#include <linux/rcupdate.h>
  11
  12#define INIT_SOCKS 128
  13
  14static DEFINE_SPINLOCK(reuseport_lock);
  15
  16static struct sock_reuseport *__reuseport_alloc(u16 max_socks)
  17{
  18        size_t size = sizeof(struct sock_reuseport) +
  19                      sizeof(struct sock *) * max_socks;
  20        struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
  21
  22        if (!reuse)
  23                return NULL;
  24
  25        reuse->max_socks = max_socks;
  26
  27        RCU_INIT_POINTER(reuse->prog, NULL);
  28        return reuse;
  29}
  30
  31int reuseport_alloc(struct sock *sk)
  32{
  33        struct sock_reuseport *reuse;
  34
  35        /* bh lock used since this function call may precede hlist lock in
  36         * soft irq of receive path or setsockopt from process context
  37         */
  38        spin_lock_bh(&reuseport_lock);
  39        WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
  40                                            lockdep_is_held(&reuseport_lock)),
  41                  "multiple allocations for the same socket");
  42        reuse = __reuseport_alloc(INIT_SOCKS);
  43        if (!reuse) {
  44                spin_unlock_bh(&reuseport_lock);
  45                return -ENOMEM;
  46        }
  47
  48        reuse->socks[0] = sk;
  49        reuse->num_socks = 1;
  50        rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
  51
  52        spin_unlock_bh(&reuseport_lock);
  53
  54        return 0;
  55}
  56EXPORT_SYMBOL(reuseport_alloc);
  57
  58static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
  59{
  60        struct sock_reuseport *more_reuse;
  61        u32 more_socks_size, i;
  62
  63        more_socks_size = reuse->max_socks * 2U;
  64        if (more_socks_size > U16_MAX)
  65                return NULL;
  66
  67        more_reuse = __reuseport_alloc(more_socks_size);
  68        if (!more_reuse)
  69                return NULL;
  70
  71        more_reuse->max_socks = more_socks_size;
  72        more_reuse->num_socks = reuse->num_socks;
  73        more_reuse->prog = reuse->prog;
  74
  75        memcpy(more_reuse->socks, reuse->socks,
  76               reuse->num_socks * sizeof(struct sock *));
  77
  78        for (i = 0; i < reuse->num_socks; ++i)
  79                rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
  80                                   more_reuse);
  81
  82        /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
  83         * that reuse and more_reuse can temporarily share a reference
  84         * to prog.
  85         */
  86        kfree_rcu(reuse, rcu);
  87        return more_reuse;
  88}
  89
  90/**
  91 *  reuseport_add_sock - Add a socket to the reuseport group of another.
  92 *  @sk:  New socket to add to the group.
  93 *  @sk2: Socket belonging to the existing reuseport group.
  94 *  May return ENOMEM and not add socket to group under memory pressure.
  95 */
  96int reuseport_add_sock(struct sock *sk, struct sock *sk2)
  97{
  98        struct sock_reuseport *reuse;
  99
 100        if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
 101                int err = reuseport_alloc(sk2);
 102
 103                if (err)
 104                        return err;
 105        }
 106
 107        spin_lock_bh(&reuseport_lock);
 108        reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
 109                                          lockdep_is_held(&reuseport_lock)),
 110        WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
 111                                            lockdep_is_held(&reuseport_lock)),
 112                  "socket already in reuseport group");
 113
 114        if (reuse->num_socks == reuse->max_socks) {
 115                reuse = reuseport_grow(reuse);
 116                if (!reuse) {
 117                        spin_unlock_bh(&reuseport_lock);
 118                        return -ENOMEM;
 119                }
 120        }
 121
 122        reuse->socks[reuse->num_socks] = sk;
 123        /* paired with smp_rmb() in reuseport_select_sock() */
 124        smp_wmb();
 125        reuse->num_socks++;
 126        rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 127
 128        spin_unlock_bh(&reuseport_lock);
 129
 130        return 0;
 131}
 132EXPORT_SYMBOL(reuseport_add_sock);
 133
 134static void reuseport_free_rcu(struct rcu_head *head)
 135{
 136        struct sock_reuseport *reuse;
 137
 138        reuse = container_of(head, struct sock_reuseport, rcu);
 139        if (reuse->prog)
 140                bpf_prog_destroy(reuse->prog);
 141        kfree(reuse);
 142}
 143
 144void reuseport_detach_sock(struct sock *sk)
 145{
 146        struct sock_reuseport *reuse;
 147        int i;
 148
 149        spin_lock_bh(&reuseport_lock);
 150        reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 151                                          lockdep_is_held(&reuseport_lock));
 152        rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
 153
 154        for (i = 0; i < reuse->num_socks; i++) {
 155                if (reuse->socks[i] == sk) {
 156                        reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
 157                        reuse->num_socks--;
 158                        if (reuse->num_socks == 0)
 159                                call_rcu(&reuse->rcu, reuseport_free_rcu);
 160                        break;
 161                }
 162        }
 163        spin_unlock_bh(&reuseport_lock);
 164}
 165EXPORT_SYMBOL(reuseport_detach_sock);
 166
 167static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
 168                            struct bpf_prog *prog, struct sk_buff *skb,
 169                            int hdr_len)
 170{
 171        struct sk_buff *nskb = NULL;
 172        u32 index;
 173
 174        if (skb_shared(skb)) {
 175                nskb = skb_clone(skb, GFP_ATOMIC);
 176                if (!nskb)
 177                        return NULL;
 178                skb = nskb;
 179        }
 180
 181        /* temporarily advance data past protocol header */
 182        if (!pskb_pull(skb, hdr_len)) {
 183                kfree_skb(nskb);
 184                return NULL;
 185        }
 186        index = bpf_prog_run_save_cb(prog, skb);
 187        __skb_push(skb, hdr_len);
 188
 189        consume_skb(nskb);
 190
 191        if (index >= socks)
 192                return NULL;
 193
 194        return reuse->socks[index];
 195}
 196
 197/**
 198 *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
 199 *  @sk: First socket in the group.
 200 *  @hash: When no BPF filter is available, use this hash to select.
 201 *  @skb: skb to run through BPF filter.
 202 *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
 203 *    the skb does not yet point at the payload, this parameter represents
 204 *    how far the pointer needs to advance to reach the payload.
 205 *  Returns a socket that should receive the packet (or NULL on error).
 206 */
 207struct sock *reuseport_select_sock(struct sock *sk,
 208                                   u32 hash,
 209                                   struct sk_buff *skb,
 210                                   int hdr_len)
 211{
 212        struct sock_reuseport *reuse;
 213        struct bpf_prog *prog;
 214        struct sock *sk2 = NULL;
 215        u16 socks;
 216
 217        rcu_read_lock();
 218        reuse = rcu_dereference(sk->sk_reuseport_cb);
 219
 220        /* if memory allocation failed or add call is not yet complete */
 221        if (!reuse)
 222                goto out;
 223
 224        prog = rcu_dereference(reuse->prog);
 225        socks = READ_ONCE(reuse->num_socks);
 226        if (likely(socks)) {
 227                /* paired with smp_wmb() in reuseport_add_sock() */
 228                smp_rmb();
 229
 230                if (prog && skb)
 231                        sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
 232                else
 233                        sk2 = reuse->socks[reciprocal_scale(hash, socks)];
 234        }
 235
 236out:
 237        rcu_read_unlock();
 238        return sk2;
 239}
 240EXPORT_SYMBOL(reuseport_select_sock);
 241
 242struct bpf_prog *
 243reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
 244{
 245        struct sock_reuseport *reuse;
 246        struct bpf_prog *old_prog;
 247
 248        spin_lock_bh(&reuseport_lock);
 249        reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 250                                          lockdep_is_held(&reuseport_lock));
 251        old_prog = rcu_dereference_protected(reuse->prog,
 252                                             lockdep_is_held(&reuseport_lock));
 253        rcu_assign_pointer(reuse->prog, prog);
 254        spin_unlock_bh(&reuseport_lock);
 255
 256        return old_prog;
 257}
 258EXPORT_SYMBOL(reuseport_attach_prog);
 259