linux/net/core/sock_reuseport.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * To speed up listener socket lookup, create an array to store all sockets
   4 * listening on the same port.  This allows a decision to be made after finding
   5 * the first socket.  An optional BPF program can also be configured for
   6 * selecting the socket index from the array of available sockets.
   7 */
   8
   9#include <net/sock_reuseport.h>
  10#include <linux/bpf.h>
  11#include <linux/idr.h>
  12#include <linux/filter.h>
  13#include <linux/rcupdate.h>
  14
  15#define INIT_SOCKS 128
  16
  17DEFINE_SPINLOCK(reuseport_lock);
  18
  19static DEFINE_IDA(reuseport_ida);
  20
  21static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
  22{
  23        unsigned int size = sizeof(struct sock_reuseport) +
  24                      sizeof(struct sock *) * max_socks;
  25        struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
  26
  27        if (!reuse)
  28                return NULL;
  29
  30        reuse->max_socks = max_socks;
  31
  32        RCU_INIT_POINTER(reuse->prog, NULL);
  33        return reuse;
  34}
  35
  36int reuseport_alloc(struct sock *sk, bool bind_inany)
  37{
  38        struct sock_reuseport *reuse;
  39        int id, ret = 0;
  40
  41        /* bh lock used since this function call may precede hlist lock in
  42         * soft irq of receive path or setsockopt from process context
  43         */
  44        spin_lock_bh(&reuseport_lock);
  45
  46        /* Allocation attempts can occur concurrently via the setsockopt path
  47         * and the bind/hash path.  Nothing to do when we lose the race.
  48         */
  49        reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
  50                                          lockdep_is_held(&reuseport_lock));
  51        if (reuse) {
  52                /* Only set reuse->bind_inany if the bind_inany is true.
  53                 * Otherwise, it will overwrite the reuse->bind_inany
  54                 * which was set by the bind/hash path.
  55                 */
  56                if (bind_inany)
  57                        reuse->bind_inany = bind_inany;
  58                goto out;
  59        }
  60
  61        reuse = __reuseport_alloc(INIT_SOCKS);
  62        if (!reuse) {
  63                ret = -ENOMEM;
  64                goto out;
  65        }
  66
  67        id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
  68        if (id < 0) {
  69                kfree(reuse);
  70                ret = id;
  71                goto out;
  72        }
  73
  74        reuse->reuseport_id = id;
  75        reuse->socks[0] = sk;
  76        reuse->num_socks = 1;
  77        reuse->bind_inany = bind_inany;
  78        rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
  79
  80out:
  81        spin_unlock_bh(&reuseport_lock);
  82
  83        return ret;
  84}
  85EXPORT_SYMBOL(reuseport_alloc);
  86
  87static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
  88{
  89        struct sock_reuseport *more_reuse;
  90        u32 more_socks_size, i;
  91
  92        more_socks_size = reuse->max_socks * 2U;
  93        if (more_socks_size > U16_MAX)
  94                return NULL;
  95
  96        more_reuse = __reuseport_alloc(more_socks_size);
  97        if (!more_reuse)
  98                return NULL;
  99
 100        more_reuse->num_socks = reuse->num_socks;
 101        more_reuse->prog = reuse->prog;
 102        more_reuse->reuseport_id = reuse->reuseport_id;
 103        more_reuse->bind_inany = reuse->bind_inany;
 104        more_reuse->has_conns = reuse->has_conns;
 105
 106        memcpy(more_reuse->socks, reuse->socks,
 107               reuse->num_socks * sizeof(struct sock *));
 108        more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
 109
 110        for (i = 0; i < reuse->num_socks; ++i)
 111                rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
 112                                   more_reuse);
 113
 114        /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
 115         * that reuse and more_reuse can temporarily share a reference
 116         * to prog.
 117         */
 118        kfree_rcu(reuse, rcu);
 119        return more_reuse;
 120}
 121
 122static void reuseport_free_rcu(struct rcu_head *head)
 123{
 124        struct sock_reuseport *reuse;
 125
 126        reuse = container_of(head, struct sock_reuseport, rcu);
 127        sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
 128        ida_free(&reuseport_ida, reuse->reuseport_id);
 129        kfree(reuse);
 130}
 131
 132/**
 133 *  reuseport_add_sock - Add a socket to the reuseport group of another.
 134 *  @sk:  New socket to add to the group.
 135 *  @sk2: Socket belonging to the existing reuseport group.
 136 *  @bind_inany: Whether or not the group is bound to a local INANY address.
 137 *
 138 *  May return ENOMEM and not add socket to group under memory pressure.
 139 */
 140int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 141{
 142        struct sock_reuseport *old_reuse, *reuse;
 143
 144        if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
 145                int err = reuseport_alloc(sk2, bind_inany);
 146
 147                if (err)
 148                        return err;
 149        }
 150
 151        spin_lock_bh(&reuseport_lock);
 152        reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
 153                                          lockdep_is_held(&reuseport_lock));
 154        old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 155                                             lockdep_is_held(&reuseport_lock));
 156        if (old_reuse && old_reuse->num_socks != 1) {
 157                spin_unlock_bh(&reuseport_lock);
 158                return -EBUSY;
 159        }
 160
 161        if (reuse->num_socks == reuse->max_socks) {
 162                reuse = reuseport_grow(reuse);
 163                if (!reuse) {
 164                        spin_unlock_bh(&reuseport_lock);
 165                        return -ENOMEM;
 166                }
 167        }
 168
 169        reuse->socks[reuse->num_socks] = sk;
 170        /* paired with smp_rmb() in reuseport_select_sock() */
 171        smp_wmb();
 172        reuse->num_socks++;
 173        rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 174
 175        spin_unlock_bh(&reuseport_lock);
 176
 177        if (old_reuse)
 178                call_rcu(&old_reuse->rcu, reuseport_free_rcu);
 179        return 0;
 180}
 181EXPORT_SYMBOL(reuseport_add_sock);
 182
 183void reuseport_detach_sock(struct sock *sk)
 184{
 185        struct sock_reuseport *reuse;
 186        int i;
 187
 188        spin_lock_bh(&reuseport_lock);
 189        reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 190                                          lockdep_is_held(&reuseport_lock));
 191
 192        /* Notify the bpf side. The sk may be added to a sockarray
 193         * map. If so, sockarray logic will remove it from the map.
 194         *
 195         * Other bpf map types that work with reuseport, like sockmap,
 196         * don't need an explicit callback from here. They override sk
 197         * unhash/close ops to remove the sk from the map before we
 198         * get to this point.
 199         */
 200        bpf_sk_reuseport_detach(sk);
 201
 202        rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
 203
 204        for (i = 0; i < reuse->num_socks; i++) {
 205                if (reuse->socks[i] == sk) {
 206                        reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
 207                        reuse->num_socks--;
 208                        if (reuse->num_socks == 0)
 209                                call_rcu(&reuse->rcu, reuseport_free_rcu);
 210                        break;
 211                }
 212        }
 213        spin_unlock_bh(&reuseport_lock);
 214}
 215EXPORT_SYMBOL(reuseport_detach_sock);
 216
 217static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
 218                                   struct bpf_prog *prog, struct sk_buff *skb,
 219                                   int hdr_len)
 220{
 221        struct sk_buff *nskb = NULL;
 222        u32 index;
 223
 224        if (skb_shared(skb)) {
 225                nskb = skb_clone(skb, GFP_ATOMIC);
 226                if (!nskb)
 227                        return NULL;
 228                skb = nskb;
 229        }
 230
 231        /* temporarily advance data past protocol header */
 232        if (!pskb_pull(skb, hdr_len)) {
 233                kfree_skb(nskb);
 234                return NULL;
 235        }
 236        index = bpf_prog_run_save_cb(prog, skb);
 237        __skb_push(skb, hdr_len);
 238
 239        consume_skb(nskb);
 240
 241        if (index >= socks)
 242                return NULL;
 243
 244        return reuse->socks[index];
 245}
 246
 247/**
 248 *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
 249 *  @sk: First socket in the group.
 250 *  @hash: When no BPF filter is available, use this hash to select.
 251 *  @skb: skb to run through BPF filter.
 252 *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
 253 *    the skb does not yet point at the payload, this parameter represents
 254 *    how far the pointer needs to advance to reach the payload.
 255 *  Returns a socket that should receive the packet (or NULL on error).
 256 */
 257struct sock *reuseport_select_sock(struct sock *sk,
 258                                   u32 hash,
 259                                   struct sk_buff *skb,
 260                                   int hdr_len)
 261{
 262        struct sock_reuseport *reuse;
 263        struct bpf_prog *prog;
 264        struct sock *sk2 = NULL;
 265        u16 socks;
 266
 267        rcu_read_lock();
 268        reuse = rcu_dereference(sk->sk_reuseport_cb);
 269
 270        /* if memory allocation failed or add call is not yet complete */
 271        if (!reuse)
 272                goto out;
 273
 274        prog = rcu_dereference(reuse->prog);
 275        socks = READ_ONCE(reuse->num_socks);
 276        if (likely(socks)) {
 277                /* paired with smp_wmb() in reuseport_add_sock() */
 278                smp_rmb();
 279
 280                if (!prog || !skb)
 281                        goto select_by_hash;
 282
 283                if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
 284                        sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
 285                else
 286                        sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
 287
 288select_by_hash:
 289                /* no bpf or invalid bpf result: fall back to hash usage */
 290                if (!sk2) {
 291                        int i, j;
 292
 293                        i = j = reciprocal_scale(hash, socks);
 294                        while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
 295                                i++;
 296                                if (i >= socks)
 297                                        i = 0;
 298                                if (i == j)
 299                                        goto out;
 300                        }
 301                        sk2 = reuse->socks[i];
 302                }
 303        }
 304
 305out:
 306        rcu_read_unlock();
 307        return sk2;
 308}
 309EXPORT_SYMBOL(reuseport_select_sock);
 310
 311int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
 312{
 313        struct sock_reuseport *reuse;
 314        struct bpf_prog *old_prog;
 315
 316        if (sk_unhashed(sk) && sk->sk_reuseport) {
 317                int err = reuseport_alloc(sk, false);
 318
 319                if (err)
 320                        return err;
 321        } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
 322                /* The socket wasn't bound with SO_REUSEPORT */
 323                return -EINVAL;
 324        }
 325
 326        spin_lock_bh(&reuseport_lock);
 327        reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 328                                          lockdep_is_held(&reuseport_lock));
 329        old_prog = rcu_dereference_protected(reuse->prog,
 330                                             lockdep_is_held(&reuseport_lock));
 331        rcu_assign_pointer(reuse->prog, prog);
 332        spin_unlock_bh(&reuseport_lock);
 333
 334        sk_reuseport_prog_free(old_prog);
 335        return 0;
 336}
 337EXPORT_SYMBOL(reuseport_attach_prog);
 338
 339int reuseport_detach_prog(struct sock *sk)
 340{
 341        struct sock_reuseport *reuse;
 342        struct bpf_prog *old_prog;
 343
 344        if (!rcu_access_pointer(sk->sk_reuseport_cb))
 345                return sk->sk_reuseport ? -ENOENT : -EINVAL;
 346
 347        old_prog = NULL;
 348        spin_lock_bh(&reuseport_lock);
 349        reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 350                                          lockdep_is_held(&reuseport_lock));
 351        old_prog = rcu_replace_pointer(reuse->prog, old_prog,
 352                                       lockdep_is_held(&reuseport_lock));
 353        spin_unlock_bh(&reuseport_lock);
 354
 355        if (!old_prog)
 356                return -ENOENT;
 357
 358        sk_reuseport_prog_free(old_prog);
 359        return 0;
 360}
 361EXPORT_SYMBOL(reuseport_detach_prog);
 362