linux/net/ipv4/tcp_cong.c
<<
>>
Prefs
   1/*
   2 * Pluggable TCP congestion control support and newReno
   3 * congestion control.
   4 * Based on ideas from I/O scheduler support and Web100.
   5 *
   6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
   7 */
   8
   9#define pr_fmt(fmt) "TCP: " fmt
  10
  11#include <linux/module.h>
  12#include <linux/mm.h>
  13#include <linux/types.h>
  14#include <linux/list.h>
  15#include <linux/gfp.h>
  16#include <linux/jhash.h>
  17#include <net/tcp.h>
  18
  19static DEFINE_SPINLOCK(tcp_cong_list_lock);
  20static LIST_HEAD(tcp_cong_list);
  21
  22/* Simple linear search, don't expect many entries! */
  23static struct tcp_congestion_ops *tcp_ca_find(const char *name)
  24{
  25        struct tcp_congestion_ops *e;
  26
  27        list_for_each_entry_rcu(e, &tcp_cong_list, list) {
  28                if (strcmp(e->name, name) == 0)
  29                        return e;
  30        }
  31
  32        return NULL;
  33}
  34
  35/* Must be called with rcu lock held */
  36static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
  37{
  38        const struct tcp_congestion_ops *ca = tcp_ca_find(name);
  39#ifdef CONFIG_MODULES
  40        if (!ca && capable(CAP_NET_ADMIN)) {
  41                rcu_read_unlock();
  42                request_module("tcp_%s", name);
  43                rcu_read_lock();
  44                ca = tcp_ca_find(name);
  45        }
  46#endif
  47        return ca;
  48}
  49
  50/* Simple linear search, not much in here. */
  51struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
  52{
  53        struct tcp_congestion_ops *e;
  54
  55        list_for_each_entry_rcu(e, &tcp_cong_list, list) {
  56                if (e->key == key)
  57                        return e;
  58        }
  59
  60        return NULL;
  61}
  62
  63/*
  64 * Attach new congestion control algorithm to the list
  65 * of available options.
  66 */
  67int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
  68{
  69        int ret = 0;
  70
  71        /* all algorithms must implement these */
  72        if (!ca->ssthresh || !ca->undo_cwnd ||
  73            !(ca->cong_avoid || ca->cong_control)) {
  74                pr_err("%s does not implement required ops\n", ca->name);
  75                return -EINVAL;
  76        }
  77
  78        ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
  79
  80        spin_lock(&tcp_cong_list_lock);
  81        if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
  82                pr_notice("%s already registered or non-unique key\n",
  83                          ca->name);
  84                ret = -EEXIST;
  85        } else {
  86                list_add_tail_rcu(&ca->list, &tcp_cong_list);
  87                pr_debug("%s registered\n", ca->name);
  88        }
  89        spin_unlock(&tcp_cong_list_lock);
  90
  91        return ret;
  92}
  93EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
  94
  95/*
  96 * Remove congestion control algorithm, called from
  97 * the module's remove function.  Module ref counts are used
  98 * to ensure that this can't be done till all sockets using
  99 * that method are closed.
 100 */
 101void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 102{
 103        spin_lock(&tcp_cong_list_lock);
 104        list_del_rcu(&ca->list);
 105        spin_unlock(&tcp_cong_list_lock);
 106
 107        /* Wait for outstanding readers to complete before the
 108         * module gets removed entirely.
 109         *
 110         * A try_module_get() should fail by now as our module is
 111         * in "going" state since no refs are held anymore and
 112         * module_exit() handler being called.
 113         */
 114        synchronize_rcu();
 115}
 116EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 117
 118u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
 119{
 120        const struct tcp_congestion_ops *ca;
 121        u32 key = TCP_CA_UNSPEC;
 122
 123        might_sleep();
 124
 125        rcu_read_lock();
 126        ca = __tcp_ca_find_autoload(name);
 127        if (ca) {
 128                key = ca->key;
 129                *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
 130        }
 131        rcu_read_unlock();
 132
 133        return key;
 134}
 135EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
 136
 137char *tcp_ca_get_name_by_key(u32 key, char *buffer)
 138{
 139        const struct tcp_congestion_ops *ca;
 140        char *ret = NULL;
 141
 142        rcu_read_lock();
 143        ca = tcp_ca_find_key(key);
 144        if (ca)
 145                ret = strncpy(buffer, ca->name,
 146                              TCP_CA_NAME_MAX);
 147        rcu_read_unlock();
 148
 149        return ret;
 150}
 151EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
 152
 153/* Assign choice of congestion control. */
 154void tcp_assign_congestion_control(struct sock *sk)
 155{
 156        struct inet_connection_sock *icsk = inet_csk(sk);
 157        struct tcp_congestion_ops *ca;
 158
 159        rcu_read_lock();
 160        list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
 161                if (likely(try_module_get(ca->owner))) {
 162                        icsk->icsk_ca_ops = ca;
 163                        goto out;
 164                }
 165                /* Fallback to next available. The last really
 166                 * guaranteed fallback is Reno from this list.
 167                 */
 168        }
 169out:
 170        rcu_read_unlock();
 171        memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
 172
 173        if (ca->flags & TCP_CONG_NEEDS_ECN)
 174                INET_ECN_xmit(sk);
 175        else
 176                INET_ECN_dontxmit(sk);
 177}
 178
 179void tcp_init_congestion_control(struct sock *sk)
 180{
 181        const struct inet_connection_sock *icsk = inet_csk(sk);
 182
 183        tcp_sk(sk)->prior_ssthresh = 0;
 184        if (icsk->icsk_ca_ops->init)
 185                icsk->icsk_ca_ops->init(sk);
 186        if (tcp_ca_needs_ecn(sk))
 187                INET_ECN_xmit(sk);
 188        else
 189                INET_ECN_dontxmit(sk);
 190}
 191
 192static void tcp_reinit_congestion_control(struct sock *sk,
 193                                          const struct tcp_congestion_ops *ca)
 194{
 195        struct inet_connection_sock *icsk = inet_csk(sk);
 196
 197        tcp_cleanup_congestion_control(sk);
 198        icsk->icsk_ca_ops = ca;
 199        icsk->icsk_ca_setsockopt = 1;
 200        memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
 201
 202        if (sk->sk_state != TCP_CLOSE)
 203                tcp_init_congestion_control(sk);
 204}
 205
 206/* Manage refcounts on socket close. */
 207void tcp_cleanup_congestion_control(struct sock *sk)
 208{
 209        struct inet_connection_sock *icsk = inet_csk(sk);
 210
 211        if (icsk->icsk_ca_ops->release)
 212                icsk->icsk_ca_ops->release(sk);
 213        module_put(icsk->icsk_ca_ops->owner);
 214}
 215
 216/* Used by sysctl to change default congestion control */
 217int tcp_set_default_congestion_control(const char *name)
 218{
 219        struct tcp_congestion_ops *ca;
 220        int ret = -ENOENT;
 221
 222        spin_lock(&tcp_cong_list_lock);
 223        ca = tcp_ca_find(name);
 224#ifdef CONFIG_MODULES
 225        if (!ca && capable(CAP_NET_ADMIN)) {
 226                spin_unlock(&tcp_cong_list_lock);
 227
 228                request_module("tcp_%s", name);
 229                spin_lock(&tcp_cong_list_lock);
 230                ca = tcp_ca_find(name);
 231        }
 232#endif
 233
 234        if (ca) {
 235                ca->flags |= TCP_CONG_NON_RESTRICTED;   /* default is always allowed */
 236                list_move(&ca->list, &tcp_cong_list);
 237                ret = 0;
 238        }
 239        spin_unlock(&tcp_cong_list_lock);
 240
 241        return ret;
 242}
 243
 244/* Set default value from kernel configuration at bootup */
 245static int __init tcp_congestion_default(void)
 246{
 247        return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
 248}
 249late_initcall(tcp_congestion_default);
 250
 251/* Build string with list of available congestion control values */
 252void tcp_get_available_congestion_control(char *buf, size_t maxlen)
 253{
 254        struct tcp_congestion_ops *ca;
 255        size_t offs = 0;
 256
 257        rcu_read_lock();
 258        list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
 259                offs += snprintf(buf + offs, maxlen - offs,
 260                                 "%s%s",
 261                                 offs == 0 ? "" : " ", ca->name);
 262        }
 263        rcu_read_unlock();
 264}
 265
 266/* Get current default congestion control */
 267void tcp_get_default_congestion_control(char *name)
 268{
 269        struct tcp_congestion_ops *ca;
 270        /* We will always have reno... */
 271        BUG_ON(list_empty(&tcp_cong_list));
 272
 273        rcu_read_lock();
 274        ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
 275        strncpy(name, ca->name, TCP_CA_NAME_MAX);
 276        rcu_read_unlock();
 277}
 278
 279/* Built list of non-restricted congestion control values */
 280void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
 281{
 282        struct tcp_congestion_ops *ca;
 283        size_t offs = 0;
 284
 285        *buf = '\0';
 286        rcu_read_lock();
 287        list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
 288                if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
 289                        continue;
 290                offs += snprintf(buf + offs, maxlen - offs,
 291                                 "%s%s",
 292                                 offs == 0 ? "" : " ", ca->name);
 293        }
 294        rcu_read_unlock();
 295}
 296
 297/* Change list of non-restricted congestion control */
 298int tcp_set_allowed_congestion_control(char *val)
 299{
 300        struct tcp_congestion_ops *ca;
 301        char *saved_clone, *clone, *name;
 302        int ret = 0;
 303
 304        saved_clone = clone = kstrdup(val, GFP_USER);
 305        if (!clone)
 306                return -ENOMEM;
 307
 308        spin_lock(&tcp_cong_list_lock);
 309        /* pass 1 check for bad entries */
 310        while ((name = strsep(&clone, " ")) && *name) {
 311                ca = tcp_ca_find(name);
 312                if (!ca) {
 313                        ret = -ENOENT;
 314                        goto out;
 315                }
 316        }
 317
 318        /* pass 2 clear old values */
 319        list_for_each_entry_rcu(ca, &tcp_cong_list, list)
 320                ca->flags &= ~TCP_CONG_NON_RESTRICTED;
 321
 322        /* pass 3 mark as allowed */
 323        while ((name = strsep(&val, " ")) && *name) {
 324                ca = tcp_ca_find(name);
 325                WARN_ON(!ca);
 326                if (ca)
 327                        ca->flags |= TCP_CONG_NON_RESTRICTED;
 328        }
 329out:
 330        spin_unlock(&tcp_cong_list_lock);
 331        kfree(saved_clone);
 332
 333        return ret;
 334}
 335
 336/* Change congestion control for socket. If load is false, then it is the
 337 * responsibility of the caller to call tcp_init_congestion_control or
 338 * tcp_reinit_congestion_control (if the current congestion control was
 339 * already initialized.
 340 */
 341int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit)
 342{
 343        struct inet_connection_sock *icsk = inet_csk(sk);
 344        const struct tcp_congestion_ops *ca;
 345        int err = 0;
 346
 347        if (icsk->icsk_ca_dst_locked)
 348                return -EPERM;
 349
 350        rcu_read_lock();
 351        if (!load)
 352                ca = tcp_ca_find(name);
 353        else
 354                ca = __tcp_ca_find_autoload(name);
 355        /* No change asking for existing value */
 356        if (ca == icsk->icsk_ca_ops) {
 357                icsk->icsk_ca_setsockopt = 1;
 358                goto out;
 359        }
 360        if (!ca) {
 361                err = -ENOENT;
 362        } else if (!load) {
 363                const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
 364
 365                if (try_module_get(ca->owner)) {
 366                        if (reinit) {
 367                                tcp_reinit_congestion_control(sk, ca);
 368                        } else {
 369                                icsk->icsk_ca_ops = ca;
 370                                module_put(old_ca->owner);
 371                        }
 372                } else {
 373                        err = -EBUSY;
 374                }
 375        } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
 376                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
 377                err = -EPERM;
 378        } else if (!try_module_get(ca->owner)) {
 379                err = -EBUSY;
 380        } else {
 381                tcp_reinit_congestion_control(sk, ca);
 382        }
 383 out:
 384        rcu_read_unlock();
 385        return err;
 386}
 387
 388/* Slow start is used when congestion window is no greater than the slow start
 389 * threshold. We base on RFC2581 and also handle stretch ACKs properly.
 390 * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
 391 * something better;) a packet is only considered (s)acked in its entirety to
 392 * defend the ACK attacks described in the RFC. Slow start processes a stretch
 393 * ACK of degree N as if N acks of degree 1 are received back to back except
 394 * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
 395 * returns the leftover acks to adjust cwnd in congestion avoidance mode.
 396 */
 397u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
 398{
 399        u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
 400
 401        acked -= cwnd - tp->snd_cwnd;
 402        tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
 403
 404        return acked;
 405}
 406EXPORT_SYMBOL_GPL(tcp_slow_start);
 407
 408/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
 409 * for every packet that was ACKed.
 410 */
 411void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
 412{
 413        /* If credits accumulated at a higher w, apply them gently now. */
 414        if (tp->snd_cwnd_cnt >= w) {
 415                tp->snd_cwnd_cnt = 0;
 416                tp->snd_cwnd++;
 417        }
 418
 419        tp->snd_cwnd_cnt += acked;
 420        if (tp->snd_cwnd_cnt >= w) {
 421                u32 delta = tp->snd_cwnd_cnt / w;
 422
 423                tp->snd_cwnd_cnt -= delta * w;
 424                tp->snd_cwnd += delta;
 425        }
 426        tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
 427}
 428EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
 429
 430/*
 431 * TCP Reno congestion control
 432 * This is special case used for fallback as well.
 433 */
 434/* This is Jacobson's slow start and congestion avoidance.
 435 * SIGCOMM '88, p. 328.
 436 */
 437void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 438{
 439        struct tcp_sock *tp = tcp_sk(sk);
 440
 441        if (!tcp_is_cwnd_limited(sk))
 442                return;
 443
 444        /* In "safe" area, increase. */
 445        if (tcp_in_slow_start(tp)) {
 446                acked = tcp_slow_start(tp, acked);
 447                if (!acked)
 448                        return;
 449        }
 450        /* In dangerous area, increase slowly. */
 451        tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
 452}
 453EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
 454
 455/* Slow start threshold is half the congestion window (min 2) */
 456u32 tcp_reno_ssthresh(struct sock *sk)
 457{
 458        const struct tcp_sock *tp = tcp_sk(sk);
 459
 460        return max(tp->snd_cwnd >> 1U, 2U);
 461}
 462EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
 463
 464u32 tcp_reno_undo_cwnd(struct sock *sk)
 465{
 466        const struct tcp_sock *tp = tcp_sk(sk);
 467
 468        return max(tp->snd_cwnd, tp->prior_cwnd);
 469}
 470EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
 471
 472struct tcp_congestion_ops tcp_reno = {
 473        .flags          = TCP_CONG_NON_RESTRICTED,
 474        .name           = "reno",
 475        .owner          = THIS_MODULE,
 476        .ssthresh       = tcp_reno_ssthresh,
 477        .cong_avoid     = tcp_reno_cong_avoid,
 478        .undo_cwnd      = tcp_reno_undo_cwnd,
 479};
 480