linux/net/ipv4/tcp_dctcp.c
<<
>>
Prefs
   1/* DataCenter TCP (DCTCP) congestion control.
   2 *
   3 * http://simula.stanford.edu/~alizade/Site/DCTCP.html
   4 *
   5 * This is an implementation of DCTCP over Reno, an enhancement to the
   6 * TCP congestion control algorithm designed for data centers. DCTCP
   7 * leverages Explicit Congestion Notification (ECN) in the network to
   8 * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
   9 * the following three data center transport requirements:
  10 *
  11 *  - High burst tolerance (incast due to partition/aggregate)
  12 *  - Low latency (short flows, queries)
  13 *  - High throughput (continuous data updates, large file transfers)
  14 *    with commodity shallow buffered switches
  15 *
  16 * The algorithm is described in detail in the following two papers:
  17 *
  18 * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
  19 *    Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
  20 *      "Data Center TCP (DCTCP)", Data Center Networks session
  21 *      Proc. ACM SIGCOMM, New Delhi, 2010.
  22 *   http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
  23 *
  24 * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
  25 *      "Analysis of DCTCP: Stability, Convergence, and Fairness"
  26 *      Proc. ACM SIGMETRICS, San Jose, 2011.
  27 *   http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
  28 *
  29 * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
  30 *
  31 * Authors:
  32 *
  33 *      Daniel Borkmann <dborkman@redhat.com>
  34 *      Florian Westphal <fw@strlen.de>
  35 *      Glenn Judd <glenn.judd@morganstanley.com>
  36 *
  37 * This program is free software; you can redistribute it and/or modify
  38 * it under the terms of the GNU General Public License as published by
  39 * the Free Software Foundation; either version 2 of the License, or (at
  40 * your option) any later version.
  41 */
  42
  43#include <linux/module.h>
  44#include <linux/mm.h>
  45#include <net/tcp.h>
  46#include <linux/inet_diag.h>
  47
  48#define DCTCP_MAX_ALPHA 1024U
  49
  50struct dctcp {
  51        u32 acked_bytes_ecn;
  52        u32 acked_bytes_total;
  53        u32 prior_snd_una;
  54        u32 prior_rcv_nxt;
  55        u32 dctcp_alpha;
  56        u32 next_seq;
  57        u32 ce_state;
  58        u32 delayed_ack_reserved;
  59        u32 loss_cwnd;
  60};
  61
  62static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
  63module_param(dctcp_shift_g, uint, 0644);
  64MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
  65
  66static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
  67module_param(dctcp_alpha_on_init, uint, 0644);
  68MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
  69
  70static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
  71module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
  72MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
  73                 "parameter for clamping alpha on loss");
  74
  75static struct tcp_congestion_ops dctcp_reno;
  76
  77static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
  78{
  79        ca->next_seq = tp->snd_nxt;
  80
  81        ca->acked_bytes_ecn = 0;
  82        ca->acked_bytes_total = 0;
  83}
  84
  85static void dctcp_init(struct sock *sk)
  86{
  87        const struct tcp_sock *tp = tcp_sk(sk);
  88
  89        if ((tp->ecn_flags & TCP_ECN_OK) ||
  90            (sk->sk_state == TCP_LISTEN ||
  91             sk->sk_state == TCP_CLOSE)) {
  92                struct dctcp *ca = inet_csk_ca(sk);
  93
  94                ca->prior_snd_una = tp->snd_una;
  95                ca->prior_rcv_nxt = tp->rcv_nxt;
  96
  97                ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
  98
  99                ca->delayed_ack_reserved = 0;
 100                ca->loss_cwnd = 0;
 101                ca->ce_state = 0;
 102
 103                dctcp_reset(tp, ca);
 104                return;
 105        }
 106
 107        /* No ECN support? Fall back to Reno. Also need to clear
 108         * ECT from sk since it is set during 3WHS for DCTCP.
 109         */
 110        inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
 111        INET_ECN_dontxmit(sk);
 112}
 113
 114static u32 dctcp_ssthresh(struct sock *sk)
 115{
 116        struct dctcp *ca = inet_csk_ca(sk);
 117        struct tcp_sock *tp = tcp_sk(sk);
 118
 119        ca->loss_cwnd = tp->snd_cwnd;
 120        return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
 121}
 122
 123/* Minimal DCTP CE state machine:
 124 *
 125 * S:   0 <- last pkt was non-CE
 126 *      1 <- last pkt was CE
 127 */
 128
 129static void dctcp_ce_state_0_to_1(struct sock *sk)
 130{
 131        struct dctcp *ca = inet_csk_ca(sk);
 132        struct tcp_sock *tp = tcp_sk(sk);
 133
 134        /* State has changed from CE=0 to CE=1 and delayed
 135         * ACK has not sent yet.
 136         */
 137        if (!ca->ce_state && ca->delayed_ack_reserved) {
 138                u32 tmp_rcv_nxt;
 139
 140                /* Save current rcv_nxt. */
 141                tmp_rcv_nxt = tp->rcv_nxt;
 142
 143                /* Generate previous ack with CE=0. */
 144                tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
 145                tp->rcv_nxt = ca->prior_rcv_nxt;
 146
 147                tcp_send_ack(sk);
 148
 149                /* Recover current rcv_nxt. */
 150                tp->rcv_nxt = tmp_rcv_nxt;
 151        }
 152
 153        ca->prior_rcv_nxt = tp->rcv_nxt;
 154        ca->ce_state = 1;
 155
 156        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 157}
 158
 159static void dctcp_ce_state_1_to_0(struct sock *sk)
 160{
 161        struct dctcp *ca = inet_csk_ca(sk);
 162        struct tcp_sock *tp = tcp_sk(sk);
 163
 164        /* State has changed from CE=1 to CE=0 and delayed
 165         * ACK has not sent yet.
 166         */
 167        if (ca->ce_state && ca->delayed_ack_reserved) {
 168                u32 tmp_rcv_nxt;
 169
 170                /* Save current rcv_nxt. */
 171                tmp_rcv_nxt = tp->rcv_nxt;
 172
 173                /* Generate previous ack with CE=1. */
 174                tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 175                tp->rcv_nxt = ca->prior_rcv_nxt;
 176
 177                tcp_send_ack(sk);
 178
 179                /* Recover current rcv_nxt. */
 180                tp->rcv_nxt = tmp_rcv_nxt;
 181        }
 182
 183        ca->prior_rcv_nxt = tp->rcv_nxt;
 184        ca->ce_state = 0;
 185
 186        tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
 187}
 188
 189static void dctcp_update_alpha(struct sock *sk, u32 flags)
 190{
 191        const struct tcp_sock *tp = tcp_sk(sk);
 192        struct dctcp *ca = inet_csk_ca(sk);
 193        u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
 194
 195        /* If ack did not advance snd_una, count dupack as MSS size.
 196         * If ack did update window, do not count it at all.
 197         */
 198        if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
 199                acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
 200        if (acked_bytes) {
 201                ca->acked_bytes_total += acked_bytes;
 202                ca->prior_snd_una = tp->snd_una;
 203
 204                if (flags & CA_ACK_ECE)
 205                        ca->acked_bytes_ecn += acked_bytes;
 206        }
 207
 208        /* Expired RTT */
 209        if (!before(tp->snd_una, ca->next_seq)) {
 210                u64 bytes_ecn = ca->acked_bytes_ecn;
 211                u32 alpha = ca->dctcp_alpha;
 212
 213                /* alpha = (1 - g) * alpha + g * F */
 214
 215                alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
 216                if (bytes_ecn) {
 217                        /* If dctcp_shift_g == 1, a 32bit value would overflow
 218                         * after 8 Mbytes.
 219                         */
 220                        bytes_ecn <<= (10 - dctcp_shift_g);
 221                        do_div(bytes_ecn, max(1U, ca->acked_bytes_total));
 222
 223                        alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA);
 224                }
 225                /* dctcp_alpha can be read from dctcp_get_info() without
 226                 * synchro, so we ask compiler to not use dctcp_alpha
 227                 * as a temporary variable in prior operations.
 228                 */
 229                WRITE_ONCE(ca->dctcp_alpha, alpha);
 230                dctcp_reset(tp, ca);
 231        }
 232}
 233
 234static void dctcp_state(struct sock *sk, u8 new_state)
 235{
 236        if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
 237                struct dctcp *ca = inet_csk_ca(sk);
 238
 239                /* If this extension is enabled, we clamp dctcp_alpha to
 240                 * max on packet loss; the motivation is that dctcp_alpha
 241                 * is an indicator to the extend of congestion and packet
 242                 * loss is an indicator of extreme congestion; setting
 243                 * this in practice turned out to be beneficial, and
 244                 * effectively assumes total congestion which reduces the
 245                 * window by half.
 246                 */
 247                ca->dctcp_alpha = DCTCP_MAX_ALPHA;
 248        }
 249}
 250
 251static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
 252{
 253        struct dctcp *ca = inet_csk_ca(sk);
 254
 255        switch (ev) {
 256        case CA_EVENT_DELAYED_ACK:
 257                if (!ca->delayed_ack_reserved)
 258                        ca->delayed_ack_reserved = 1;
 259                break;
 260        case CA_EVENT_NON_DELAYED_ACK:
 261                if (ca->delayed_ack_reserved)
 262                        ca->delayed_ack_reserved = 0;
 263                break;
 264        default:
 265                /* Don't care for the rest. */
 266                break;
 267        }
 268}
 269
 270static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 271{
 272        switch (ev) {
 273        case CA_EVENT_ECN_IS_CE:
 274                dctcp_ce_state_0_to_1(sk);
 275                break;
 276        case CA_EVENT_ECN_NO_CE:
 277                dctcp_ce_state_1_to_0(sk);
 278                break;
 279        case CA_EVENT_DELAYED_ACK:
 280        case CA_EVENT_NON_DELAYED_ACK:
 281                dctcp_update_ack_reserved(sk, ev);
 282                break;
 283        default:
 284                /* Don't care for the rest. */
 285                break;
 286        }
 287}
 288
 289static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
 290{
 291        const struct dctcp *ca = inet_csk_ca(sk);
 292
 293        /* Fill it also in case of VEGASINFO due to req struct limits.
 294         * We can still correctly retrieve it later.
 295         */
 296        if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
 297            ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
 298                struct tcp_dctcp_info info;
 299
 300                memset(&info, 0, sizeof(info));
 301                if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
 302                        info.dctcp_enabled = 1;
 303                        info.dctcp_ce_state = (u16) ca->ce_state;
 304                        info.dctcp_alpha = ca->dctcp_alpha;
 305                        info.dctcp_ab_ecn = ca->acked_bytes_ecn;
 306                        info.dctcp_ab_tot = ca->acked_bytes_total;
 307                }
 308
 309                return nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
 310        }
 311        return 0;
 312}
 313
 314static u32 dctcp_cwnd_undo(struct sock *sk)
 315{
 316        const struct dctcp *ca = inet_csk_ca(sk);
 317
 318        return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
 319}
 320
 321static struct tcp_congestion_ops dctcp __read_mostly = {
 322        .init           = dctcp_init,
 323        .in_ack_event   = dctcp_update_alpha,
 324        .cwnd_event     = dctcp_cwnd_event,
 325        .ssthresh       = dctcp_ssthresh,
 326        .cong_avoid     = tcp_reno_cong_avoid,
 327        .undo_cwnd      = dctcp_cwnd_undo,
 328        .set_state      = dctcp_state,
 329        .get_info       = dctcp_get_info,
 330        .flags          = TCP_CONG_NEEDS_ECN,
 331        .owner          = THIS_MODULE,
 332        .name           = "dctcp",
 333};
 334
 335static struct tcp_congestion_ops dctcp_reno __read_mostly = {
 336        .ssthresh       = tcp_reno_ssthresh,
 337        .cong_avoid     = tcp_reno_cong_avoid,
 338        .get_info       = dctcp_get_info,
 339        .owner          = THIS_MODULE,
 340        .name           = "dctcp-reno",
 341};
 342
 343static int __init dctcp_register(void)
 344{
 345        BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
 346        return tcp_register_congestion_control(&dctcp);
 347}
 348
 349static void __exit dctcp_unregister(void)
 350{
 351        tcp_unregister_congestion_control(&dctcp);
 352}
 353
 354module_init(dctcp_register);
 355module_exit(dctcp_unregister);
 356
 357MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
 358MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
 359MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
 360
 361MODULE_LICENSE("GPL v2");
 362MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
 363