linux/net/mptcp/subflow.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* Multipath TCP
   3 *
   4 * Copyright (c) 2017 - 2019, Intel Corporation.
   5 */
   6
   7#define pr_fmt(fmt) "MPTCP: " fmt
   8
   9#include <linux/kernel.h>
  10#include <linux/module.h>
  11#include <linux/netdevice.h>
  12#include <net/sock.h>
  13#include <net/inet_common.h>
  14#include <net/inet_hashtables.h>
  15#include <net/protocol.h>
  16#include <net/tcp.h>
  17#if IS_ENABLED(CONFIG_MPTCP_IPV6)
  18#include <net/ip6_route.h>
  19#endif
  20#include <net/mptcp.h>
  21#include "protocol.h"
  22
  23static int subflow_rebuild_header(struct sock *sk)
  24{
  25        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  26        int err = 0;
  27
  28        if (subflow->request_mptcp && !subflow->token) {
  29                pr_debug("subflow=%p", sk);
  30                err = mptcp_token_new_connect(sk);
  31        }
  32
  33        if (err)
  34                return err;
  35
  36        return subflow->icsk_af_ops->rebuild_header(sk);
  37}
  38
  39static void subflow_req_destructor(struct request_sock *req)
  40{
  41        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
  42
  43        pr_debug("subflow_req=%p", subflow_req);
  44
  45        if (subflow_req->mp_capable)
  46                mptcp_token_destroy_request(subflow_req->token);
  47        tcp_request_sock_ops.destructor(req);
  48}
  49
  50static void subflow_init_req(struct request_sock *req,
  51                             const struct sock *sk_listener,
  52                             struct sk_buff *skb)
  53{
  54        struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
  55        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
  56        struct tcp_options_received rx_opt;
  57
  58        pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
  59
  60        memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
  61        mptcp_get_options(skb, &rx_opt);
  62
  63        subflow_req->mp_capable = 0;
  64        subflow_req->remote_key_valid = 0;
  65
  66#ifdef CONFIG_TCP_MD5SIG
  67        /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
  68         * TCP option space.
  69         */
  70        if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
  71                return;
  72#endif
  73
  74        if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
  75                int err;
  76
  77                err = mptcp_token_new_request(req);
  78                if (err == 0)
  79                        subflow_req->mp_capable = 1;
  80
  81                subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
  82        }
  83}
  84
  85static void subflow_v4_init_req(struct request_sock *req,
  86                                const struct sock *sk_listener,
  87                                struct sk_buff *skb)
  88{
  89        tcp_rsk(req)->is_mptcp = 1;
  90
  91        tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
  92
  93        subflow_init_req(req, sk_listener, skb);
  94}
  95
  96#if IS_ENABLED(CONFIG_MPTCP_IPV6)
  97static void subflow_v6_init_req(struct request_sock *req,
  98                                const struct sock *sk_listener,
  99                                struct sk_buff *skb)
 100{
 101        tcp_rsk(req)->is_mptcp = 1;
 102
 103        tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
 104
 105        subflow_init_req(req, sk_listener, skb);
 106}
 107#endif
 108
 109static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 110{
 111        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 112
 113        subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
 114
 115        if (subflow->conn && !subflow->conn_finished) {
 116                pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
 117                         subflow->remote_key);
 118                mptcp_finish_connect(sk);
 119                subflow->conn_finished = 1;
 120
 121                if (skb) {
 122                        pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
 123                        subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
 124                }
 125        }
 126}
 127
 128static struct request_sock_ops subflow_request_sock_ops;
 129static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
 130
 131static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 132{
 133        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 134
 135        pr_debug("subflow=%p", subflow);
 136
 137        /* Never answer to SYNs sent to broadcast or multicast */
 138        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
 139                goto drop;
 140
 141        return tcp_conn_request(&subflow_request_sock_ops,
 142                                &subflow_request_sock_ipv4_ops,
 143                                sk, skb);
 144drop:
 145        tcp_listendrop(sk);
 146        return 0;
 147}
 148
 149#if IS_ENABLED(CONFIG_MPTCP_IPV6)
 150static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
 151static struct inet_connection_sock_af_ops subflow_v6_specific;
 152static struct inet_connection_sock_af_ops subflow_v6m_specific;
 153
 154static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 155{
 156        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 157
 158        pr_debug("subflow=%p", subflow);
 159
 160        if (skb->protocol == htons(ETH_P_IP))
 161                return subflow_v4_conn_request(sk, skb);
 162
 163        if (!ipv6_unicast_destination(skb))
 164                goto drop;
 165
 166        return tcp_conn_request(&subflow_request_sock_ops,
 167                                &subflow_request_sock_ipv6_ops, sk, skb);
 168
 169drop:
 170        tcp_listendrop(sk);
 171        return 0; /* don't send reset */
 172}
 173#endif
 174
 175static struct sock *subflow_syn_recv_sock(const struct sock *sk,
 176                                          struct sk_buff *skb,
 177                                          struct request_sock *req,
 178                                          struct dst_entry *dst,
 179                                          struct request_sock *req_unhash,
 180                                          bool *own_req)
 181{
 182        struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
 183        struct mptcp_subflow_request_sock *subflow_req;
 184        struct tcp_options_received opt_rx;
 185        struct sock *child;
 186
 187        pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
 188
 189        if (tcp_rsk(req)->is_mptcp == 0)
 190                goto create_child;
 191
 192        /* if the sk is MP_CAPABLE, we try to fetch the client key */
 193        subflow_req = mptcp_subflow_rsk(req);
 194        if (subflow_req->mp_capable) {
 195                if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
 196                        /* here we can receive and accept an in-window,
 197                         * out-of-order pkt, which will not carry the MP_CAPABLE
 198                         * opt even on mptcp enabled paths
 199                         */
 200                        goto create_child;
 201                }
 202
 203                opt_rx.mptcp.mp_capable = 0;
 204                mptcp_get_options(skb, &opt_rx);
 205                if (opt_rx.mptcp.mp_capable) {
 206                        subflow_req->remote_key = opt_rx.mptcp.sndr_key;
 207                        subflow_req->remote_key_valid = 1;
 208                } else {
 209                        subflow_req->mp_capable = 0;
 210                }
 211        }
 212
 213create_child:
 214        child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
 215                                                     req_unhash, own_req);
 216
 217        if (child && *own_req) {
 218                struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
 219
 220                /* we have null ctx on TCP fallback, not fatal on MPC
 221                 * handshake
 222                 */
 223                if (!ctx)
 224                        return child;
 225
 226                if (ctx->mp_capable) {
 227                        if (mptcp_token_new_accept(ctx->token))
 228                                goto close_child;
 229                }
 230        }
 231
 232        return child;
 233
 234close_child:
 235        pr_debug("closing child socket");
 236        tcp_send_active_reset(child, GFP_ATOMIC);
 237        inet_csk_prepare_forced_close(child);
 238        tcp_done(child);
 239        return NULL;
 240}
 241
 242static struct inet_connection_sock_af_ops subflow_specific;
 243
 244enum mapping_status {
 245        MAPPING_OK,
 246        MAPPING_INVALID,
 247        MAPPING_EMPTY,
 248        MAPPING_DATA_FIN
 249};
 250
 251static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
 252{
 253        if ((u32)seq == (u32)old_seq)
 254                return old_seq;
 255
 256        /* Assume map covers data not mapped yet. */
 257        return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
 258}
 259
 260static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
 261{
 262        WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
 263                  ssn, subflow->map_subflow_seq, subflow->map_data_len);
 264}
 265
 266static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
 267{
 268        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 269        unsigned int skb_consumed;
 270
 271        skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
 272        if (WARN_ON_ONCE(skb_consumed >= skb->len))
 273                return true;
 274
 275        return skb->len - skb_consumed <= subflow->map_data_len -
 276                                          mptcp_subflow_get_map_offset(subflow);
 277}
 278
 279static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
 280{
 281        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 282        u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
 283
 284        if (unlikely(before(ssn, subflow->map_subflow_seq))) {
 285                /* Mapping covers data later in the subflow stream,
 286                 * currently unsupported.
 287                 */
 288                warn_bad_map(subflow, ssn);
 289                return false;
 290        }
 291        if (unlikely(!before(ssn, subflow->map_subflow_seq +
 292                                  subflow->map_data_len))) {
 293                /* Mapping does covers past subflow data, invalid */
 294                warn_bad_map(subflow, ssn + skb->len);
 295                return false;
 296        }
 297        return true;
 298}
 299
 300static enum mapping_status get_mapping_status(struct sock *ssk)
 301{
 302        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 303        struct mptcp_ext *mpext;
 304        struct sk_buff *skb;
 305        u16 data_len;
 306        u64 map_seq;
 307
 308        skb = skb_peek(&ssk->sk_receive_queue);
 309        if (!skb)
 310                return MAPPING_EMPTY;
 311
 312        mpext = mptcp_get_ext(skb);
 313        if (!mpext || !mpext->use_map) {
 314                if (!subflow->map_valid && !skb->len) {
 315                        /* the TCP stack deliver 0 len FIN pkt to the receive
 316                         * queue, that is the only 0len pkts ever expected here,
 317                         * and we can admit no mapping only for 0 len pkts
 318                         */
 319                        if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
 320                                WARN_ONCE(1, "0len seq %d:%d flags %x",
 321                                          TCP_SKB_CB(skb)->seq,
 322                                          TCP_SKB_CB(skb)->end_seq,
 323                                          TCP_SKB_CB(skb)->tcp_flags);
 324                        sk_eat_skb(ssk, skb);
 325                        return MAPPING_EMPTY;
 326                }
 327
 328                if (!subflow->map_valid)
 329                        return MAPPING_INVALID;
 330
 331                goto validate_seq;
 332        }
 333
 334        pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
 335                 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
 336                 mpext->data_len, mpext->data_fin);
 337
 338        data_len = mpext->data_len;
 339        if (data_len == 0) {
 340                pr_err("Infinite mapping not handled");
 341                return MAPPING_INVALID;
 342        }
 343
 344        if (mpext->data_fin == 1) {
 345                if (data_len == 1) {
 346                        pr_debug("DATA_FIN with no payload");
 347                        if (subflow->map_valid) {
 348                                /* A DATA_FIN might arrive in a DSS
 349                                 * option before the previous mapping
 350                                 * has been fully consumed. Continue
 351                                 * handling the existing mapping.
 352                                 */
 353                                skb_ext_del(skb, SKB_EXT_MPTCP);
 354                                return MAPPING_OK;
 355                        } else {
 356                                return MAPPING_DATA_FIN;
 357                        }
 358                }
 359
 360                /* Adjust for DATA_FIN using 1 byte of sequence space */
 361                data_len--;
 362        }
 363
 364        if (!mpext->dsn64) {
 365                map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
 366                                     mpext->data_seq);
 367                pr_debug("expanded seq=%llu", subflow->map_seq);
 368        } else {
 369                map_seq = mpext->data_seq;
 370        }
 371
 372        if (subflow->map_valid) {
 373                /* Allow replacing only with an identical map */
 374                if (subflow->map_seq == map_seq &&
 375                    subflow->map_subflow_seq == mpext->subflow_seq &&
 376                    subflow->map_data_len == data_len) {
 377                        skb_ext_del(skb, SKB_EXT_MPTCP);
 378                        return MAPPING_OK;
 379                }
 380
 381                /* If this skb data are fully covered by the current mapping,
 382                 * the new map would need caching, which is not supported
 383                 */
 384                if (skb_is_fully_mapped(ssk, skb))
 385                        return MAPPING_INVALID;
 386
 387                /* will validate the next map after consuming the current one */
 388                return MAPPING_OK;
 389        }
 390
 391        subflow->map_seq = map_seq;
 392        subflow->map_subflow_seq = mpext->subflow_seq;
 393        subflow->map_data_len = data_len;
 394        subflow->map_valid = 1;
 395        subflow->mpc_map = mpext->mpc_map;
 396        pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
 397                 subflow->map_seq, subflow->map_subflow_seq,
 398                 subflow->map_data_len);
 399
 400validate_seq:
 401        /* we revalidate valid mapping on new skb, because we must ensure
 402         * the current skb is completely covered by the available mapping
 403         */
 404        if (!validate_mapping(ssk, skb))
 405                return MAPPING_INVALID;
 406
 407        skb_ext_del(skb, SKB_EXT_MPTCP);
 408        return MAPPING_OK;
 409}
 410
 411static bool subflow_check_data_avail(struct sock *ssk)
 412{
 413        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 414        enum mapping_status status;
 415        struct mptcp_sock *msk;
 416        struct sk_buff *skb;
 417
 418        pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
 419                 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
 420        if (subflow->data_avail)
 421                return true;
 422
 423        if (!subflow->conn)
 424                return false;
 425
 426        msk = mptcp_sk(subflow->conn);
 427        for (;;) {
 428                u32 map_remaining;
 429                size_t delta;
 430                u64 ack_seq;
 431                u64 old_ack;
 432
 433                status = get_mapping_status(ssk);
 434                pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
 435                if (status == MAPPING_INVALID) {
 436                        ssk->sk_err = EBADMSG;
 437                        goto fatal;
 438                }
 439
 440                if (status != MAPPING_OK)
 441                        return false;
 442
 443                skb = skb_peek(&ssk->sk_receive_queue);
 444                if (WARN_ON_ONCE(!skb))
 445                        return false;
 446
 447                /* if msk lacks the remote key, this subflow must provide an
 448                 * MP_CAPABLE-based mapping
 449                 */
 450                if (unlikely(!READ_ONCE(msk->can_ack))) {
 451                        if (!subflow->mpc_map) {
 452                                ssk->sk_err = EBADMSG;
 453                                goto fatal;
 454                        }
 455                        WRITE_ONCE(msk->remote_key, subflow->remote_key);
 456                        WRITE_ONCE(msk->ack_seq, subflow->map_seq);
 457                        WRITE_ONCE(msk->can_ack, true);
 458                }
 459
 460                old_ack = READ_ONCE(msk->ack_seq);
 461                ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
 462                pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
 463                         ack_seq);
 464                if (ack_seq == old_ack)
 465                        break;
 466
 467                /* only accept in-sequence mapping. Old values are spurious
 468                 * retransmission; we can hit "future" values on active backup
 469                 * subflow switch, we relay on retransmissions to get
 470                 * in-sequence data.
 471                 * Cuncurrent subflows support will require subflow data
 472                 * reordering
 473                 */
 474                map_remaining = subflow->map_data_len -
 475                                mptcp_subflow_get_map_offset(subflow);
 476                if (before64(ack_seq, old_ack))
 477                        delta = min_t(size_t, old_ack - ack_seq, map_remaining);
 478                else
 479                        delta = min_t(size_t, ack_seq - old_ack, map_remaining);
 480
 481                /* discard mapped data */
 482                pr_debug("discarding %zu bytes, current map len=%d", delta,
 483                         map_remaining);
 484                if (delta) {
 485                        struct mptcp_read_arg arg = {
 486                                .msg = NULL,
 487                        };
 488                        read_descriptor_t desc = {
 489                                .count = delta,
 490                                .arg.data = &arg,
 491                        };
 492                        int ret;
 493
 494                        ret = tcp_read_sock(ssk, &desc, mptcp_read_actor);
 495                        if (ret < 0) {
 496                                ssk->sk_err = -ret;
 497                                goto fatal;
 498                        }
 499                        if (ret < delta)
 500                                return false;
 501                        if (delta == map_remaining)
 502                                subflow->map_valid = 0;
 503                }
 504        }
 505        return true;
 506
 507fatal:
 508        /* fatal protocol error, close the socket */
 509        /* This barrier is coupled with smp_rmb() in tcp_poll() */
 510        smp_wmb();
 511        ssk->sk_error_report(ssk);
 512        tcp_set_state(ssk, TCP_CLOSE);
 513        tcp_send_active_reset(ssk, GFP_ATOMIC);
 514        return false;
 515}
 516
 517bool mptcp_subflow_data_available(struct sock *sk)
 518{
 519        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 520        struct sk_buff *skb;
 521
 522        /* check if current mapping is still valid */
 523        if (subflow->map_valid &&
 524            mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
 525                subflow->map_valid = 0;
 526                subflow->data_avail = 0;
 527
 528                pr_debug("Done with mapping: seq=%u data_len=%u",
 529                         subflow->map_subflow_seq,
 530                         subflow->map_data_len);
 531        }
 532
 533        if (!subflow_check_data_avail(sk)) {
 534                subflow->data_avail = 0;
 535                return false;
 536        }
 537
 538        skb = skb_peek(&sk->sk_receive_queue);
 539        subflow->data_avail = skb &&
 540                       before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
 541        return subflow->data_avail;
 542}
 543
 544static void subflow_data_ready(struct sock *sk)
 545{
 546        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 547        struct sock *parent = subflow->conn;
 548
 549        if (!parent || !subflow->mp_capable) {
 550                subflow->tcp_data_ready(sk);
 551
 552                if (parent)
 553                        parent->sk_data_ready(parent);
 554                return;
 555        }
 556
 557        if (mptcp_subflow_data_available(sk)) {
 558                set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
 559
 560                parent->sk_data_ready(parent);
 561        }
 562}
 563
 564static void subflow_write_space(struct sock *sk)
 565{
 566        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 567        struct sock *parent = subflow->conn;
 568
 569        sk_stream_write_space(sk);
 570        if (parent && sk_stream_is_writeable(sk)) {
 571                set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
 572                smp_mb__after_atomic();
 573                /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
 574                sk_stream_write_space(parent);
 575        }
 576}
 577
 578static struct inet_connection_sock_af_ops *
 579subflow_default_af_ops(struct sock *sk)
 580{
 581#if IS_ENABLED(CONFIG_MPTCP_IPV6)
 582        if (sk->sk_family == AF_INET6)
 583                return &subflow_v6_specific;
 584#endif
 585        return &subflow_specific;
 586}
 587
 588#if IS_ENABLED(CONFIG_MPTCP_IPV6)
 589void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
 590{
 591        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 592        struct inet_connection_sock *icsk = inet_csk(sk);
 593        struct inet_connection_sock_af_ops *target;
 594
 595        target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
 596
 597        pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
 598                 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
 599
 600        if (likely(icsk->icsk_af_ops == target))
 601                return;
 602
 603        subflow->icsk_af_ops = icsk->icsk_af_ops;
 604        icsk->icsk_af_ops = target;
 605}
 606#endif
 607
 608int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
 609{
 610        struct mptcp_subflow_context *subflow;
 611        struct net *net = sock_net(sk);
 612        struct socket *sf;
 613        int err;
 614
 615        err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
 616                               &sf);
 617        if (err)
 618                return err;
 619
 620        lock_sock(sf->sk);
 621
 622        /* kernel sockets do not by default acquire net ref, but TCP timer
 623         * needs it.
 624         */
 625        sf->sk->sk_net_refcnt = 1;
 626        get_net(net);
 627#ifdef CONFIG_PROC_FS
 628        this_cpu_add(*net->core.sock_inuse, 1);
 629#endif
 630        err = tcp_set_ulp(sf->sk, "mptcp");
 631        release_sock(sf->sk);
 632
 633        if (err)
 634                return err;
 635
 636        subflow = mptcp_subflow_ctx(sf->sk);
 637        pr_debug("subflow=%p", subflow);
 638
 639        *new_sock = sf;
 640        sock_hold(sk);
 641        subflow->conn = sk;
 642
 643        return 0;
 644}
 645
 646static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
 647                                                        gfp_t priority)
 648{
 649        struct inet_connection_sock *icsk = inet_csk(sk);
 650        struct mptcp_subflow_context *ctx;
 651
 652        ctx = kzalloc(sizeof(*ctx), priority);
 653        if (!ctx)
 654                return NULL;
 655
 656        rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
 657        INIT_LIST_HEAD(&ctx->node);
 658
 659        pr_debug("subflow=%p", ctx);
 660
 661        ctx->tcp_sock = sk;
 662
 663        return ctx;
 664}
 665
 666static void __subflow_state_change(struct sock *sk)
 667{
 668        struct socket_wq *wq;
 669
 670        rcu_read_lock();
 671        wq = rcu_dereference(sk->sk_wq);
 672        if (skwq_has_sleeper(wq))
 673                wake_up_interruptible_all(&wq->wait);
 674        rcu_read_unlock();
 675}
 676
 677static bool subflow_is_done(const struct sock *sk)
 678{
 679        return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
 680}
 681
 682static void subflow_state_change(struct sock *sk)
 683{
 684        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 685        struct sock *parent = READ_ONCE(subflow->conn);
 686
 687        __subflow_state_change(sk);
 688
 689        /* as recvmsg() does not acquire the subflow socket for ssk selection
 690         * a fin packet carrying a DSS can be unnoticed if we don't trigger
 691         * the data available machinery here.
 692         */
 693        if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) {
 694                set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
 695
 696                parent->sk_data_ready(parent);
 697        }
 698
 699        if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) &&
 700            !subflow->rx_eof && subflow_is_done(sk)) {
 701                subflow->rx_eof = 1;
 702                parent->sk_shutdown |= RCV_SHUTDOWN;
 703                __subflow_state_change(parent);
 704        }
 705}
 706
 707static int subflow_ulp_init(struct sock *sk)
 708{
 709        struct inet_connection_sock *icsk = inet_csk(sk);
 710        struct mptcp_subflow_context *ctx;
 711        struct tcp_sock *tp = tcp_sk(sk);
 712        int err = 0;
 713
 714        /* disallow attaching ULP to a socket unless it has been
 715         * created with sock_create_kern()
 716         */
 717        if (!sk->sk_kern_sock) {
 718                err = -EOPNOTSUPP;
 719                goto out;
 720        }
 721
 722        ctx = subflow_create_ctx(sk, GFP_KERNEL);
 723        if (!ctx) {
 724                err = -ENOMEM;
 725                goto out;
 726        }
 727
 728        pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
 729
 730        tp->is_mptcp = 1;
 731        ctx->icsk_af_ops = icsk->icsk_af_ops;
 732        icsk->icsk_af_ops = subflow_default_af_ops(sk);
 733        ctx->tcp_data_ready = sk->sk_data_ready;
 734        ctx->tcp_state_change = sk->sk_state_change;
 735        ctx->tcp_write_space = sk->sk_write_space;
 736        sk->sk_data_ready = subflow_data_ready;
 737        sk->sk_write_space = subflow_write_space;
 738        sk->sk_state_change = subflow_state_change;
 739out:
 740        return err;
 741}
 742
 743static void subflow_ulp_release(struct sock *sk)
 744{
 745        struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
 746
 747        if (!ctx)
 748                return;
 749
 750        if (ctx->conn)
 751                sock_put(ctx->conn);
 752
 753        kfree_rcu(ctx, rcu);
 754}
 755
 756static void subflow_ulp_fallback(struct sock *sk,
 757                                 struct mptcp_subflow_context *old_ctx)
 758{
 759        struct inet_connection_sock *icsk = inet_csk(sk);
 760
 761        mptcp_subflow_tcp_fallback(sk, old_ctx);
 762        icsk->icsk_ulp_ops = NULL;
 763        rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
 764        tcp_sk(sk)->is_mptcp = 0;
 765}
 766
 767static void subflow_ulp_clone(const struct request_sock *req,
 768                              struct sock *newsk,
 769                              const gfp_t priority)
 770{
 771        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
 772        struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
 773        struct mptcp_subflow_context *new_ctx;
 774
 775        if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) {
 776                subflow_ulp_fallback(newsk, old_ctx);
 777                return;
 778        }
 779
 780        new_ctx = subflow_create_ctx(newsk, priority);
 781        if (!new_ctx) {
 782                subflow_ulp_fallback(newsk, old_ctx);
 783                return;
 784        }
 785
 786        /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully
 787         * established only after we receive the remote key
 788         */
 789        new_ctx->conn_finished = 1;
 790        new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
 791        new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
 792        new_ctx->tcp_state_change = old_ctx->tcp_state_change;
 793        new_ctx->tcp_write_space = old_ctx->tcp_write_space;
 794        new_ctx->mp_capable = 1;
 795        new_ctx->fourth_ack = subflow_req->remote_key_valid;
 796        new_ctx->can_ack = subflow_req->remote_key_valid;
 797        new_ctx->remote_key = subflow_req->remote_key;
 798        new_ctx->local_key = subflow_req->local_key;
 799        new_ctx->token = subflow_req->token;
 800        new_ctx->ssn_offset = subflow_req->ssn_offset;
 801        new_ctx->idsn = subflow_req->idsn;
 802}
 803
 804static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
 805        .name           = "mptcp",
 806        .owner          = THIS_MODULE,
 807        .init           = subflow_ulp_init,
 808        .release        = subflow_ulp_release,
 809        .clone          = subflow_ulp_clone,
 810};
 811
 812static int subflow_ops_init(struct request_sock_ops *subflow_ops)
 813{
 814        subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
 815        subflow_ops->slab_name = "request_sock_subflow";
 816
 817        subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
 818                                              subflow_ops->obj_size, 0,
 819                                              SLAB_ACCOUNT |
 820                                              SLAB_TYPESAFE_BY_RCU,
 821                                              NULL);
 822        if (!subflow_ops->slab)
 823                return -ENOMEM;
 824
 825        subflow_ops->destructor = subflow_req_destructor;
 826
 827        return 0;
 828}
 829
 830void mptcp_subflow_init(void)
 831{
 832        subflow_request_sock_ops = tcp_request_sock_ops;
 833        if (subflow_ops_init(&subflow_request_sock_ops) != 0)
 834                panic("MPTCP: failed to init subflow request sock ops\n");
 835
 836        subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
 837        subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
 838
 839        subflow_specific = ipv4_specific;
 840        subflow_specific.conn_request = subflow_v4_conn_request;
 841        subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
 842        subflow_specific.sk_rx_dst_set = subflow_finish_connect;
 843        subflow_specific.rebuild_header = subflow_rebuild_header;
 844
 845#if IS_ENABLED(CONFIG_MPTCP_IPV6)
 846        subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
 847        subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
 848
 849        subflow_v6_specific = ipv6_specific;
 850        subflow_v6_specific.conn_request = subflow_v6_conn_request;
 851        subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
 852        subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
 853        subflow_v6_specific.rebuild_header = subflow_rebuild_header;
 854
 855        subflow_v6m_specific = subflow_v6_specific;
 856        subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
 857        subflow_v6m_specific.send_check = ipv4_specific.send_check;
 858        subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
 859        subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
 860        subflow_v6m_specific.net_frag_header_len = 0;
 861#endif
 862
 863        if (tcp_register_ulp(&subflow_ulp_ops) != 0)
 864                panic("MPTCP: failed to register subflows to ULP\n");
 865}
 866