linux/net/smc/af_smc.c
<<
>>
Prefs
   1/*
   2 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
   3 *
   4 *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
   5 *  applies to SOCK_STREAM sockets only
   6 *  offers an alternative communication option for TCP-protocol sockets
   7 *  applicable with RoCE-cards only
   8 *
   9 *  Initial restrictions:
  10 *    - non-blocking connect postponed
  11 *    - IPv6 support postponed
  12 *    - support for alternate links postponed
  13 *    - partial support for non-blocking sockets only
  14 *    - support for urgent data postponed
  15 *
  16 *  Copyright IBM Corp. 2016
  17 *
  18 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
  19 *              based on prototype from Frank Blaschka
  20 */
  21
  22#define KMSG_COMPONENT "smc"
  23#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  24
  25#include <linux/module.h>
  26#include <linux/socket.h>
  27#include <linux/inetdevice.h>
  28#include <linux/workqueue.h>
  29#include <linux/in.h>
  30#include <linux/sched/signal.h>
  31
  32#include <net/sock.h>
  33#include <net/tcp.h>
  34#include <net/smc.h>
  35
  36#include "smc.h"
  37#include "smc_clc.h"
  38#include "smc_llc.h"
  39#include "smc_cdc.h"
  40#include "smc_core.h"
  41#include "smc_ib.h"
  42#include "smc_pnet.h"
  43#include "smc_tx.h"
  44#include "smc_rx.h"
  45#include "smc_close.h"
  46
  47static DEFINE_MUTEX(smc_create_lgr_pending);    /* serialize link group
  48                                                 * creation
  49                                                 */
  50
  51struct smc_lgr_list smc_lgr_list = {            /* established link groups */
  52        .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
  53        .list = LIST_HEAD_INIT(smc_lgr_list.list),
  54};
  55
  56static void smc_tcp_listen_work(struct work_struct *);
  57
  58static void smc_set_keepalive(struct sock *sk, int val)
  59{
  60        struct smc_sock *smc = smc_sk(sk);
  61
  62        smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
  63}
  64
  65static struct smc_hashinfo smc_v4_hashinfo = {
  66        .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
  67};
  68
  69int smc_hash_sk(struct sock *sk)
  70{
  71        struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
  72        struct hlist_head *head;
  73
  74        head = &h->ht;
  75
  76        write_lock_bh(&h->lock);
  77        sk_add_node(sk, head);
  78        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
  79        write_unlock_bh(&h->lock);
  80
  81        return 0;
  82}
  83EXPORT_SYMBOL_GPL(smc_hash_sk);
  84
  85void smc_unhash_sk(struct sock *sk)
  86{
  87        struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
  88
  89        write_lock_bh(&h->lock);
  90        if (sk_del_node_init(sk))
  91                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
  92        write_unlock_bh(&h->lock);
  93}
  94EXPORT_SYMBOL_GPL(smc_unhash_sk);
  95
  96struct proto smc_proto = {
  97        .name           = "SMC",
  98        .owner          = THIS_MODULE,
  99        .keepalive      = smc_set_keepalive,
 100        .hash           = smc_hash_sk,
 101        .unhash         = smc_unhash_sk,
 102        .obj_size       = sizeof(struct smc_sock),
 103        .h.smc_hash     = &smc_v4_hashinfo,
 104        .slab_flags     = SLAB_TYPESAFE_BY_RCU,
 105};
 106EXPORT_SYMBOL_GPL(smc_proto);
 107
 108static int smc_release(struct socket *sock)
 109{
 110        struct sock *sk = sock->sk;
 111        struct smc_sock *smc;
 112        int rc = 0;
 113
 114        if (!sk)
 115                goto out;
 116
 117        smc = smc_sk(sk);
 118        sock_hold(sk);
 119        if (sk->sk_state == SMC_LISTEN)
 120                /* smc_close_non_accepted() is called and acquires
 121                 * sock lock for child sockets again
 122                 */
 123                lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
 124        else
 125                lock_sock(sk);
 126
 127        if (smc->use_fallback) {
 128                sk->sk_state = SMC_CLOSED;
 129                sk->sk_state_change(sk);
 130        } else {
 131                rc = smc_close_active(smc);
 132                sock_set_flag(sk, SOCK_DEAD);
 133                sk->sk_shutdown |= SHUTDOWN_MASK;
 134        }
 135        if (smc->clcsock) {
 136                sock_release(smc->clcsock);
 137                smc->clcsock = NULL;
 138        }
 139
 140        /* detach socket */
 141        sock_orphan(sk);
 142        sock->sk = NULL;
 143        if (smc->use_fallback) {
 144                schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
 145        } else if (sk->sk_state == SMC_CLOSED) {
 146                smc_conn_free(&smc->conn);
 147                schedule_delayed_work(&smc->sock_put_work,
 148                                      SMC_CLOSE_SOCK_PUT_DELAY);
 149        }
 150        release_sock(sk);
 151
 152        sock_put(sk);
 153out:
 154        return rc;
 155}
 156
 157static void smc_destruct(struct sock *sk)
 158{
 159        if (sk->sk_state != SMC_CLOSED)
 160                return;
 161        if (!sock_flag(sk, SOCK_DEAD))
 162                return;
 163
 164        sk_refcnt_debug_dec(sk);
 165}
 166
 167static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 168{
 169        struct smc_sock *smc;
 170        struct sock *sk;
 171
 172        sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
 173        if (!sk)
 174                return NULL;
 175
 176        sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
 177        sk->sk_state = SMC_INIT;
 178        sk->sk_destruct = smc_destruct;
 179        sk->sk_protocol = SMCPROTO_SMC;
 180        smc = smc_sk(sk);
 181        INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 182        INIT_LIST_HEAD(&smc->accept_q);
 183        spin_lock_init(&smc->accept_q_lock);
 184        INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
 185        sk->sk_prot->hash(sk);
 186        sk_refcnt_debug_inc(sk);
 187
 188        return sk;
 189}
 190
 191static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
 192                    int addr_len)
 193{
 194        struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
 195        struct sock *sk = sock->sk;
 196        struct smc_sock *smc;
 197        int rc;
 198
 199        smc = smc_sk(sk);
 200
 201        /* replicate tests from inet_bind(), to be safe wrt. future changes */
 202        rc = -EINVAL;
 203        if (addr_len < sizeof(struct sockaddr_in))
 204                goto out;
 205
 206        rc = -EAFNOSUPPORT;
 207        /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
 208        if ((addr->sin_family != AF_INET) &&
 209            ((addr->sin_family != AF_UNSPEC) ||
 210             (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
 211                goto out;
 212
 213        lock_sock(sk);
 214
 215        /* Check if socket is already active */
 216        rc = -EINVAL;
 217        if (sk->sk_state != SMC_INIT)
 218                goto out_rel;
 219
 220        smc->clcsock->sk->sk_reuse = sk->sk_reuse;
 221        rc = kernel_bind(smc->clcsock, uaddr, addr_len);
 222
 223out_rel:
 224        release_sock(sk);
 225out:
 226        return rc;
 227}
 228
 229static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
 230                                   unsigned long mask)
 231{
 232        /* options we don't get control via setsockopt for */
 233        nsk->sk_type = osk->sk_type;
 234        nsk->sk_sndbuf = osk->sk_sndbuf;
 235        nsk->sk_rcvbuf = osk->sk_rcvbuf;
 236        nsk->sk_sndtimeo = osk->sk_sndtimeo;
 237        nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
 238        nsk->sk_mark = osk->sk_mark;
 239        nsk->sk_priority = osk->sk_priority;
 240        nsk->sk_rcvlowat = osk->sk_rcvlowat;
 241        nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
 242        nsk->sk_err = osk->sk_err;
 243
 244        nsk->sk_flags &= ~mask;
 245        nsk->sk_flags |= osk->sk_flags & mask;
 246}
 247
 248#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
 249                             (1UL << SOCK_KEEPOPEN) | \
 250                             (1UL << SOCK_LINGER) | \
 251                             (1UL << SOCK_BROADCAST) | \
 252                             (1UL << SOCK_TIMESTAMP) | \
 253                             (1UL << SOCK_DBG) | \
 254                             (1UL << SOCK_RCVTSTAMP) | \
 255                             (1UL << SOCK_RCVTSTAMPNS) | \
 256                             (1UL << SOCK_LOCALROUTE) | \
 257                             (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
 258                             (1UL << SOCK_RXQ_OVFL) | \
 259                             (1UL << SOCK_WIFI_STATUS) | \
 260                             (1UL << SOCK_NOFCS) | \
 261                             (1UL << SOCK_FILTER_LOCKED))
 262/* copy only relevant settings and flags of SOL_SOCKET level from smc to
 263 * clc socket (since smc is not called for these options from net/core)
 264 */
 265static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
 266{
 267        smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
 268}
 269
 270#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
 271                             (1UL << SOCK_KEEPOPEN) | \
 272                             (1UL << SOCK_LINGER) | \
 273                             (1UL << SOCK_DBG))
 274/* copy only settings and flags relevant for smc from clc to smc socket */
 275static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 276{
 277        smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
 278}
 279
 280/* determine subnet and mask of internal TCP socket */
 281int smc_netinfo_by_tcpsk(struct socket *clcsock,
 282                         __be32 *subnet, u8 *prefix_len)
 283{
 284        struct dst_entry *dst = sk_dst_get(clcsock->sk);
 285        struct sockaddr_in addr;
 286        int rc = -ENOENT;
 287        int len;
 288
 289        if (!dst) {
 290                rc = -ENOTCONN;
 291                goto out;
 292        }
 293        if (!dst->dev) {
 294                rc = -ENODEV;
 295                goto out_rel;
 296        }
 297
 298        /* get address to which the internal TCP socket is bound */
 299        kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
 300        /* analyze IPv4 specific data of net_device belonging to TCP socket */
 301        for_ifa(dst->dev->ip_ptr) {
 302                if (ifa->ifa_address != addr.sin_addr.s_addr)
 303                        continue;
 304                *prefix_len = inet_mask_len(ifa->ifa_mask);
 305                *subnet = ifa->ifa_address & ifa->ifa_mask;
 306                rc = 0;
 307                break;
 308        } endfor_ifa(dst->dev->ip_ptr);
 309
 310out_rel:
 311        dst_release(dst);
 312out:
 313        return rc;
 314}
 315
 316static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
 317{
 318        struct smc_link_group *lgr = smc->conn.lgr;
 319        struct smc_link *link;
 320        int rest;
 321        int rc;
 322
 323        link = &lgr->lnk[SMC_SINGLE_LINK];
 324        /* receive CONFIRM LINK request from server over RoCE fabric */
 325        rest = wait_for_completion_interruptible_timeout(
 326                &link->llc_confirm,
 327                SMC_LLC_WAIT_FIRST_TIME);
 328        if (rest <= 0) {
 329                struct smc_clc_msg_decline dclc;
 330
 331                rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 332                                      SMC_CLC_DECLINE);
 333                return rc;
 334        }
 335
 336        rc = smc_ib_modify_qp_rts(link);
 337        if (rc)
 338                return SMC_CLC_DECL_INTERR;
 339
 340        smc_wr_remember_qp_attr(link);
 341        /* send CONFIRM LINK response over RoCE fabric */
 342        rc = smc_llc_send_confirm_link(link,
 343                                       link->smcibdev->mac[link->ibport - 1],
 344                                       gid, SMC_LLC_RESP);
 345        if (rc < 0)
 346                return SMC_CLC_DECL_TCL;
 347
 348        return rc;
 349}
 350
 351static void smc_conn_save_peer_info(struct smc_sock *smc,
 352                                    struct smc_clc_msg_accept_confirm *clc)
 353{
 354        smc->conn.peer_conn_idx = clc->conn_idx;
 355        smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
 356        smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
 357        atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 358}
 359
 360static void smc_link_save_peer_info(struct smc_link *link,
 361                                    struct smc_clc_msg_accept_confirm *clc)
 362{
 363        link->peer_qpn = ntoh24(clc->qpn);
 364        memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
 365        memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
 366        link->peer_psn = ntoh24(clc->psn);
 367        link->peer_mtu = clc->qp_mtu;
 368}
 369
 370/* setup for RDMA connection of client */
 371static int smc_connect_rdma(struct smc_sock *smc)
 372{
 373        struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
 374        struct smc_clc_msg_accept_confirm aclc;
 375        int local_contact = SMC_FIRST_CONTACT;
 376        struct smc_ib_device *smcibdev;
 377        struct smc_link *link;
 378        u8 srv_first_contact;
 379        int reason_code = 0;
 380        int rc = 0;
 381        u8 ibport;
 382
 383        /* IPSec connections opt out of SMC-R optimizations */
 384        if (using_ipsec(smc)) {
 385                reason_code = SMC_CLC_DECL_IPSEC;
 386                goto decline_rdma;
 387        }
 388
 389        /* PNET table look up: search active ib_device and port
 390         * within same PNETID that also contains the ethernet device
 391         * used for the internal TCP socket
 392         */
 393        smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
 394        if (!smcibdev) {
 395                reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 396                goto decline_rdma;
 397        }
 398
 399        /* do inband token exchange */
 400        reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
 401        if (reason_code < 0) {
 402                rc = reason_code;
 403                goto out_err;
 404        }
 405        if (reason_code > 0) /* configuration error */
 406                goto decline_rdma;
 407        /* receive SMC Accept CLC message */
 408        reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
 409                                       SMC_CLC_ACCEPT);
 410        if (reason_code < 0) {
 411                rc = reason_code;
 412                goto out_err;
 413        }
 414        if (reason_code > 0)
 415                goto decline_rdma;
 416
 417        srv_first_contact = aclc.hdr.flag;
 418        mutex_lock(&smc_create_lgr_pending);
 419        local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
 420                                        ibport, &aclc.lcl, srv_first_contact);
 421        if (local_contact < 0) {
 422                rc = local_contact;
 423                if (rc == -ENOMEM)
 424                        reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
 425                else if (rc == -ENOLINK)
 426                        reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
 427                goto decline_rdma_unlock;
 428        }
 429        link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 430
 431        smc_conn_save_peer_info(smc, &aclc);
 432
 433        rc = smc_sndbuf_create(smc);
 434        if (rc) {
 435                reason_code = SMC_CLC_DECL_MEM;
 436                goto decline_rdma_unlock;
 437        }
 438        rc = smc_rmb_create(smc);
 439        if (rc) {
 440                reason_code = SMC_CLC_DECL_MEM;
 441                goto decline_rdma_unlock;
 442        }
 443
 444        if (local_contact == SMC_FIRST_CONTACT)
 445                smc_link_save_peer_info(link, &aclc);
 446
 447        rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
 448        if (rc) {
 449                reason_code = SMC_CLC_DECL_INTERR;
 450                goto decline_rdma_unlock;
 451        }
 452
 453        smc_close_init(smc);
 454        smc_rx_init(smc);
 455
 456        if (local_contact == SMC_FIRST_CONTACT) {
 457                rc = smc_ib_ready_link(link);
 458                if (rc) {
 459                        reason_code = SMC_CLC_DECL_INTERR;
 460                        goto decline_rdma_unlock;
 461                }
 462        }
 463
 464        rc = smc_clc_send_confirm(smc);
 465        if (rc)
 466                goto out_err_unlock;
 467
 468        if (local_contact == SMC_FIRST_CONTACT) {
 469                /* QP confirmation over RoCE fabric */
 470                reason_code = smc_clnt_conf_first_link(
 471                        smc, &smcibdev->gid[ibport - 1]);
 472                if (reason_code < 0) {
 473                        rc = reason_code;
 474                        goto out_err_unlock;
 475                }
 476                if (reason_code > 0)
 477                        goto decline_rdma_unlock;
 478        }
 479
 480        mutex_unlock(&smc_create_lgr_pending);
 481        smc_tx_init(smc);
 482
 483out_connected:
 484        smc_copy_sock_settings_to_clc(smc);
 485        if (smc->sk.sk_state == SMC_INIT)
 486                smc->sk.sk_state = SMC_ACTIVE;
 487
 488        return rc ? rc : local_contact;
 489
 490decline_rdma_unlock:
 491        mutex_unlock(&smc_create_lgr_pending);
 492        smc_conn_free(&smc->conn);
 493decline_rdma:
 494        /* RDMA setup failed, switch back to TCP */
 495        smc->use_fallback = true;
 496        if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
 497                rc = smc_clc_send_decline(smc, reason_code, 0);
 498                if (rc < sizeof(struct smc_clc_msg_decline))
 499                        goto out_err;
 500        }
 501        goto out_connected;
 502
 503out_err_unlock:
 504        mutex_unlock(&smc_create_lgr_pending);
 505        smc_conn_free(&smc->conn);
 506out_err:
 507        return rc;
 508}
 509
 510static int smc_connect(struct socket *sock, struct sockaddr *addr,
 511                       int alen, int flags)
 512{
 513        struct sock *sk = sock->sk;
 514        struct smc_sock *smc;
 515        int rc = -EINVAL;
 516
 517        smc = smc_sk(sk);
 518
 519        /* separate smc parameter checking to be safe */
 520        if (alen < sizeof(addr->sa_family))
 521                goto out_err;
 522        if (addr->sa_family != AF_INET)
 523                goto out_err;
 524        smc->addr = addr;       /* needed for nonblocking connect */
 525
 526        lock_sock(sk);
 527        switch (sk->sk_state) {
 528        default:
 529                goto out;
 530        case SMC_ACTIVE:
 531                rc = -EISCONN;
 532                goto out;
 533        case SMC_INIT:
 534                rc = 0;
 535                break;
 536        }
 537
 538        smc_copy_sock_settings_to_clc(smc);
 539        rc = kernel_connect(smc->clcsock, addr, alen, flags);
 540        if (rc)
 541                goto out;
 542
 543        /* setup RDMA connection */
 544        rc = smc_connect_rdma(smc);
 545        if (rc < 0)
 546                goto out;
 547        else
 548                rc = 0; /* success cases including fallback */
 549
 550out:
 551        release_sock(sk);
 552out_err:
 553        return rc;
 554}
 555
 556static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 557{
 558        struct sock *sk = &lsmc->sk;
 559        struct socket *new_clcsock;
 560        struct sock *new_sk;
 561        int rc;
 562
 563        release_sock(&lsmc->sk);
 564        new_sk = smc_sock_alloc(sock_net(sk), NULL);
 565        if (!new_sk) {
 566                rc = -ENOMEM;
 567                lsmc->sk.sk_err = ENOMEM;
 568                *new_smc = NULL;
 569                lock_sock(&lsmc->sk);
 570                goto out;
 571        }
 572        *new_smc = smc_sk(new_sk);
 573
 574        rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
 575        lock_sock(&lsmc->sk);
 576        if  (rc < 0) {
 577                lsmc->sk.sk_err = -rc;
 578                new_sk->sk_state = SMC_CLOSED;
 579                sock_set_flag(new_sk, SOCK_DEAD);
 580                sk->sk_prot->unhash(new_sk);
 581                sock_put(new_sk);
 582                *new_smc = NULL;
 583                goto out;
 584        }
 585        if (lsmc->sk.sk_state == SMC_CLOSED) {
 586                if (new_clcsock)
 587                        sock_release(new_clcsock);
 588                new_sk->sk_state = SMC_CLOSED;
 589                sock_set_flag(new_sk, SOCK_DEAD);
 590                sk->sk_prot->unhash(new_sk);
 591                sock_put(new_sk);
 592                *new_smc = NULL;
 593                goto out;
 594        }
 595
 596        (*new_smc)->clcsock = new_clcsock;
 597out:
 598        return rc;
 599}
 600
 601/* add a just created sock to the accept queue of the listen sock as
 602 * candidate for a following socket accept call from user space
 603 */
 604static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
 605{
 606        struct smc_sock *par = smc_sk(parent);
 607
 608        sock_hold(sk);
 609        spin_lock(&par->accept_q_lock);
 610        list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
 611        spin_unlock(&par->accept_q_lock);
 612        sk_acceptq_added(parent);
 613}
 614
 615/* remove a socket from the accept queue of its parental listening socket */
 616static void smc_accept_unlink(struct sock *sk)
 617{
 618        struct smc_sock *par = smc_sk(sk)->listen_smc;
 619
 620        spin_lock(&par->accept_q_lock);
 621        list_del_init(&smc_sk(sk)->accept_q);
 622        spin_unlock(&par->accept_q_lock);
 623        sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
 624        sock_put(sk);
 625}
 626
 627/* remove a sock from the accept queue to bind it to a new socket created
 628 * for a socket accept call from user space
 629 */
 630struct sock *smc_accept_dequeue(struct sock *parent,
 631                                struct socket *new_sock)
 632{
 633        struct smc_sock *isk, *n;
 634        struct sock *new_sk;
 635
 636        list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
 637                new_sk = (struct sock *)isk;
 638
 639                smc_accept_unlink(new_sk);
 640                if (new_sk->sk_state == SMC_CLOSED) {
 641                        new_sk->sk_prot->unhash(new_sk);
 642                        sock_put(new_sk);
 643                        continue;
 644                }
 645                if (new_sock)
 646                        sock_graft(new_sk, new_sock);
 647                return new_sk;
 648        }
 649        return NULL;
 650}
 651
 652/* clean up for a created but never accepted sock */
 653void smc_close_non_accepted(struct sock *sk)
 654{
 655        struct smc_sock *smc = smc_sk(sk);
 656
 657        sock_hold(sk);
 658        lock_sock(sk);
 659        if (!sk->sk_lingertime)
 660                /* wait for peer closing */
 661                sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
 662        if (smc->use_fallback) {
 663                sk->sk_state = SMC_CLOSED;
 664        } else {
 665                smc_close_active(smc);
 666                sock_set_flag(sk, SOCK_DEAD);
 667                sk->sk_shutdown |= SHUTDOWN_MASK;
 668        }
 669        if (smc->clcsock) {
 670                struct socket *tcp;
 671
 672                tcp = smc->clcsock;
 673                smc->clcsock = NULL;
 674                sock_release(tcp);
 675        }
 676        if (smc->use_fallback) {
 677                schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
 678        } else if (sk->sk_state == SMC_CLOSED) {
 679                smc_conn_free(&smc->conn);
 680                schedule_delayed_work(&smc->sock_put_work,
 681                                      SMC_CLOSE_SOCK_PUT_DELAY);
 682        }
 683        release_sock(sk);
 684        sock_put(sk);
 685}
 686
 687static int smc_serv_conf_first_link(struct smc_sock *smc)
 688{
 689        struct smc_link_group *lgr = smc->conn.lgr;
 690        struct smc_link *link;
 691        int rest;
 692        int rc;
 693
 694        link = &lgr->lnk[SMC_SINGLE_LINK];
 695        /* send CONFIRM LINK request to client over the RoCE fabric */
 696        rc = smc_llc_send_confirm_link(link,
 697                                       link->smcibdev->mac[link->ibport - 1],
 698                                       &link->smcibdev->gid[link->ibport - 1],
 699                                       SMC_LLC_REQ);
 700        if (rc < 0)
 701                return SMC_CLC_DECL_TCL;
 702
 703        /* receive CONFIRM LINK response from client over the RoCE fabric */
 704        rest = wait_for_completion_interruptible_timeout(
 705                &link->llc_confirm_resp,
 706                SMC_LLC_WAIT_FIRST_TIME);
 707        if (rest <= 0) {
 708                struct smc_clc_msg_decline dclc;
 709
 710                rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 711                                      SMC_CLC_DECLINE);
 712        }
 713
 714        return rc;
 715}
 716
 717/* setup for RDMA connection of server */
 718static void smc_listen_work(struct work_struct *work)
 719{
 720        struct smc_sock *new_smc = container_of(work, struct smc_sock,
 721                                                smc_listen_work);
 722        struct socket *newclcsock = new_smc->clcsock;
 723        struct smc_sock *lsmc = new_smc->listen_smc;
 724        struct smc_clc_msg_accept_confirm cclc;
 725        int local_contact = SMC_REUSE_CONTACT;
 726        struct sock *newsmcsk = &new_smc->sk;
 727        struct smc_clc_msg_proposal pclc;
 728        struct smc_ib_device *smcibdev;
 729        struct sockaddr_in peeraddr;
 730        struct smc_link *link;
 731        int reason_code = 0;
 732        int rc = 0, len;
 733        __be32 subnet;
 734        u8 prefix_len;
 735        u8 ibport;
 736
 737        /* do inband token exchange -
 738         *wait for and receive SMC Proposal CLC message
 739         */
 740        reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
 741                                       SMC_CLC_PROPOSAL);
 742        if (reason_code < 0)
 743                goto out_err;
 744        if (reason_code > 0)
 745                goto decline_rdma;
 746
 747        /* IPSec connections opt out of SMC-R optimizations */
 748        if (using_ipsec(new_smc)) {
 749                reason_code = SMC_CLC_DECL_IPSEC;
 750                goto decline_rdma;
 751        }
 752
 753        /* PNET table look up: search active ib_device and port
 754         * within same PNETID that also contains the ethernet device
 755         * used for the internal TCP socket
 756         */
 757        smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
 758        if (!smcibdev) {
 759                reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 760                goto decline_rdma;
 761        }
 762
 763        /* determine subnet and mask from internal TCP socket */
 764        rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
 765        if (rc) {
 766                reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 767                goto decline_rdma;
 768        }
 769        if ((pclc.outgoing_subnet != subnet) ||
 770            (pclc.prefix_len != prefix_len)) {
 771                reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 772                goto decline_rdma;
 773        }
 774
 775        /* get address of the peer connected to the internal TCP socket */
 776        kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
 777
 778        /* allocate connection / link group */
 779        mutex_lock(&smc_create_lgr_pending);
 780        local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
 781                                        smcibdev, ibport, &pclc.lcl, 0);
 782        if (local_contact == SMC_REUSE_CONTACT)
 783                /* lock no longer needed, free it due to following
 784                 * smc_clc_wait_msg() call
 785                 */
 786                mutex_unlock(&smc_create_lgr_pending);
 787        if (local_contact < 0) {
 788                rc = local_contact;
 789                if (rc == -ENOMEM)
 790                        reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
 791                else if (rc == -ENOLINK)
 792                        reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
 793                goto decline_rdma;
 794        }
 795        link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 796
 797        rc = smc_sndbuf_create(new_smc);
 798        if (rc) {
 799                reason_code = SMC_CLC_DECL_MEM;
 800                goto decline_rdma;
 801        }
 802        rc = smc_rmb_create(new_smc);
 803        if (rc) {
 804                reason_code = SMC_CLC_DECL_MEM;
 805                goto decline_rdma;
 806        }
 807
 808        smc_close_init(new_smc);
 809        smc_rx_init(new_smc);
 810
 811        rc = smc_clc_send_accept(new_smc, local_contact);
 812        if (rc)
 813                goto out_err;
 814
 815        /* receive SMC Confirm CLC message */
 816        reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
 817                                       SMC_CLC_CONFIRM);
 818        if (reason_code < 0)
 819                goto out_err;
 820        if (reason_code > 0)
 821                goto decline_rdma;
 822        smc_conn_save_peer_info(new_smc, &cclc);
 823        if (local_contact == SMC_FIRST_CONTACT)
 824                smc_link_save_peer_info(link, &cclc);
 825
 826        rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
 827        if (rc) {
 828                reason_code = SMC_CLC_DECL_INTERR;
 829                goto decline_rdma;
 830        }
 831
 832        if (local_contact == SMC_FIRST_CONTACT) {
 833                rc = smc_ib_ready_link(link);
 834                if (rc) {
 835                        reason_code = SMC_CLC_DECL_INTERR;
 836                        goto decline_rdma;
 837                }
 838                /* QP confirmation over RoCE fabric */
 839                reason_code = smc_serv_conf_first_link(new_smc);
 840                if (reason_code < 0) {
 841                        /* peer is not aware of a problem */
 842                        rc = reason_code;
 843                        goto out_err;
 844                }
 845                if (reason_code > 0)
 846                        goto decline_rdma;
 847        }
 848
 849        smc_tx_init(new_smc);
 850
 851out_connected:
 852        sk_refcnt_debug_inc(newsmcsk);
 853        if (newsmcsk->sk_state == SMC_INIT)
 854                newsmcsk->sk_state = SMC_ACTIVE;
 855enqueue:
 856        if (local_contact == SMC_FIRST_CONTACT)
 857                mutex_unlock(&smc_create_lgr_pending);
 858        lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
 859        if (lsmc->sk.sk_state == SMC_LISTEN) {
 860                smc_accept_enqueue(&lsmc->sk, newsmcsk);
 861        } else { /* no longer listening */
 862                smc_close_non_accepted(newsmcsk);
 863        }
 864        release_sock(&lsmc->sk);
 865
 866        /* Wake up accept */
 867        lsmc->sk.sk_data_ready(&lsmc->sk);
 868        sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
 869        return;
 870
 871decline_rdma:
 872        /* RDMA setup failed, switch back to TCP */
 873        smc_conn_free(&new_smc->conn);
 874        new_smc->use_fallback = true;
 875        if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
 876                rc = smc_clc_send_decline(new_smc, reason_code, 0);
 877                if (rc < sizeof(struct smc_clc_msg_decline))
 878                        goto out_err;
 879        }
 880        goto out_connected;
 881
 882out_err:
 883        newsmcsk->sk_state = SMC_CLOSED;
 884        smc_conn_free(&new_smc->conn);
 885        goto enqueue; /* queue new sock with sk_err set */
 886}
 887
 888static void smc_tcp_listen_work(struct work_struct *work)
 889{
 890        struct smc_sock *lsmc = container_of(work, struct smc_sock,
 891                                             tcp_listen_work);
 892        struct smc_sock *new_smc;
 893        int rc = 0;
 894
 895        lock_sock(&lsmc->sk);
 896        while (lsmc->sk.sk_state == SMC_LISTEN) {
 897                rc = smc_clcsock_accept(lsmc, &new_smc);
 898                if (rc)
 899                        goto out;
 900                if (!new_smc)
 901                        continue;
 902
 903                new_smc->listen_smc = lsmc;
 904                new_smc->use_fallback = false; /* assume rdma capability first*/
 905                sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
 906                INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
 907                smc_copy_sock_settings_to_smc(new_smc);
 908                schedule_work(&new_smc->smc_listen_work);
 909        }
 910
 911out:
 912        release_sock(&lsmc->sk);
 913        lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
 914}
 915
 916static int smc_listen(struct socket *sock, int backlog)
 917{
 918        struct sock *sk = sock->sk;
 919        struct smc_sock *smc;
 920        int rc;
 921
 922        smc = smc_sk(sk);
 923        lock_sock(sk);
 924
 925        rc = -EINVAL;
 926        if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
 927                goto out;
 928
 929        rc = 0;
 930        if (sk->sk_state == SMC_LISTEN) {
 931                sk->sk_max_ack_backlog = backlog;
 932                goto out;
 933        }
 934        /* some socket options are handled in core, so we could not apply
 935         * them to the clc socket -- copy smc socket options to clc socket
 936         */
 937        smc_copy_sock_settings_to_clc(smc);
 938
 939        rc = kernel_listen(smc->clcsock, backlog);
 940        if (rc)
 941                goto out;
 942        sk->sk_max_ack_backlog = backlog;
 943        sk->sk_ack_backlog = 0;
 944        sk->sk_state = SMC_LISTEN;
 945        INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 946        schedule_work(&smc->tcp_listen_work);
 947
 948out:
 949        release_sock(sk);
 950        return rc;
 951}
 952
 953static int smc_accept(struct socket *sock, struct socket *new_sock,
 954                      int flags, bool kern)
 955{
 956        struct sock *sk = sock->sk, *nsk;
 957        DECLARE_WAITQUEUE(wait, current);
 958        struct smc_sock *lsmc;
 959        long timeo;
 960        int rc = 0;
 961
 962        lsmc = smc_sk(sk);
 963        lock_sock(sk);
 964
 965        if (lsmc->sk.sk_state != SMC_LISTEN) {
 966                rc = -EINVAL;
 967                goto out;
 968        }
 969
 970        /* Wait for an incoming connection */
 971        timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
 972        add_wait_queue_exclusive(sk_sleep(sk), &wait);
 973        while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
 974                set_current_state(TASK_INTERRUPTIBLE);
 975                if (!timeo) {
 976                        rc = -EAGAIN;
 977                        break;
 978                }
 979                release_sock(sk);
 980                timeo = schedule_timeout(timeo);
 981                /* wakeup by sk_data_ready in smc_listen_work() */
 982                sched_annotate_sleep();
 983                lock_sock(sk);
 984                if (signal_pending(current)) {
 985                        rc = sock_intr_errno(timeo);
 986                        break;
 987                }
 988        }
 989        set_current_state(TASK_RUNNING);
 990        remove_wait_queue(sk_sleep(sk), &wait);
 991
 992        if (!rc)
 993                rc = sock_error(nsk);
 994
 995out:
 996        release_sock(sk);
 997        return rc;
 998}
 999
1000static int smc_getname(struct socket *sock, struct sockaddr *addr,
1001                       int *len, int peer)
1002{
1003        struct smc_sock *smc;
1004
1005        if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1006            (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1007                return -ENOTCONN;
1008
1009        smc = smc_sk(sock->sk);
1010
1011        return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
1012}
1013
1014static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1015{
1016        struct sock *sk = sock->sk;
1017        struct smc_sock *smc;
1018        int rc = -EPIPE;
1019
1020        smc = smc_sk(sk);
1021        lock_sock(sk);
1022        if ((sk->sk_state != SMC_ACTIVE) &&
1023            (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1024            (sk->sk_state != SMC_INIT))
1025                goto out;
1026        if (smc->use_fallback)
1027                rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1028        else
1029                rc = smc_tx_sendmsg(smc, msg, len);
1030out:
1031        release_sock(sk);
1032        return rc;
1033}
1034
1035static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1036                       int flags)
1037{
1038        struct sock *sk = sock->sk;
1039        struct smc_sock *smc;
1040        int rc = -ENOTCONN;
1041
1042        smc = smc_sk(sk);
1043        lock_sock(sk);
1044        if ((sk->sk_state == SMC_INIT) ||
1045            (sk->sk_state == SMC_LISTEN) ||
1046            (sk->sk_state == SMC_CLOSED))
1047                goto out;
1048
1049        if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1050                rc = 0;
1051                goto out;
1052        }
1053
1054        if (smc->use_fallback)
1055                rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1056        else
1057                rc = smc_rx_recvmsg(smc, msg, len, flags);
1058
1059out:
1060        release_sock(sk);
1061        return rc;
1062}
1063
1064static unsigned int smc_accept_poll(struct sock *parent)
1065{
1066        struct smc_sock *isk;
1067        struct sock *sk;
1068
1069        lock_sock(parent);
1070        list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
1071                sk = (struct sock *)isk;
1072
1073                if (sk->sk_state == SMC_ACTIVE) {
1074                        release_sock(parent);
1075                        return POLLIN | POLLRDNORM;
1076                }
1077        }
1078        release_sock(parent);
1079
1080        return 0;
1081}
1082
1083static unsigned int smc_poll(struct file *file, struct socket *sock,
1084                             poll_table *wait)
1085{
1086        struct sock *sk = sock->sk;
1087        unsigned int mask = 0;
1088        struct smc_sock *smc;
1089        int rc;
1090
1091        smc = smc_sk(sock->sk);
1092        if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1093                /* delegate to CLC child sock */
1094                mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1095                /* if non-blocking connect finished ... */
1096                lock_sock(sk);
1097                if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
1098                        sk->sk_err = smc->clcsock->sk->sk_err;
1099                        if (sk->sk_err) {
1100                                mask |= POLLERR;
1101                        } else {
1102                                rc = smc_connect_rdma(smc);
1103                                if (rc < 0)
1104                                        mask |= POLLERR;
1105                                else
1106                                        /* success cases including fallback */
1107                                        mask |= POLLOUT | POLLWRNORM;
1108                        }
1109                }
1110                release_sock(sk);
1111        } else {
1112                sock_poll_wait(file, sk_sleep(sk), wait);
1113                if (sk->sk_state == SMC_LISTEN)
1114                        /* woken up by sk_data_ready in smc_listen_work() */
1115                        mask |= smc_accept_poll(sk);
1116                if (sk->sk_err)
1117                        mask |= POLLERR;
1118                if (atomic_read(&smc->conn.sndbuf_space) ||
1119                    (sk->sk_shutdown & SEND_SHUTDOWN)) {
1120                        mask |= POLLOUT | POLLWRNORM;
1121                } else {
1122                        sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1123                        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1124                }
1125                if (atomic_read(&smc->conn.bytes_to_rcv))
1126                        mask |= POLLIN | POLLRDNORM;
1127                if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1128                    (sk->sk_state == SMC_CLOSED))
1129                        mask |= POLLHUP;
1130                if (sk->sk_shutdown & RCV_SHUTDOWN)
1131                        mask |= POLLIN | POLLRDNORM | POLLRDHUP;
1132                if (sk->sk_state == SMC_APPCLOSEWAIT1)
1133                        mask |= POLLIN;
1134
1135        }
1136
1137        return mask;
1138}
1139
1140static int smc_shutdown(struct socket *sock, int how)
1141{
1142        struct sock *sk = sock->sk;
1143        struct smc_sock *smc;
1144        int rc = -EINVAL;
1145        int rc1 = 0;
1146
1147        smc = smc_sk(sk);
1148
1149        if ((how < SHUT_RD) || (how > SHUT_RDWR))
1150                return rc;
1151
1152        lock_sock(sk);
1153
1154        rc = -ENOTCONN;
1155        if ((sk->sk_state != SMC_LISTEN) &&
1156            (sk->sk_state != SMC_ACTIVE) &&
1157            (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1158            (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1159            (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1160            (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1161            (sk->sk_state != SMC_APPFINCLOSEWAIT))
1162                goto out;
1163        if (smc->use_fallback) {
1164                rc = kernel_sock_shutdown(smc->clcsock, how);
1165                sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1166                if (sk->sk_shutdown == SHUTDOWN_MASK)
1167                        sk->sk_state = SMC_CLOSED;
1168                goto out;
1169        }
1170        switch (how) {
1171        case SHUT_RDWR:         /* shutdown in both directions */
1172                rc = smc_close_active(smc);
1173                break;
1174        case SHUT_WR:
1175                rc = smc_close_shutdown_write(smc);
1176                break;
1177        case SHUT_RD:
1178                if (sk->sk_state == SMC_LISTEN)
1179                        rc = smc_close_active(smc);
1180                else
1181                        rc = 0;
1182                        /* nothing more to do because peer is not involved */
1183                break;
1184        }
1185        rc1 = kernel_sock_shutdown(smc->clcsock, how);
1186        /* map sock_shutdown_cmd constants to sk_shutdown value range */
1187        sk->sk_shutdown |= how + 1;
1188
1189out:
1190        release_sock(sk);
1191        return rc ? rc : rc1;
1192}
1193
1194static int smc_setsockopt(struct socket *sock, int level, int optname,
1195                          char __user *optval, unsigned int optlen)
1196{
1197        struct sock *sk = sock->sk;
1198        struct smc_sock *smc;
1199
1200        smc = smc_sk(sk);
1201
1202        /* generic setsockopts reaching us here always apply to the
1203         * CLC socket
1204         */
1205        return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1206                                             optval, optlen);
1207}
1208
1209static int smc_getsockopt(struct socket *sock, int level, int optname,
1210                          char __user *optval, int __user *optlen)
1211{
1212        struct smc_sock *smc;
1213
1214        smc = smc_sk(sock->sk);
1215        /* socket options apply to the CLC socket */
1216        return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1217                                             optval, optlen);
1218}
1219
1220static int smc_ioctl(struct socket *sock, unsigned int cmd,
1221                     unsigned long arg)
1222{
1223        struct smc_sock *smc;
1224
1225        smc = smc_sk(sock->sk);
1226        if (smc->use_fallback)
1227                return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1228        else
1229                return sock_no_ioctl(sock, cmd, arg);
1230}
1231
1232static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1233                            int offset, size_t size, int flags)
1234{
1235        struct sock *sk = sock->sk;
1236        struct smc_sock *smc;
1237        int rc = -EPIPE;
1238
1239        smc = smc_sk(sk);
1240        lock_sock(sk);
1241        if (sk->sk_state != SMC_ACTIVE)
1242                goto out;
1243        if (smc->use_fallback)
1244                rc = kernel_sendpage(smc->clcsock, page, offset,
1245                                     size, flags);
1246        else
1247                rc = sock_no_sendpage(sock, page, offset, size, flags);
1248
1249out:
1250        release_sock(sk);
1251        return rc;
1252}
1253
1254static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1255                               struct pipe_inode_info *pipe, size_t len,
1256                                    unsigned int flags)
1257{
1258        struct sock *sk = sock->sk;
1259        struct smc_sock *smc;
1260        int rc = -ENOTCONN;
1261
1262        smc = smc_sk(sk);
1263        lock_sock(sk);
1264        if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1265                goto out;
1266        if (smc->use_fallback) {
1267                rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1268                                                    pipe, len, flags);
1269        } else {
1270                rc = -EOPNOTSUPP;
1271        }
1272out:
1273        release_sock(sk);
1274        return rc;
1275}
1276
1277/* must look like tcp */
1278static const struct proto_ops smc_sock_ops = {
1279        .family         = PF_SMC,
1280        .owner          = THIS_MODULE,
1281        .release        = smc_release,
1282        .bind           = smc_bind,
1283        .connect        = smc_connect,
1284        .socketpair     = sock_no_socketpair,
1285        .accept         = smc_accept,
1286        .getname        = smc_getname,
1287        .poll           = smc_poll,
1288        .ioctl          = smc_ioctl,
1289        .listen         = smc_listen,
1290        .shutdown       = smc_shutdown,
1291        .setsockopt     = smc_setsockopt,
1292        .getsockopt     = smc_getsockopt,
1293        .sendmsg        = smc_sendmsg,
1294        .recvmsg        = smc_recvmsg,
1295        .mmap           = sock_no_mmap,
1296        .sendpage       = smc_sendpage,
1297        .splice_read    = smc_splice_read,
1298};
1299
1300static int smc_create(struct net *net, struct socket *sock, int protocol,
1301                      int kern)
1302{
1303        struct smc_sock *smc;
1304        struct sock *sk;
1305        int rc;
1306
1307        rc = -ESOCKTNOSUPPORT;
1308        if (sock->type != SOCK_STREAM)
1309                goto out;
1310
1311        rc = -EPROTONOSUPPORT;
1312        if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1313                goto out;
1314
1315        rc = -ENOBUFS;
1316        sock->ops = &smc_sock_ops;
1317        sk = smc_sock_alloc(net, sock);
1318        if (!sk)
1319                goto out;
1320
1321        /* create internal TCP socket for CLC handshake and fallback */
1322        smc = smc_sk(sk);
1323        smc->use_fallback = false; /* assume rdma capability first */
1324        rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1325                              IPPROTO_TCP, &smc->clcsock);
1326        if (rc)
1327                sk_common_release(sk);
1328        smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1329        smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1330
1331out:
1332        return rc;
1333}
1334
1335static const struct net_proto_family smc_sock_family_ops = {
1336        .family = PF_SMC,
1337        .owner  = THIS_MODULE,
1338        .create = smc_create,
1339};
1340
1341static int __init smc_init(void)
1342{
1343        int rc;
1344
1345        rc = smc_pnet_init();
1346        if (rc)
1347                return rc;
1348
1349        rc = smc_llc_init();
1350        if (rc) {
1351                pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1352                goto out_pnet;
1353        }
1354
1355        rc = smc_cdc_init();
1356        if (rc) {
1357                pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1358                goto out_pnet;
1359        }
1360
1361        rc = proto_register(&smc_proto, 1);
1362        if (rc) {
1363                pr_err("%s: proto_register fails with %d\n", __func__, rc);
1364                goto out_pnet;
1365        }
1366
1367        rc = sock_register(&smc_sock_family_ops);
1368        if (rc) {
1369                pr_err("%s: sock_register fails with %d\n", __func__, rc);
1370                goto out_proto;
1371        }
1372        INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1373
1374        rc = smc_ib_register_client();
1375        if (rc) {
1376                pr_err("%s: ib_register fails with %d\n", __func__, rc);
1377                goto out_sock;
1378        }
1379
1380        return 0;
1381
1382out_sock:
1383        sock_unregister(PF_SMC);
1384out_proto:
1385        proto_unregister(&smc_proto);
1386out_pnet:
1387        smc_pnet_exit();
1388        return rc;
1389}
1390
1391static void __exit smc_exit(void)
1392{
1393        struct smc_link_group *lgr, *lg;
1394        LIST_HEAD(lgr_freeing_list);
1395
1396        spin_lock_bh(&smc_lgr_list.lock);
1397        if (!list_empty(&smc_lgr_list.list))
1398                list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1399        spin_unlock_bh(&smc_lgr_list.lock);
1400        list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1401                list_del_init(&lgr->list);
1402                smc_lgr_free(lgr); /* free link group */
1403        }
1404        smc_ib_unregister_client();
1405        sock_unregister(PF_SMC);
1406        proto_unregister(&smc_proto);
1407        smc_pnet_exit();
1408}
1409
1410module_init(smc_init);
1411module_exit(smc_exit);
1412
1413MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1414MODULE_DESCRIPTION("smc socket address family");
1415MODULE_LICENSE("GPL");
1416MODULE_ALIAS_NETPROTO(PF_SMC);
1417