linux/net/smc/af_smc.c
<<
>>
Prefs
   1/*
   2 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
   3 *
   4 *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
   5 *  applies to SOCK_STREAM sockets only
   6 *  offers an alternative communication option for TCP-protocol sockets
   7 *  applicable with RoCE-cards only
   8 *
   9 *  Initial restrictions:
  10 *    - non-blocking connect postponed
  11 *    - IPv6 support postponed
  12 *    - support for alternate links postponed
  13 *    - partial support for non-blocking sockets only
  14 *    - support for urgent data postponed
  15 *
  16 *  Copyright IBM Corp. 2016
  17 *
  18 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
  19 *              based on prototype from Frank Blaschka
  20 */
  21
  22#define KMSG_COMPONENT "smc"
  23#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  24
  25#include <linux/module.h>
  26#include <linux/socket.h>
  27#include <linux/inetdevice.h>
  28#include <linux/workqueue.h>
  29#include <linux/in.h>
  30#include <linux/sched/signal.h>
  31
  32#include <net/sock.h>
  33#include <net/tcp.h>
  34#include <net/smc.h>
  35
  36#include "smc.h"
  37#include "smc_clc.h"
  38#include "smc_llc.h"
  39#include "smc_cdc.h"
  40#include "smc_core.h"
  41#include "smc_ib.h"
  42#include "smc_pnet.h"
  43#include "smc_tx.h"
  44#include "smc_rx.h"
  45#include "smc_close.h"
  46
  47static DEFINE_MUTEX(smc_create_lgr_pending);    /* serialize link group
  48                                                 * creation
  49                                                 */
  50
  51struct smc_lgr_list smc_lgr_list = {            /* established link groups */
  52        .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
  53        .list = LIST_HEAD_INIT(smc_lgr_list.list),
  54};
  55
  56static void smc_tcp_listen_work(struct work_struct *);
  57
  58static void smc_set_keepalive(struct sock *sk, int val)
  59{
  60        struct smc_sock *smc = smc_sk(sk);
  61
  62        smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
  63}
  64
  65static struct smc_hashinfo smc_v4_hashinfo = {
  66        .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
  67};
  68
  69int smc_hash_sk(struct sock *sk)
  70{
  71        struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
  72        struct hlist_head *head;
  73
  74        head = &h->ht;
  75
  76        write_lock_bh(&h->lock);
  77        sk_add_node(sk, head);
  78        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
  79        write_unlock_bh(&h->lock);
  80
  81        return 0;
  82}
  83EXPORT_SYMBOL_GPL(smc_hash_sk);
  84
  85void smc_unhash_sk(struct sock *sk)
  86{
  87        struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
  88
  89        write_lock_bh(&h->lock);
  90        if (sk_del_node_init(sk))
  91                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
  92        write_unlock_bh(&h->lock);
  93}
  94EXPORT_SYMBOL_GPL(smc_unhash_sk);
  95
  96struct proto smc_proto = {
  97        .name           = "SMC",
  98        .owner          = THIS_MODULE,
  99        .keepalive      = smc_set_keepalive,
 100        .hash           = smc_hash_sk,
 101        .unhash         = smc_unhash_sk,
 102        .obj_size       = sizeof(struct smc_sock),
 103        .h.smc_hash     = &smc_v4_hashinfo,
 104        .slab_flags     = SLAB_DESTROY_BY_RCU,
 105};
 106EXPORT_SYMBOL_GPL(smc_proto);
 107
 108static int smc_release(struct socket *sock)
 109{
 110        struct sock *sk = sock->sk;
 111        struct smc_sock *smc;
 112        int rc = 0;
 113
 114        if (!sk)
 115                goto out;
 116
 117        smc = smc_sk(sk);
 118        sock_hold(sk);
 119        if (sk->sk_state == SMC_LISTEN)
 120                /* smc_close_non_accepted() is called and acquires
 121                 * sock lock for child sockets again
 122                 */
 123                lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
 124        else
 125                lock_sock(sk);
 126
 127        if (smc->use_fallback) {
 128                sk->sk_state = SMC_CLOSED;
 129                sk->sk_state_change(sk);
 130        } else {
 131                rc = smc_close_active(smc);
 132                sock_set_flag(sk, SOCK_DEAD);
 133                sk->sk_shutdown |= SHUTDOWN_MASK;
 134        }
 135        if (smc->clcsock) {
 136                sock_release(smc->clcsock);
 137                smc->clcsock = NULL;
 138        }
 139
 140        /* detach socket */
 141        sock_orphan(sk);
 142        sock->sk = NULL;
 143        if (smc->use_fallback) {
 144                schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
 145        } else if (sk->sk_state == SMC_CLOSED) {
 146                smc_conn_free(&smc->conn);
 147                schedule_delayed_work(&smc->sock_put_work,
 148                                      SMC_CLOSE_SOCK_PUT_DELAY);
 149        }
 150        sk->sk_prot->unhash(sk);
 151        release_sock(sk);
 152
 153        sock_put(sk);
 154out:
 155        return rc;
 156}
 157
 158static void smc_destruct(struct sock *sk)
 159{
 160        if (sk->sk_state != SMC_CLOSED)
 161                return;
 162        if (!sock_flag(sk, SOCK_DEAD))
 163                return;
 164
 165        sk_refcnt_debug_dec(sk);
 166}
 167
 168static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 169{
 170        struct smc_sock *smc;
 171        struct sock *sk;
 172
 173        sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
 174        if (!sk)
 175                return NULL;
 176
 177        sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
 178        sk->sk_state = SMC_INIT;
 179        sk->sk_destruct = smc_destruct;
 180        sk->sk_protocol = SMCPROTO_SMC;
 181        smc = smc_sk(sk);
 182        INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 183        INIT_LIST_HEAD(&smc->accept_q);
 184        spin_lock_init(&smc->accept_q_lock);
 185        INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
 186        sk->sk_prot->hash(sk);
 187        sk_refcnt_debug_inc(sk);
 188
 189        return sk;
 190}
 191
 192static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
 193                    int addr_len)
 194{
 195        struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
 196        struct sock *sk = sock->sk;
 197        struct smc_sock *smc;
 198        int rc;
 199
 200        smc = smc_sk(sk);
 201
 202        /* replicate tests from inet_bind(), to be safe wrt. future changes */
 203        rc = -EINVAL;
 204        if (addr_len < sizeof(struct sockaddr_in))
 205                goto out;
 206
 207        rc = -EAFNOSUPPORT;
 208        /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
 209        if ((addr->sin_family != AF_INET) &&
 210            ((addr->sin_family != AF_UNSPEC) ||
 211             (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
 212                goto out;
 213
 214        lock_sock(sk);
 215
 216        /* Check if socket is already active */
 217        rc = -EINVAL;
 218        if (sk->sk_state != SMC_INIT)
 219                goto out_rel;
 220
 221        smc->clcsock->sk->sk_reuse = sk->sk_reuse;
 222        rc = kernel_bind(smc->clcsock, uaddr, addr_len);
 223
 224out_rel:
 225        release_sock(sk);
 226out:
 227        return rc;
 228}
 229
 230static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
 231                                   unsigned long mask)
 232{
 233        /* options we don't get control via setsockopt for */
 234        nsk->sk_type = osk->sk_type;
 235        nsk->sk_sndbuf = osk->sk_sndbuf;
 236        nsk->sk_rcvbuf = osk->sk_rcvbuf;
 237        nsk->sk_sndtimeo = osk->sk_sndtimeo;
 238        nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
 239        nsk->sk_mark = osk->sk_mark;
 240        nsk->sk_priority = osk->sk_priority;
 241        nsk->sk_rcvlowat = osk->sk_rcvlowat;
 242        nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
 243        nsk->sk_err = osk->sk_err;
 244
 245        nsk->sk_flags &= ~mask;
 246        nsk->sk_flags |= osk->sk_flags & mask;
 247}
 248
 249#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
 250                             (1UL << SOCK_KEEPOPEN) | \
 251                             (1UL << SOCK_LINGER) | \
 252                             (1UL << SOCK_BROADCAST) | \
 253                             (1UL << SOCK_TIMESTAMP) | \
 254                             (1UL << SOCK_DBG) | \
 255                             (1UL << SOCK_RCVTSTAMP) | \
 256                             (1UL << SOCK_RCVTSTAMPNS) | \
 257                             (1UL << SOCK_LOCALROUTE) | \
 258                             (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
 259                             (1UL << SOCK_RXQ_OVFL) | \
 260                             (1UL << SOCK_WIFI_STATUS) | \
 261                             (1UL << SOCK_NOFCS) | \
 262                             (1UL << SOCK_FILTER_LOCKED))
 263/* copy only relevant settings and flags of SOL_SOCKET level from smc to
 264 * clc socket (since smc is not called for these options from net/core)
 265 */
 266static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
 267{
 268        smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
 269}
 270
 271#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
 272                             (1UL << SOCK_KEEPOPEN) | \
 273                             (1UL << SOCK_LINGER) | \
 274                             (1UL << SOCK_DBG))
 275/* copy only settings and flags relevant for smc from clc to smc socket */
 276static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 277{
 278        smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
 279}
 280
 281/* determine subnet and mask of internal TCP socket */
 282int smc_netinfo_by_tcpsk(struct socket *clcsock,
 283                         __be32 *subnet, u8 *prefix_len)
 284{
 285        struct dst_entry *dst = sk_dst_get(clcsock->sk);
 286        struct sockaddr_in addr;
 287        int rc = -ENOENT;
 288        int len;
 289
 290        if (!dst) {
 291                rc = -ENOTCONN;
 292                goto out;
 293        }
 294        if (!dst->dev) {
 295                rc = -ENODEV;
 296                goto out_rel;
 297        }
 298
 299        /* get address to which the internal TCP socket is bound */
 300        kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
 301        /* analyze IPv4 specific data of net_device belonging to TCP socket */
 302        for_ifa(dst->dev->ip_ptr) {
 303                if (ifa->ifa_address != addr.sin_addr.s_addr)
 304                        continue;
 305                *prefix_len = inet_mask_len(ifa->ifa_mask);
 306                *subnet = ifa->ifa_address & ifa->ifa_mask;
 307                rc = 0;
 308                break;
 309        } endfor_ifa(dst->dev->ip_ptr);
 310
 311out_rel:
 312        dst_release(dst);
 313out:
 314        return rc;
 315}
 316
 317static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
 318{
 319        struct smc_link_group *lgr = smc->conn.lgr;
 320        struct smc_link *link;
 321        int rest;
 322        int rc;
 323
 324        link = &lgr->lnk[SMC_SINGLE_LINK];
 325        /* receive CONFIRM LINK request from server over RoCE fabric */
 326        rest = wait_for_completion_interruptible_timeout(
 327                &link->llc_confirm,
 328                SMC_LLC_WAIT_FIRST_TIME);
 329        if (rest <= 0) {
 330                struct smc_clc_msg_decline dclc;
 331
 332                rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 333                                      SMC_CLC_DECLINE);
 334                return rc;
 335        }
 336
 337        rc = smc_ib_modify_qp_rts(link);
 338        if (rc)
 339                return SMC_CLC_DECL_INTERR;
 340
 341        smc_wr_remember_qp_attr(link);
 342        /* send CONFIRM LINK response over RoCE fabric */
 343        rc = smc_llc_send_confirm_link(link,
 344                                       link->smcibdev->mac[link->ibport - 1],
 345                                       gid, SMC_LLC_RESP);
 346        if (rc < 0)
 347                return SMC_CLC_DECL_TCL;
 348
 349        return rc;
 350}
 351
 352static void smc_conn_save_peer_info(struct smc_sock *smc,
 353                                    struct smc_clc_msg_accept_confirm *clc)
 354{
 355        smc->conn.peer_conn_idx = clc->conn_idx;
 356        smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
 357        smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
 358        atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 359}
 360
 361static void smc_link_save_peer_info(struct smc_link *link,
 362                                    struct smc_clc_msg_accept_confirm *clc)
 363{
 364        link->peer_qpn = ntoh24(clc->qpn);
 365        memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
 366        memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
 367        link->peer_psn = ntoh24(clc->psn);
 368        link->peer_mtu = clc->qp_mtu;
 369}
 370
 371/* setup for RDMA connection of client */
 372static int smc_connect_rdma(struct smc_sock *smc)
 373{
 374        struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
 375        struct smc_clc_msg_accept_confirm aclc;
 376        int local_contact = SMC_FIRST_CONTACT;
 377        struct smc_ib_device *smcibdev;
 378        struct smc_link *link;
 379        u8 srv_first_contact;
 380        int reason_code = 0;
 381        int rc = 0;
 382        u8 ibport;
 383
 384        /* IPSec connections opt out of SMC-R optimizations */
 385        if (using_ipsec(smc)) {
 386                reason_code = SMC_CLC_DECL_IPSEC;
 387                goto decline_rdma;
 388        }
 389
 390        /* PNET table look up: search active ib_device and port
 391         * within same PNETID that also contains the ethernet device
 392         * used for the internal TCP socket
 393         */
 394        smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
 395        if (!smcibdev) {
 396                reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 397                goto decline_rdma;
 398        }
 399
 400        /* do inband token exchange */
 401        reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
 402        if (reason_code < 0) {
 403                rc = reason_code;
 404                goto out_err;
 405        }
 406        if (reason_code > 0) /* configuration error */
 407                goto decline_rdma;
 408        /* receive SMC Accept CLC message */
 409        reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
 410                                       SMC_CLC_ACCEPT);
 411        if (reason_code < 0) {
 412                rc = reason_code;
 413                goto out_err;
 414        }
 415        if (reason_code > 0)
 416                goto decline_rdma;
 417
 418        srv_first_contact = aclc.hdr.flag;
 419        mutex_lock(&smc_create_lgr_pending);
 420        local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
 421                                        ibport, &aclc.lcl, srv_first_contact);
 422        if (local_contact < 0) {
 423                rc = local_contact;
 424                if (rc == -ENOMEM)
 425                        reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
 426                else if (rc == -ENOLINK)
 427                        reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
 428                goto decline_rdma_unlock;
 429        }
 430        link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 431
 432        smc_conn_save_peer_info(smc, &aclc);
 433
 434        rc = smc_sndbuf_create(smc);
 435        if (rc) {
 436                reason_code = SMC_CLC_DECL_MEM;
 437                goto decline_rdma_unlock;
 438        }
 439        rc = smc_rmb_create(smc);
 440        if (rc) {
 441                reason_code = SMC_CLC_DECL_MEM;
 442                goto decline_rdma_unlock;
 443        }
 444
 445        if (local_contact == SMC_FIRST_CONTACT)
 446                smc_link_save_peer_info(link, &aclc);
 447
 448        rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
 449        if (rc) {
 450                reason_code = SMC_CLC_DECL_INTERR;
 451                goto decline_rdma_unlock;
 452        }
 453
 454        if (local_contact == SMC_FIRST_CONTACT) {
 455                rc = smc_ib_ready_link(link);
 456                if (rc) {
 457                        reason_code = SMC_CLC_DECL_INTERR;
 458                        goto decline_rdma_unlock;
 459                }
 460        }
 461
 462        rc = smc_clc_send_confirm(smc);
 463        if (rc)
 464                goto out_err_unlock;
 465
 466        if (local_contact == SMC_FIRST_CONTACT) {
 467                /* QP confirmation over RoCE fabric */
 468                reason_code = smc_clnt_conf_first_link(
 469                        smc, &smcibdev->gid[ibport - 1]);
 470                if (reason_code < 0) {
 471                        rc = reason_code;
 472                        goto out_err_unlock;
 473                }
 474                if (reason_code > 0)
 475                        goto decline_rdma_unlock;
 476        }
 477
 478        mutex_unlock(&smc_create_lgr_pending);
 479        smc_tx_init(smc);
 480        smc_rx_init(smc);
 481
 482out_connected:
 483        smc_copy_sock_settings_to_clc(smc);
 484        if (smc->sk.sk_state == SMC_INIT)
 485                smc->sk.sk_state = SMC_ACTIVE;
 486
 487        return rc ? rc : local_contact;
 488
 489decline_rdma_unlock:
 490        mutex_unlock(&smc_create_lgr_pending);
 491        smc_conn_free(&smc->conn);
 492decline_rdma:
 493        /* RDMA setup failed, switch back to TCP */
 494        smc->use_fallback = true;
 495        if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
 496                rc = smc_clc_send_decline(smc, reason_code, 0);
 497                if (rc < sizeof(struct smc_clc_msg_decline))
 498                        goto out_err;
 499        }
 500        goto out_connected;
 501
 502out_err_unlock:
 503        mutex_unlock(&smc_create_lgr_pending);
 504        smc_conn_free(&smc->conn);
 505out_err:
 506        return rc;
 507}
 508
 509static int smc_connect(struct socket *sock, struct sockaddr *addr,
 510                       int alen, int flags)
 511{
 512        struct sock *sk = sock->sk;
 513        struct smc_sock *smc;
 514        int rc = -EINVAL;
 515
 516        smc = smc_sk(sk);
 517
 518        /* separate smc parameter checking to be safe */
 519        if (alen < sizeof(addr->sa_family))
 520                goto out_err;
 521        if (addr->sa_family != AF_INET)
 522                goto out_err;
 523        smc->addr = addr;       /* needed for nonblocking connect */
 524
 525        lock_sock(sk);
 526        switch (sk->sk_state) {
 527        default:
 528                goto out;
 529        case SMC_ACTIVE:
 530                rc = -EISCONN;
 531                goto out;
 532        case SMC_INIT:
 533                rc = 0;
 534                break;
 535        }
 536
 537        smc_copy_sock_settings_to_clc(smc);
 538        rc = kernel_connect(smc->clcsock, addr, alen, flags);
 539        if (rc)
 540                goto out;
 541
 542        /* setup RDMA connection */
 543        rc = smc_connect_rdma(smc);
 544        if (rc < 0)
 545                goto out;
 546        else
 547                rc = 0; /* success cases including fallback */
 548
 549out:
 550        release_sock(sk);
 551out_err:
 552        return rc;
 553}
 554
 555static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 556{
 557        struct sock *sk = &lsmc->sk;
 558        struct socket *new_clcsock;
 559        struct sock *new_sk;
 560        int rc;
 561
 562        release_sock(&lsmc->sk);
 563        new_sk = smc_sock_alloc(sock_net(sk), NULL);
 564        if (!new_sk) {
 565                rc = -ENOMEM;
 566                lsmc->sk.sk_err = ENOMEM;
 567                *new_smc = NULL;
 568                lock_sock(&lsmc->sk);
 569                goto out;
 570        }
 571        *new_smc = smc_sk(new_sk);
 572
 573        rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
 574        lock_sock(&lsmc->sk);
 575        if  (rc < 0) {
 576                lsmc->sk.sk_err = -rc;
 577                new_sk->sk_state = SMC_CLOSED;
 578                sock_set_flag(new_sk, SOCK_DEAD);
 579                sk->sk_prot->unhash(new_sk);
 580                sock_put(new_sk);
 581                *new_smc = NULL;
 582                goto out;
 583        }
 584        if (lsmc->sk.sk_state == SMC_CLOSED) {
 585                if (new_clcsock)
 586                        sock_release(new_clcsock);
 587                new_sk->sk_state = SMC_CLOSED;
 588                sock_set_flag(new_sk, SOCK_DEAD);
 589                sk->sk_prot->unhash(new_sk);
 590                sock_put(new_sk);
 591                *new_smc = NULL;
 592                goto out;
 593        }
 594
 595        (*new_smc)->clcsock = new_clcsock;
 596out:
 597        return rc;
 598}
 599
 600/* add a just created sock to the accept queue of the listen sock as
 601 * candidate for a following socket accept call from user space
 602 */
 603static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
 604{
 605        struct smc_sock *par = smc_sk(parent);
 606
 607        sock_hold(sk);
 608        spin_lock(&par->accept_q_lock);
 609        list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
 610        spin_unlock(&par->accept_q_lock);
 611        sk_acceptq_added(parent);
 612}
 613
 614/* remove a socket from the accept queue of its parental listening socket */
 615static void smc_accept_unlink(struct sock *sk)
 616{
 617        struct smc_sock *par = smc_sk(sk)->listen_smc;
 618
 619        spin_lock(&par->accept_q_lock);
 620        list_del_init(&smc_sk(sk)->accept_q);
 621        spin_unlock(&par->accept_q_lock);
 622        sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
 623        sock_put(sk);
 624}
 625
 626/* remove a sock from the accept queue to bind it to a new socket created
 627 * for a socket accept call from user space
 628 */
 629struct sock *smc_accept_dequeue(struct sock *parent,
 630                                struct socket *new_sock)
 631{
 632        struct smc_sock *isk, *n;
 633        struct sock *new_sk;
 634
 635        list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
 636                new_sk = (struct sock *)isk;
 637
 638                smc_accept_unlink(new_sk);
 639                if (new_sk->sk_state == SMC_CLOSED) {
 640                        /* tbd in follow-on patch: close this sock */
 641                        continue;
 642                }
 643                if (new_sock)
 644                        sock_graft(new_sk, new_sock);
 645                return new_sk;
 646        }
 647        return NULL;
 648}
 649
 650/* clean up for a created but never accepted sock */
 651void smc_close_non_accepted(struct sock *sk)
 652{
 653        struct smc_sock *smc = smc_sk(sk);
 654
 655        sock_hold(sk);
 656        lock_sock(sk);
 657        if (!sk->sk_lingertime)
 658                /* wait for peer closing */
 659                sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
 660        if (!smc->use_fallback)
 661                smc_close_active(smc);
 662        if (smc->clcsock) {
 663                struct socket *tcp;
 664
 665                tcp = smc->clcsock;
 666                smc->clcsock = NULL;
 667                sock_release(tcp);
 668        }
 669        sock_set_flag(sk, SOCK_DEAD);
 670        sk->sk_shutdown |= SHUTDOWN_MASK;
 671        if (smc->use_fallback) {
 672                schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
 673        } else {
 674                smc_conn_free(&smc->conn);
 675                schedule_delayed_work(&smc->sock_put_work,
 676                                      SMC_CLOSE_SOCK_PUT_DELAY);
 677        }
 678        release_sock(sk);
 679        sock_put(sk);
 680}
 681
 682static int smc_serv_conf_first_link(struct smc_sock *smc)
 683{
 684        struct smc_link_group *lgr = smc->conn.lgr;
 685        struct smc_link *link;
 686        int rest;
 687        int rc;
 688
 689        link = &lgr->lnk[SMC_SINGLE_LINK];
 690        /* send CONFIRM LINK request to client over the RoCE fabric */
 691        rc = smc_llc_send_confirm_link(link,
 692                                       link->smcibdev->mac[link->ibport - 1],
 693                                       &link->smcibdev->gid[link->ibport - 1],
 694                                       SMC_LLC_REQ);
 695        if (rc < 0)
 696                return SMC_CLC_DECL_TCL;
 697
 698        /* receive CONFIRM LINK response from client over the RoCE fabric */
 699        rest = wait_for_completion_interruptible_timeout(
 700                &link->llc_confirm_resp,
 701                SMC_LLC_WAIT_FIRST_TIME);
 702        if (rest <= 0) {
 703                struct smc_clc_msg_decline dclc;
 704
 705                rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 706                                      SMC_CLC_DECLINE);
 707        }
 708
 709        return rc;
 710}
 711
 712/* setup for RDMA connection of server */
 713static void smc_listen_work(struct work_struct *work)
 714{
 715        struct smc_sock *new_smc = container_of(work, struct smc_sock,
 716                                                smc_listen_work);
 717        struct socket *newclcsock = new_smc->clcsock;
 718        struct smc_sock *lsmc = new_smc->listen_smc;
 719        struct smc_clc_msg_accept_confirm cclc;
 720        int local_contact = SMC_REUSE_CONTACT;
 721        struct sock *newsmcsk = &new_smc->sk;
 722        struct smc_clc_msg_proposal pclc;
 723        struct smc_ib_device *smcibdev;
 724        struct sockaddr_in peeraddr;
 725        struct smc_link *link;
 726        int reason_code = 0;
 727        int rc = 0, len;
 728        __be32 subnet;
 729        u8 prefix_len;
 730        u8 ibport;
 731
 732        /* do inband token exchange -
 733         *wait for and receive SMC Proposal CLC message
 734         */
 735        reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
 736                                       SMC_CLC_PROPOSAL);
 737        if (reason_code < 0)
 738                goto out_err;
 739        if (reason_code > 0)
 740                goto decline_rdma;
 741
 742        /* IPSec connections opt out of SMC-R optimizations */
 743        if (using_ipsec(new_smc)) {
 744                reason_code = SMC_CLC_DECL_IPSEC;
 745                goto decline_rdma;
 746        }
 747
 748        /* PNET table look up: search active ib_device and port
 749         * within same PNETID that also contains the ethernet device
 750         * used for the internal TCP socket
 751         */
 752        smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
 753        if (!smcibdev) {
 754                reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 755                goto decline_rdma;
 756        }
 757
 758        /* determine subnet and mask from internal TCP socket */
 759        rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
 760        if (rc) {
 761                reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 762                goto decline_rdma;
 763        }
 764        if ((pclc.outgoing_subnet != subnet) ||
 765            (pclc.prefix_len != prefix_len)) {
 766                reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 767                goto decline_rdma;
 768        }
 769
 770        /* get address of the peer connected to the internal TCP socket */
 771        kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
 772
 773        /* allocate connection / link group */
 774        mutex_lock(&smc_create_lgr_pending);
 775        local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
 776                                        smcibdev, ibport, &pclc.lcl, 0);
 777        if (local_contact == SMC_REUSE_CONTACT)
 778                /* lock no longer needed, free it due to following
 779                 * smc_clc_wait_msg() call
 780                 */
 781                mutex_unlock(&smc_create_lgr_pending);
 782        if (local_contact < 0) {
 783                rc = local_contact;
 784                if (rc == -ENOMEM)
 785                        reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
 786                else if (rc == -ENOLINK)
 787                        reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
 788                goto decline_rdma;
 789        }
 790        link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 791
 792        rc = smc_sndbuf_create(new_smc);
 793        if (rc) {
 794                reason_code = SMC_CLC_DECL_MEM;
 795                goto decline_rdma;
 796        }
 797        rc = smc_rmb_create(new_smc);
 798        if (rc) {
 799                reason_code = SMC_CLC_DECL_MEM;
 800                goto decline_rdma;
 801        }
 802
 803        rc = smc_clc_send_accept(new_smc, local_contact);
 804        if (rc)
 805                goto out_err;
 806
 807        /* receive SMC Confirm CLC message */
 808        reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
 809                                       SMC_CLC_CONFIRM);
 810        if (reason_code < 0)
 811                goto out_err;
 812        if (reason_code > 0)
 813                goto decline_rdma;
 814        smc_conn_save_peer_info(new_smc, &cclc);
 815        if (local_contact == SMC_FIRST_CONTACT)
 816                smc_link_save_peer_info(link, &cclc);
 817
 818        rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
 819        if (rc) {
 820                reason_code = SMC_CLC_DECL_INTERR;
 821                goto decline_rdma;
 822        }
 823
 824        if (local_contact == SMC_FIRST_CONTACT) {
 825                rc = smc_ib_ready_link(link);
 826                if (rc) {
 827                        reason_code = SMC_CLC_DECL_INTERR;
 828                        goto decline_rdma;
 829                }
 830                /* QP confirmation over RoCE fabric */
 831                reason_code = smc_serv_conf_first_link(new_smc);
 832                if (reason_code < 0) {
 833                        /* peer is not aware of a problem */
 834                        rc = reason_code;
 835                        goto out_err;
 836                }
 837                if (reason_code > 0)
 838                        goto decline_rdma;
 839        }
 840
 841        smc_tx_init(new_smc);
 842        smc_rx_init(new_smc);
 843
 844out_connected:
 845        sk_refcnt_debug_inc(newsmcsk);
 846        if (newsmcsk->sk_state == SMC_INIT)
 847                newsmcsk->sk_state = SMC_ACTIVE;
 848enqueue:
 849        if (local_contact == SMC_FIRST_CONTACT)
 850                mutex_unlock(&smc_create_lgr_pending);
 851        lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
 852        if (lsmc->sk.sk_state == SMC_LISTEN) {
 853                smc_accept_enqueue(&lsmc->sk, newsmcsk);
 854        } else { /* no longer listening */
 855                smc_close_non_accepted(newsmcsk);
 856        }
 857        release_sock(&lsmc->sk);
 858
 859        /* Wake up accept */
 860        lsmc->sk.sk_data_ready(&lsmc->sk);
 861        sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
 862        return;
 863
 864decline_rdma:
 865        /* RDMA setup failed, switch back to TCP */
 866        smc_conn_free(&new_smc->conn);
 867        new_smc->use_fallback = true;
 868        if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
 869                rc = smc_clc_send_decline(new_smc, reason_code, 0);
 870                if (rc < sizeof(struct smc_clc_msg_decline))
 871                        goto out_err;
 872        }
 873        goto out_connected;
 874
 875out_err:
 876        newsmcsk->sk_state = SMC_CLOSED;
 877        smc_conn_free(&new_smc->conn);
 878        goto enqueue; /* queue new sock with sk_err set */
 879}
 880
 881static void smc_tcp_listen_work(struct work_struct *work)
 882{
 883        struct smc_sock *lsmc = container_of(work, struct smc_sock,
 884                                             tcp_listen_work);
 885        struct smc_sock *new_smc;
 886        int rc = 0;
 887
 888        lock_sock(&lsmc->sk);
 889        while (lsmc->sk.sk_state == SMC_LISTEN) {
 890                rc = smc_clcsock_accept(lsmc, &new_smc);
 891                if (rc)
 892                        goto out;
 893                if (!new_smc)
 894                        continue;
 895
 896                new_smc->listen_smc = lsmc;
 897                new_smc->use_fallback = false; /* assume rdma capability first*/
 898                sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
 899                INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
 900                smc_copy_sock_settings_to_smc(new_smc);
 901                schedule_work(&new_smc->smc_listen_work);
 902        }
 903
 904out:
 905        release_sock(&lsmc->sk);
 906        lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
 907}
 908
 909static int smc_listen(struct socket *sock, int backlog)
 910{
 911        struct sock *sk = sock->sk;
 912        struct smc_sock *smc;
 913        int rc;
 914
 915        smc = smc_sk(sk);
 916        lock_sock(sk);
 917
 918        rc = -EINVAL;
 919        if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
 920                goto out;
 921
 922        rc = 0;
 923        if (sk->sk_state == SMC_LISTEN) {
 924                sk->sk_max_ack_backlog = backlog;
 925                goto out;
 926        }
 927        /* some socket options are handled in core, so we could not apply
 928         * them to the clc socket -- copy smc socket options to clc socket
 929         */
 930        smc_copy_sock_settings_to_clc(smc);
 931
 932        rc = kernel_listen(smc->clcsock, backlog);
 933        if (rc)
 934                goto out;
 935        sk->sk_max_ack_backlog = backlog;
 936        sk->sk_ack_backlog = 0;
 937        sk->sk_state = SMC_LISTEN;
 938        INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 939        schedule_work(&smc->tcp_listen_work);
 940
 941out:
 942        release_sock(sk);
 943        return rc;
 944}
 945
 946static int smc_accept(struct socket *sock, struct socket *new_sock,
 947                      int flags, bool kern)
 948{
 949        struct sock *sk = sock->sk, *nsk;
 950        DECLARE_WAITQUEUE(wait, current);
 951        struct smc_sock *lsmc;
 952        long timeo;
 953        int rc = 0;
 954
 955        lsmc = smc_sk(sk);
 956        lock_sock(sk);
 957
 958        if (lsmc->sk.sk_state != SMC_LISTEN) {
 959                rc = -EINVAL;
 960                goto out;
 961        }
 962
 963        /* Wait for an incoming connection */
 964        timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
 965        add_wait_queue_exclusive(sk_sleep(sk), &wait);
 966        while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
 967                set_current_state(TASK_INTERRUPTIBLE);
 968                if (!timeo) {
 969                        rc = -EAGAIN;
 970                        break;
 971                }
 972                release_sock(sk);
 973                timeo = schedule_timeout(timeo);
 974                /* wakeup by sk_data_ready in smc_listen_work() */
 975                sched_annotate_sleep();
 976                lock_sock(sk);
 977                if (signal_pending(current)) {
 978                        rc = sock_intr_errno(timeo);
 979                        break;
 980                }
 981        }
 982        set_current_state(TASK_RUNNING);
 983        remove_wait_queue(sk_sleep(sk), &wait);
 984
 985        if (!rc)
 986                rc = sock_error(nsk);
 987
 988out:
 989        release_sock(sk);
 990        return rc;
 991}
 992
 993static int smc_getname(struct socket *sock, struct sockaddr *addr,
 994                       int *len, int peer)
 995{
 996        struct smc_sock *smc;
 997
 998        if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
 999            (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1000                return -ENOTCONN;
1001
1002        smc = smc_sk(sock->sk);
1003
1004        return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
1005}
1006
1007static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1008{
1009        struct sock *sk = sock->sk;
1010        struct smc_sock *smc;
1011        int rc = -EPIPE;
1012
1013        smc = smc_sk(sk);
1014        lock_sock(sk);
1015        if ((sk->sk_state != SMC_ACTIVE) &&
1016            (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1017            (sk->sk_state != SMC_INIT))
1018                goto out;
1019        if (smc->use_fallback)
1020                rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1021        else
1022                rc = smc_tx_sendmsg(smc, msg, len);
1023out:
1024        release_sock(sk);
1025        return rc;
1026}
1027
1028static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1029                       int flags)
1030{
1031        struct sock *sk = sock->sk;
1032        struct smc_sock *smc;
1033        int rc = -ENOTCONN;
1034
1035        smc = smc_sk(sk);
1036        lock_sock(sk);
1037        if ((sk->sk_state == SMC_INIT) ||
1038            (sk->sk_state == SMC_LISTEN) ||
1039            (sk->sk_state == SMC_CLOSED))
1040                goto out;
1041
1042        if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1043                rc = 0;
1044                goto out;
1045        }
1046
1047        if (smc->use_fallback)
1048                rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1049        else
1050                rc = smc_rx_recvmsg(smc, msg, len, flags);
1051
1052out:
1053        release_sock(sk);
1054        return rc;
1055}
1056
1057static unsigned int smc_accept_poll(struct sock *parent)
1058{
1059        struct smc_sock *isk;
1060        struct sock *sk;
1061
1062        lock_sock(parent);
1063        list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
1064                sk = (struct sock *)isk;
1065
1066                if (sk->sk_state == SMC_ACTIVE) {
1067                        release_sock(parent);
1068                        return POLLIN | POLLRDNORM;
1069                }
1070        }
1071        release_sock(parent);
1072
1073        return 0;
1074}
1075
1076static unsigned int smc_poll(struct file *file, struct socket *sock,
1077                             poll_table *wait)
1078{
1079        struct sock *sk = sock->sk;
1080        unsigned int mask = 0;
1081        struct smc_sock *smc;
1082        int rc;
1083
1084        smc = smc_sk(sock->sk);
1085        if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1086                /* delegate to CLC child sock */
1087                mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1088                /* if non-blocking connect finished ... */
1089                lock_sock(sk);
1090                if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
1091                        sk->sk_err = smc->clcsock->sk->sk_err;
1092                        if (sk->sk_err) {
1093                                mask |= POLLERR;
1094                        } else {
1095                                rc = smc_connect_rdma(smc);
1096                                if (rc < 0)
1097                                        mask |= POLLERR;
1098                                else
1099                                        /* success cases including fallback */
1100                                        mask |= POLLOUT | POLLWRNORM;
1101                        }
1102                }
1103                release_sock(sk);
1104        } else {
1105                sock_poll_wait(file, sk_sleep(sk), wait);
1106                if (sk->sk_state == SMC_LISTEN)
1107                        /* woken up by sk_data_ready in smc_listen_work() */
1108                        mask |= smc_accept_poll(sk);
1109                if (sk->sk_err)
1110                        mask |= POLLERR;
1111                if (atomic_read(&smc->conn.sndbuf_space) ||
1112                    (sk->sk_shutdown & SEND_SHUTDOWN)) {
1113                        mask |= POLLOUT | POLLWRNORM;
1114                } else {
1115                        sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1116                        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1117                }
1118                if (atomic_read(&smc->conn.bytes_to_rcv))
1119                        mask |= POLLIN | POLLRDNORM;
1120                if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1121                    (sk->sk_state == SMC_CLOSED))
1122                        mask |= POLLHUP;
1123                if (sk->sk_shutdown & RCV_SHUTDOWN)
1124                        mask |= POLLIN | POLLRDNORM | POLLRDHUP;
1125                if (sk->sk_state == SMC_APPCLOSEWAIT1)
1126                        mask |= POLLIN;
1127
1128        }
1129
1130        return mask;
1131}
1132
1133static int smc_shutdown(struct socket *sock, int how)
1134{
1135        struct sock *sk = sock->sk;
1136        struct smc_sock *smc;
1137        int rc = -EINVAL;
1138        int rc1 = 0;
1139
1140        smc = smc_sk(sk);
1141
1142        if ((how < SHUT_RD) || (how > SHUT_RDWR))
1143                return rc;
1144
1145        lock_sock(sk);
1146
1147        rc = -ENOTCONN;
1148        if ((sk->sk_state != SMC_LISTEN) &&
1149            (sk->sk_state != SMC_ACTIVE) &&
1150            (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1151            (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1152            (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1153            (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1154            (sk->sk_state != SMC_APPFINCLOSEWAIT))
1155                goto out;
1156        if (smc->use_fallback) {
1157                rc = kernel_sock_shutdown(smc->clcsock, how);
1158                sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1159                if (sk->sk_shutdown == SHUTDOWN_MASK)
1160                        sk->sk_state = SMC_CLOSED;
1161                goto out;
1162        }
1163        switch (how) {
1164        case SHUT_RDWR:         /* shutdown in both directions */
1165                rc = smc_close_active(smc);
1166                break;
1167        case SHUT_WR:
1168                rc = smc_close_shutdown_write(smc);
1169                break;
1170        case SHUT_RD:
1171                if (sk->sk_state == SMC_LISTEN)
1172                        rc = smc_close_active(smc);
1173                else
1174                        rc = 0;
1175                        /* nothing more to do because peer is not involved */
1176                break;
1177        }
1178        rc1 = kernel_sock_shutdown(smc->clcsock, how);
1179        /* map sock_shutdown_cmd constants to sk_shutdown value range */
1180        sk->sk_shutdown |= how + 1;
1181
1182out:
1183        release_sock(sk);
1184        return rc ? rc : rc1;
1185}
1186
1187static int smc_setsockopt(struct socket *sock, int level, int optname,
1188                          char __user *optval, unsigned int optlen)
1189{
1190        struct sock *sk = sock->sk;
1191        struct smc_sock *smc;
1192
1193        smc = smc_sk(sk);
1194
1195        /* generic setsockopts reaching us here always apply to the
1196         * CLC socket
1197         */
1198        return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1199                                             optval, optlen);
1200}
1201
1202static int smc_getsockopt(struct socket *sock, int level, int optname,
1203                          char __user *optval, int __user *optlen)
1204{
1205        struct smc_sock *smc;
1206
1207        smc = smc_sk(sock->sk);
1208        /* socket options apply to the CLC socket */
1209        return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1210                                             optval, optlen);
1211}
1212
1213static int smc_ioctl(struct socket *sock, unsigned int cmd,
1214                     unsigned long arg)
1215{
1216        struct smc_sock *smc;
1217
1218        smc = smc_sk(sock->sk);
1219        if (smc->use_fallback)
1220                return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1221        else
1222                return sock_no_ioctl(sock, cmd, arg);
1223}
1224
1225static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1226                            int offset, size_t size, int flags)
1227{
1228        struct sock *sk = sock->sk;
1229        struct smc_sock *smc;
1230        int rc = -EPIPE;
1231
1232        smc = smc_sk(sk);
1233        lock_sock(sk);
1234        if (sk->sk_state != SMC_ACTIVE)
1235                goto out;
1236        if (smc->use_fallback)
1237                rc = kernel_sendpage(smc->clcsock, page, offset,
1238                                     size, flags);
1239        else
1240                rc = sock_no_sendpage(sock, page, offset, size, flags);
1241
1242out:
1243        release_sock(sk);
1244        return rc;
1245}
1246
1247static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1248                               struct pipe_inode_info *pipe, size_t len,
1249                                    unsigned int flags)
1250{
1251        struct sock *sk = sock->sk;
1252        struct smc_sock *smc;
1253        int rc = -ENOTCONN;
1254
1255        smc = smc_sk(sk);
1256        lock_sock(sk);
1257        if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1258                goto out;
1259        if (smc->use_fallback) {
1260                rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1261                                                    pipe, len, flags);
1262        } else {
1263                rc = -EOPNOTSUPP;
1264        }
1265out:
1266        release_sock(sk);
1267        return rc;
1268}
1269
1270/* must look like tcp */
1271static const struct proto_ops smc_sock_ops = {
1272        .family         = PF_SMC,
1273        .owner          = THIS_MODULE,
1274        .release        = smc_release,
1275        .bind           = smc_bind,
1276        .connect        = smc_connect,
1277        .socketpair     = sock_no_socketpair,
1278        .accept         = smc_accept,
1279        .getname        = smc_getname,
1280        .poll           = smc_poll,
1281        .ioctl          = smc_ioctl,
1282        .listen         = smc_listen,
1283        .shutdown       = smc_shutdown,
1284        .setsockopt     = smc_setsockopt,
1285        .getsockopt     = smc_getsockopt,
1286        .sendmsg        = smc_sendmsg,
1287        .recvmsg        = smc_recvmsg,
1288        .mmap           = sock_no_mmap,
1289        .sendpage       = smc_sendpage,
1290        .splice_read    = smc_splice_read,
1291};
1292
1293static int smc_create(struct net *net, struct socket *sock, int protocol,
1294                      int kern)
1295{
1296        struct smc_sock *smc;
1297        struct sock *sk;
1298        int rc;
1299
1300        rc = -ESOCKTNOSUPPORT;
1301        if (sock->type != SOCK_STREAM)
1302                goto out;
1303
1304        rc = -EPROTONOSUPPORT;
1305        if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1306                goto out;
1307
1308        rc = -ENOBUFS;
1309        sock->ops = &smc_sock_ops;
1310        sk = smc_sock_alloc(net, sock);
1311        if (!sk)
1312                goto out;
1313
1314        /* create internal TCP socket for CLC handshake and fallback */
1315        smc = smc_sk(sk);
1316        smc->use_fallback = false; /* assume rdma capability first */
1317        rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1318                              IPPROTO_TCP, &smc->clcsock);
1319        if (rc)
1320                sk_common_release(sk);
1321        smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1322        smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1323
1324out:
1325        return rc;
1326}
1327
1328static const struct net_proto_family smc_sock_family_ops = {
1329        .family = PF_SMC,
1330        .owner  = THIS_MODULE,
1331        .create = smc_create,
1332};
1333
1334static int __init smc_init(void)
1335{
1336        int rc;
1337
1338        rc = smc_pnet_init();
1339        if (rc)
1340                return rc;
1341
1342        rc = smc_llc_init();
1343        if (rc) {
1344                pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1345                goto out_pnet;
1346        }
1347
1348        rc = smc_cdc_init();
1349        if (rc) {
1350                pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1351                goto out_pnet;
1352        }
1353
1354        rc = proto_register(&smc_proto, 1);
1355        if (rc) {
1356                pr_err("%s: proto_register fails with %d\n", __func__, rc);
1357                goto out_pnet;
1358        }
1359
1360        rc = sock_register(&smc_sock_family_ops);
1361        if (rc) {
1362                pr_err("%s: sock_register fails with %d\n", __func__, rc);
1363                goto out_proto;
1364        }
1365        INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1366
1367        rc = smc_ib_register_client();
1368        if (rc) {
1369                pr_err("%s: ib_register fails with %d\n", __func__, rc);
1370                goto out_sock;
1371        }
1372
1373        return 0;
1374
1375out_sock:
1376        sock_unregister(PF_SMC);
1377out_proto:
1378        proto_unregister(&smc_proto);
1379out_pnet:
1380        smc_pnet_exit();
1381        return rc;
1382}
1383
1384static void __exit smc_exit(void)
1385{
1386        struct smc_link_group *lgr, *lg;
1387        LIST_HEAD(lgr_freeing_list);
1388
1389        spin_lock_bh(&smc_lgr_list.lock);
1390        if (!list_empty(&smc_lgr_list.list))
1391                list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1392        spin_unlock_bh(&smc_lgr_list.lock);
1393        list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1394                list_del_init(&lgr->list);
1395                smc_lgr_free(lgr); /* free link group */
1396        }
1397        smc_ib_unregister_client();
1398        sock_unregister(PF_SMC);
1399        proto_unregister(&smc_proto);
1400        smc_pnet_exit();
1401}
1402
1403module_init(smc_init);
1404module_exit(smc_exit);
1405
1406MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1407MODULE_DESCRIPTION("smc socket address family");
1408MODULE_LICENSE("GPL");
1409MODULE_ALIAS_NETPROTO(PF_SMC);
1410