linux/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.gnu.org/licenses/gpl-2.0.html
  19 *
  20 * GPL HEADER END
  21 */
  22/*
  23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24 * Use is subject to license terms.
  25 *
  26 * Copyright (c) 2011, 2012, Intel Corporation.
  27 */
  28/*
  29 * This file is part of Lustre, http://www.lustre.org/
  30 * Lustre is a trademark of Sun Microsystems, Inc.
  31 */
  32
  33#include "socklnd.h"
  34
  35int
  36ksocknal_lib_get_conn_addrs(struct ksock_conn *conn)
  37{
  38        int rc = lnet_sock_getaddr(conn->ksnc_sock, 1, &conn->ksnc_ipaddr,
  39                                   &conn->ksnc_port);
  40
  41        /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
  42        LASSERT(!conn->ksnc_closing);
  43
  44        if (rc) {
  45                CERROR("Error %d getting sock peer IP\n", rc);
  46                return rc;
  47        }
  48
  49        rc = lnet_sock_getaddr(conn->ksnc_sock, 0, &conn->ksnc_myipaddr, NULL);
  50        if (rc) {
  51                CERROR("Error %d getting sock local IP\n", rc);
  52                return rc;
  53        }
  54
  55        return 0;
  56}
  57
  58int
  59ksocknal_lib_zc_capable(struct ksock_conn *conn)
  60{
  61        int caps = conn->ksnc_sock->sk->sk_route_caps;
  62
  63        if (conn->ksnc_proto == &ksocknal_protocol_v1x)
  64                return 0;
  65
  66        /*
  67         * ZC if the socket supports scatter/gather and doesn't need software
  68         * checksums
  69         */
  70        return ((caps & NETIF_F_SG) && (caps & NETIF_F_CSUM_MASK));
  71}
  72
  73int
  74ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx)
  75{
  76        struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
  77        struct socket *sock = conn->ksnc_sock;
  78        int nob, i;
  79
  80        if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */
  81            conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
  82            tx->tx_nob == tx->tx_resid           && /* frist sending    */
  83            !tx->tx_msg.ksm_csum)                    /* not checksummed  */
  84                ksocknal_lib_csum_tx(tx);
  85
  86        for (nob = i = 0; i < tx->tx_niov; i++)
  87                nob += tx->tx_iov[i].iov_len;
  88
  89        if (!list_empty(&conn->ksnc_tx_queue) ||
  90            nob < tx->tx_resid)
  91                msg.msg_flags |= MSG_MORE;
  92
  93        iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC,
  94                      tx->tx_iov, tx->tx_niov, nob);
  95        return sock_sendmsg(sock, &msg);
  96}
  97
  98int
  99ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx)
 100{
 101        struct socket *sock = conn->ksnc_sock;
 102        lnet_kiov_t *kiov = tx->tx_kiov;
 103        int rc;
 104        int nob;
 105
 106        /* Not NOOP message */
 107        LASSERT(tx->tx_lnetmsg);
 108
 109        if (tx->tx_msg.ksm_zc_cookies[0]) {
 110                /* Zero copy is enabled */
 111                struct sock *sk = sock->sk;
 112                struct page *page = kiov->bv_page;
 113                int offset = kiov->bv_offset;
 114                int fragsize = kiov->bv_len;
 115                int msgflg = MSG_DONTWAIT;
 116
 117                CDEBUG(D_NET, "page %p + offset %x for %d\n",
 118                       page, offset, kiov->bv_len);
 119
 120                if (!list_empty(&conn->ksnc_tx_queue) ||
 121                    fragsize < tx->tx_resid)
 122                        msgflg |= MSG_MORE;
 123
 124                if (sk->sk_prot->sendpage) {
 125                        rc = sk->sk_prot->sendpage(sk, page,
 126                                                   offset, fragsize, msgflg);
 127                } else {
 128                        rc = tcp_sendpage(sk, page, offset, fragsize, msgflg);
 129                }
 130        } else {
 131                struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
 132                int i;
 133
 134                for (nob = i = 0; i < tx->tx_nkiov; i++)
 135                        nob += kiov[i].bv_len;
 136
 137                if (!list_empty(&conn->ksnc_tx_queue) ||
 138                    nob < tx->tx_resid)
 139                        msg.msg_flags |= MSG_MORE;
 140
 141                iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC,
 142                              kiov, tx->tx_nkiov, nob);
 143                rc = sock_sendmsg(sock, &msg);
 144        }
 145        return rc;
 146}
 147
 148void
 149ksocknal_lib_eager_ack(struct ksock_conn *conn)
 150{
 151        int opt = 1;
 152        struct socket *sock = conn->ksnc_sock;
 153
 154        /*
 155         * Remind the socket to ACK eagerly.  If I don't, the socket might
 156         * think I'm about to send something it could piggy-back the ACK
 157         * on, introducing delay in completing zero-copy sends in my
 158         * peer.
 159         */
 160        kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, (char *)&opt,
 161                          sizeof(opt));
 162}
 163
 164int
 165ksocknal_lib_recv_iov(struct ksock_conn *conn)
 166{
 167        unsigned int niov = conn->ksnc_rx_niov;
 168        struct kvec *iov = conn->ksnc_rx_iov;
 169        struct msghdr msg = {
 170                .msg_flags = 0
 171        };
 172        int nob;
 173        int i;
 174        int rc;
 175        int fragnob;
 176        int sum;
 177        __u32 saved_csum;
 178
 179        LASSERT(niov > 0);
 180
 181        for (nob = i = 0; i < niov; i++)
 182                nob += iov[i].iov_len;
 183
 184        LASSERT(nob <= conn->ksnc_rx_nob_wanted);
 185
 186        iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, niov, nob);
 187        rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT);
 188
 189        saved_csum = 0;
 190        if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
 191                saved_csum = conn->ksnc_msg.ksm_csum;
 192                conn->ksnc_msg.ksm_csum = 0;
 193        }
 194
 195        if (saved_csum) {
 196                /* accumulate checksum */
 197                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
 198                        LASSERT(i < niov);
 199
 200                        fragnob = iov[i].iov_len;
 201                        if (fragnob > sum)
 202                                fragnob = sum;
 203
 204                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
 205                                                           iov[i].iov_base, fragnob);
 206                }
 207                conn->ksnc_msg.ksm_csum = saved_csum;
 208        }
 209
 210        return rc;
 211}
 212
 213int
 214ksocknal_lib_recv_kiov(struct ksock_conn *conn)
 215{
 216        unsigned int niov = conn->ksnc_rx_nkiov;
 217        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
 218        struct msghdr msg = {
 219                .msg_flags = 0
 220        };
 221        int nob;
 222        int i;
 223        int rc;
 224        void *base;
 225        int sum;
 226        int fragnob;
 227
 228        for (nob = i = 0; i < niov; i++)
 229                nob += kiov[i].bv_len;
 230
 231        LASSERT(nob <= conn->ksnc_rx_nob_wanted);
 232
 233        iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, kiov, niov, nob);
 234        rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT);
 235
 236        if (conn->ksnc_msg.ksm_csum) {
 237                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
 238                        LASSERT(i < niov);
 239
 240                        base = kmap(kiov[i].bv_page) + kiov[i].bv_offset;
 241                        fragnob = kiov[i].bv_len;
 242                        if (fragnob > sum)
 243                                fragnob = sum;
 244
 245                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
 246                                                           base, fragnob);
 247
 248                        kunmap(kiov[i].bv_page);
 249                }
 250        }
 251        return rc;
 252}
 253
 254void
 255ksocknal_lib_csum_tx(struct ksock_tx *tx)
 256{
 257        int i;
 258        __u32 csum;
 259        void *base;
 260
 261        LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg);
 262        LASSERT(tx->tx_conn);
 263        LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
 264
 265        tx->tx_msg.ksm_csum = 0;
 266
 267        csum = ksocknal_csum(~0, tx->tx_iov[0].iov_base,
 268                             tx->tx_iov[0].iov_len);
 269
 270        if (tx->tx_kiov) {
 271                for (i = 0; i < tx->tx_nkiov; i++) {
 272                        base = kmap(tx->tx_kiov[i].bv_page) +
 273                               tx->tx_kiov[i].bv_offset;
 274
 275                        csum = ksocknal_csum(csum, base, tx->tx_kiov[i].bv_len);
 276
 277                        kunmap(tx->tx_kiov[i].bv_page);
 278                }
 279        } else {
 280                for (i = 1; i < tx->tx_niov; i++)
 281                        csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
 282                                             tx->tx_iov[i].iov_len);
 283        }
 284
 285        if (*ksocknal_tunables.ksnd_inject_csum_error) {
 286                csum++;
 287                *ksocknal_tunables.ksnd_inject_csum_error = 0;
 288        }
 289
 290        tx->tx_msg.ksm_csum = csum;
 291}
 292
 293int
 294ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, int *rxmem, int *nagle)
 295{
 296        struct socket *sock = conn->ksnc_sock;
 297        int len;
 298        int rc;
 299
 300        rc = ksocknal_connsock_addref(conn);
 301        if (rc) {
 302                LASSERT(conn->ksnc_closing);
 303                *txmem = *rxmem = *nagle = 0;
 304                return -ESHUTDOWN;
 305        }
 306
 307        rc = lnet_sock_getbuf(sock, txmem, rxmem);
 308        if (!rc) {
 309                len = sizeof(*nagle);
 310                rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY,
 311                                       (char *)nagle, &len);
 312        }
 313
 314        ksocknal_connsock_decref(conn);
 315
 316        if (!rc)
 317                *nagle = !*nagle;
 318        else
 319                *txmem = *rxmem = *nagle = 0;
 320
 321        return rc;
 322}
 323
 324int
 325ksocknal_lib_setup_sock(struct socket *sock)
 326{
 327        int rc;
 328        int option;
 329        int keep_idle;
 330        int keep_intvl;
 331        int keep_count;
 332        int do_keepalive;
 333        struct linger linger;
 334
 335        sock->sk->sk_allocation = GFP_NOFS;
 336
 337        /*
 338         * Ensure this socket aborts active sends immediately when we close
 339         * it.
 340         */
 341        linger.l_onoff = 0;
 342        linger.l_linger = 0;
 343
 344        rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, (char *)&linger,
 345                               sizeof(linger));
 346        if (rc) {
 347                CERROR("Can't set SO_LINGER: %d\n", rc);
 348                return rc;
 349        }
 350
 351        option = -1;
 352        rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2, (char *)&option,
 353                               sizeof(option));
 354        if (rc) {
 355                CERROR("Can't set SO_LINGER2: %d\n", rc);
 356                return rc;
 357        }
 358
 359        if (!*ksocknal_tunables.ksnd_nagle) {
 360                option = 1;
 361
 362                rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
 363                                       (char *)&option, sizeof(option));
 364                if (rc) {
 365                        CERROR("Can't disable nagle: %d\n", rc);
 366                        return rc;
 367                }
 368        }
 369
 370        rc = lnet_sock_setbuf(sock, *ksocknal_tunables.ksnd_tx_buffer_size,
 371                              *ksocknal_tunables.ksnd_rx_buffer_size);
 372        if (rc) {
 373                CERROR("Can't set buffer tx %d, rx %d buffers: %d\n",
 374                       *ksocknal_tunables.ksnd_tx_buffer_size,
 375                       *ksocknal_tunables.ksnd_rx_buffer_size, rc);
 376                return rc;
 377        }
 378
 379/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
 380
 381        /* snapshot tunables */
 382        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
 383        keep_count = *ksocknal_tunables.ksnd_keepalive_count;
 384        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
 385
 386        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
 387
 388        option = (do_keepalive ? 1 : 0);
 389        rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&option,
 390                               sizeof(option));
 391        if (rc) {
 392                CERROR("Can't set SO_KEEPALIVE: %d\n", rc);
 393                return rc;
 394        }
 395
 396        if (!do_keepalive)
 397                return 0;
 398
 399        rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&keep_idle,
 400                               sizeof(keep_idle));
 401        if (rc) {
 402                CERROR("Can't set TCP_KEEPIDLE: %d\n", rc);
 403                return rc;
 404        }
 405
 406        rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
 407                               (char *)&keep_intvl, sizeof(keep_intvl));
 408        if (rc) {
 409                CERROR("Can't set TCP_KEEPINTVL: %d\n", rc);
 410                return rc;
 411        }
 412
 413        rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keep_count,
 414                               sizeof(keep_count));
 415        if (rc) {
 416                CERROR("Can't set TCP_KEEPCNT: %d\n", rc);
 417                return rc;
 418        }
 419
 420        return 0;
 421}
 422
 423void
 424ksocknal_lib_push_conn(struct ksock_conn *conn)
 425{
 426        struct sock *sk;
 427        struct tcp_sock *tp;
 428        int nonagle;
 429        int val = 1;
 430        int rc;
 431
 432        rc = ksocknal_connsock_addref(conn);
 433        if (rc)                     /* being shut down */
 434                return;
 435
 436        sk = conn->ksnc_sock->sk;
 437        tp = tcp_sk(sk);
 438
 439        lock_sock(sk);
 440        nonagle = tp->nonagle;
 441        tp->nonagle = 1;
 442        release_sock(sk);
 443
 444        rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY,
 445                               (char *)&val, sizeof(val));
 446        LASSERT(!rc);
 447
 448        lock_sock(sk);
 449        tp->nonagle = nonagle;
 450        release_sock(sk);
 451
 452        ksocknal_connsock_decref(conn);
 453}
 454
 455/*
 456 * socket call back in Linux
 457 */
 458static void
 459ksocknal_data_ready(struct sock *sk)
 460{
 461        struct ksock_conn *conn;
 462
 463        /* interleave correctly with closing sockets... */
 464        LASSERT(!in_irq());
 465        read_lock(&ksocknal_data.ksnd_global_lock);
 466
 467        conn = sk->sk_user_data;
 468        if (!conn) {         /* raced with ksocknal_terminate_conn */
 469                LASSERT(sk->sk_data_ready != &ksocknal_data_ready);
 470                sk->sk_data_ready(sk);
 471        } else {
 472                ksocknal_read_callback(conn);
 473        }
 474
 475        read_unlock(&ksocknal_data.ksnd_global_lock);
 476}
 477
 478static void
 479ksocknal_write_space(struct sock *sk)
 480{
 481        struct ksock_conn *conn;
 482        int wspace;
 483        int min_wpace;
 484
 485        /* interleave correctly with closing sockets... */
 486        LASSERT(!in_irq());
 487        read_lock(&ksocknal_data.ksnd_global_lock);
 488
 489        conn = sk->sk_user_data;
 490        wspace = sk_stream_wspace(sk);
 491        min_wpace = sk_stream_min_wspace(sk);
 492
 493        CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
 494               sk, wspace, min_wpace, conn,
 495               !conn ? "" : (conn->ksnc_tx_ready ?
 496                                      " ready" : " blocked"),
 497               !conn ? "" : (conn->ksnc_tx_scheduled ?
 498                                      " scheduled" : " idle"),
 499               !conn ? "" : (list_empty(&conn->ksnc_tx_queue) ?
 500                                      " empty" : " queued"));
 501
 502        if (!conn) {         /* raced with ksocknal_terminate_conn */
 503                LASSERT(sk->sk_write_space != &ksocknal_write_space);
 504                sk->sk_write_space(sk);
 505
 506                read_unlock(&ksocknal_data.ksnd_global_lock);
 507                return;
 508        }
 509
 510        if (wspace >= min_wpace) {            /* got enough space */
 511                ksocknal_write_callback(conn);
 512
 513                /*
 514                 * Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
 515                 * ENOMEM check in ksocknal_transmit is race-free (think about
 516                 * it).
 517                 */
 518                clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 519        }
 520
 521        read_unlock(&ksocknal_data.ksnd_global_lock);
 522}
 523
 524void
 525ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn)
 526{
 527        conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
 528        conn->ksnc_saved_write_space = sock->sk->sk_write_space;
 529}
 530
 531void
 532ksocknal_lib_set_callback(struct socket *sock,  struct ksock_conn *conn)
 533{
 534        sock->sk->sk_user_data = conn;
 535        sock->sk->sk_data_ready = ksocknal_data_ready;
 536        sock->sk->sk_write_space = ksocknal_write_space;
 537}
 538
 539void
 540ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn)
 541{
 542        /*
 543         * Remove conn's network callbacks.
 544         * NB I _have_ to restore the callback, rather than storing a noop,
 545         * since the socket could survive past this module being unloaded!!
 546         */
 547        sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
 548        sock->sk->sk_write_space = conn->ksnc_saved_write_space;
 549
 550        /*
 551         * A callback could be in progress already; they hold a read lock
 552         * on ksnd_global_lock (to serialise with me) and NOOP if
 553         * sk_user_data is NULL.
 554         */
 555        sock->sk->sk_user_data = NULL;
 556}
 557
 558int
 559ksocknal_lib_memory_pressure(struct ksock_conn *conn)
 560{
 561        int rc = 0;
 562        struct ksock_sched *sched;
 563
 564        sched = conn->ksnc_scheduler;
 565        spin_lock_bh(&sched->kss_lock);
 566
 567        if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) &&
 568            !conn->ksnc_tx_ready) {
 569                /*
 570                 * SOCK_NOSPACE is set when the socket fills
 571                 * and cleared in the write_space callback
 572                 * (which also sets ksnc_tx_ready).  If
 573                 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
 574                 * zero, I didn't fill the socket and
 575                 * write_space won't reschedule me, so I
 576                 * return -ENOMEM to get my caller to retry
 577                 * after a timeout
 578                 */
 579                rc = -ENOMEM;
 580        }
 581
 582        spin_unlock_bh(&sched->kss_lock);
 583
 584        return rc;
 585}
 586