linux/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19 *
  20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21 * CA 95054 USA or visit www.sun.com if you need additional information or
  22 * have any questions.
  23 *
  24 * GPL HEADER END
  25 */
  26/*
  27 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  28 * Use is subject to license terms.
  29 *
  30 * Copyright (c) 2011, 2012, Intel Corporation.
  31 */
  32/*
  33 * This file is part of Lustre, http://www.lustre.org/
  34 * Lustre is a trademark of Sun Microsystems, Inc.
  35 */
  36
  37#include "socklnd.h"
  38
  39# if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
  40
  41
  42enum {
  43        SOCKLND_TIMEOUT = 1,
  44        SOCKLND_CREDITS,
  45        SOCKLND_PEER_TXCREDITS,
  46        SOCKLND_PEER_RTRCREDITS,
  47        SOCKLND_PEER_TIMEOUT,
  48        SOCKLND_NCONNDS,
  49        SOCKLND_RECONNECTS_MIN,
  50        SOCKLND_RECONNECTS_MAX,
  51        SOCKLND_EAGER_ACK,
  52        SOCKLND_ZERO_COPY,
  53        SOCKLND_TYPED,
  54        SOCKLND_BULK_MIN,
  55        SOCKLND_RX_BUFFER_SIZE,
  56        SOCKLND_TX_BUFFER_SIZE,
  57        SOCKLND_NAGLE,
  58        SOCKLND_IRQ_AFFINITY,
  59        SOCKLND_ROUND_ROBIN,
  60        SOCKLND_KEEPALIVE,
  61        SOCKLND_KEEPALIVE_IDLE,
  62        SOCKLND_KEEPALIVE_COUNT,
  63        SOCKLND_KEEPALIVE_INTVL,
  64        SOCKLND_BACKOFF_INIT,
  65        SOCKLND_BACKOFF_MAX,
  66        SOCKLND_PROTOCOL,
  67        SOCKLND_ZERO_COPY_RECV,
  68        SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS
  69};
  70
  71static ctl_table_t ksocknal_ctl_table[] = {
  72        {
  73                .ctl_name = SOCKLND_TIMEOUT,
  74                .procname = "timeout",
  75                .data     = &ksocknal_tunables.ksnd_timeout,
  76                .maxlen   = sizeof (int),
  77                .mode     = 0644,
  78                .proc_handler = &proc_dointvec,
  79                .strategy = &sysctl_intvec,
  80        },
  81        {
  82                .ctl_name = SOCKLND_CREDITS,
  83                .procname = "credits",
  84                .data     = &ksocknal_tunables.ksnd_credits,
  85                .maxlen   = sizeof (int),
  86                .mode     = 0444,
  87                .proc_handler = &proc_dointvec,
  88                .strategy = &sysctl_intvec,
  89        },
  90         {
  91                .ctl_name = SOCKLND_PEER_TXCREDITS,
  92                .procname = "peer_credits",
  93                .data     = &ksocknal_tunables.ksnd_peertxcredits,
  94                .maxlen   = sizeof (int),
  95                .mode     = 0444,
  96                .proc_handler = &proc_dointvec,
  97                .strategy = &sysctl_intvec,
  98        },
  99         {
 100                .ctl_name = SOCKLND_PEER_RTRCREDITS,
 101                .procname = "peer_buffer_credits",
 102                .data     = &ksocknal_tunables.ksnd_peerrtrcredits,
 103                .maxlen   = sizeof (int),
 104                .mode     = 0444,
 105                .proc_handler = &proc_dointvec,
 106                .strategy = &sysctl_intvec,
 107        },
 108        {
 109                .ctl_name = SOCKLND_PEER_TIMEOUT,
 110                .procname = "peer_timeout",
 111                .data     = &ksocknal_tunables.ksnd_peertimeout,
 112                .maxlen   = sizeof (int),
 113                .mode     = 0444,
 114                .proc_handler = &proc_dointvec
 115                .strategy = &sysctl_intvec,
 116        },
 117        {
 118                .ctl_name = SOCKLND_NCONNDS,
 119                .procname = "nconnds",
 120                .data     = &ksocknal_tunables.ksnd_nconnds,
 121                .maxlen   = sizeof (int),
 122                .mode     = 0444,
 123                .proc_handler = &proc_dointvec,
 124                .strategy = &sysctl_intvec,
 125        },
 126        {
 127                .ctl_name = SOCKLND_RECONNECTS_MIN,
 128                .procname = "min_reconnectms",
 129                .data     = &ksocknal_tunables.ksnd_min_reconnectms,
 130                .maxlen   = sizeof (int),
 131                .mode     = 0444,
 132                .proc_handler = &proc_dointvec,
 133                .strategy = &sysctl_intvec,
 134        },
 135        {
 136                .ctl_name = SOCKLND_RECONNECTS_MAX,
 137                .procname = "max_reconnectms",
 138                .data     = &ksocknal_tunables.ksnd_max_reconnectms,
 139                .maxlen   = sizeof (int),
 140                .mode     = 0444,
 141                .proc_handler = &proc_dointvec,
 142                .strategy = &sysctl_intvec,
 143        },
 144        {
 145                .ctl_name = SOCKLND_EAGER_ACK,
 146                .procname = "eager_ack",
 147                .data     = &ksocknal_tunables.ksnd_eager_ack,
 148                .maxlen   = sizeof (int),
 149                .mode     = 0644,
 150                .proc_handler = &proc_dointvec,
 151                .strategy = &sysctl_intvec,
 152        },
 153        {
 154                .ctl_name = SOCKLND_ZERO_COPY,
 155                .procname = "zero_copy",
 156                .data     = &ksocknal_tunables.ksnd_zc_min_payload,
 157                .maxlen   = sizeof (int),
 158                .mode     = 0644,
 159                .proc_handler = &proc_dointvec,
 160                .strategy = &sysctl_intvec,
 161        },
 162        {
 163                .ctl_name = SOCKLND_ZERO_COPY_RECV,
 164                .procname = "zero_copy_recv",
 165                .data     = &ksocknal_tunables.ksnd_zc_recv,
 166                .maxlen   = sizeof (int),
 167                .mode     = 0644,
 168                .proc_handler = &proc_dointvec,
 169                .strategy = &sysctl_intvec,
 170        },
 171
 172        {
 173                .ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS,
 174                .procname = "zero_copy_recv",
 175                .data     = &ksocknal_tunables.ksnd_zc_recv_min_nfrags,
 176                .maxlen   = sizeof (int),
 177                .mode     = 0644,
 178                .proc_handler = &proc_dointvec,
 179                .strategy = &sysctl_intvec,
 180        },
 181        {
 182                .ctl_name = SOCKLND_TYPED,
 183                .procname = "typed",
 184                .data     = &ksocknal_tunables.ksnd_typed_conns,
 185                .maxlen   = sizeof (int),
 186                .mode     = 0444,
 187                .proc_handler = &proc_dointvec,
 188                .strategy = &sysctl_intvec,
 189        },
 190        {
 191                .ctl_name = SOCKLND_BULK_MIN,
 192                .procname = "min_bulk",
 193                .data     = &ksocknal_tunables.ksnd_min_bulk,
 194                .maxlen   = sizeof (int),
 195                .mode     = 0644,
 196                .proc_handler = &proc_dointvec,
 197                .strategy = &sysctl_intvec,
 198        },
 199        {
 200                .ctl_name = SOCKLND_RX_BUFFER_SIZE,
 201                .procname = "rx_buffer_size",
 202                .data     = &ksocknal_tunables.ksnd_rx_buffer_size,
 203                .maxlen   = sizeof(int),
 204                .mode     = 0644,
 205                .proc_handler = &proc_dointvec,
 206                .strategy = &sysctl_intvec,
 207        },
 208        {
 209                .ctl_name = SOCKLND_TX_BUFFER_SIZE,
 210                .procname = "tx_buffer_size",
 211                .data     = &ksocknal_tunables.ksnd_tx_buffer_size,
 212                .maxlen   = sizeof(int),
 213                .mode     = 0644,
 214                .proc_handler = &proc_dointvec,
 215                .strategy = &sysctl_intvec,
 216        },
 217        {
 218                .ctl_name = SOCKLND_NAGLE,
 219                .procname = "nagle",
 220                .data     = &ksocknal_tunables.ksnd_nagle,
 221                .maxlen   = sizeof(int),
 222                .mode     = 0644,
 223                .proc_handler = &proc_dointvec,
 224                .strategy = &sysctl_intvec,
 225        },
 226        {
 227                .ctl_name = SOCKLND_ROUND_ROBIN,
 228                .procname = "round_robin",
 229                .data     = &ksocknal_tunables.ksnd_round_robin,
 230                .maxlen   = sizeof(int),
 231                .mode     = 0644,
 232                .proc_handler = &proc_dointvec,
 233                .strategy = &sysctl_intvec,
 234        },
 235        {
 236                .ctl_name = SOCKLND_KEEPALIVE,
 237                .procname = "keepalive",
 238                .data     = &ksocknal_tunables.ksnd_keepalive,
 239                .maxlen   = sizeof(int),
 240                .mode     = 0644,
 241                .proc_handler = &proc_dointvec,
 242                .strategy = &sysctl_intvec,
 243        },
 244        {
 245                .ctl_name = SOCKLND_KEEPALIVE_IDLE,
 246                .procname = "keepalive_idle",
 247                .data     = &ksocknal_tunables.ksnd_keepalive_idle,
 248                .maxlen   = sizeof(int),
 249                .mode     = 0644,
 250                .proc_handler = &proc_dointvec,
 251                .strategy = &sysctl_intvec,
 252        },
 253        {
 254                .ctl_name = SOCKLND_KEEPALIVE_COUNT,
 255                .procname = "keepalive_count",
 256                .data     = &ksocknal_tunables.ksnd_keepalive_count,
 257                .maxlen   = sizeof(int),
 258                .mode     = 0644,
 259                .proc_handler = &proc_dointvec,
 260                .strategy = &sysctl_intvec,
 261        },
 262        {
 263                .ctl_name = SOCKLND_KEEPALIVE_INTVL,
 264                .procname = "keepalive_intvl",
 265                .data     = &ksocknal_tunables.ksnd_keepalive_intvl,
 266                .maxlen   = sizeof(int),
 267                .mode     = 0644,
 268                .proc_handler = &proc_dointvec,
 269                .strategy = &sysctl_intvec,
 270        },
 271#if SOCKNAL_VERSION_DEBUG
 272        {
 273                .ctl_name = SOCKLND_PROTOCOL,
 274                .procname = "protocol",
 275                .data     = &ksocknal_tunables.ksnd_protocol,
 276                .maxlen   = sizeof(int),
 277                .mode     = 0644,
 278                .proc_handler = &proc_dointvec,
 279                .strategy = &sysctl_intvec,
 280        },
 281#endif
 282        {0}
 283};
 284
 285
 286ctl_table_t ksocknal_top_ctl_table[] = {
 287        {
 288                .ctl_name = CTL_SOCKLND,
 289                .procname = "socknal",
 290                .data     = NULL,
 291                .maxlen   = 0,
 292                .mode     = 0555,
 293                .child    = ksocknal_ctl_table
 294        },
 295        { 0 }
 296};
 297
 298int
 299ksocknal_lib_tunables_init ()
 300{
 301        if (!*ksocknal_tunables.ksnd_typed_conns) {
 302                int rc = -EINVAL;
 303#if SOCKNAL_VERSION_DEBUG
 304                if (*ksocknal_tunables.ksnd_protocol < 3)
 305                        rc = 0;
 306#endif
 307                if (rc != 0) {
 308                        CERROR("Protocol V3.x MUST have typed connections\n");
 309                        return rc;
 310                }
 311        }
 312
 313        if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2)
 314                *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2;
 315        if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV)
 316                *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV;
 317
 318        ksocknal_tunables.ksnd_sysctl =
 319                cfs_register_sysctl_table(ksocknal_top_ctl_table, 0);
 320
 321        if (ksocknal_tunables.ksnd_sysctl == NULL)
 322                CWARN("Can't setup /proc tunables\n");
 323
 324        return 0;
 325}
 326
 327void
 328ksocknal_lib_tunables_fini ()
 329{
 330        if (ksocknal_tunables.ksnd_sysctl != NULL)
 331                unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);
 332}
 333#else
 334int
 335ksocknal_lib_tunables_init ()
 336{
 337        return 0;
 338}
 339
 340void
 341ksocknal_lib_tunables_fini ()
 342{
 343}
 344#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */
 345
 346int
 347ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
 348{
 349        int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
 350                                     &conn->ksnc_ipaddr,
 351                                     &conn->ksnc_port);
 352
 353        /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
 354        LASSERT (!conn->ksnc_closing);
 355
 356        if (rc != 0) {
 357                CERROR ("Error %d getting sock peer IP\n", rc);
 358                return rc;
 359        }
 360
 361        rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
 362                                 &conn->ksnc_myipaddr, NULL);
 363        if (rc != 0) {
 364                CERROR ("Error %d getting sock local IP\n", rc);
 365                return rc;
 366        }
 367
 368        return 0;
 369}
 370
 371int
 372ksocknal_lib_zc_capable(ksock_conn_t *conn)
 373{
 374        int  caps = conn->ksnc_sock->sk->sk_route_caps;
 375
 376        if (conn->ksnc_proto == &ksocknal_protocol_v1x)
 377                return 0;
 378
 379        /* ZC if the socket supports scatter/gather and doesn't need software
 380         * checksums */
 381        return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0);
 382}
 383
 384int
 385ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
 386{
 387        struct socket *sock = conn->ksnc_sock;
 388        int         nob;
 389        int         rc;
 390
 391        if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */
 392            conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
 393            tx->tx_nob == tx->tx_resid           && /* frist sending    */
 394            tx->tx_msg.ksm_csum == 0)                /* not checksummed  */
 395                ksocknal_lib_csum_tx(tx);
 396
 397        /* NB we can't trust socket ops to either consume our iovs
 398         * or leave them alone. */
 399
 400        {
 401#if SOCKNAL_SINGLE_FRAG_TX
 402                struct iovec    scratch;
 403                struct iovec   *scratchiov = &scratch;
 404                unsigned int    niov = 1;
 405#else
 406                struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 407                unsigned int    niov = tx->tx_niov;
 408#endif
 409                struct msghdr msg = {
 410                        .msg_name       = NULL,
 411                        .msg_namelen    = 0,
 412                        .msg_iov        = scratchiov,
 413                        .msg_iovlen     = niov,
 414                        .msg_control    = NULL,
 415                        .msg_controllen = 0,
 416                        .msg_flags      = MSG_DONTWAIT
 417                };
 418                mm_segment_t oldmm = get_fs();
 419                int  i;
 420
 421                for (nob = i = 0; i < niov; i++) {
 422                        scratchiov[i] = tx->tx_iov[i];
 423                        nob += scratchiov[i].iov_len;
 424                }
 425
 426                if (!list_empty(&conn->ksnc_tx_queue) ||
 427                    nob < tx->tx_resid)
 428                        msg.msg_flags |= MSG_MORE;
 429
 430                set_fs (KERNEL_DS);
 431                rc = sock_sendmsg(sock, &msg, nob);
 432                set_fs (oldmm);
 433        }
 434        return rc;
 435}
 436
 437int
 438ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 439{
 440        struct socket *sock = conn->ksnc_sock;
 441        lnet_kiov_t   *kiov = tx->tx_kiov;
 442        int         rc;
 443        int         nob;
 444
 445        /* Not NOOP message */
 446        LASSERT (tx->tx_lnetmsg != NULL);
 447
 448        /* NB we can't trust socket ops to either consume our iovs
 449         * or leave them alone. */
 450        if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
 451                /* Zero copy is enabled */
 452                struct sock   *sk = sock->sk;
 453                struct page   *page = kiov->kiov_page;
 454                int         offset = kiov->kiov_offset;
 455                int         fragsize = kiov->kiov_len;
 456                int         msgflg = MSG_DONTWAIT;
 457
 458                CDEBUG(D_NET, "page %p + offset %x for %d\n",
 459                               page, offset, kiov->kiov_len);
 460
 461                if (!list_empty(&conn->ksnc_tx_queue) ||
 462                    fragsize < tx->tx_resid)
 463                        msgflg |= MSG_MORE;
 464
 465                if (sk->sk_prot->sendpage != NULL) {
 466                        rc = sk->sk_prot->sendpage(sk, page,
 467                                                   offset, fragsize, msgflg);
 468                } else {
 469                        rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
 470                                              msgflg);
 471                }
 472        } else {
 473#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
 474                struct iovec  scratch;
 475                struct iovec *scratchiov = &scratch;
 476                unsigned int  niov = 1;
 477#else
 478#ifdef CONFIG_HIGHMEM
 479#warning "XXX risk of kmap deadlock on multiple frags..."
 480#endif
 481                struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 482                unsigned int  niov = tx->tx_nkiov;
 483#endif
 484                struct msghdr msg = {
 485                        .msg_name       = NULL,
 486                        .msg_namelen    = 0,
 487                        .msg_iov        = scratchiov,
 488                        .msg_iovlen     = niov,
 489                        .msg_control    = NULL,
 490                        .msg_controllen = 0,
 491                        .msg_flags      = MSG_DONTWAIT
 492                };
 493                mm_segment_t  oldmm = get_fs();
 494                int        i;
 495
 496                for (nob = i = 0; i < niov; i++) {
 497                        scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
 498                                                 kiov[i].kiov_offset;
 499                        nob += scratchiov[i].iov_len = kiov[i].kiov_len;
 500                }
 501
 502                if (!list_empty(&conn->ksnc_tx_queue) ||
 503                    nob < tx->tx_resid)
 504                        msg.msg_flags |= MSG_MORE;
 505
 506                set_fs (KERNEL_DS);
 507                rc = sock_sendmsg(sock, &msg, nob);
 508                set_fs (oldmm);
 509
 510                for (i = 0; i < niov; i++)
 511                        kunmap(kiov[i].kiov_page);
 512        }
 513        return rc;
 514}
 515
 516void
 517ksocknal_lib_eager_ack (ksock_conn_t *conn)
 518{
 519        int         opt = 1;
 520        mm_segment_t   oldmm = get_fs();
 521        struct socket *sock = conn->ksnc_sock;
 522
 523        /* Remind the socket to ACK eagerly.  If I don't, the socket might
 524         * think I'm about to send something it could piggy-back the ACK
 525         * on, introducing delay in completing zero-copy sends in my
 526         * peer. */
 527
 528        set_fs(KERNEL_DS);
 529        sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
 530                               (char *)&opt, sizeof (opt));
 531        set_fs(oldmm);
 532}
 533
 534int
 535ksocknal_lib_recv_iov (ksock_conn_t *conn)
 536{
 537#if SOCKNAL_SINGLE_FRAG_RX
 538        struct iovec  scratch;
 539        struct iovec *scratchiov = &scratch;
 540        unsigned int  niov = 1;
 541#else
 542        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 543        unsigned int  niov = conn->ksnc_rx_niov;
 544#endif
 545        struct iovec *iov = conn->ksnc_rx_iov;
 546        struct msghdr msg = {
 547                .msg_name       = NULL,
 548                .msg_namelen    = 0,
 549                .msg_iov        = scratchiov,
 550                .msg_iovlen     = niov,
 551                .msg_control    = NULL,
 552                .msg_controllen = 0,
 553                .msg_flags      = 0
 554        };
 555        mm_segment_t oldmm = get_fs();
 556        int       nob;
 557        int       i;
 558        int       rc;
 559        int       fragnob;
 560        int       sum;
 561        __u32   saved_csum;
 562
 563        /* NB we can't trust socket ops to either consume our iovs
 564         * or leave them alone. */
 565        LASSERT (niov > 0);
 566
 567        for (nob = i = 0; i < niov; i++) {
 568                scratchiov[i] = iov[i];
 569                nob += scratchiov[i].iov_len;
 570        }
 571        LASSERT (nob <= conn->ksnc_rx_nob_wanted);
 572
 573        set_fs (KERNEL_DS);
 574        rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
 575        /* NB this is just a boolean..........................^ */
 576        set_fs (oldmm);
 577
 578        saved_csum = 0;
 579        if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
 580                saved_csum = conn->ksnc_msg.ksm_csum;
 581                conn->ksnc_msg.ksm_csum = 0;
 582        }
 583
 584        if (saved_csum != 0) {
 585                /* accumulate checksum */
 586                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
 587                        LASSERT (i < niov);
 588
 589                        fragnob = iov[i].iov_len;
 590                        if (fragnob > sum)
 591                                fragnob = sum;
 592
 593                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
 594                                                           iov[i].iov_base, fragnob);
 595                }
 596                conn->ksnc_msg.ksm_csum = saved_csum;
 597        }
 598
 599        return rc;
 600}
 601
 602static void
 603ksocknal_lib_kiov_vunmap(void *addr)
 604{
 605        if (addr == NULL)
 606                return;
 607
 608        vunmap(addr);
 609}
 610
 611static void *
 612ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
 613                       struct iovec *iov, struct page **pages)
 614{
 615        void         *addr;
 616        int            nob;
 617        int            i;
 618
 619        if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
 620                return NULL;
 621
 622        LASSERT (niov <= LNET_MAX_IOV);
 623
 624        if (niov < 2 ||
 625            niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
 626                return NULL;
 627
 628        for (nob = i = 0; i < niov; i++) {
 629                if ((kiov[i].kiov_offset != 0 && i > 0) ||
 630                    (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1))
 631                        return NULL;
 632
 633                pages[i] = kiov[i].kiov_page;
 634                nob += kiov[i].kiov_len;
 635        }
 636
 637        addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
 638        if (addr == NULL)
 639                return NULL;
 640
 641        iov->iov_base = addr + kiov[0].kiov_offset;
 642        iov->iov_len = nob;
 643
 644        return addr;
 645}
 646
 647int
 648ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 649{
 650#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
 651        struct iovec   scratch;
 652        struct iovec  *scratchiov = &scratch;
 653        struct page  **pages      = NULL;
 654        unsigned int   niov       = 1;
 655#else
 656#ifdef CONFIG_HIGHMEM
 657#warning "XXX risk of kmap deadlock on multiple frags..."
 658#endif
 659        struct iovec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
 660        struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
 661        unsigned int   niov       = conn->ksnc_rx_nkiov;
 662#endif
 663        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
 664        struct msghdr msg = {
 665                .msg_name       = NULL,
 666                .msg_namelen    = 0,
 667                .msg_iov        = scratchiov,
 668                .msg_control    = NULL,
 669                .msg_controllen = 0,
 670                .msg_flags      = 0
 671        };
 672        mm_segment_t oldmm = get_fs();
 673        int       nob;
 674        int       i;
 675        int       rc;
 676        void    *base;
 677        void    *addr;
 678        int       sum;
 679        int       fragnob;
 680
 681        /* NB we can't trust socket ops to either consume our iovs
 682         * or leave them alone. */
 683        if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
 684                nob = scratchiov[0].iov_len;
 685                msg.msg_iovlen = 1;
 686
 687        } else {
 688                for (nob = i = 0; i < niov; i++) {
 689                        nob += scratchiov[i].iov_len = kiov[i].kiov_len;
 690                        scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
 691                                                 kiov[i].kiov_offset;
 692                }
 693                msg.msg_iovlen = niov;
 694        }
 695
 696        LASSERT (nob <= conn->ksnc_rx_nob_wanted);
 697
 698        set_fs (KERNEL_DS);
 699        rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
 700        /* NB this is just a boolean.......................^ */
 701        set_fs (oldmm);
 702
 703        if (conn->ksnc_msg.ksm_csum != 0) {
 704                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
 705                        LASSERT (i < niov);
 706
 707                        /* Dang! have to kmap again because I have nowhere to stash the
 708                         * mapped address.  But by doing it while the page is still
 709                         * mapped, the kernel just bumps the map count and returns me
 710                         * the address it stashed. */
 711                        base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
 712                        fragnob = kiov[i].kiov_len;
 713                        if (fragnob > sum)
 714                                fragnob = sum;
 715
 716                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
 717                                                           base, fragnob);
 718
 719                        kunmap(kiov[i].kiov_page);
 720                }
 721        }
 722
 723        if (addr != NULL) {
 724                ksocknal_lib_kiov_vunmap(addr);
 725        } else {
 726                for (i = 0; i < niov; i++)
 727                        kunmap(kiov[i].kiov_page);
 728        }
 729
 730        return (rc);
 731}
 732
 733void
 734ksocknal_lib_csum_tx(ksock_tx_t *tx)
 735{
 736        int       i;
 737        __u32   csum;
 738        void    *base;
 739
 740        LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg);
 741        LASSERT(tx->tx_conn != NULL);
 742        LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
 743
 744        tx->tx_msg.ksm_csum = 0;
 745
 746        csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base,
 747                             tx->tx_iov[0].iov_len);
 748
 749        if (tx->tx_kiov != NULL) {
 750                for (i = 0; i < tx->tx_nkiov; i++) {
 751                        base = kmap(tx->tx_kiov[i].kiov_page) +
 752                               tx->tx_kiov[i].kiov_offset;
 753
 754                        csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
 755
 756                        kunmap(tx->tx_kiov[i].kiov_page);
 757                }
 758        } else {
 759                for (i = 1; i < tx->tx_niov; i++)
 760                        csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
 761                                             tx->tx_iov[i].iov_len);
 762        }
 763
 764        if (*ksocknal_tunables.ksnd_inject_csum_error) {
 765                csum++;
 766                *ksocknal_tunables.ksnd_inject_csum_error = 0;
 767        }
 768
 769        tx->tx_msg.ksm_csum = csum;
 770}
 771
 772int
 773ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
 774{
 775        mm_segment_t   oldmm = get_fs ();
 776        struct socket *sock = conn->ksnc_sock;
 777        int         len;
 778        int         rc;
 779
 780        rc = ksocknal_connsock_addref(conn);
 781        if (rc != 0) {
 782                LASSERT (conn->ksnc_closing);
 783                *txmem = *rxmem = *nagle = 0;
 784                return (-ESHUTDOWN);
 785        }
 786
 787        rc = libcfs_sock_getbuf(sock, txmem, rxmem);
 788        if (rc == 0) {
 789                len = sizeof(*nagle);
 790                set_fs(KERNEL_DS);
 791                rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
 792                                           (char *)nagle, &len);
 793                set_fs(oldmm);
 794        }
 795
 796        ksocknal_connsock_decref(conn);
 797
 798        if (rc == 0)
 799                *nagle = !*nagle;
 800        else
 801                *txmem = *rxmem = *nagle = 0;
 802
 803        return (rc);
 804}
 805
 806int
 807ksocknal_lib_setup_sock (struct socket *sock)
 808{
 809        mm_segment_t    oldmm = get_fs ();
 810        int          rc;
 811        int          option;
 812        int          keep_idle;
 813        int          keep_intvl;
 814        int          keep_count;
 815        int          do_keepalive;
 816        struct linger   linger;
 817
 818        sock->sk->sk_allocation = GFP_NOFS;
 819
 820        /* Ensure this socket aborts active sends immediately when we close
 821         * it. */
 822
 823        linger.l_onoff = 0;
 824        linger.l_linger = 0;
 825
 826        set_fs (KERNEL_DS);
 827        rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER,
 828                              (char *)&linger, sizeof (linger));
 829        set_fs (oldmm);
 830        if (rc != 0) {
 831                CERROR ("Can't set SO_LINGER: %d\n", rc);
 832                return (rc);
 833        }
 834
 835        option = -1;
 836        set_fs (KERNEL_DS);
 837        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2,
 838                                    (char *)&option, sizeof (option));
 839        set_fs (oldmm);
 840        if (rc != 0) {
 841                CERROR ("Can't set SO_LINGER2: %d\n", rc);
 842                return (rc);
 843        }
 844
 845        if (!*ksocknal_tunables.ksnd_nagle) {
 846                option = 1;
 847
 848                set_fs (KERNEL_DS);
 849                rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY,
 850                                            (char *)&option, sizeof (option));
 851                set_fs (oldmm);
 852                if (rc != 0) {
 853                        CERROR ("Can't disable nagle: %d\n", rc);
 854                        return (rc);
 855                }
 856        }
 857
 858        rc = libcfs_sock_setbuf(sock,
 859                                *ksocknal_tunables.ksnd_tx_buffer_size,
 860                                *ksocknal_tunables.ksnd_rx_buffer_size);
 861        if (rc != 0) {
 862                CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
 863                        *ksocknal_tunables.ksnd_tx_buffer_size,
 864                        *ksocknal_tunables.ksnd_rx_buffer_size, rc);
 865                return (rc);
 866        }
 867
 868/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
 869
 870        /* snapshot tunables */
 871        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
 872        keep_count = *ksocknal_tunables.ksnd_keepalive_count;
 873        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
 874
 875        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
 876
 877        option = (do_keepalive ? 1 : 0);
 878        set_fs (KERNEL_DS);
 879        rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE,
 880                              (char *)&option, sizeof (option));
 881        set_fs (oldmm);
 882        if (rc != 0) {
 883                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
 884                return (rc);
 885        }
 886
 887        if (!do_keepalive)
 888                return (0);
 889
 890        set_fs (KERNEL_DS);
 891        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
 892                                    (char *)&keep_idle, sizeof (keep_idle));
 893        set_fs (oldmm);
 894        if (rc != 0) {
 895                CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
 896                return (rc);
 897        }
 898
 899        set_fs (KERNEL_DS);
 900        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
 901                                    (char *)&keep_intvl, sizeof (keep_intvl));
 902        set_fs (oldmm);
 903        if (rc != 0) {
 904                CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
 905                return (rc);
 906        }
 907
 908        set_fs (KERNEL_DS);
 909        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
 910                                    (char *)&keep_count, sizeof (keep_count));
 911        set_fs (oldmm);
 912        if (rc != 0) {
 913                CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
 914                return (rc);
 915        }
 916
 917        return (0);
 918}
 919
 920void
 921ksocknal_lib_push_conn (ksock_conn_t *conn)
 922{
 923        struct sock    *sk;
 924        struct tcp_sock *tp;
 925        int          nonagle;
 926        int          val = 1;
 927        int          rc;
 928        mm_segment_t    oldmm;
 929
 930        rc = ksocknal_connsock_addref(conn);
 931        if (rc != 0)                        /* being shut down */
 932                return;
 933
 934        sk = conn->ksnc_sock->sk;
 935        tp = tcp_sk(sk);
 936
 937        lock_sock (sk);
 938        nonagle = tp->nonagle;
 939        tp->nonagle = 1;
 940        release_sock (sk);
 941
 942        oldmm = get_fs ();
 943        set_fs (KERNEL_DS);
 944
 945        rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
 946                                      (char *)&val, sizeof (val));
 947        LASSERT (rc == 0);
 948
 949        set_fs (oldmm);
 950
 951        lock_sock (sk);
 952        tp->nonagle = nonagle;
 953        release_sock (sk);
 954
 955        ksocknal_connsock_decref(conn);
 956}
 957
 958extern void ksocknal_read_callback (ksock_conn_t *conn);
 959extern void ksocknal_write_callback (ksock_conn_t *conn);
 960/*
 961 * socket call back in Linux
 962 */
 963static void
 964ksocknal_data_ready (struct sock *sk, int n)
 965{
 966        ksock_conn_t  *conn;
 967        ENTRY;
 968
 969        /* interleave correctly with closing sockets... */
 970        LASSERT(!in_irq());
 971        read_lock(&ksocknal_data.ksnd_global_lock);
 972
 973        conn = sk->sk_user_data;
 974        if (conn == NULL) {          /* raced with ksocknal_terminate_conn */
 975                LASSERT (sk->sk_data_ready != &ksocknal_data_ready);
 976                sk->sk_data_ready (sk, n);
 977        } else
 978                ksocknal_read_callback(conn);
 979
 980        read_unlock(&ksocknal_data.ksnd_global_lock);
 981
 982        EXIT;
 983}
 984
 985static void
 986ksocknal_write_space (struct sock *sk)
 987{
 988        ksock_conn_t  *conn;
 989        int         wspace;
 990        int         min_wpace;
 991
 992        /* interleave correctly with closing sockets... */
 993        LASSERT(!in_irq());
 994        read_lock(&ksocknal_data.ksnd_global_lock);
 995
 996        conn = sk->sk_user_data;
 997        wspace = SOCKNAL_WSPACE(sk);
 998        min_wpace = SOCKNAL_MIN_WSPACE(sk);
 999
1000        CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
1001               sk, wspace, min_wpace, conn,
1002               (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
1003                                      " ready" : " blocked"),
1004               (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
1005                                      " scheduled" : " idle"),
1006               (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
1007                                      " empty" : " queued"));
1008
1009        if (conn == NULL) {          /* raced with ksocknal_terminate_conn */
1010                LASSERT (sk->sk_write_space != &ksocknal_write_space);
1011                sk->sk_write_space (sk);
1012
1013                read_unlock(&ksocknal_data.ksnd_global_lock);
1014                return;
1015        }
1016
1017        if (wspace >= min_wpace) {            /* got enough space */
1018                ksocknal_write_callback(conn);
1019
1020                /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
1021                 * ENOMEM check in ksocknal_transmit is race-free (think about
1022                 * it). */
1023
1024                clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
1025        }
1026
1027        read_unlock(&ksocknal_data.ksnd_global_lock);
1028}
1029
1030void
1031ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
1032{
1033        conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
1034        conn->ksnc_saved_write_space = sock->sk->sk_write_space;
1035}
1036
1037void
1038ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
1039{
1040        sock->sk->sk_user_data = conn;
1041        sock->sk->sk_data_ready = ksocknal_data_ready;
1042        sock->sk->sk_write_space = ksocknal_write_space;
1043        return;
1044}
1045
1046void
1047ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
1048{
1049        /* Remove conn's network callbacks.
1050         * NB I _have_ to restore the callback, rather than storing a noop,
1051         * since the socket could survive past this module being unloaded!! */
1052        sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
1053        sock->sk->sk_write_space = conn->ksnc_saved_write_space;
1054
1055        /* A callback could be in progress already; they hold a read lock
1056         * on ksnd_global_lock (to serialise with me) and NOOP if
1057         * sk_user_data is NULL. */
1058        sock->sk->sk_user_data = NULL;
1059
1060        return ;
1061}
1062
1063int
1064ksocknal_lib_memory_pressure(ksock_conn_t *conn)
1065{
1066        int         rc = 0;
1067        ksock_sched_t *sched;
1068
1069        sched = conn->ksnc_scheduler;
1070        spin_lock_bh(&sched->kss_lock);
1071
1072        if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) &&
1073            !conn->ksnc_tx_ready) {
1074                /* SOCK_NOSPACE is set when the socket fills
1075                 * and cleared in the write_space callback
1076                 * (which also sets ksnc_tx_ready).  If
1077                 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
1078                 * zero, I didn't fill the socket and
1079                 * write_space won't reschedule me, so I
1080                 * return -ENOMEM to get my caller to retry
1081                 * after a timeout */
1082                rc = -ENOMEM;
1083        }
1084
1085        spin_unlock_bh(&sched->kss_lock);
1086
1087        return rc;
1088}
1089