linux/drivers/infiniband/ulp/ipoib/ipoib_ib.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
   4 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
   5 * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
   6 *
   7 * This software is available to you under a choice of one of two
   8 * licenses.  You may choose to be licensed under the terms of the GNU
   9 * General Public License (GPL) Version 2, available from the file
  10 * COPYING in the main directory of this source tree, or the
  11 * OpenIB.org BSD license below:
  12 *
  13 *     Redistribution and use in source and binary forms, with or
  14 *     without modification, are permitted provided that the following
  15 *     conditions are met:
  16 *
  17 *      - Redistributions of source code must retain the above
  18 *        copyright notice, this list of conditions and the following
  19 *        disclaimer.
  20 *
  21 *      - Redistributions in binary form must reproduce the above
  22 *        copyright notice, this list of conditions and the following
  23 *        disclaimer in the documentation and/or other materials
  24 *        provided with the distribution.
  25 *
  26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  33 * SOFTWARE.
  34 */
  35
  36#include <linux/delay.h>
  37#include <linux/moduleparam.h>
  38#include <linux/dma-mapping.h>
  39#include <linux/slab.h>
  40
  41#include <linux/ip.h>
  42#include <linux/tcp.h>
  43
  44#include "ipoib.h"
  45
  46#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
  47static int data_debug_level;
  48
  49module_param(data_debug_level, int, 0644);
  50MODULE_PARM_DESC(data_debug_level,
  51                 "Enable data path debug tracing if > 0");
  52#endif
  53
  54static DEFINE_MUTEX(pkey_mutex);
  55
  56struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
  57                                 struct ib_pd *pd, struct ib_ah_attr *attr)
  58{
  59        struct ipoib_ah *ah;
  60        struct ib_ah *vah;
  61
  62        ah = kmalloc(sizeof *ah, GFP_KERNEL);
  63        if (!ah)
  64                return ERR_PTR(-ENOMEM);
  65
  66        ah->dev       = dev;
  67        ah->last_send = 0;
  68        kref_init(&ah->ref);
  69
  70        vah = ib_create_ah(pd, attr);
  71        if (IS_ERR(vah)) {
  72                kfree(ah);
  73                ah = (struct ipoib_ah *)vah;
  74        } else {
  75                ah->ah = vah;
  76                ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah);
  77        }
  78
  79        return ah;
  80}
  81
  82void ipoib_free_ah(struct kref *kref)
  83{
  84        struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
  85        struct ipoib_dev_priv *priv = netdev_priv(ah->dev);
  86
  87        unsigned long flags;
  88
  89        spin_lock_irqsave(&priv->lock, flags);
  90        list_add_tail(&ah->list, &priv->dead_ahs);
  91        spin_unlock_irqrestore(&priv->lock, flags);
  92}
  93
  94static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
  95                                  u64 mapping[IPOIB_UD_RX_SG])
  96{
  97        ib_dma_unmap_single(priv->ca, mapping[0],
  98                            IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
  99                            DMA_FROM_DEVICE);
 100}
 101
 102static int ipoib_ib_post_receive(struct net_device *dev, int id)
 103{
 104        struct ipoib_dev_priv *priv = netdev_priv(dev);
 105        struct ib_recv_wr *bad_wr;
 106        int ret;
 107
 108        priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
 109        priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
 110        priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];
 111
 112
 113        ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
 114        if (unlikely(ret)) {
 115                ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
 116                ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
 117                dev_kfree_skb_any(priv->rx_ring[id].skb);
 118                priv->rx_ring[id].skb = NULL;
 119        }
 120
 121        return ret;
 122}
 123
 124static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
 125{
 126        struct ipoib_dev_priv *priv = netdev_priv(dev);
 127        struct sk_buff *skb;
 128        int buf_size;
 129        u64 *mapping;
 130
 131        buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
 132
 133        skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN);
 134        if (unlikely(!skb))
 135                return NULL;
 136
 137        /*
 138         * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
 139         * header.  So we need 4 more bytes to get to 48 and align the
 140         * IP header to a multiple of 16.
 141         */
 142        skb_reserve(skb, 4);
 143
 144        mapping = priv->rx_ring[id].mapping;
 145        mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
 146                                       DMA_FROM_DEVICE);
 147        if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
 148                goto error;
 149
 150        priv->rx_ring[id].skb = skb;
 151        return skb;
 152error:
 153        dev_kfree_skb_any(skb);
 154        return NULL;
 155}
 156
 157static int ipoib_ib_post_receives(struct net_device *dev)
 158{
 159        struct ipoib_dev_priv *priv = netdev_priv(dev);
 160        int i;
 161
 162        for (i = 0; i < ipoib_recvq_size; ++i) {
 163                if (!ipoib_alloc_rx_skb(dev, i)) {
 164                        ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 165                        return -ENOMEM;
 166                }
 167                if (ipoib_ib_post_receive(dev, i)) {
 168                        ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
 169                        return -EIO;
 170                }
 171        }
 172
 173        return 0;
 174}
 175
 176static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 177{
 178        struct ipoib_dev_priv *priv = netdev_priv(dev);
 179        unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
 180        struct sk_buff *skb;
 181        u64 mapping[IPOIB_UD_RX_SG];
 182        union ib_gid *dgid;
 183
 184        ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
 185                       wr_id, wc->status);
 186
 187        if (unlikely(wr_id >= ipoib_recvq_size)) {
 188                ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
 189                           wr_id, ipoib_recvq_size);
 190                return;
 191        }
 192
 193        skb  = priv->rx_ring[wr_id].skb;
 194
 195        if (unlikely(wc->status != IB_WC_SUCCESS)) {
 196                if (wc->status != IB_WC_WR_FLUSH_ERR)
 197                        ipoib_warn(priv, "failed recv event "
 198                                   "(status=%d, wrid=%d vend_err %x)\n",
 199                                   wc->status, wr_id, wc->vendor_err);
 200                ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
 201                dev_kfree_skb_any(skb);
 202                priv->rx_ring[wr_id].skb = NULL;
 203                return;
 204        }
 205
 206        /*
 207         * Drop packets that this interface sent, ie multicast packets
 208         * that the HCA has replicated.
 209         */
 210        if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
 211                goto repost;
 212
 213        memcpy(mapping, priv->rx_ring[wr_id].mapping,
 214               IPOIB_UD_RX_SG * sizeof *mapping);
 215
 216        /*
 217         * If we can't allocate a new RX buffer, dump
 218         * this packet and reuse the old buffer.
 219         */
 220        if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
 221                ++dev->stats.rx_dropped;
 222                goto repost;
 223        }
 224
 225        ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 226                       wc->byte_len, wc->slid);
 227
 228        ipoib_ud_dma_unmap_rx(priv, mapping);
 229
 230        skb_put(skb, wc->byte_len);
 231
 232        /* First byte of dgid signals multicast when 0xff */
 233        dgid = &((struct ib_grh *)skb->data)->dgid;
 234
 235        if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff)
 236                skb->pkt_type = PACKET_HOST;
 237        else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0)
 238                skb->pkt_type = PACKET_BROADCAST;
 239        else
 240                skb->pkt_type = PACKET_MULTICAST;
 241
 242        skb_pull(skb, IB_GRH_BYTES);
 243
 244        skb->protocol = ((struct ipoib_header *) skb->data)->proto;
 245        skb_reset_mac_header(skb);
 246        skb_pull(skb, IPOIB_ENCAP_LEN);
 247
 248        skb->truesize = SKB_TRUESIZE(skb->len);
 249
 250        ++dev->stats.rx_packets;
 251        dev->stats.rx_bytes += skb->len;
 252
 253        skb->dev = dev;
 254        if ((dev->features & NETIF_F_RXCSUM) &&
 255                        likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
 256                skb->ip_summed = CHECKSUM_UNNECESSARY;
 257
 258        napi_gro_receive(&priv->napi, skb);
 259
 260repost:
 261        if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
 262                ipoib_warn(priv, "ipoib_ib_post_receive failed "
 263                           "for buf %d\n", wr_id);
 264}
 265
 266int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
 267{
 268        struct sk_buff *skb = tx_req->skb;
 269        u64 *mapping = tx_req->mapping;
 270        int i;
 271        int off;
 272
 273        if (skb_headlen(skb)) {
 274                mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
 275                                               DMA_TO_DEVICE);
 276                if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
 277                        return -EIO;
 278
 279                off = 1;
 280        } else
 281                off = 0;
 282
 283        for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
 284                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 285                mapping[i + off] = ib_dma_map_page(ca,
 286                                                 skb_frag_page(frag),
 287                                                 frag->page_offset, skb_frag_size(frag),
 288                                                 DMA_TO_DEVICE);
 289                if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
 290                        goto partial_error;
 291        }
 292        return 0;
 293
 294partial_error:
 295        for (; i > 0; --i) {
 296                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
 297
 298                ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE);
 299        }
 300
 301        if (off)
 302                ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
 303
 304        return -EIO;
 305}
 306
 307void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
 308                        struct ipoib_tx_buf *tx_req)
 309{
 310        struct sk_buff *skb = tx_req->skb;
 311        u64 *mapping = tx_req->mapping;
 312        int i;
 313        int off;
 314
 315        if (skb_headlen(skb)) {
 316                ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb),
 317                                    DMA_TO_DEVICE);
 318                off = 1;
 319        } else
 320                off = 0;
 321
 322        for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
 323                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 324
 325                ib_dma_unmap_page(priv->ca, mapping[i + off],
 326                                  skb_frag_size(frag), DMA_TO_DEVICE);
 327        }
 328}
 329
 330/*
 331 * As the result of a completion error the QP Can be transferred to SQE states.
 332 * The function checks if the (send)QP is in SQE state and
 333 * moves it back to RTS state, that in order to have it functional again.
 334 */
 335static void ipoib_qp_state_validate_work(struct work_struct *work)
 336{
 337        struct ipoib_qp_state_validate *qp_work =
 338                container_of(work, struct ipoib_qp_state_validate, work);
 339
 340        struct ipoib_dev_priv *priv = qp_work->priv;
 341        struct ib_qp_attr qp_attr;
 342        struct ib_qp_init_attr query_init_attr;
 343        int ret;
 344
 345        ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
 346        if (ret) {
 347                ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
 348                           __func__, ret);
 349                goto free_res;
 350        }
 351        pr_info("%s: QP: 0x%x is in state: %d\n",
 352                __func__, priv->qp->qp_num, qp_attr.qp_state);
 353
 354        /* currently support only in SQE->RTS transition*/
 355        if (qp_attr.qp_state == IB_QPS_SQE) {
 356                qp_attr.qp_state = IB_QPS_RTS;
 357
 358                ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
 359                if (ret) {
 360                        pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
 361                                ret, priv->qp->qp_num);
 362                        goto free_res;
 363                }
 364                pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
 365                        __func__, priv->qp->qp_num);
 366        } else {
 367                pr_warn("QP (%d) will stay in state: %d\n",
 368                        priv->qp->qp_num, qp_attr.qp_state);
 369        }
 370
 371free_res:
 372        kfree(qp_work);
 373}
 374
 375static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 376{
 377        struct ipoib_dev_priv *priv = netdev_priv(dev);
 378        unsigned int wr_id = wc->wr_id;
 379        struct ipoib_tx_buf *tx_req;
 380
 381        ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
 382                       wr_id, wc->status);
 383
 384        if (unlikely(wr_id >= ipoib_sendq_size)) {
 385                ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
 386                           wr_id, ipoib_sendq_size);
 387                return;
 388        }
 389
 390        tx_req = &priv->tx_ring[wr_id];
 391
 392        ipoib_dma_unmap_tx(priv, tx_req);
 393
 394        ++dev->stats.tx_packets;
 395        dev->stats.tx_bytes += tx_req->skb->len;
 396
 397        dev_kfree_skb_any(tx_req->skb);
 398
 399        ++priv->tx_tail;
 400        if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
 401            netif_queue_stopped(dev) &&
 402            test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 403                netif_wake_queue(dev);
 404
 405        if (wc->status != IB_WC_SUCCESS &&
 406            wc->status != IB_WC_WR_FLUSH_ERR) {
 407                struct ipoib_qp_state_validate *qp_work;
 408                ipoib_warn(priv, "failed send event "
 409                           "(status=%d, wrid=%d vend_err %x)\n",
 410                           wc->status, wr_id, wc->vendor_err);
 411                qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
 412                if (!qp_work) {
 413                        ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n",
 414                                   __func__, priv->qp->qp_num);
 415                        return;
 416                }
 417
 418                INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
 419                qp_work->priv = priv;
 420                queue_work(priv->wq, &qp_work->work);
 421        }
 422}
 423
 424static int poll_tx(struct ipoib_dev_priv *priv)
 425{
 426        int n, i;
 427
 428        n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
 429        for (i = 0; i < n; ++i)
 430                ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
 431
 432        return n == MAX_SEND_CQE;
 433}
 434
 435int ipoib_poll(struct napi_struct *napi, int budget)
 436{
 437        struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi);
 438        struct net_device *dev = priv->dev;
 439        int done;
 440        int t;
 441        int n, i;
 442
 443        done  = 0;
 444
 445poll_more:
 446        while (done < budget) {
 447                int max = (budget - done);
 448
 449                t = min(IPOIB_NUM_WC, max);
 450                n = ib_poll_cq(priv->recv_cq, t, priv->ibwc);
 451
 452                for (i = 0; i < n; i++) {
 453                        struct ib_wc *wc = priv->ibwc + i;
 454
 455                        if (wc->wr_id & IPOIB_OP_RECV) {
 456                                ++done;
 457                                if (wc->wr_id & IPOIB_OP_CM)
 458                                        ipoib_cm_handle_rx_wc(dev, wc);
 459                                else
 460                                        ipoib_ib_handle_rx_wc(dev, wc);
 461                        } else
 462                                ipoib_cm_handle_tx_wc(priv->dev, wc);
 463                }
 464
 465                if (n != t)
 466                        break;
 467        }
 468
 469        if (done < budget) {
 470                napi_complete(napi);
 471                if (unlikely(ib_req_notify_cq(priv->recv_cq,
 472                                              IB_CQ_NEXT_COMP |
 473                                              IB_CQ_REPORT_MISSED_EVENTS)) &&
 474                    napi_reschedule(napi))
 475                        goto poll_more;
 476        }
 477
 478        return done;
 479}
 480
 481void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
 482{
 483        struct net_device *dev = dev_ptr;
 484        struct ipoib_dev_priv *priv = netdev_priv(dev);
 485
 486        napi_schedule(&priv->napi);
 487}
 488
 489static void drain_tx_cq(struct net_device *dev)
 490{
 491        struct ipoib_dev_priv *priv = netdev_priv(dev);
 492
 493        netif_tx_lock(dev);
 494        while (poll_tx(priv))
 495                ; /* nothing */
 496
 497        if (netif_queue_stopped(dev))
 498                mod_timer(&priv->poll_timer, jiffies + 1);
 499
 500        netif_tx_unlock(dev);
 501}
 502
 503void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
 504{
 505        struct ipoib_dev_priv *priv = netdev_priv(dev_ptr);
 506
 507        mod_timer(&priv->poll_timer, jiffies);
 508}
 509
 510static inline int post_send(struct ipoib_dev_priv *priv,
 511                            unsigned int wr_id,
 512                            struct ib_ah *address, u32 qpn,
 513                            struct ipoib_tx_buf *tx_req,
 514                            void *head, int hlen)
 515{
 516        struct ib_send_wr *bad_wr;
 517        struct sk_buff *skb = tx_req->skb;
 518
 519        ipoib_build_sge(priv, tx_req);
 520
 521        priv->tx_wr.wr.wr_id    = wr_id;
 522        priv->tx_wr.remote_qpn  = qpn;
 523        priv->tx_wr.ah          = address;
 524
 525        if (head) {
 526                priv->tx_wr.mss         = skb_shinfo(skb)->gso_size;
 527                priv->tx_wr.header      = head;
 528                priv->tx_wr.hlen        = hlen;
 529                priv->tx_wr.wr.opcode   = IB_WR_LSO;
 530        } else
 531                priv->tx_wr.wr.opcode   = IB_WR_SEND;
 532
 533        return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr);
 534}
 535
 536void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 537                struct ipoib_ah *address, u32 qpn)
 538{
 539        struct ipoib_dev_priv *priv = netdev_priv(dev);
 540        struct ipoib_tx_buf *tx_req;
 541        int hlen, rc;
 542        void *phead;
 543
 544        if (skb_is_gso(skb)) {
 545                hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
 546                phead = skb->data;
 547                if (unlikely(!skb_pull(skb, hlen))) {
 548                        ipoib_warn(priv, "linear data too small\n");
 549                        ++dev->stats.tx_dropped;
 550                        ++dev->stats.tx_errors;
 551                        dev_kfree_skb_any(skb);
 552                        return;
 553                }
 554        } else {
 555                if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
 556                        ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
 557                                   skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
 558                        ++dev->stats.tx_dropped;
 559                        ++dev->stats.tx_errors;
 560                        ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
 561                        return;
 562                }
 563                phead = NULL;
 564                hlen  = 0;
 565        }
 566
 567        ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
 568                       skb->len, address, qpn);
 569
 570        /*
 571         * We put the skb into the tx_ring _before_ we call post_send()
 572         * because it's entirely possible that the completion handler will
 573         * run before we execute anything after the post_send().  That
 574         * means we have to make sure everything is properly recorded and
 575         * our state is consistent before we call post_send().
 576         */
 577        tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
 578        tx_req->skb = skb;
 579        if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
 580                ++dev->stats.tx_errors;
 581                dev_kfree_skb_any(skb);
 582                return;
 583        }
 584
 585        if (skb->ip_summed == CHECKSUM_PARTIAL)
 586                priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM;
 587        else
 588                priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
 589
 590        if (++priv->tx_outstanding == ipoib_sendq_size) {
 591                ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
 592                if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
 593                        ipoib_warn(priv, "request notify on send CQ failed\n");
 594                netif_stop_queue(dev);
 595        }
 596
 597        skb_orphan(skb);
 598        skb_dst_drop(skb);
 599
 600        rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
 601                       address->ah, qpn, tx_req, phead, hlen);
 602        if (unlikely(rc)) {
 603                ipoib_warn(priv, "post_send failed, error %d\n", rc);
 604                ++dev->stats.tx_errors;
 605                --priv->tx_outstanding;
 606                ipoib_dma_unmap_tx(priv, tx_req);
 607                dev_kfree_skb_any(skb);
 608                if (netif_queue_stopped(dev))
 609                        netif_wake_queue(dev);
 610        } else {
 611                dev->trans_start = jiffies;
 612
 613                address->last_send = priv->tx_head;
 614                ++priv->tx_head;
 615        }
 616
 617        if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
 618                while (poll_tx(priv))
 619                        ; /* nothing */
 620}
 621
 622static void __ipoib_reap_ah(struct net_device *dev)
 623{
 624        struct ipoib_dev_priv *priv = netdev_priv(dev);
 625        struct ipoib_ah *ah, *tah;
 626        LIST_HEAD(remove_list);
 627        unsigned long flags;
 628
 629        netif_tx_lock_bh(dev);
 630        spin_lock_irqsave(&priv->lock, flags);
 631
 632        list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
 633                if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
 634                        list_del(&ah->list);
 635                        ib_destroy_ah(ah->ah);
 636                        kfree(ah);
 637                }
 638
 639        spin_unlock_irqrestore(&priv->lock, flags);
 640        netif_tx_unlock_bh(dev);
 641}
 642
 643void ipoib_reap_ah(struct work_struct *work)
 644{
 645        struct ipoib_dev_priv *priv =
 646                container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
 647        struct net_device *dev = priv->dev;
 648
 649        __ipoib_reap_ah(dev);
 650
 651        if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
 652                queue_delayed_work(priv->wq, &priv->ah_reap_task,
 653                                   round_jiffies_relative(HZ));
 654}
 655
 656static void ipoib_flush_ah(struct net_device *dev)
 657{
 658        struct ipoib_dev_priv *priv = netdev_priv(dev);
 659
 660        cancel_delayed_work(&priv->ah_reap_task);
 661        flush_workqueue(priv->wq);
 662        ipoib_reap_ah(&priv->ah_reap_task.work);
 663}
 664
 665static void ipoib_stop_ah(struct net_device *dev)
 666{
 667        struct ipoib_dev_priv *priv = netdev_priv(dev);
 668
 669        set_bit(IPOIB_STOP_REAPER, &priv->flags);
 670        ipoib_flush_ah(dev);
 671}
 672
 673static void ipoib_ib_tx_timer_func(unsigned long ctx)
 674{
 675        drain_tx_cq((struct net_device *)ctx);
 676}
 677
 678int ipoib_ib_dev_open(struct net_device *dev)
 679{
 680        struct ipoib_dev_priv *priv = netdev_priv(dev);
 681        int ret;
 682
 683        ipoib_pkey_dev_check_presence(dev);
 684
 685        if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
 686                ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey,
 687                           (!(priv->pkey & 0x7fff) ? "Invalid" : "not found"));
 688                return -1;
 689        }
 690
 691        ret = ipoib_init_qp(dev);
 692        if (ret) {
 693                ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
 694                return -1;
 695        }
 696
 697        ret = ipoib_ib_post_receives(dev);
 698        if (ret) {
 699                ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
 700                goto dev_stop;
 701        }
 702
 703        ret = ipoib_cm_dev_open(dev);
 704        if (ret) {
 705                ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
 706                goto dev_stop;
 707        }
 708
 709        clear_bit(IPOIB_STOP_REAPER, &priv->flags);
 710        queue_delayed_work(priv->wq, &priv->ah_reap_task,
 711                           round_jiffies_relative(HZ));
 712
 713        if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
 714                napi_enable(&priv->napi);
 715
 716        return 0;
 717dev_stop:
 718        if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
 719                napi_enable(&priv->napi);
 720        ipoib_ib_dev_stop(dev);
 721        return -1;
 722}
 723
 724void ipoib_pkey_dev_check_presence(struct net_device *dev)
 725{
 726        struct ipoib_dev_priv *priv = netdev_priv(dev);
 727
 728        if (!(priv->pkey & 0x7fff) ||
 729            ib_find_pkey(priv->ca, priv->port, priv->pkey,
 730                         &priv->pkey_index))
 731                clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 732        else
 733                set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 734}
 735
 736int ipoib_ib_dev_up(struct net_device *dev)
 737{
 738        struct ipoib_dev_priv *priv = netdev_priv(dev);
 739
 740        ipoib_pkey_dev_check_presence(dev);
 741
 742        if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
 743                ipoib_dbg(priv, "PKEY is not assigned.\n");
 744                return 0;
 745        }
 746
 747        set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
 748
 749        return ipoib_mcast_start_thread(dev);
 750}
 751
 752int ipoib_ib_dev_down(struct net_device *dev)
 753{
 754        struct ipoib_dev_priv *priv = netdev_priv(dev);
 755
 756        ipoib_dbg(priv, "downing ib_dev\n");
 757
 758        clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
 759        netif_carrier_off(dev);
 760
 761        ipoib_mcast_stop_thread(dev);
 762        ipoib_mcast_dev_flush(dev);
 763
 764        ipoib_flush_paths(dev);
 765
 766        return 0;
 767}
 768
 769static int recvs_pending(struct net_device *dev)
 770{
 771        struct ipoib_dev_priv *priv = netdev_priv(dev);
 772        int pending = 0;
 773        int i;
 774
 775        for (i = 0; i < ipoib_recvq_size; ++i)
 776                if (priv->rx_ring[i].skb)
 777                        ++pending;
 778
 779        return pending;
 780}
 781
 782void ipoib_drain_cq(struct net_device *dev)
 783{
 784        struct ipoib_dev_priv *priv = netdev_priv(dev);
 785        int i, n;
 786
 787        /*
 788         * We call completion handling routines that expect to be
 789         * called from the BH-disabled NAPI poll context, so disable
 790         * BHs here too.
 791         */
 792        local_bh_disable();
 793
 794        do {
 795                n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
 796                for (i = 0; i < n; ++i) {
 797                        /*
 798                         * Convert any successful completions to flush
 799                         * errors to avoid passing packets up the
 800                         * stack after bringing the device down.
 801                         */
 802                        if (priv->ibwc[i].status == IB_WC_SUCCESS)
 803                                priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
 804
 805                        if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
 806                                if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
 807                                        ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
 808                                else
 809                                        ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
 810                        } else
 811                                ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
 812                }
 813        } while (n == IPOIB_NUM_WC);
 814
 815        while (poll_tx(priv))
 816                ; /* nothing */
 817
 818        local_bh_enable();
 819}
 820
 821int ipoib_ib_dev_stop(struct net_device *dev)
 822{
 823        struct ipoib_dev_priv *priv = netdev_priv(dev);
 824        struct ib_qp_attr qp_attr;
 825        unsigned long begin;
 826        struct ipoib_tx_buf *tx_req;
 827        int i;
 828
 829        if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
 830                napi_disable(&priv->napi);
 831
 832        ipoib_cm_dev_stop(dev);
 833
 834        /*
 835         * Move our QP to the error state and then reinitialize in
 836         * when all work requests have completed or have been flushed.
 837         */
 838        qp_attr.qp_state = IB_QPS_ERR;
 839        if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
 840                ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
 841
 842        /* Wait for all sends and receives to complete */
 843        begin = jiffies;
 844
 845        while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
 846                if (time_after(jiffies, begin + 5 * HZ)) {
 847                        ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
 848                                   priv->tx_head - priv->tx_tail, recvs_pending(dev));
 849
 850                        /*
 851                         * assume the HW is wedged and just free up
 852                         * all our pending work requests.
 853                         */
 854                        while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
 855                                tx_req = &priv->tx_ring[priv->tx_tail &
 856                                                        (ipoib_sendq_size - 1)];
 857                                ipoib_dma_unmap_tx(priv, tx_req);
 858                                dev_kfree_skb_any(tx_req->skb);
 859                                ++priv->tx_tail;
 860                                --priv->tx_outstanding;
 861                        }
 862
 863                        for (i = 0; i < ipoib_recvq_size; ++i) {
 864                                struct ipoib_rx_buf *rx_req;
 865
 866                                rx_req = &priv->rx_ring[i];
 867                                if (!rx_req->skb)
 868                                        continue;
 869                                ipoib_ud_dma_unmap_rx(priv,
 870                                                      priv->rx_ring[i].mapping);
 871                                dev_kfree_skb_any(rx_req->skb);
 872                                rx_req->skb = NULL;
 873                        }
 874
 875                        goto timeout;
 876                }
 877
 878                ipoib_drain_cq(dev);
 879
 880                msleep(1);
 881        }
 882
 883        ipoib_dbg(priv, "All sends and receives done.\n");
 884
 885timeout:
 886        del_timer_sync(&priv->poll_timer);
 887        qp_attr.qp_state = IB_QPS_RESET;
 888        if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
 889                ipoib_warn(priv, "Failed to modify QP to RESET state\n");
 890
 891        ipoib_flush_ah(dev);
 892
 893        ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
 894
 895        return 0;
 896}
 897
 898int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 899{
 900        struct ipoib_dev_priv *priv = netdev_priv(dev);
 901
 902        priv->ca = ca;
 903        priv->port = port;
 904        priv->qp = NULL;
 905
 906        if (ipoib_transport_dev_init(dev, ca)) {
 907                printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
 908                return -ENODEV;
 909        }
 910
 911        setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
 912                    (unsigned long) dev);
 913
 914        if (dev->flags & IFF_UP) {
 915                if (ipoib_ib_dev_open(dev)) {
 916                        ipoib_transport_dev_cleanup(dev);
 917                        return -ENODEV;
 918                }
 919        }
 920
 921        return 0;
 922}
 923
 924/*
 925 * Takes whatever value which is in pkey index 0 and updates priv->pkey
 926 * returns 0 if the pkey value was changed.
 927 */
 928static inline int update_parent_pkey(struct ipoib_dev_priv *priv)
 929{
 930        int result;
 931        u16 prev_pkey;
 932
 933        prev_pkey = priv->pkey;
 934        result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
 935        if (result) {
 936                ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n",
 937                           priv->port, result);
 938                return result;
 939        }
 940
 941        priv->pkey |= 0x8000;
 942
 943        if (prev_pkey != priv->pkey) {
 944                ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n",
 945                          prev_pkey, priv->pkey);
 946                /*
 947                 * Update the pkey in the broadcast address, while making sure to set
 948                 * the full membership bit, so that we join the right broadcast group.
 949                 */
 950                priv->dev->broadcast[8] = priv->pkey >> 8;
 951                priv->dev->broadcast[9] = priv->pkey & 0xff;
 952                return 0;
 953        }
 954
 955        return 1;
 956}
 957/*
 958 * returns 0 if pkey value was found in a different slot.
 959 */
 960static inline int update_child_pkey(struct ipoib_dev_priv *priv)
 961{
 962        u16 old_index = priv->pkey_index;
 963
 964        priv->pkey_index = 0;
 965        ipoib_pkey_dev_check_presence(priv->dev);
 966
 967        if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
 968            (old_index == priv->pkey_index))
 969                return 1;
 970        return 0;
 971}
 972
 973static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
 974                                enum ipoib_flush_level level,
 975                                int nesting)
 976{
 977        struct ipoib_dev_priv *cpriv;
 978        struct net_device *dev = priv->dev;
 979        int result;
 980
 981        down_read_nested(&priv->vlan_rwsem, nesting);
 982
 983        /*
 984         * Flush any child interfaces too -- they might be up even if
 985         * the parent is down.
 986         */
 987        list_for_each_entry(cpriv, &priv->child_intfs, list)
 988                __ipoib_ib_dev_flush(cpriv, level, nesting + 1);
 989
 990        up_read(&priv->vlan_rwsem);
 991
 992        if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
 993            level != IPOIB_FLUSH_HEAVY) {
 994                ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
 995                return;
 996        }
 997
 998        if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
 999                /* interface is down. update pkey and leave. */
1000                if (level == IPOIB_FLUSH_HEAVY) {
1001                        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
1002                                update_parent_pkey(priv);
1003                        else
1004                                update_child_pkey(priv);
1005                }
1006                ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
1007                return;
1008        }
1009
1010        if (level == IPOIB_FLUSH_HEAVY) {
1011                /* child devices chase their origin pkey value, while non-child
1012                 * (parent) devices should always takes what present in pkey index 0
1013                 */
1014                if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
1015                        result = update_child_pkey(priv);
1016                        if (result) {
1017                                /* restart QP only if P_Key index is changed */
1018                                ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
1019                                return;
1020                        }
1021
1022                } else {
1023                        result = update_parent_pkey(priv);
1024                        /* restart QP only if P_Key value changed */
1025                        if (result) {
1026                                ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n");
1027                                return;
1028                        }
1029                }
1030        }
1031
1032        if (level == IPOIB_FLUSH_LIGHT) {
1033                ipoib_mark_paths_invalid(dev);
1034                ipoib_mcast_dev_flush(dev);
1035                ipoib_flush_ah(dev);
1036        }
1037
1038        if (level >= IPOIB_FLUSH_NORMAL)
1039                ipoib_ib_dev_down(dev);
1040
1041        if (level == IPOIB_FLUSH_HEAVY) {
1042                if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
1043                        ipoib_ib_dev_stop(dev);
1044                if (ipoib_ib_dev_open(dev) != 0)
1045                        return;
1046                if (netif_queue_stopped(dev))
1047                        netif_start_queue(dev);
1048        }
1049
1050        /*
1051         * The device could have been brought down between the start and when
1052         * we get here, don't bring it back up if it's not configured up
1053         */
1054        if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
1055                if (level >= IPOIB_FLUSH_NORMAL)
1056                        ipoib_ib_dev_up(dev);
1057                ipoib_mcast_restart_task(&priv->restart_task);
1058        }
1059}
1060
1061void ipoib_ib_dev_flush_light(struct work_struct *work)
1062{
1063        struct ipoib_dev_priv *priv =
1064                container_of(work, struct ipoib_dev_priv, flush_light);
1065
1066        __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT, 0);
1067}
1068
1069void ipoib_ib_dev_flush_normal(struct work_struct *work)
1070{
1071        struct ipoib_dev_priv *priv =
1072                container_of(work, struct ipoib_dev_priv, flush_normal);
1073
1074        __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL, 0);
1075}
1076
1077void ipoib_ib_dev_flush_heavy(struct work_struct *work)
1078{
1079        struct ipoib_dev_priv *priv =
1080                container_of(work, struct ipoib_dev_priv, flush_heavy);
1081
1082        __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0);
1083}
1084
1085void ipoib_ib_dev_cleanup(struct net_device *dev)
1086{
1087        struct ipoib_dev_priv *priv = netdev_priv(dev);
1088
1089        ipoib_dbg(priv, "cleaning up ib_dev\n");
1090        /*
1091         * We must make sure there are no more (path) completions
1092         * that may wish to touch priv fields that are no longer valid
1093         */
1094        ipoib_flush_paths(dev);
1095
1096        ipoib_mcast_stop_thread(dev);
1097        ipoib_mcast_dev_flush(dev);
1098
1099        /*
1100         * All of our ah references aren't free until after
1101         * ipoib_mcast_dev_flush(), ipoib_flush_paths, and
1102         * the neighbor garbage collection is stopped and reaped.
1103         * That should all be done now, so make a final ah flush.
1104         */
1105        ipoib_stop_ah(dev);
1106
1107        ipoib_transport_dev_cleanup(dev);
1108}
1109
1110
1111