linux/drivers/infiniband/ulp/iser/iser_verbs.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
   3 * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
   4 * Copyright (c) 2013 Mellanox Technologies. All rights reserved.
   5 *
   6 * This software is available to you under a choice of one of two
   7 * licenses.  You may choose to be licensed under the terms of the GNU
   8 * General Public License (GPL) Version 2, available from the file
   9 * COPYING in the main directory of this source tree, or the
  10 * OpenIB.org BSD license below:
  11 *
  12 *     Redistribution and use in source and binary forms, with or
  13 *     without modification, are permitted provided that the following
  14 *     conditions are met:
  15 *
  16 *      - Redistributions of source code must retain the above
  17 *        copyright notice, this list of conditions and the following
  18 *        disclaimer.
  19 *
  20 *      - Redistributions in binary form must reproduce the above
  21 *        copyright notice, this list of conditions and the following
  22 *        disclaimer in the documentation and/or other materials
  23 *        provided with the distribution.
  24 *
  25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32 * SOFTWARE.
  33 */
  34#include <linux/kernel.h>
  35#include <linux/module.h>
  36#include <linux/slab.h>
  37#include <linux/delay.h>
  38
  39#include "iscsi_iser.h"
  40
  41#define ISCSI_ISER_MAX_CONN     8
  42#define ISER_MAX_RX_CQ_LEN      (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
  43#define ISER_MAX_TX_CQ_LEN      (ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
  44
  45static void iser_cq_tasklet_fn(unsigned long data);
  46static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
  47
  48static void iser_cq_event_callback(struct ib_event *cause, void *context)
  49{
  50        iser_err("got cq event %d \n", cause->event);
  51}
  52
  53static void iser_qp_event_callback(struct ib_event *cause, void *context)
  54{
  55        iser_err("got qp event %d\n",cause->event);
  56}
  57
  58static void iser_event_handler(struct ib_event_handler *handler,
  59                                struct ib_event *event)
  60{
  61        iser_err("async event %d on device %s port %d\n", event->event,
  62                event->device->name, event->element.port_num);
  63}
  64
  65/**
  66 * iser_create_device_ib_res - creates Protection Domain (PD), Completion
  67 * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
  68 * the adapator.
  69 *
  70 * returns 0 on success, -1 on failure
  71 */
  72static int iser_create_device_ib_res(struct iser_device *device)
  73{
  74        int i, j;
  75        struct iser_cq_desc *cq_desc;
  76        struct ib_device_attr *dev_attr;
  77
  78        dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL);
  79        if (!dev_attr)
  80                return -ENOMEM;
  81
  82        if (ib_query_device(device->ib_device, dev_attr)) {
  83                pr_warn("Query device failed for %s\n", device->ib_device->name);
  84                goto dev_attr_err;
  85        }
  86
  87        /* Assign function handles  - based on FMR support */
  88        if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
  89            device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
  90                iser_info("FMR supported, using FMR for registration\n");
  91                device->iser_alloc_rdma_reg_res = iser_create_fmr_pool;
  92                device->iser_free_rdma_reg_res = iser_free_fmr_pool;
  93                device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr;
  94                device->iser_unreg_rdma_mem = iser_unreg_mem_fmr;
  95        } else
  96        if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
  97                iser_info("FRWR supported, using FRWR for registration\n");
  98                device->iser_alloc_rdma_reg_res = iser_create_frwr_pool;
  99                device->iser_free_rdma_reg_res = iser_free_frwr_pool;
 100                device->iser_reg_rdma_mem = iser_reg_rdma_mem_frwr;
 101                device->iser_unreg_rdma_mem = iser_unreg_mem_frwr;
 102        } else {
 103                iser_err("IB device does not support FMRs nor FRWRs, can't register memory\n");
 104                goto dev_attr_err;
 105        }
 106
 107        device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors);
 108        iser_info("using %d CQs, device %s supports %d vectors\n",
 109                  device->cqs_used, device->ib_device->name,
 110                  device->ib_device->num_comp_vectors);
 111
 112        device->cq_desc = kmalloc(sizeof(struct iser_cq_desc) * device->cqs_used,
 113                                  GFP_KERNEL);
 114        if (device->cq_desc == NULL)
 115                goto cq_desc_err;
 116        cq_desc = device->cq_desc;
 117
 118        device->pd = ib_alloc_pd(device->ib_device);
 119        if (IS_ERR(device->pd))
 120                goto pd_err;
 121
 122        for (i = 0; i < device->cqs_used; i++) {
 123                cq_desc[i].device   = device;
 124                cq_desc[i].cq_index = i;
 125
 126                device->rx_cq[i] = ib_create_cq(device->ib_device,
 127                                          iser_cq_callback,
 128                                          iser_cq_event_callback,
 129                                          (void *)&cq_desc[i],
 130                                          ISER_MAX_RX_CQ_LEN, i);
 131                if (IS_ERR(device->rx_cq[i]))
 132                        goto cq_err;
 133
 134                device->tx_cq[i] = ib_create_cq(device->ib_device,
 135                                          NULL, iser_cq_event_callback,
 136                                          (void *)&cq_desc[i],
 137                                          ISER_MAX_TX_CQ_LEN, i);
 138
 139                if (IS_ERR(device->tx_cq[i]))
 140                        goto cq_err;
 141
 142                if (ib_req_notify_cq(device->rx_cq[i], IB_CQ_NEXT_COMP))
 143                        goto cq_err;
 144
 145                tasklet_init(&device->cq_tasklet[i],
 146                             iser_cq_tasklet_fn,
 147                        (unsigned long)&cq_desc[i]);
 148        }
 149
 150        device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE |
 151                                   IB_ACCESS_REMOTE_WRITE |
 152                                   IB_ACCESS_REMOTE_READ);
 153        if (IS_ERR(device->mr))
 154                goto dma_mr_err;
 155
 156        INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
 157                                iser_event_handler);
 158        if (ib_register_event_handler(&device->event_handler))
 159                goto handler_err;
 160
 161        kfree(dev_attr);
 162        return 0;
 163
 164handler_err:
 165        ib_dereg_mr(device->mr);
 166dma_mr_err:
 167        for (j = 0; j < device->cqs_used; j++)
 168                tasklet_kill(&device->cq_tasklet[j]);
 169cq_err:
 170        for (j = 0; j < i; j++) {
 171                if (device->tx_cq[j])
 172                        ib_destroy_cq(device->tx_cq[j]);
 173                if (device->rx_cq[j])
 174                        ib_destroy_cq(device->rx_cq[j]);
 175        }
 176        ib_dealloc_pd(device->pd);
 177pd_err:
 178        kfree(device->cq_desc);
 179cq_desc_err:
 180        iser_err("failed to allocate an IB resource\n");
 181dev_attr_err:
 182        kfree(dev_attr);
 183        return -1;
 184}
 185
 186/**
 187 * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
 188 * CQ and PD created with the device associated with the adapator.
 189 */
 190static void iser_free_device_ib_res(struct iser_device *device)
 191{
 192        int i;
 193        BUG_ON(device->mr == NULL);
 194
 195        for (i = 0; i < device->cqs_used; i++) {
 196                tasklet_kill(&device->cq_tasklet[i]);
 197                (void)ib_destroy_cq(device->tx_cq[i]);
 198                (void)ib_destroy_cq(device->rx_cq[i]);
 199                device->tx_cq[i] = NULL;
 200                device->rx_cq[i] = NULL;
 201        }
 202
 203        (void)ib_unregister_event_handler(&device->event_handler);
 204        (void)ib_dereg_mr(device->mr);
 205        (void)ib_dealloc_pd(device->pd);
 206
 207        kfree(device->cq_desc);
 208
 209        device->mr = NULL;
 210        device->pd = NULL;
 211}
 212
 213/**
 214 * iser_create_fmr_pool - Creates FMR pool and page_vector
 215 *
 216 * returns 0 on success, or errno code on failure
 217 */
 218int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max)
 219{
 220        struct iser_device *device = ib_conn->device;
 221        struct ib_fmr_pool_param params;
 222        int ret = -ENOMEM;
 223
 224        ib_conn->fastreg.fmr.page_vec = kmalloc(sizeof(struct iser_page_vec) +
 225                                                (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)),
 226                                                GFP_KERNEL);
 227        if (!ib_conn->fastreg.fmr.page_vec)
 228                return ret;
 229
 230        ib_conn->fastreg.fmr.page_vec->pages = (u64 *)(ib_conn->fastreg.fmr.page_vec + 1);
 231
 232        params.page_shift        = SHIFT_4K;
 233        /* when the first/last SG element are not start/end *
 234         * page aligned, the map whould be of N+1 pages     */
 235        params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1;
 236        /* make the pool size twice the max number of SCSI commands *
 237         * the ML is expected to queue, watermark for unmap at 50%  */
 238        params.pool_size         = cmds_max * 2;
 239        params.dirty_watermark   = cmds_max;
 240        params.cache             = 0;
 241        params.flush_function    = NULL;
 242        params.access            = (IB_ACCESS_LOCAL_WRITE  |
 243                                    IB_ACCESS_REMOTE_WRITE |
 244                                    IB_ACCESS_REMOTE_READ);
 245
 246        ib_conn->fastreg.fmr.pool = ib_create_fmr_pool(device->pd, &params);
 247        if (!IS_ERR(ib_conn->fastreg.fmr.pool))
 248                return 0;
 249
 250        /* no FMR => no need for page_vec */
 251        kfree(ib_conn->fastreg.fmr.page_vec);
 252        ib_conn->fastreg.fmr.page_vec = NULL;
 253
 254        ret = PTR_ERR(ib_conn->fastreg.fmr.pool);
 255        ib_conn->fastreg.fmr.pool = NULL;
 256        if (ret != -ENOSYS) {
 257                iser_err("FMR allocation failed, err %d\n", ret);
 258                return ret;
 259        } else {
 260                iser_warn("FMRs are not supported, using unaligned mode\n");
 261                return 0;
 262        }
 263}
 264
 265/**
 266 * iser_free_fmr_pool - releases the FMR pool and page vec
 267 */
 268void iser_free_fmr_pool(struct iser_conn *ib_conn)
 269{
 270        iser_info("freeing conn %p fmr pool %p\n",
 271                  ib_conn, ib_conn->fastreg.fmr.pool);
 272
 273        if (ib_conn->fastreg.fmr.pool != NULL)
 274                ib_destroy_fmr_pool(ib_conn->fastreg.fmr.pool);
 275
 276        ib_conn->fastreg.fmr.pool = NULL;
 277
 278        kfree(ib_conn->fastreg.fmr.page_vec);
 279        ib_conn->fastreg.fmr.page_vec = NULL;
 280}
 281
 282/**
 283 * iser_create_frwr_pool - Creates pool of fast_reg descriptors
 284 * for fast registration work requests.
 285 * returns 0 on success, or errno code on failure
 286 */
 287int iser_create_frwr_pool(struct iser_conn *ib_conn, unsigned cmds_max)
 288{
 289        struct iser_device      *device = ib_conn->device;
 290        struct fast_reg_descriptor      *desc;
 291        int i, ret;
 292
 293        INIT_LIST_HEAD(&ib_conn->fastreg.frwr.pool);
 294        ib_conn->fastreg.frwr.pool_size = 0;
 295        for (i = 0; i < cmds_max; i++) {
 296                desc = kmalloc(sizeof(*desc), GFP_KERNEL);
 297                if (!desc) {
 298                        iser_err("Failed to allocate a new fast_reg descriptor\n");
 299                        ret = -ENOMEM;
 300                        goto err;
 301                }
 302
 303                desc->data_frpl = ib_alloc_fast_reg_page_list(device->ib_device,
 304                                                         ISCSI_ISER_SG_TABLESIZE + 1);
 305                if (IS_ERR(desc->data_frpl)) {
 306                        ret = PTR_ERR(desc->data_frpl);
 307                        iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", ret);
 308                        goto fast_reg_page_failure;
 309                }
 310
 311                desc->data_mr = ib_alloc_fast_reg_mr(device->pd,
 312                                                     ISCSI_ISER_SG_TABLESIZE + 1);
 313                if (IS_ERR(desc->data_mr)) {
 314                        ret = PTR_ERR(desc->data_mr);
 315                        iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
 316                        goto fast_reg_mr_failure;
 317                }
 318                desc->valid = true;
 319                list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool);
 320                ib_conn->fastreg.frwr.pool_size++;
 321        }
 322
 323        return 0;
 324
 325fast_reg_mr_failure:
 326        ib_free_fast_reg_page_list(desc->data_frpl);
 327fast_reg_page_failure:
 328        kfree(desc);
 329err:
 330        iser_free_frwr_pool(ib_conn);
 331        return ret;
 332}
 333
 334/**
 335 * iser_free_frwr_pool - releases the pool of fast_reg descriptors
 336 */
 337void iser_free_frwr_pool(struct iser_conn *ib_conn)
 338{
 339        struct fast_reg_descriptor *desc, *tmp;
 340        int i = 0;
 341
 342        if (list_empty(&ib_conn->fastreg.frwr.pool))
 343                return;
 344
 345        iser_info("freeing conn %p frwr pool\n", ib_conn);
 346
 347        list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.frwr.pool, list) {
 348                list_del(&desc->list);
 349                ib_free_fast_reg_page_list(desc->data_frpl);
 350                ib_dereg_mr(desc->data_mr);
 351                kfree(desc);
 352                ++i;
 353        }
 354
 355        if (i < ib_conn->fastreg.frwr.pool_size)
 356                iser_warn("pool still has %d regions registered\n",
 357                          ib_conn->fastreg.frwr.pool_size - i);
 358}
 359
 360/**
 361 * iser_create_ib_conn_res - Queue-Pair (QP)
 362 *
 363 * returns 0 on success, -1 on failure
 364 */
 365static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 366{
 367        struct iser_device      *device;
 368        struct ib_qp_init_attr  init_attr;
 369        int                     ret = -ENOMEM;
 370        int index, min_index = 0;
 371
 372        BUG_ON(ib_conn->device == NULL);
 373
 374        device = ib_conn->device;
 375
 376        memset(&init_attr, 0, sizeof init_attr);
 377
 378        mutex_lock(&ig.connlist_mutex);
 379        /* select the CQ with the minimal number of usages */
 380        for (index = 0; index < device->cqs_used; index++)
 381                if (device->cq_active_qps[index] <
 382                    device->cq_active_qps[min_index])
 383                        min_index = index;
 384        device->cq_active_qps[min_index]++;
 385        mutex_unlock(&ig.connlist_mutex);
 386        iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn);
 387
 388        init_attr.event_handler = iser_qp_event_callback;
 389        init_attr.qp_context    = (void *)ib_conn;
 390        init_attr.send_cq       = device->tx_cq[min_index];
 391        init_attr.recv_cq       = device->rx_cq[min_index];
 392        init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
 393        init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
 394        init_attr.cap.max_send_sge = 2;
 395        init_attr.cap.max_recv_sge = 1;
 396        init_attr.sq_sig_type   = IB_SIGNAL_REQ_WR;
 397        init_attr.qp_type       = IB_QPT_RC;
 398
 399        ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
 400        if (ret)
 401                goto out_err;
 402
 403        ib_conn->qp = ib_conn->cma_id->qp;
 404        iser_info("setting conn %p cma_id %p qp %p\n",
 405                  ib_conn, ib_conn->cma_id,
 406                  ib_conn->cma_id->qp);
 407        return ret;
 408
 409out_err:
 410        iser_err("unable to alloc mem or create resource, err %d\n", ret);
 411        return ret;
 412}
 413
 414/**
 415 * releases the QP objects, returns 0 on success,
 416 * -1 on failure
 417 */
 418static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
 419{
 420        int cq_index;
 421        BUG_ON(ib_conn == NULL);
 422
 423        iser_info("freeing conn %p cma_id %p qp %p\n",
 424                  ib_conn, ib_conn->cma_id,
 425                  ib_conn->qp);
 426
 427        /* qp is created only once both addr & route are resolved */
 428
 429        if (ib_conn->qp != NULL) {
 430                cq_index = ((struct iser_cq_desc *)ib_conn->qp->recv_cq->cq_context)->cq_index;
 431                ib_conn->device->cq_active_qps[cq_index]--;
 432
 433                rdma_destroy_qp(ib_conn->cma_id);
 434        }
 435
 436        ib_conn->qp       = NULL;
 437
 438        return 0;
 439}
 440
 441/**
 442 * based on the resolved device node GUID see if there already allocated
 443 * device for this device. If there's no such, create one.
 444 */
 445static
 446struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
 447{
 448        struct iser_device *device;
 449
 450        mutex_lock(&ig.device_list_mutex);
 451
 452        list_for_each_entry(device, &ig.device_list, ig_list)
 453                /* find if there's a match using the node GUID */
 454                if (device->ib_device->node_guid == cma_id->device->node_guid)
 455                        goto inc_refcnt;
 456
 457        device = kzalloc(sizeof *device, GFP_KERNEL);
 458        if (device == NULL)
 459                goto out;
 460
 461        /* assign this device to the device */
 462        device->ib_device = cma_id->device;
 463        /* init the device and link it into ig device list */
 464        if (iser_create_device_ib_res(device)) {
 465                kfree(device);
 466                device = NULL;
 467                goto out;
 468        }
 469        list_add(&device->ig_list, &ig.device_list);
 470
 471inc_refcnt:
 472        device->refcount++;
 473out:
 474        mutex_unlock(&ig.device_list_mutex);
 475        return device;
 476}
 477
 478/* if there's no demand for this device, release it */
 479static void iser_device_try_release(struct iser_device *device)
 480{
 481        mutex_lock(&ig.device_list_mutex);
 482        device->refcount--;
 483        iser_info("device %p refcount %d\n", device, device->refcount);
 484        if (!device->refcount) {
 485                iser_free_device_ib_res(device);
 486                list_del(&device->ig_list);
 487                kfree(device);
 488        }
 489        mutex_unlock(&ig.device_list_mutex);
 490}
 491
 492static int iser_conn_state_comp_exch(struct iser_conn *ib_conn,
 493                                     enum iser_ib_conn_state comp,
 494                                     enum iser_ib_conn_state exch)
 495{
 496        int ret;
 497
 498        spin_lock_bh(&ib_conn->lock);
 499        if ((ret = (ib_conn->state == comp)))
 500                ib_conn->state = exch;
 501        spin_unlock_bh(&ib_conn->lock);
 502        return ret;
 503}
 504
 505/**
 506 * Frees all conn objects and deallocs conn descriptor
 507 */
 508static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id)
 509{
 510        struct iser_device  *device = ib_conn->device;
 511
 512        BUG_ON(ib_conn->state != ISER_CONN_DOWN);
 513
 514        mutex_lock(&ig.connlist_mutex);
 515        list_del(&ib_conn->conn_list);
 516        mutex_unlock(&ig.connlist_mutex);
 517        iser_free_rx_descriptors(ib_conn);
 518        iser_free_ib_conn_res(ib_conn);
 519        ib_conn->device = NULL;
 520        /* on EVENT_ADDR_ERROR there's no device yet for this conn */
 521        if (device != NULL)
 522                iser_device_try_release(device);
 523        /* if cma handler context, the caller actually destroy the id */
 524        if (ib_conn->cma_id != NULL && can_destroy_id) {
 525                rdma_destroy_id(ib_conn->cma_id);
 526                ib_conn->cma_id = NULL;
 527        }
 528        iscsi_destroy_endpoint(ib_conn->ep);
 529}
 530
 531void iser_conn_get(struct iser_conn *ib_conn)
 532{
 533        atomic_inc(&ib_conn->refcount);
 534}
 535
 536int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id)
 537{
 538        if (atomic_dec_and_test(&ib_conn->refcount)) {
 539                iser_conn_release(ib_conn, can_destroy_id);
 540                return 1;
 541        }
 542        return 0;
 543}
 544
 545/**
 546 * triggers start of the disconnect procedures and wait for them to be done
 547 */
 548void iser_conn_terminate(struct iser_conn *ib_conn)
 549{
 550        int err = 0;
 551
 552        /* change the ib conn state only if the conn is UP, however always call
 553         * rdma_disconnect since this is the only way to cause the CMA to change
 554         * the QP state to ERROR
 555         */
 556
 557        iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, ISER_CONN_TERMINATING);
 558        err = rdma_disconnect(ib_conn->cma_id);
 559        if (err)
 560                iser_err("Failed to disconnect, conn: 0x%p err %d\n",
 561                         ib_conn,err);
 562
 563        wait_event_interruptible(ib_conn->wait,
 564                                 ib_conn->state == ISER_CONN_DOWN);
 565
 566        iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */
 567}
 568
 569static int iser_connect_error(struct rdma_cm_id *cma_id)
 570{
 571        struct iser_conn *ib_conn;
 572        ib_conn = (struct iser_conn *)cma_id->context;
 573
 574        ib_conn->state = ISER_CONN_DOWN;
 575        wake_up_interruptible(&ib_conn->wait);
 576        return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */
 577}
 578
 579static int iser_addr_handler(struct rdma_cm_id *cma_id)
 580{
 581        struct iser_device *device;
 582        struct iser_conn   *ib_conn;
 583        int    ret;
 584
 585        device = iser_device_find_by_ib_device(cma_id);
 586        if (!device) {
 587                iser_err("device lookup/creation failed\n");
 588                return iser_connect_error(cma_id);
 589        }
 590
 591        ib_conn = (struct iser_conn *)cma_id->context;
 592        ib_conn->device = device;
 593
 594        ret = rdma_resolve_route(cma_id, 1000);
 595        if (ret) {
 596                iser_err("resolve route failed: %d\n", ret);
 597                return iser_connect_error(cma_id);
 598        }
 599
 600        return 0;
 601}
 602
 603static int iser_route_handler(struct rdma_cm_id *cma_id)
 604{
 605        struct rdma_conn_param conn_param;
 606        int    ret;
 607        struct iser_cm_hdr req_hdr;
 608
 609        ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context);
 610        if (ret)
 611                goto failure;
 612
 613        memset(&conn_param, 0, sizeof conn_param);
 614        conn_param.responder_resources = 4;
 615        conn_param.initiator_depth     = 1;
 616        conn_param.retry_count         = 7;
 617        conn_param.rnr_retry_count     = 6;
 618
 619        memset(&req_hdr, 0, sizeof(req_hdr));
 620        req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
 621                        ISER_SEND_W_INV_NOT_SUPPORTED);
 622        conn_param.private_data         = (void *)&req_hdr;
 623        conn_param.private_data_len     = sizeof(struct iser_cm_hdr);
 624
 625        ret = rdma_connect(cma_id, &conn_param);
 626        if (ret) {
 627                iser_err("failure connecting: %d\n", ret);
 628                goto failure;
 629        }
 630
 631        return 0;
 632failure:
 633        return iser_connect_error(cma_id);
 634}
 635
 636static void iser_connected_handler(struct rdma_cm_id *cma_id)
 637{
 638        struct iser_conn *ib_conn;
 639
 640        ib_conn = (struct iser_conn *)cma_id->context;
 641        ib_conn->state = ISER_CONN_UP;
 642        wake_up_interruptible(&ib_conn->wait);
 643}
 644
 645static int iser_disconnected_handler(struct rdma_cm_id *cma_id)
 646{
 647        struct iser_conn *ib_conn;
 648        int ret;
 649
 650        ib_conn = (struct iser_conn *)cma_id->context;
 651
 652        /* getting here when the state is UP means that the conn is being *
 653         * terminated asynchronously from the iSCSI layer's perspective.  */
 654        if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP,
 655                                        ISER_CONN_TERMINATING)){
 656                if (ib_conn->iser_conn)
 657                        iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn,
 658                                           ISCSI_ERR_CONN_FAILED);
 659                else
 660                        iser_err("iscsi_iser connection isn't bound\n");
 661        }
 662
 663        /* Complete the termination process if no posts are pending */
 664        if (ib_conn->post_recv_buf_count == 0 &&
 665            (atomic_read(&ib_conn->post_send_buf_count) == 0)) {
 666                ib_conn->state = ISER_CONN_DOWN;
 667                wake_up_interruptible(&ib_conn->wait);
 668        }
 669
 670        ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */
 671        return ret;
 672}
 673
 674static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
 675{
 676        int ret = 0;
 677
 678        iser_info("event %d status %d conn %p id %p\n",
 679                  event->event, event->status, cma_id->context, cma_id);
 680
 681        switch (event->event) {
 682        case RDMA_CM_EVENT_ADDR_RESOLVED:
 683                ret = iser_addr_handler(cma_id);
 684                break;
 685        case RDMA_CM_EVENT_ROUTE_RESOLVED:
 686                ret = iser_route_handler(cma_id);
 687                break;
 688        case RDMA_CM_EVENT_ESTABLISHED:
 689                iser_connected_handler(cma_id);
 690                break;
 691        case RDMA_CM_EVENT_ADDR_ERROR:
 692        case RDMA_CM_EVENT_ROUTE_ERROR:
 693        case RDMA_CM_EVENT_CONNECT_ERROR:
 694        case RDMA_CM_EVENT_UNREACHABLE:
 695        case RDMA_CM_EVENT_REJECTED:
 696                ret = iser_connect_error(cma_id);
 697                break;
 698        case RDMA_CM_EVENT_DISCONNECTED:
 699        case RDMA_CM_EVENT_DEVICE_REMOVAL:
 700        case RDMA_CM_EVENT_ADDR_CHANGE:
 701                ret = iser_disconnected_handler(cma_id);
 702                break;
 703        default:
 704                iser_err("Unexpected RDMA CM event (%d)\n", event->event);
 705                break;
 706        }
 707        return ret;
 708}
 709
 710void iser_conn_init(struct iser_conn *ib_conn)
 711{
 712        ib_conn->state = ISER_CONN_INIT;
 713        init_waitqueue_head(&ib_conn->wait);
 714        ib_conn->post_recv_buf_count = 0;
 715        atomic_set(&ib_conn->post_send_buf_count, 0);
 716        atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */
 717        INIT_LIST_HEAD(&ib_conn->conn_list);
 718        spin_lock_init(&ib_conn->lock);
 719}
 720
 721 /**
 722 * starts the process of connecting to the target
 723 * sleeps until the connection is established or rejected
 724 */
 725int iser_connect(struct iser_conn   *ib_conn,
 726                 struct sockaddr_in *src_addr,
 727                 struct sockaddr_in *dst_addr,
 728                 int                 non_blocking)
 729{
 730        struct sockaddr *src, *dst;
 731        int err = 0;
 732
 733        sprintf(ib_conn->name, "%pI4:%d",
 734                &dst_addr->sin_addr.s_addr, dst_addr->sin_port);
 735
 736        /* the device is known only --after-- address resolution */
 737        ib_conn->device = NULL;
 738
 739        iser_info("connecting to: %pI4, port 0x%x\n",
 740                  &dst_addr->sin_addr, dst_addr->sin_port);
 741
 742        ib_conn->state = ISER_CONN_PENDING;
 743
 744        iser_conn_get(ib_conn); /* ref ib conn's cma id */
 745        ib_conn->cma_id = rdma_create_id(iser_cma_handler,
 746                                             (void *)ib_conn,
 747                                             RDMA_PS_TCP, IB_QPT_RC);
 748        if (IS_ERR(ib_conn->cma_id)) {
 749                err = PTR_ERR(ib_conn->cma_id);
 750                iser_err("rdma_create_id failed: %d\n", err);
 751                goto id_failure;
 752        }
 753
 754        src = (struct sockaddr *)src_addr;
 755        dst = (struct sockaddr *)dst_addr;
 756        err = rdma_resolve_addr(ib_conn->cma_id, src, dst, 1000);
 757        if (err) {
 758                iser_err("rdma_resolve_addr failed: %d\n", err);
 759                goto addr_failure;
 760        }
 761
 762        if (!non_blocking) {
 763                wait_event_interruptible(ib_conn->wait,
 764                                         (ib_conn->state != ISER_CONN_PENDING));
 765
 766                if (ib_conn->state != ISER_CONN_UP) {
 767                        err =  -EIO;
 768                        goto connect_failure;
 769                }
 770        }
 771
 772        mutex_lock(&ig.connlist_mutex);
 773        list_add(&ib_conn->conn_list, &ig.connlist);
 774        mutex_unlock(&ig.connlist_mutex);
 775        return 0;
 776
 777id_failure:
 778        ib_conn->cma_id = NULL;
 779addr_failure:
 780        ib_conn->state = ISER_CONN_DOWN;
 781        iser_conn_put(ib_conn, 1); /* deref ib conn's cma id */
 782connect_failure:
 783        iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */
 784        return err;
 785}
 786
 787/**
 788 * iser_reg_page_vec - Register physical memory
 789 *
 790 * returns: 0 on success, errno code on failure
 791 */
 792int iser_reg_page_vec(struct iser_conn     *ib_conn,
 793                      struct iser_page_vec *page_vec,
 794                      struct iser_mem_reg  *mem_reg)
 795{
 796        struct ib_pool_fmr *mem;
 797        u64                io_addr;
 798        u64                *page_list;
 799        int                status;
 800
 801        page_list = page_vec->pages;
 802        io_addr   = page_list[0];
 803
 804        mem  = ib_fmr_pool_map_phys(ib_conn->fastreg.fmr.pool,
 805                                    page_list,
 806                                    page_vec->length,
 807                                    io_addr);
 808
 809        if (IS_ERR(mem)) {
 810                status = (int)PTR_ERR(mem);
 811                iser_err("ib_fmr_pool_map_phys failed: %d\n", status);
 812                return status;
 813        }
 814
 815        mem_reg->lkey  = mem->fmr->lkey;
 816        mem_reg->rkey  = mem->fmr->rkey;
 817        mem_reg->len   = page_vec->length * SIZE_4K;
 818        mem_reg->va    = io_addr;
 819        mem_reg->is_mr = 1;
 820        mem_reg->mem_h = (void *)mem;
 821
 822        mem_reg->va   += page_vec->offset;
 823        mem_reg->len   = page_vec->data_size;
 824
 825        iser_dbg("PHYSICAL Mem.register, [PHYS p_array: 0x%p, sz: %d, "
 826                 "entry[0]: (0x%08lx,%ld)] -> "
 827                 "[lkey: 0x%08X mem_h: 0x%p va: 0x%08lX sz: %ld]\n",
 828                 page_vec, page_vec->length,
 829                 (unsigned long)page_vec->pages[0],
 830                 (unsigned long)page_vec->data_size,
 831                 (unsigned int)mem_reg->lkey, mem_reg->mem_h,
 832                 (unsigned long)mem_reg->va, (unsigned long)mem_reg->len);
 833        return 0;
 834}
 835
 836/**
 837 * Unregister (previosuly registered using FMR) memory.
 838 * If memory is non-FMR does nothing.
 839 */
 840void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
 841                        enum iser_data_dir cmd_dir)
 842{
 843        struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
 844        int ret;
 845
 846        if (!reg->is_mr)
 847                return;
 848
 849        iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h);
 850
 851        ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
 852        if (ret)
 853                iser_err("ib_fmr_pool_unmap failed %d\n", ret);
 854
 855        reg->mem_h = NULL;
 856}
 857
 858void iser_unreg_mem_frwr(struct iscsi_iser_task *iser_task,
 859                         enum iser_data_dir cmd_dir)
 860{
 861        struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
 862        struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn;
 863        struct fast_reg_descriptor *desc = reg->mem_h;
 864
 865        if (!reg->is_mr)
 866                return;
 867
 868        reg->mem_h = NULL;
 869        reg->is_mr = 0;
 870        spin_lock_bh(&ib_conn->lock);
 871        list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool);
 872        spin_unlock_bh(&ib_conn->lock);
 873}
 874
 875int iser_post_recvl(struct iser_conn *ib_conn)
 876{
 877        struct ib_recv_wr rx_wr, *rx_wr_failed;
 878        struct ib_sge     sge;
 879        int ib_ret;
 880
 881        sge.addr   = ib_conn->login_resp_dma;
 882        sge.length = ISER_RX_LOGIN_SIZE;
 883        sge.lkey   = ib_conn->device->mr->lkey;
 884
 885        rx_wr.wr_id   = (unsigned long)ib_conn->login_resp_buf;
 886        rx_wr.sg_list = &sge;
 887        rx_wr.num_sge = 1;
 888        rx_wr.next    = NULL;
 889
 890        ib_conn->post_recv_buf_count++;
 891        ib_ret  = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
 892        if (ib_ret) {
 893                iser_err("ib_post_recv failed ret=%d\n", ib_ret);
 894                ib_conn->post_recv_buf_count--;
 895        }
 896        return ib_ret;
 897}
 898
 899int iser_post_recvm(struct iser_conn *ib_conn, int count)
 900{
 901        struct ib_recv_wr *rx_wr, *rx_wr_failed;
 902        int i, ib_ret;
 903        unsigned int my_rx_head = ib_conn->rx_desc_head;
 904        struct iser_rx_desc *rx_desc;
 905
 906        for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
 907                rx_desc         = &ib_conn->rx_descs[my_rx_head];
 908                rx_wr->wr_id    = (unsigned long)rx_desc;
 909                rx_wr->sg_list  = &rx_desc->rx_sg;
 910                rx_wr->num_sge  = 1;
 911                rx_wr->next     = rx_wr + 1;
 912                my_rx_head = (my_rx_head + 1) & ib_conn->qp_max_recv_dtos_mask;
 913        }
 914
 915        rx_wr--;
 916        rx_wr->next = NULL; /* mark end of work requests list */
 917
 918        ib_conn->post_recv_buf_count += count;
 919        ib_ret  = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
 920        if (ib_ret) {
 921                iser_err("ib_post_recv failed ret=%d\n", ib_ret);
 922                ib_conn->post_recv_buf_count -= count;
 923        } else
 924                ib_conn->rx_desc_head = my_rx_head;
 925        return ib_ret;
 926}
 927
 928
 929/**
 930 * iser_start_send - Initiate a Send DTO operation
 931 *
 932 * returns 0 on success, -1 on failure
 933 */
 934int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc)
 935{
 936        int               ib_ret;
 937        struct ib_send_wr send_wr, *send_wr_failed;
 938
 939        ib_dma_sync_single_for_device(ib_conn->device->ib_device,
 940                tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
 941
 942        send_wr.next       = NULL;
 943        send_wr.wr_id      = (unsigned long)tx_desc;
 944        send_wr.sg_list    = tx_desc->tx_sg;
 945        send_wr.num_sge    = tx_desc->num_sge;
 946        send_wr.opcode     = IB_WR_SEND;
 947        send_wr.send_flags = IB_SEND_SIGNALED;
 948
 949        atomic_inc(&ib_conn->post_send_buf_count);
 950
 951        ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
 952        if (ib_ret) {
 953                iser_err("ib_post_send failed, ret:%d\n", ib_ret);
 954                atomic_dec(&ib_conn->post_send_buf_count);
 955        }
 956        return ib_ret;
 957}
 958
 959static void iser_handle_comp_error(struct iser_tx_desc *desc,
 960                                struct iser_conn *ib_conn)
 961{
 962        if (desc && desc->type == ISCSI_TX_DATAOUT)
 963                kmem_cache_free(ig.desc_cache, desc);
 964
 965        if (ib_conn->post_recv_buf_count == 0 &&
 966            atomic_read(&ib_conn->post_send_buf_count) == 0) {
 967                /* getting here when the state is UP means that the conn is *
 968                 * being terminated asynchronously from the iSCSI layer's   *
 969                 * perspective.                                             */
 970                if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP,
 971                    ISER_CONN_TERMINATING))
 972                        iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn,
 973                                           ISCSI_ERR_CONN_FAILED);
 974
 975                /* no more non completed posts to the QP, complete the
 976                 * termination process w.o worrying on disconnect event */
 977                ib_conn->state = ISER_CONN_DOWN;
 978                wake_up_interruptible(&ib_conn->wait);
 979        }
 980}
 981
 982static int iser_drain_tx_cq(struct iser_device  *device, int cq_index)
 983{
 984        struct ib_cq  *cq = device->tx_cq[cq_index];
 985        struct ib_wc  wc;
 986        struct iser_tx_desc *tx_desc;
 987        struct iser_conn *ib_conn;
 988        int completed_tx = 0;
 989
 990        while (ib_poll_cq(cq, 1, &wc) == 1) {
 991                tx_desc = (struct iser_tx_desc *) (unsigned long) wc.wr_id;
 992                ib_conn = wc.qp->qp_context;
 993                if (wc.status == IB_WC_SUCCESS) {
 994                        if (wc.opcode == IB_WC_SEND)
 995                                iser_snd_completion(tx_desc, ib_conn);
 996                        else if (wc.opcode == IB_WC_LOCAL_INV ||
 997                                 wc.opcode == IB_WC_FAST_REG_MR) {
 998                                atomic_dec(&ib_conn->post_send_buf_count);
 999                                continue;
1000                        } else
1001                                iser_err("expected opcode %d got %d\n",
1002                                        IB_WC_SEND, wc.opcode);
1003                } else {
1004                        iser_err("tx id %llx status %d vend_err %x\n",
1005                                wc.wr_id, wc.status, wc.vendor_err);
1006                        atomic_dec(&ib_conn->post_send_buf_count);
1007                        iser_handle_comp_error(tx_desc, ib_conn);
1008                }
1009                completed_tx++;
1010        }
1011        return completed_tx;
1012}
1013
1014
1015static void iser_cq_tasklet_fn(unsigned long data)
1016{
1017        struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)data;
1018        struct iser_device  *device = cq_desc->device;
1019        int cq_index = cq_desc->cq_index;
1020        struct ib_cq         *cq = device->rx_cq[cq_index];
1021         struct ib_wc        wc;
1022         struct iser_rx_desc *desc;
1023         unsigned long       xfer_len;
1024        struct iser_conn *ib_conn;
1025        int completed_tx, completed_rx;
1026        completed_tx = completed_rx = 0;
1027
1028        while (ib_poll_cq(cq, 1, &wc) == 1) {
1029                desc     = (struct iser_rx_desc *) (unsigned long) wc.wr_id;
1030                BUG_ON(desc == NULL);
1031                ib_conn = wc.qp->qp_context;
1032                if (wc.status == IB_WC_SUCCESS) {
1033                        if (wc.opcode == IB_WC_RECV) {
1034                                xfer_len = (unsigned long)wc.byte_len;
1035                                iser_rcv_completion(desc, xfer_len, ib_conn);
1036                        } else
1037                                iser_err("expected opcode %d got %d\n",
1038                                        IB_WC_RECV, wc.opcode);
1039                } else {
1040                        if (wc.status != IB_WC_WR_FLUSH_ERR)
1041                                iser_err("rx id %llx status %d vend_err %x\n",
1042                                        wc.wr_id, wc.status, wc.vendor_err);
1043                        ib_conn->post_recv_buf_count--;
1044                        iser_handle_comp_error(NULL, ib_conn);
1045                }
1046                completed_rx++;
1047                if (!(completed_rx & 63))
1048                        completed_tx += iser_drain_tx_cq(device, cq_index);
1049        }
1050        /* #warning "it is assumed here that arming CQ only once its empty" *
1051         * " would not cause interrupts to be missed"                       */
1052        ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
1053
1054        completed_tx += iser_drain_tx_cq(device, cq_index);
1055        iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx);
1056}
1057
1058static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
1059{
1060        struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)cq_context;
1061        struct iser_device  *device = cq_desc->device;
1062        int cq_index = cq_desc->cq_index;
1063
1064        tasklet_schedule(&device->cq_tasklet[cq_index]);
1065}
1066