linux/drivers/infiniband/ulp/iser/iser_memory.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 */
  32#include <linux/module.h>
  33#include <linux/kernel.h>
  34#include <linux/slab.h>
  35#include <linux/mm.h>
  36#include <linux/highmem.h>
  37#include <linux/scatterlist.h>
  38
  39#include "iscsi_iser.h"
  40
  41#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */
  42
  43/**
  44 * Decrements the reference count for the
  45 * registered buffer & releases it
  46 *
  47 * returns 0 if released, 1 if deferred
  48 */
  49int iser_regd_buff_release(struct iser_regd_buf *regd_buf)
  50{
  51        struct ib_device *dev;
  52
  53        if ((atomic_read(&regd_buf->ref_count) == 0) ||
  54            atomic_dec_and_test(&regd_buf->ref_count)) {
  55                /* if we used the dma mr, unreg is just NOP */
  56                if (regd_buf->reg.is_fmr)
  57                        iser_unreg_mem(&regd_buf->reg);
  58
  59                if (regd_buf->dma_addr) {
  60                        dev = regd_buf->device->ib_device;
  61                        ib_dma_unmap_single(dev,
  62                                         regd_buf->dma_addr,
  63                                         regd_buf->data_size,
  64                                         regd_buf->direction);
  65                }
  66                /* else this regd buf is associated with task which we */
  67                /* dma_unmap_single/sg later */
  68                return 0;
  69        } else {
  70                iser_dbg("Release deferred, regd.buff: 0x%p\n", regd_buf);
  71                return 1;
  72        }
  73}
  74
  75/**
  76 * iser_reg_single - fills registered buffer descriptor with
  77 *                   registration information
  78 */
  79void iser_reg_single(struct iser_device *device,
  80                     struct iser_regd_buf *regd_buf,
  81                     enum dma_data_direction direction)
  82{
  83        u64 dma_addr;
  84
  85        dma_addr = ib_dma_map_single(device->ib_device,
  86                                     regd_buf->virt_addr,
  87                                     regd_buf->data_size, direction);
  88        BUG_ON(ib_dma_mapping_error(device->ib_device, dma_addr));
  89
  90        regd_buf->reg.lkey = device->mr->lkey;
  91        regd_buf->reg.len  = regd_buf->data_size;
  92        regd_buf->reg.va   = dma_addr;
  93        regd_buf->reg.is_fmr = 0;
  94
  95        regd_buf->dma_addr  = dma_addr;
  96        regd_buf->direction = direction;
  97}
  98
  99/**
 100 * iser_start_rdma_unaligned_sg
 101 */
 102static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
 103                                        enum iser_data_dir cmd_dir)
 104{
 105        int dma_nents;
 106        struct ib_device *dev;
 107        char *mem = NULL;
 108        struct iser_data_buf *data = &iser_task->data[cmd_dir];
 109        unsigned long  cmd_data_len = data->data_len;
 110
 111        if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
 112                mem = (void *)__get_free_pages(GFP_NOIO,
 113                      ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
 114        else
 115                mem = kmalloc(cmd_data_len, GFP_NOIO);
 116
 117        if (mem == NULL) {
 118                iser_err("Failed to allocate mem size %d %d for copying sglist\n",
 119                         data->size,(int)cmd_data_len);
 120                return -ENOMEM;
 121        }
 122
 123        if (cmd_dir == ISER_DIR_OUT) {
 124                /* copy the unaligned sg the buffer which is used for RDMA */
 125                struct scatterlist *sgl = (struct scatterlist *)data->buf;
 126                struct scatterlist *sg;
 127                int i;
 128                char *p, *from;
 129
 130                p = mem;
 131                for_each_sg(sgl, sg, data->size, i) {
 132                        from = kmap_atomic(sg_page(sg), KM_USER0);
 133                        memcpy(p,
 134                               from + sg->offset,
 135                               sg->length);
 136                        kunmap_atomic(from, KM_USER0);
 137                        p += sg->length;
 138                }
 139        }
 140
 141        sg_init_one(&iser_task->data_copy[cmd_dir].sg_single, mem, cmd_data_len);
 142        iser_task->data_copy[cmd_dir].buf  =
 143                &iser_task->data_copy[cmd_dir].sg_single;
 144        iser_task->data_copy[cmd_dir].size = 1;
 145
 146        iser_task->data_copy[cmd_dir].copy_buf  = mem;
 147
 148        dev = iser_task->iser_conn->ib_conn->device->ib_device;
 149        dma_nents = ib_dma_map_sg(dev,
 150                                  &iser_task->data_copy[cmd_dir].sg_single,
 151                                  1,
 152                                  (cmd_dir == ISER_DIR_OUT) ?
 153                                  DMA_TO_DEVICE : DMA_FROM_DEVICE);
 154        BUG_ON(dma_nents == 0);
 155
 156        iser_task->data_copy[cmd_dir].dma_nents = dma_nents;
 157        return 0;
 158}
 159
 160/**
 161 * iser_finalize_rdma_unaligned_sg
 162 */
 163void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
 164                                     enum iser_data_dir         cmd_dir)
 165{
 166        struct ib_device *dev;
 167        struct iser_data_buf *mem_copy;
 168        unsigned long  cmd_data_len;
 169
 170        dev = iser_task->iser_conn->ib_conn->device->ib_device;
 171        mem_copy = &iser_task->data_copy[cmd_dir];
 172
 173        ib_dma_unmap_sg(dev, &mem_copy->sg_single, 1,
 174                        (cmd_dir == ISER_DIR_OUT) ?
 175                        DMA_TO_DEVICE : DMA_FROM_DEVICE);
 176
 177        if (cmd_dir == ISER_DIR_IN) {
 178                char *mem;
 179                struct scatterlist *sgl, *sg;
 180                unsigned char *p, *to;
 181                unsigned int sg_size;
 182                int i;
 183
 184                /* copy back read RDMA to unaligned sg */
 185                mem     = mem_copy->copy_buf;
 186
 187                sgl     = (struct scatterlist *)iser_task->data[ISER_DIR_IN].buf;
 188                sg_size = iser_task->data[ISER_DIR_IN].size;
 189
 190                p = mem;
 191                for_each_sg(sgl, sg, sg_size, i) {
 192                        to = kmap_atomic(sg_page(sg), KM_SOFTIRQ0);
 193                        memcpy(to + sg->offset,
 194                               p,
 195                               sg->length);
 196                        kunmap_atomic(to, KM_SOFTIRQ0);
 197                        p += sg->length;
 198                }
 199        }
 200
 201        cmd_data_len = iser_task->data[cmd_dir].data_len;
 202
 203        if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
 204                free_pages((unsigned long)mem_copy->copy_buf,
 205                           ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
 206        else
 207                kfree(mem_copy->copy_buf);
 208
 209        mem_copy->copy_buf = NULL;
 210}
 211
 212/**
 213 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
 214 * and returns the length of resulting physical address array (may be less than
 215 * the original due to possible compaction).
 216 *
 217 * we build a "page vec" under the assumption that the SG meets the RDMA
 218 * alignment requirements. Other then the first and last SG elements, all
 219 * the "internal" elements can be compacted into a list whose elements are
 220 * dma addresses of physical pages. The code supports also the weird case
 221 * where --few fragments of the same page-- are present in the SG as
 222 * consecutive elements. Also, it handles one entry SG.
 223 */
 224static int iser_sg_to_page_vec(struct iser_data_buf *data,
 225                               struct iser_page_vec *page_vec,
 226                               struct ib_device *ibdev)
 227{
 228        struct scatterlist *sgl = (struct scatterlist *)data->buf;
 229        struct scatterlist *sg;
 230        u64 first_addr, last_addr, page;
 231        int end_aligned;
 232        unsigned int cur_page = 0;
 233        unsigned long total_sz = 0;
 234        int i;
 235
 236        /* compute the offset of first element */
 237        page_vec->offset = (u64) sgl[0].offset & ~MASK_4K;
 238
 239        for_each_sg(sgl, sg, data->dma_nents, i) {
 240                unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
 241
 242                total_sz += dma_len;
 243
 244                first_addr = ib_sg_dma_address(ibdev, sg);
 245                last_addr  = first_addr + dma_len;
 246
 247                end_aligned   = !(last_addr  & ~MASK_4K);
 248
 249                /* continue to collect page fragments till aligned or SG ends */
 250                while (!end_aligned && (i + 1 < data->dma_nents)) {
 251                        sg = sg_next(sg);
 252                        i++;
 253                        dma_len = ib_sg_dma_len(ibdev, sg);
 254                        total_sz += dma_len;
 255                        last_addr = ib_sg_dma_address(ibdev, sg) + dma_len;
 256                        end_aligned = !(last_addr  & ~MASK_4K);
 257                }
 258
 259                /* handle the 1st page in the 1st DMA element */
 260                if (cur_page == 0) {
 261                        page = first_addr & MASK_4K;
 262                        page_vec->pages[cur_page] = page;
 263                        cur_page++;
 264                        page += SIZE_4K;
 265                } else
 266                        page = first_addr;
 267
 268                for (; page < last_addr; page += SIZE_4K) {
 269                        page_vec->pages[cur_page] = page;
 270                        cur_page++;
 271                }
 272
 273        }
 274        page_vec->data_size = total_sz;
 275        iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page);
 276        return cur_page;
 277}
 278
 279#define IS_4K_ALIGNED(addr)     ((((unsigned long)addr) & ~MASK_4K) == 0)
 280
 281/**
 282 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned
 283 * for RDMA sub-list of a scatter-gather list of memory buffers, and  returns
 284 * the number of entries which are aligned correctly. Supports the case where
 285 * consecutive SG elements are actually fragments of the same physcial page.
 286 */
 287static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data,
 288                                              struct ib_device *ibdev)
 289{
 290        struct scatterlist *sgl, *sg;
 291        u64 end_addr, next_addr;
 292        int i, cnt;
 293        unsigned int ret_len = 0;
 294
 295        sgl = (struct scatterlist *)data->buf;
 296
 297        cnt = 0;
 298        for_each_sg(sgl, sg, data->dma_nents, i) {
 299                /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX "
 300                   "offset: %ld sz: %ld\n", i,
 301                   (unsigned long)sg_phys(sg),
 302                   (unsigned long)sg->offset,
 303                   (unsigned long)sg->length); */
 304                end_addr = ib_sg_dma_address(ibdev, sg) +
 305                           ib_sg_dma_len(ibdev, sg);
 306                /* iser_dbg("Checking sg iobuf end address "
 307                       "0x%08lX\n", end_addr); */
 308                if (i + 1 < data->dma_nents) {
 309                        next_addr = ib_sg_dma_address(ibdev, sg_next(sg));
 310                        /* are i, i+1 fragments of the same page? */
 311                        if (end_addr == next_addr) {
 312                                cnt++;
 313                                continue;
 314                        } else if (!IS_4K_ALIGNED(end_addr)) {
 315                                ret_len = cnt + 1;
 316                                break;
 317                        }
 318                }
 319                cnt++;
 320        }
 321        if (i == data->dma_nents)
 322                ret_len = cnt;  /* loop ended */
 323        iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n",
 324                 ret_len, data->dma_nents, data);
 325        return ret_len;
 326}
 327
 328static void iser_data_buf_dump(struct iser_data_buf *data,
 329                               struct ib_device *ibdev)
 330{
 331        struct scatterlist *sgl = (struct scatterlist *)data->buf;
 332        struct scatterlist *sg;
 333        int i;
 334
 335        if (iser_debug_level == 0)
 336                return;
 337
 338        for_each_sg(sgl, sg, data->dma_nents, i)
 339                iser_warn("sg[%d] dma_addr:0x%lX page:0x%p "
 340                         "off:0x%x sz:0x%x dma_len:0x%x\n",
 341                         i, (unsigned long)ib_sg_dma_address(ibdev, sg),
 342                         sg_page(sg), sg->offset,
 343                         sg->length, ib_sg_dma_len(ibdev, sg));
 344}
 345
 346static void iser_dump_page_vec(struct iser_page_vec *page_vec)
 347{
 348        int i;
 349
 350        iser_err("page vec length %d data size %d\n",
 351                 page_vec->length, page_vec->data_size);
 352        for (i = 0; i < page_vec->length; i++)
 353                iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]);
 354}
 355
 356static void iser_page_vec_build(struct iser_data_buf *data,
 357                                struct iser_page_vec *page_vec,
 358                                struct ib_device *ibdev)
 359{
 360        int page_vec_len = 0;
 361
 362        page_vec->length = 0;
 363        page_vec->offset = 0;
 364
 365        iser_dbg("Translating sg sz: %d\n", data->dma_nents);
 366        page_vec_len = iser_sg_to_page_vec(data, page_vec, ibdev);
 367        iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len);
 368
 369        page_vec->length = page_vec_len;
 370
 371        if (page_vec_len * SIZE_4K < page_vec->data_size) {
 372                iser_err("page_vec too short to hold this SG\n");
 373                iser_data_buf_dump(data, ibdev);
 374                iser_dump_page_vec(page_vec);
 375                BUG();
 376        }
 377}
 378
 379int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
 380                            struct iser_data_buf *data,
 381                            enum iser_data_dir iser_dir,
 382                            enum dma_data_direction dma_dir)
 383{
 384        struct ib_device *dev;
 385
 386        iser_task->dir[iser_dir] = 1;
 387        dev = iser_task->iser_conn->ib_conn->device->ib_device;
 388
 389        data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir);
 390        if (data->dma_nents == 0) {
 391                iser_err("dma_map_sg failed!!!\n");
 392                return -EINVAL;
 393        }
 394        return 0;
 395}
 396
 397void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task)
 398{
 399        struct ib_device *dev;
 400        struct iser_data_buf *data;
 401
 402        dev = iser_task->iser_conn->ib_conn->device->ib_device;
 403
 404        if (iser_task->dir[ISER_DIR_IN]) {
 405                data = &iser_task->data[ISER_DIR_IN];
 406                ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE);
 407        }
 408
 409        if (iser_task->dir[ISER_DIR_OUT]) {
 410                data = &iser_task->data[ISER_DIR_OUT];
 411                ib_dma_unmap_sg(dev, data->buf, data->size, DMA_TO_DEVICE);
 412        }
 413}
 414
 415/**
 416 * iser_reg_rdma_mem - Registers memory intended for RDMA,
 417 * obtaining rkey and va
 418 *
 419 * returns 0 on success, errno code on failure
 420 */
 421int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,
 422                      enum   iser_data_dir        cmd_dir)
 423{
 424        struct iscsi_conn    *iscsi_conn = iser_task->iser_conn->iscsi_conn;
 425        struct iser_conn     *ib_conn = iser_task->iser_conn->ib_conn;
 426        struct iser_device   *device = ib_conn->device;
 427        struct ib_device     *ibdev = device->ib_device;
 428        struct iser_data_buf *mem = &iser_task->data[cmd_dir];
 429        struct iser_regd_buf *regd_buf;
 430        int aligned_len;
 431        int err;
 432        int i;
 433        struct scatterlist *sg;
 434
 435        regd_buf = &iser_task->rdma_regd[cmd_dir];
 436
 437        aligned_len = iser_data_buf_aligned_len(mem, ibdev);
 438        if (aligned_len != mem->dma_nents) {
 439                iscsi_conn->fmr_unalign_cnt++;
 440                iser_warn("rdma alignment violation %d/%d aligned\n",
 441                         aligned_len, mem->size);
 442                iser_data_buf_dump(mem, ibdev);
 443
 444                /* unmap the command data before accessing it */
 445                iser_dma_unmap_task_data(iser_task);
 446
 447                /* allocate copy buf, if we are writing, copy the */
 448                /* unaligned scatterlist, dma map the copy        */
 449                if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0)
 450                                return -ENOMEM;
 451                mem = &iser_task->data_copy[cmd_dir];
 452        }
 453
 454        /* if there a single dma entry, FMR is not needed */
 455        if (mem->dma_nents == 1) {
 456                sg = (struct scatterlist *)mem->buf;
 457
 458                regd_buf->reg.lkey = device->mr->lkey;
 459                regd_buf->reg.rkey = device->mr->rkey;
 460                regd_buf->reg.len  = ib_sg_dma_len(ibdev, &sg[0]);
 461                regd_buf->reg.va   = ib_sg_dma_address(ibdev, &sg[0]);
 462                regd_buf->reg.is_fmr = 0;
 463
 464                iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X  "
 465                         "va: 0x%08lX sz: %ld]\n",
 466                         (unsigned int)regd_buf->reg.lkey,
 467                         (unsigned int)regd_buf->reg.rkey,
 468                         (unsigned long)regd_buf->reg.va,
 469                         (unsigned long)regd_buf->reg.len);
 470        } else { /* use FMR for multiple dma entries */
 471                iser_page_vec_build(mem, ib_conn->page_vec, ibdev);
 472                err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, &regd_buf->reg);
 473                if (err) {
 474                        iser_data_buf_dump(mem, ibdev);
 475                        iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
 476                                 mem->dma_nents,
 477                                 ntoh24(iser_task->desc.iscsi_header.dlength));
 478                        iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
 479                                 ib_conn->page_vec->data_size, ib_conn->page_vec->length,
 480                                 ib_conn->page_vec->offset);
 481                        for (i=0 ; i<ib_conn->page_vec->length ; i++)
 482                                iser_err("page_vec[%d] = 0x%llx\n", i,
 483                                         (unsigned long long) ib_conn->page_vec->pages[i]);
 484                        return err;
 485                }
 486        }
 487
 488        /* take a reference on this regd buf such that it will not be released *
 489         * (eg in send dto completion) before we get the scsi response         */
 490        atomic_inc(&regd_buf->ref_count);
 491        return 0;
 492}
 493