linux/drivers/infiniband/ulp/iser/iser_memory.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 */
  32#include <linux/module.h>
  33#include <linux/kernel.h>
  34#include <linux/slab.h>
  35#include <linux/mm.h>
  36#include <linux/highmem.h>
  37#include <linux/scatterlist.h>
  38
  39#include "iscsi_iser.h"
  40
  41#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */
  42
  43/**
  44 * iser_start_rdma_unaligned_sg
  45 */
  46static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
  47                                        enum iser_data_dir cmd_dir)
  48{
  49        int dma_nents;
  50        struct ib_device *dev;
  51        char *mem = NULL;
  52        struct iser_data_buf *data = &iser_task->data[cmd_dir];
  53        unsigned long  cmd_data_len = data->data_len;
  54
  55        if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
  56                mem = (void *)__get_free_pages(GFP_ATOMIC,
  57                      ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
  58        else
  59                mem = kmalloc(cmd_data_len, GFP_ATOMIC);
  60
  61        if (mem == NULL) {
  62                iser_err("Failed to allocate mem size %d %d for copying sglist\n",
  63                         data->size,(int)cmd_data_len);
  64                return -ENOMEM;
  65        }
  66
  67        if (cmd_dir == ISER_DIR_OUT) {
  68                /* copy the unaligned sg the buffer which is used for RDMA */
  69                struct scatterlist *sgl = (struct scatterlist *)data->buf;
  70                struct scatterlist *sg;
  71                int i;
  72                char *p, *from;
  73
  74                p = mem;
  75                for_each_sg(sgl, sg, data->size, i) {
  76                        from = kmap_atomic(sg_page(sg), KM_USER0);
  77                        memcpy(p,
  78                               from + sg->offset,
  79                               sg->length);
  80                        kunmap_atomic(from, KM_USER0);
  81                        p += sg->length;
  82                }
  83        }
  84
  85        sg_init_one(&iser_task->data_copy[cmd_dir].sg_single, mem, cmd_data_len);
  86        iser_task->data_copy[cmd_dir].buf  =
  87                &iser_task->data_copy[cmd_dir].sg_single;
  88        iser_task->data_copy[cmd_dir].size = 1;
  89
  90        iser_task->data_copy[cmd_dir].copy_buf  = mem;
  91
  92        dev = iser_task->iser_conn->ib_conn->device->ib_device;
  93        dma_nents = ib_dma_map_sg(dev,
  94                                  &iser_task->data_copy[cmd_dir].sg_single,
  95                                  1,
  96                                  (cmd_dir == ISER_DIR_OUT) ?
  97                                  DMA_TO_DEVICE : DMA_FROM_DEVICE);
  98        BUG_ON(dma_nents == 0);
  99
 100        iser_task->data_copy[cmd_dir].dma_nents = dma_nents;
 101        return 0;
 102}
 103
 104/**
 105 * iser_finalize_rdma_unaligned_sg
 106 */
 107void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
 108                                     enum iser_data_dir         cmd_dir)
 109{
 110        struct ib_device *dev;
 111        struct iser_data_buf *mem_copy;
 112        unsigned long  cmd_data_len;
 113
 114        dev = iser_task->iser_conn->ib_conn->device->ib_device;
 115        mem_copy = &iser_task->data_copy[cmd_dir];
 116
 117        ib_dma_unmap_sg(dev, &mem_copy->sg_single, 1,
 118                        (cmd_dir == ISER_DIR_OUT) ?
 119                        DMA_TO_DEVICE : DMA_FROM_DEVICE);
 120
 121        if (cmd_dir == ISER_DIR_IN) {
 122                char *mem;
 123                struct scatterlist *sgl, *sg;
 124                unsigned char *p, *to;
 125                unsigned int sg_size;
 126                int i;
 127
 128                /* copy back read RDMA to unaligned sg */
 129                mem     = mem_copy->copy_buf;
 130
 131                sgl     = (struct scatterlist *)iser_task->data[ISER_DIR_IN].buf;
 132                sg_size = iser_task->data[ISER_DIR_IN].size;
 133
 134                p = mem;
 135                for_each_sg(sgl, sg, sg_size, i) {
 136                        to = kmap_atomic(sg_page(sg), KM_SOFTIRQ0);
 137                        memcpy(to + sg->offset,
 138                               p,
 139                               sg->length);
 140                        kunmap_atomic(to, KM_SOFTIRQ0);
 141                        p += sg->length;
 142                }
 143        }
 144
 145        cmd_data_len = iser_task->data[cmd_dir].data_len;
 146
 147        if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
 148                free_pages((unsigned long)mem_copy->copy_buf,
 149                           ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
 150        else
 151                kfree(mem_copy->copy_buf);
 152
 153        mem_copy->copy_buf = NULL;
 154}
 155
 156#define IS_4K_ALIGNED(addr)     ((((unsigned long)addr) & ~MASK_4K) == 0)
 157
 158/**
 159 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
 160 * and returns the length of resulting physical address array (may be less than
 161 * the original due to possible compaction).
 162 *
 163 * we build a "page vec" under the assumption that the SG meets the RDMA
 164 * alignment requirements. Other then the first and last SG elements, all
 165 * the "internal" elements can be compacted into a list whose elements are
 166 * dma addresses of physical pages. The code supports also the weird case
 167 * where --few fragments of the same page-- are present in the SG as
 168 * consecutive elements. Also, it handles one entry SG.
 169 */
 170
 171static int iser_sg_to_page_vec(struct iser_data_buf *data,
 172                               struct iser_page_vec *page_vec,
 173                               struct ib_device *ibdev)
 174{
 175        struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf;
 176        u64 start_addr, end_addr, page, chunk_start = 0;
 177        unsigned long total_sz = 0;
 178        unsigned int dma_len;
 179        int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
 180
 181        /* compute the offset of first element */
 182        page_vec->offset = (u64) sgl[0].offset & ~MASK_4K;
 183
 184        new_chunk = 1;
 185        cur_page  = 0;
 186        for_each_sg(sgl, sg, data->dma_nents, i) {
 187                start_addr = ib_sg_dma_address(ibdev, sg);
 188                if (new_chunk)
 189                        chunk_start = start_addr;
 190                dma_len = ib_sg_dma_len(ibdev, sg);
 191                end_addr = start_addr + dma_len;
 192                total_sz += dma_len;
 193
 194                /* collect page fragments until aligned or end of SG list */
 195                if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
 196                        new_chunk = 0;
 197                        continue;
 198                }
 199                new_chunk = 1;
 200
 201                /* address of the first page in the contiguous chunk;
 202                   masking relevant for the very first SG entry,
 203                   which might be unaligned */
 204                page = chunk_start & MASK_4K;
 205                do {
 206                        page_vec->pages[cur_page++] = page;
 207                        page += SIZE_4K;
 208                } while (page < end_addr);
 209        }
 210
 211        page_vec->data_size = total_sz;
 212        iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page);
 213        return cur_page;
 214}
 215
 216
 217/**
 218 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned
 219 * for RDMA sub-list of a scatter-gather list of memory buffers, and  returns
 220 * the number of entries which are aligned correctly. Supports the case where
 221 * consecutive SG elements are actually fragments of the same physcial page.
 222 */
 223static int iser_data_buf_aligned_len(struct iser_data_buf *data,
 224                                      struct ib_device *ibdev)
 225{
 226        struct scatterlist *sgl, *sg, *next_sg = NULL;
 227        u64 start_addr, end_addr;
 228        int i, ret_len, start_check = 0;
 229
 230        if (data->dma_nents == 1)
 231                return 1;
 232
 233        sgl = (struct scatterlist *)data->buf;
 234        start_addr  = ib_sg_dma_address(ibdev, sgl);
 235
 236        for_each_sg(sgl, sg, data->dma_nents, i) {
 237                if (start_check && !IS_4K_ALIGNED(start_addr))
 238                        break;
 239
 240                next_sg = sg_next(sg);
 241                if (!next_sg)
 242                        break;
 243
 244                end_addr    = start_addr + ib_sg_dma_len(ibdev, sg);
 245                start_addr  = ib_sg_dma_address(ibdev, next_sg);
 246
 247                if (end_addr == start_addr) {
 248                        start_check = 0;
 249                        continue;
 250                } else
 251                        start_check = 1;
 252
 253                if (!IS_4K_ALIGNED(end_addr))
 254                        break;
 255        }
 256        ret_len = (next_sg) ? i : i+1;
 257        iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n",
 258                 ret_len, data->dma_nents, data);
 259        return ret_len;
 260}
 261
 262static void iser_data_buf_dump(struct iser_data_buf *data,
 263                               struct ib_device *ibdev)
 264{
 265        struct scatterlist *sgl = (struct scatterlist *)data->buf;
 266        struct scatterlist *sg;
 267        int i;
 268
 269        if (iser_debug_level == 0)
 270                return;
 271
 272        for_each_sg(sgl, sg, data->dma_nents, i)
 273                iser_warn("sg[%d] dma_addr:0x%lX page:0x%p "
 274                         "off:0x%x sz:0x%x dma_len:0x%x\n",
 275                         i, (unsigned long)ib_sg_dma_address(ibdev, sg),
 276                         sg_page(sg), sg->offset,
 277                         sg->length, ib_sg_dma_len(ibdev, sg));
 278}
 279
 280static void iser_dump_page_vec(struct iser_page_vec *page_vec)
 281{
 282        int i;
 283
 284        iser_err("page vec length %d data size %d\n",
 285                 page_vec->length, page_vec->data_size);
 286        for (i = 0; i < page_vec->length; i++)
 287                iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]);
 288}
 289
 290static void iser_page_vec_build(struct iser_data_buf *data,
 291                                struct iser_page_vec *page_vec,
 292                                struct ib_device *ibdev)
 293{
 294        int page_vec_len = 0;
 295
 296        page_vec->length = 0;
 297        page_vec->offset = 0;
 298
 299        iser_dbg("Translating sg sz: %d\n", data->dma_nents);
 300        page_vec_len = iser_sg_to_page_vec(data, page_vec, ibdev);
 301        iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len);
 302
 303        page_vec->length = page_vec_len;
 304
 305        if (page_vec_len * SIZE_4K < page_vec->data_size) {
 306                iser_err("page_vec too short to hold this SG\n");
 307                iser_data_buf_dump(data, ibdev);
 308                iser_dump_page_vec(page_vec);
 309                BUG();
 310        }
 311}
 312
 313int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
 314                            struct iser_data_buf *data,
 315                            enum iser_data_dir iser_dir,
 316                            enum dma_data_direction dma_dir)
 317{
 318        struct ib_device *dev;
 319
 320        iser_task->dir[iser_dir] = 1;
 321        dev = iser_task->iser_conn->ib_conn->device->ib_device;
 322
 323        data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir);
 324        if (data->dma_nents == 0) {
 325                iser_err("dma_map_sg failed!!!\n");
 326                return -EINVAL;
 327        }
 328        return 0;
 329}
 330
 331void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task)
 332{
 333        struct ib_device *dev;
 334        struct iser_data_buf *data;
 335
 336        dev = iser_task->iser_conn->ib_conn->device->ib_device;
 337
 338        if (iser_task->dir[ISER_DIR_IN]) {
 339                data = &iser_task->data[ISER_DIR_IN];
 340                ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE);
 341        }
 342
 343        if (iser_task->dir[ISER_DIR_OUT]) {
 344                data = &iser_task->data[ISER_DIR_OUT];
 345                ib_dma_unmap_sg(dev, data->buf, data->size, DMA_TO_DEVICE);
 346        }
 347}
 348
 349/**
 350 * iser_reg_rdma_mem - Registers memory intended for RDMA,
 351 * obtaining rkey and va
 352 *
 353 * returns 0 on success, errno code on failure
 354 */
 355int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,
 356                      enum   iser_data_dir        cmd_dir)
 357{
 358        struct iscsi_conn    *iscsi_conn = iser_task->iser_conn->iscsi_conn;
 359        struct iser_conn     *ib_conn = iser_task->iser_conn->ib_conn;
 360        struct iser_device   *device = ib_conn->device;
 361        struct ib_device     *ibdev = device->ib_device;
 362        struct iser_data_buf *mem = &iser_task->data[cmd_dir];
 363        struct iser_regd_buf *regd_buf;
 364        int aligned_len;
 365        int err;
 366        int i;
 367        struct scatterlist *sg;
 368
 369        regd_buf = &iser_task->rdma_regd[cmd_dir];
 370
 371        aligned_len = iser_data_buf_aligned_len(mem, ibdev);
 372        if (aligned_len != mem->dma_nents) {
 373                iscsi_conn->fmr_unalign_cnt++;
 374                iser_warn("rdma alignment violation %d/%d aligned\n",
 375                         aligned_len, mem->size);
 376                iser_data_buf_dump(mem, ibdev);
 377
 378                /* unmap the command data before accessing it */
 379                iser_dma_unmap_task_data(iser_task);
 380
 381                /* allocate copy buf, if we are writing, copy the */
 382                /* unaligned scatterlist, dma map the copy        */
 383                if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0)
 384                                return -ENOMEM;
 385                mem = &iser_task->data_copy[cmd_dir];
 386        }
 387
 388        /* if there a single dma entry, FMR is not needed */
 389        if (mem->dma_nents == 1) {
 390                sg = (struct scatterlist *)mem->buf;
 391
 392                regd_buf->reg.lkey = device->mr->lkey;
 393                regd_buf->reg.rkey = device->mr->rkey;
 394                regd_buf->reg.len  = ib_sg_dma_len(ibdev, &sg[0]);
 395                regd_buf->reg.va   = ib_sg_dma_address(ibdev, &sg[0]);
 396                regd_buf->reg.is_fmr = 0;
 397
 398                iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X  "
 399                         "va: 0x%08lX sz: %ld]\n",
 400                         (unsigned int)regd_buf->reg.lkey,
 401                         (unsigned int)regd_buf->reg.rkey,
 402                         (unsigned long)regd_buf->reg.va,
 403                         (unsigned long)regd_buf->reg.len);
 404        } else { /* use FMR for multiple dma entries */
 405                iser_page_vec_build(mem, ib_conn->page_vec, ibdev);
 406                err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, &regd_buf->reg);
 407                if (err) {
 408                        iser_data_buf_dump(mem, ibdev);
 409                        iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
 410                                 mem->dma_nents,
 411                                 ntoh24(iser_task->desc.iscsi_header.dlength));
 412                        iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
 413                                 ib_conn->page_vec->data_size, ib_conn->page_vec->length,
 414                                 ib_conn->page_vec->offset);
 415                        for (i=0 ; i<ib_conn->page_vec->length ; i++)
 416                                iser_err("page_vec[%d] = 0x%llx\n", i,
 417                                         (unsigned long long) ib_conn->page_vec->pages[i]);
 418                        return err;
 419                }
 420        }
 421        return 0;
 422}
 423