linux/drivers/infiniband/core/cq.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2015 HGST, a Western Digital Company.
   4 */
   5#include <linux/module.h>
   6#include <linux/err.h>
   7#include <linux/slab.h>
   8#include <rdma/ib_verbs.h>
   9
  10#include "core_priv.h"
  11
  12#include <trace/events/rdma_core.h>
  13/* Max size for shared CQ, may require tuning */
  14#define IB_MAX_SHARED_CQ_SZ             4096U
  15
  16/* # of WCs to poll for with a single call to ib_poll_cq */
  17#define IB_POLL_BATCH                   16
  18#define IB_POLL_BATCH_DIRECT            8
  19
  20/* # of WCs to iterate over before yielding */
  21#define IB_POLL_BUDGET_IRQ              256
  22#define IB_POLL_BUDGET_WORKQUEUE        65536
  23
  24#define IB_POLL_FLAGS \
  25        (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
  26
  27static const struct dim_cq_moder
  28rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
  29        {1,   0, 1,  0},
  30        {1,   0, 4,  0},
  31        {2,   0, 4,  0},
  32        {2,   0, 8,  0},
  33        {4,   0, 8,  0},
  34        {16,  0, 8,  0},
  35        {16,  0, 16, 0},
  36        {32,  0, 16, 0},
  37        {32,  0, 32, 0},
  38};
  39
  40static void ib_cq_rdma_dim_work(struct work_struct *w)
  41{
  42        struct dim *dim = container_of(w, struct dim, work);
  43        struct ib_cq *cq = dim->priv;
  44
  45        u16 usec = rdma_dim_prof[dim->profile_ix].usec;
  46        u16 comps = rdma_dim_prof[dim->profile_ix].comps;
  47
  48        dim->state = DIM_START_MEASURE;
  49
  50        trace_cq_modify(cq, comps, usec);
  51        cq->device->ops.modify_cq(cq, comps, usec);
  52}
  53
  54static void rdma_dim_init(struct ib_cq *cq)
  55{
  56        struct dim *dim;
  57
  58        if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
  59            cq->poll_ctx == IB_POLL_DIRECT)
  60                return;
  61
  62        dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
  63        if (!dim)
  64                return;
  65
  66        dim->state = DIM_START_MEASURE;
  67        dim->tune_state = DIM_GOING_RIGHT;
  68        dim->profile_ix = RDMA_DIM_START_PROFILE;
  69        dim->priv = cq;
  70        cq->dim = dim;
  71
  72        INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
  73}
  74
  75static void rdma_dim_destroy(struct ib_cq *cq)
  76{
  77        if (!cq->dim)
  78                return;
  79
  80        cancel_work_sync(&cq->dim->work);
  81        kfree(cq->dim);
  82}
  83
  84static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
  85{
  86        int rc;
  87
  88        rc = ib_poll_cq(cq, num_entries, wc);
  89        trace_cq_poll(cq, num_entries, rc);
  90        return rc;
  91}
  92
  93static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
  94                           int batch)
  95{
  96        int i, n, completed = 0;
  97
  98        trace_cq_process(cq);
  99
 100        /*
 101         * budget might be (-1) if the caller does not
 102         * want to bound this call, thus we need unsigned
 103         * minimum here.
 104         */
 105        while ((n = __poll_cq(cq, min_t(u32, batch,
 106                                        budget - completed), wcs)) > 0) {
 107                for (i = 0; i < n; i++) {
 108                        struct ib_wc *wc = &wcs[i];
 109
 110                        if (wc->wr_cqe)
 111                                wc->wr_cqe->done(cq, wc);
 112                        else
 113                                WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
 114                }
 115
 116                completed += n;
 117
 118                if (n != batch || (budget != -1 && completed >= budget))
 119                        break;
 120        }
 121
 122        return completed;
 123}
 124
 125/**
 126 * ib_process_direct_cq - process a CQ in caller context
 127 * @cq:         CQ to process
 128 * @budget:     number of CQEs to poll for
 129 *
 130 * This function is used to process all outstanding CQ entries.
 131 * It does not offload CQ processing to a different context and does
 132 * not ask for completion interrupts from the HCA.
 133 * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger
 134 * concurrent processing.
 135 *
 136 * Note: do not pass -1 as %budget unless it is guaranteed that the number
 137 * of completions that will be processed is small.
 138 */
 139int ib_process_cq_direct(struct ib_cq *cq, int budget)
 140{
 141        struct ib_wc wcs[IB_POLL_BATCH_DIRECT];
 142
 143        return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT);
 144}
 145EXPORT_SYMBOL(ib_process_cq_direct);
 146
 147static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
 148{
 149        WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
 150}
 151
 152static int ib_poll_handler(struct irq_poll *iop, int budget)
 153{
 154        struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
 155        struct dim *dim = cq->dim;
 156        int completed;
 157
 158        completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
 159        if (completed < budget) {
 160                irq_poll_complete(&cq->iop);
 161                if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
 162                        trace_cq_reschedule(cq);
 163                        irq_poll_sched(&cq->iop);
 164                }
 165        }
 166
 167        if (dim)
 168                rdma_dim(dim, completed);
 169
 170        return completed;
 171}
 172
 173static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
 174{
 175        trace_cq_schedule(cq);
 176        irq_poll_sched(&cq->iop);
 177}
 178
 179static void ib_cq_poll_work(struct work_struct *work)
 180{
 181        struct ib_cq *cq = container_of(work, struct ib_cq, work);
 182        int completed;
 183
 184        completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
 185                                    IB_POLL_BATCH);
 186        if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 187            ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
 188                queue_work(cq->comp_wq, &cq->work);
 189        else if (cq->dim)
 190                rdma_dim(cq->dim, completed);
 191}
 192
 193static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 194{
 195        trace_cq_schedule(cq);
 196        queue_work(cq->comp_wq, &cq->work);
 197}
 198
 199/**
 200 * __ib_alloc_cq_user - allocate a completion queue
 201 * @dev:                device to allocate the CQ for
 202 * @private:            driver private data, accessible from cq->cq_context
 203 * @nr_cqe:             number of CQEs to allocate
 204 * @comp_vector:        HCA completion vectors for this CQ
 205 * @poll_ctx:           context to poll the CQ from.
 206 * @caller:             module owner name.
 207 * @udata:              Valid user data or NULL for kernel object
 208 *
 209 * This is the proper interface to allocate a CQ for in-kernel users. A
 210 * CQ allocated with this interface will automatically be polled from the
 211 * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
 212 * to use this CQ abstraction.
 213 */
 214struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 215                                 int nr_cqe, int comp_vector,
 216                                 enum ib_poll_context poll_ctx,
 217                                 const char *caller, struct ib_udata *udata)
 218{
 219        struct ib_cq_init_attr cq_attr = {
 220                .cqe            = nr_cqe,
 221                .comp_vector    = comp_vector,
 222        };
 223        struct ib_cq *cq;
 224        int ret = -ENOMEM;
 225
 226        cq = rdma_zalloc_drv_obj(dev, ib_cq);
 227        if (!cq)
 228                return ERR_PTR(ret);
 229
 230        cq->device = dev;
 231        cq->cq_context = private;
 232        cq->poll_ctx = poll_ctx;
 233        atomic_set(&cq->usecnt, 0);
 234        cq->comp_vector = comp_vector;
 235
 236        cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
 237        if (!cq->wc)
 238                goto out_free_cq;
 239
 240        cq->res.type = RDMA_RESTRACK_CQ;
 241        rdma_restrack_set_task(&cq->res, caller);
 242
 243        ret = dev->ops.create_cq(cq, &cq_attr, NULL);
 244        if (ret)
 245                goto out_free_wc;
 246
 247        rdma_restrack_kadd(&cq->res);
 248
 249        rdma_dim_init(cq);
 250
 251        switch (cq->poll_ctx) {
 252        case IB_POLL_DIRECT:
 253                cq->comp_handler = ib_cq_completion_direct;
 254                break;
 255        case IB_POLL_SOFTIRQ:
 256                cq->comp_handler = ib_cq_completion_softirq;
 257
 258                irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
 259                ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 260                break;
 261        case IB_POLL_WORKQUEUE:
 262        case IB_POLL_UNBOUND_WORKQUEUE:
 263                cq->comp_handler = ib_cq_completion_workqueue;
 264                INIT_WORK(&cq->work, ib_cq_poll_work);
 265                ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 266                cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
 267                                ib_comp_wq : ib_comp_unbound_wq;
 268                break;
 269        default:
 270                ret = -EINVAL;
 271                goto out_destroy_cq;
 272        }
 273
 274        trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx);
 275        return cq;
 276
 277out_destroy_cq:
 278        rdma_dim_destroy(cq);
 279        rdma_restrack_del(&cq->res);
 280        cq->device->ops.destroy_cq(cq, udata);
 281out_free_wc:
 282        kfree(cq->wc);
 283out_free_cq:
 284        kfree(cq);
 285        trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret);
 286        return ERR_PTR(ret);
 287}
 288EXPORT_SYMBOL(__ib_alloc_cq_user);
 289
 290/**
 291 * __ib_alloc_cq_any - allocate a completion queue
 292 * @dev:                device to allocate the CQ for
 293 * @private:            driver private data, accessible from cq->cq_context
 294 * @nr_cqe:             number of CQEs to allocate
 295 * @poll_ctx:           context to poll the CQ from
 296 * @caller:             module owner name
 297 *
 298 * Attempt to spread ULP Completion Queues over each device's interrupt
 299 * vectors. A simple best-effort mechanism is used.
 300 */
 301struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
 302                                int nr_cqe, enum ib_poll_context poll_ctx,
 303                                const char *caller)
 304{
 305        static atomic_t counter;
 306        int comp_vector = 0;
 307
 308        if (dev->num_comp_vectors > 1)
 309                comp_vector =
 310                        atomic_inc_return(&counter) %
 311                        min_t(int, dev->num_comp_vectors, num_online_cpus());
 312
 313        return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
 314                                  caller, NULL);
 315}
 316EXPORT_SYMBOL(__ib_alloc_cq_any);
 317
 318/**
 319 * ib_free_cq_user - free a completion queue
 320 * @cq:         completion queue to free.
 321 * @udata:      User data or NULL for kernel object
 322 */
 323void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 324{
 325        if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
 326                return;
 327        if (WARN_ON_ONCE(cq->cqe_used))
 328                return;
 329
 330        switch (cq->poll_ctx) {
 331        case IB_POLL_DIRECT:
 332                break;
 333        case IB_POLL_SOFTIRQ:
 334                irq_poll_disable(&cq->iop);
 335                break;
 336        case IB_POLL_WORKQUEUE:
 337        case IB_POLL_UNBOUND_WORKQUEUE:
 338                cancel_work_sync(&cq->work);
 339                break;
 340        default:
 341                WARN_ON_ONCE(1);
 342        }
 343
 344        rdma_dim_destroy(cq);
 345        trace_cq_free(cq);
 346        rdma_restrack_del(&cq->res);
 347        cq->device->ops.destroy_cq(cq, udata);
 348        kfree(cq->wc);
 349        kfree(cq);
 350}
 351EXPORT_SYMBOL(ib_free_cq_user);
 352
 353void ib_cq_pool_init(struct ib_device *dev)
 354{
 355        unsigned int i;
 356
 357        spin_lock_init(&dev->cq_pools_lock);
 358        for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++)
 359                INIT_LIST_HEAD(&dev->cq_pools[i]);
 360}
 361
 362void ib_cq_pool_destroy(struct ib_device *dev)
 363{
 364        struct ib_cq *cq, *n;
 365        unsigned int i;
 366
 367        for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) {
 368                list_for_each_entry_safe(cq, n, &dev->cq_pools[i],
 369                                         pool_entry) {
 370                        WARN_ON(cq->cqe_used);
 371                        cq->shared = false;
 372                        ib_free_cq(cq);
 373                }
 374        }
 375}
 376
 377static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes,
 378                        enum ib_poll_context poll_ctx)
 379{
 380        LIST_HEAD(tmp_list);
 381        unsigned int nr_cqs, i;
 382        struct ib_cq *cq, *n;
 383        int ret;
 384
 385        if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
 386                WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
 387                return -EINVAL;
 388        }
 389
 390        /*
 391         * Allocate at least as many CQEs as requested, and otherwise
 392         * a reasonable batch size so that we can share CQs between
 393         * multiple users instead of allocating a larger number of CQs.
 394         */
 395        nr_cqes = min_t(unsigned int, dev->attrs.max_cqe,
 396                        max(nr_cqes, IB_MAX_SHARED_CQ_SZ));
 397        nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
 398        for (i = 0; i < nr_cqs; i++) {
 399                cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx);
 400                if (IS_ERR(cq)) {
 401                        ret = PTR_ERR(cq);
 402                        goto out_free_cqs;
 403                }
 404                cq->shared = true;
 405                list_add_tail(&cq->pool_entry, &tmp_list);
 406        }
 407
 408        spin_lock_irq(&dev->cq_pools_lock);
 409        list_splice(&tmp_list, &dev->cq_pools[poll_ctx]);
 410        spin_unlock_irq(&dev->cq_pools_lock);
 411
 412        return 0;
 413
 414out_free_cqs:
 415        list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) {
 416                cq->shared = false;
 417                ib_free_cq(cq);
 418        }
 419        return ret;
 420}
 421
 422/**
 423 * ib_cq_pool_get() - Find the least used completion queue that matches
 424 *   a given cpu hint (or least used for wild card affinity) and fits
 425 *   nr_cqe.
 426 * @dev: rdma device
 427 * @nr_cqe: number of needed cqe entries
 428 * @comp_vector_hint: completion vector hint (-1) for the driver to assign
 429 *   a comp vector based on internal counter
 430 * @poll_ctx: cq polling context
 431 *
 432 * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and
 433 * claim entries in it for us.  In case there is no available cq, allocate
 434 * a new cq with the requirements and add it to the device pool.
 435 * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value
 436 * for @poll_ctx.
 437 */
 438struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
 439                             int comp_vector_hint,
 440                             enum ib_poll_context poll_ctx)
 441{
 442        static unsigned int default_comp_vector;
 443        unsigned int vector, num_comp_vectors;
 444        struct ib_cq *cq, *found = NULL;
 445        int ret;
 446
 447        if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
 448                WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
 449                return ERR_PTR(-EINVAL);
 450        }
 451
 452        num_comp_vectors =
 453                min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
 454        /* Project the affinty to the device completion vector range */
 455        if (comp_vector_hint < 0) {
 456                comp_vector_hint =
 457                        (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors;
 458                WRITE_ONCE(default_comp_vector, comp_vector_hint);
 459        }
 460        vector = comp_vector_hint % num_comp_vectors;
 461
 462        /*
 463         * Find the least used CQ with correct affinity and
 464         * enough free CQ entries
 465         */
 466        while (!found) {
 467                spin_lock_irq(&dev->cq_pools_lock);
 468                list_for_each_entry(cq, &dev->cq_pools[poll_ctx],
 469                                    pool_entry) {
 470                        /*
 471                         * Check to see if we have found a CQ with the
 472                         * correct completion vector
 473                         */
 474                        if (vector != cq->comp_vector)
 475                                continue;
 476                        if (cq->cqe_used + nr_cqe > cq->cqe)
 477                                continue;
 478                        found = cq;
 479                        break;
 480                }
 481
 482                if (found) {
 483                        found->cqe_used += nr_cqe;
 484                        spin_unlock_irq(&dev->cq_pools_lock);
 485
 486                        return found;
 487                }
 488                spin_unlock_irq(&dev->cq_pools_lock);
 489
 490                /*
 491                 * Didn't find a match or ran out of CQs in the device
 492                 * pool, allocate a new array of CQs.
 493                 */
 494                ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx);
 495                if (ret)
 496                        return ERR_PTR(ret);
 497        }
 498
 499        return found;
 500}
 501EXPORT_SYMBOL(ib_cq_pool_get);
 502
 503/**
 504 * ib_cq_pool_put - Return a CQ taken from a shared pool.
 505 * @cq: The CQ to return.
 506 * @nr_cqe: The max number of cqes that the user had requested.
 507 */
 508void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe)
 509{
 510        if (WARN_ON_ONCE(nr_cqe > cq->cqe_used))
 511                return;
 512
 513        spin_lock_irq(&cq->device->cq_pools_lock);
 514        cq->cqe_used -= nr_cqe;
 515        spin_unlock_irq(&cq->device->cq_pools_lock);
 516}
 517EXPORT_SYMBOL(ib_cq_pool_put);
 518