LXR linux/drivers/infiniband/core/cq.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2015 HGST, a Western Digital Company.
   4 */
   5#include <linux/module.h>
   6#include <linux/err.h>
   7#include <linux/slab.h>
   8#include <rdma/ib_verbs.h>
   9
  10#include "core_priv.h"
  11
  12#include <trace/events/rdma_core.h>
  13/* Max size for shared CQ, may require tuning */
  14#define IB_MAX_SHARED_CQ_SZ             4096U
  15
  16/* # of WCs to poll for with a single call to ib_poll_cq */
  17#define IB_POLL_BATCH                   16
  18#define IB_POLL_BATCH_DIRECT            8
  19
  20/* # of WCs to iterate over before yielding */
  21#define IB_POLL_BUDGET_IRQ              256
  22#define IB_POLL_BUDGET_WORKQUEUE        65536
  23
  24#define IB_POLL_FLAGS \
  25        (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
  26
  27static const struct dim_cq_moder
  28rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
  29        {1,   0, 1,  0},
  30        {1,   0, 4,  0},
  31        {2,   0, 4,  0},
  32        {2,   0, 8,  0},
  33        {4,   0, 8,  0},
  34        {16,  0, 8,  0},
  35        {16,  0, 16, 0},
  36        {32,  0, 16, 0},
  37        {32,  0, 32, 0},
  38};
  39
  40static void ib_cq_rdma_dim_work(struct work_struct *w)
  41{
  42        struct dim *dim = container_of(w, struct dim, work);
  43        struct ib_cq *cq = dim->priv;
  44
  45        u16 usec = rdma_dim_prof[dim->profile_ix].usec;
  46        u16 comps = rdma_dim_prof[dim->profile_ix].comps;
  47
  48        dim->state = DIM_START_MEASURE;
  49
  50        trace_cq_modify(cq, comps, usec);
  51        cq->device->ops.modify_cq(cq, comps, usec);
  52}
  53
  54static void rdma_dim_init(struct ib_cq *cq)
  55{
  56        struct dim *dim;
  57
  58        if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
  59            cq->poll_ctx == IB_POLL_DIRECT)
  60                return;
  61
  62        dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
  63        if (!dim)
  64                return;
  65
  66        dim->state = DIM_START_MEASURE;
  67        dim->tune_state = DIM_GOING_RIGHT;
  68        dim->profile_ix = RDMA_DIM_START_PROFILE;
  69        dim->priv = cq;
  70        cq->dim = dim;
  71
  72        INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
  73}
  74
  75static void rdma_dim_destroy(struct ib_cq *cq)
  76{
  77        if (!cq->dim)
  78                return;
  79
  80        cancel_work_sync(&cq->dim->work);
  81        kfree(cq->dim);
  82}
  83
  84static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
  85{
  86        int rc;
  87
  88        rc = ib_poll_cq(cq, num_entries, wc);
  89        trace_cq_poll(cq, num_entries, rc);
  90        return rc;
  91}
  92
  93static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
  94                           int batch)
  95{
  96        int i, n, completed = 0;
  97
  98        trace_cq_process(cq);
  99
 100        /*
 101         * budget might be (-1) if the caller does not
 102         * want to bound this call, thus we need unsigned
 103         * minimum here.
 104         */
 105        while ((n = __poll_cq(cq, min_t(u32, batch,
 106                                        budget - completed), wcs)) > 0) {
 107                for (i = 0; i < n; i++) {
 108                        struct ib_wc *wc = &wcs[i];
 109
 110                        if (wc->wr_cqe)
 111                                wc->wr_cqe->done(cq, wc);
 112                        else
 113                                WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
 114                }
 115
 116                completed += n;
 117
 118                if (n != batch || (budget != -1 && completed >= budget))
 119                        break;
 120        }
 121
 122        return completed;
 123}
 124
 125/**
 126 * ib_process_cq_direct - process a CQ in caller context
 127 * @cq:         CQ to process
 128 * @budget:     number of CQEs to poll for
 129 *
 130 * This function is used to process all outstanding CQ entries.
 131 * It does not offload CQ processing to a different context and does
 132 * not ask for completion interrupts from the HCA.
 133 * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger
 134 * concurrent processing.
 135 *
 136 * Note: do not pass -1 as %budget unless it is guaranteed that the number
 137 * of completions that will be processed is small.
 138 */
 139int ib_process_cq_direct(struct ib_cq *cq, int budget)
 140{
 141        struct ib_wc wcs[IB_POLL_BATCH_DIRECT];
 142
 143        return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT);
 144}
 145EXPORT_SYMBOL(ib_process_cq_direct);
 146
 147static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
 148{
 149        WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
 150}
 151
 152static int ib_poll_handler(struct irq_poll *iop, int budget)
 153{
 154        struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
 155        struct dim *dim = cq->dim;
 156        int completed;
 157
 158        completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
 159        if (completed < budget) {
 160                irq_poll_complete(&cq->iop);
 161                if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
 162                        trace_cq_reschedule(cq);
 163                        irq_poll_sched(&cq->iop);
 164                }
 165        }
 166
 167        if (dim)
 168                rdma_dim(dim, completed);
 169
 170        return completed;
 171}
 172
 173static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
 174{
 175        trace_cq_schedule(cq);
 176        irq_poll_sched(&cq->iop);
 177}
 178
 179static void ib_cq_poll_work(struct work_struct *work)
 180{
 181        struct ib_cq *cq = container_of(work, struct ib_cq, work);
 182        int completed;
 183
 184        completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
 185                                    IB_POLL_BATCH);
 186        if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 187            ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
 188                queue_work(cq->comp_wq, &cq->work);
 189        else if (cq->dim)
 190                rdma_dim(cq->dim, completed);
 191}
 192
 193static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 194{
 195        trace_cq_schedule(cq);
 196        queue_work(cq->comp_wq, &cq->work);
 197}
 198
 199/**
 200 * __ib_alloc_cq - allocate a completion queue
 201 * @dev:                device to allocate the CQ for
 202 * @private:            driver private data, accessible from cq->cq_context
 203 * @nr_cqe:             number of CQEs to allocate
 204 * @comp_vector:        HCA completion vectors for this CQ
 205 * @poll_ctx:           context to poll the CQ from.
 206 * @caller:             module owner name.
 207 *
 208 * This is the proper interface to allocate a CQ for in-kernel users. A
 209 * CQ allocated with this interface will automatically be polled from the
 210 * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
 211 * to use this CQ abstraction.
 212 */
 213struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe,
 214                            int comp_vector, enum ib_poll_context poll_ctx,
 215                            const char *caller)
 216{
 217        struct ib_cq_init_attr cq_attr = {
 218                .cqe            = nr_cqe,
 219                .comp_vector    = comp_vector,
 220        };
 221        struct ib_cq *cq;
 222        int ret = -ENOMEM;
 223
 224        cq = rdma_zalloc_drv_obj(dev, ib_cq);
 225        if (!cq)
 226                return ERR_PTR(ret);
 227
 228        cq->device = dev;
 229        cq->cq_context = private;
 230        cq->poll_ctx = poll_ctx;
 231        atomic_set(&cq->usecnt, 0);
 232        cq->comp_vector = comp_vector;
 233
 234        cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
 235        if (!cq->wc)
 236                goto out_free_cq;
 237
 238        rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
 239        rdma_restrack_set_name(&cq->res, caller);
 240
 241        ret = dev->ops.create_cq(cq, &cq_attr, NULL);
 242        if (ret)
 243                goto out_free_wc;
 244
 245        rdma_dim_init(cq);
 246
 247        switch (cq->poll_ctx) {
 248        case IB_POLL_DIRECT:
 249                cq->comp_handler = ib_cq_completion_direct;
 250                break;
 251        case IB_POLL_SOFTIRQ:
 252                cq->comp_handler = ib_cq_completion_softirq;
 253
 254                irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
 255                ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 256                break;
 257        case IB_POLL_WORKQUEUE:
 258        case IB_POLL_UNBOUND_WORKQUEUE:
 259                cq->comp_handler = ib_cq_completion_workqueue;
 260                INIT_WORK(&cq->work, ib_cq_poll_work);
 261                ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 262                cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
 263                                ib_comp_wq : ib_comp_unbound_wq;
 264                break;
 265        default:
 266                ret = -EINVAL;
 267                goto out_destroy_cq;
 268        }
 269
 270        rdma_restrack_add(&cq->res);
 271        trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx);
 272        return cq;
 273
 274out_destroy_cq:
 275        rdma_dim_destroy(cq);
 276        cq->device->ops.destroy_cq(cq, NULL);
 277out_free_wc:
 278        rdma_restrack_put(&cq->res);
 279        kfree(cq->wc);
 280out_free_cq:
 281        kfree(cq);
 282        trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret);
 283        return ERR_PTR(ret);
 284}
 285EXPORT_SYMBOL(__ib_alloc_cq);
 286
 287/**
 288 * __ib_alloc_cq_any - allocate a completion queue
 289 * @dev:                device to allocate the CQ for
 290 * @private:            driver private data, accessible from cq->cq_context
 291 * @nr_cqe:             number of CQEs to allocate
 292 * @poll_ctx:           context to poll the CQ from
 293 * @caller:             module owner name
 294 *
 295 * Attempt to spread ULP Completion Queues over each device's interrupt
 296 * vectors. A simple best-effort mechanism is used.
 297 */
 298struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
 299                                int nr_cqe, enum ib_poll_context poll_ctx,
 300                                const char *caller)
 301{
 302        static atomic_t counter;
 303        int comp_vector = 0;
 304
 305        if (dev->num_comp_vectors > 1)
 306                comp_vector =
 307                        atomic_inc_return(&counter) %
 308                        min_t(int, dev->num_comp_vectors, num_online_cpus());
 309
 310        return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx,
 311                             caller);
 312}
 313EXPORT_SYMBOL(__ib_alloc_cq_any);
 314
 315/**
 316 * ib_free_cq - free a completion queue
 317 * @cq:         completion queue to free.
 318 */
 319void ib_free_cq(struct ib_cq *cq)
 320{
 321        int ret;
 322
 323        if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
 324                return;
 325        if (WARN_ON_ONCE(cq->cqe_used))
 326                return;
 327
 328        switch (cq->poll_ctx) {
 329        case IB_POLL_DIRECT:
 330                break;
 331        case IB_POLL_SOFTIRQ:
 332                irq_poll_disable(&cq->iop);
 333                break;
 334        case IB_POLL_WORKQUEUE:
 335        case IB_POLL_UNBOUND_WORKQUEUE:
 336                cancel_work_sync(&cq->work);
 337                break;
 338        default:
 339                WARN_ON_ONCE(1);
 340        }
 341
 342        rdma_dim_destroy(cq);
 343        trace_cq_free(cq);
 344        ret = cq->device->ops.destroy_cq(cq, NULL);
 345        WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail");
 346        rdma_restrack_del(&cq->res);
 347        kfree(cq->wc);
 348        kfree(cq);
 349}
 350EXPORT_SYMBOL(ib_free_cq);
 351
 352void ib_cq_pool_cleanup(struct ib_device *dev)
 353{
 354        struct ib_cq *cq, *n;
 355        unsigned int i;
 356
 357        for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) {
 358                list_for_each_entry_safe(cq, n, &dev->cq_pools[i],
 359                                         pool_entry) {
 360                        WARN_ON(cq->cqe_used);
 361                        list_del(&cq->pool_entry);
 362                        cq->shared = false;
 363                        ib_free_cq(cq);
 364                }
 365        }
 366}
 367
 368static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes,
 369                        enum ib_poll_context poll_ctx)
 370{
 371        LIST_HEAD(tmp_list);
 372        unsigned int nr_cqs, i;
 373        struct ib_cq *cq, *n;
 374        int ret;
 375
 376        if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
 377                WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
 378                return -EINVAL;
 379        }
 380
 381        /*
 382         * Allocate at least as many CQEs as requested, and otherwise
 383         * a reasonable batch size so that we can share CQs between
 384         * multiple users instead of allocating a larger number of CQs.
 385         */
 386        nr_cqes = min_t(unsigned int, dev->attrs.max_cqe,
 387                        max(nr_cqes, IB_MAX_SHARED_CQ_SZ));
 388        nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
 389        for (i = 0; i < nr_cqs; i++) {
 390                cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx);
 391                if (IS_ERR(cq)) {
 392                        ret = PTR_ERR(cq);
 393                        goto out_free_cqs;
 394                }
 395                cq->shared = true;
 396                list_add_tail(&cq->pool_entry, &tmp_list);
 397        }
 398
 399        spin_lock_irq(&dev->cq_pools_lock);
 400        list_splice(&tmp_list, &dev->cq_pools[poll_ctx]);
 401        spin_unlock_irq(&dev->cq_pools_lock);
 402
 403        return 0;
 404
 405out_free_cqs:
 406        list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) {
 407                cq->shared = false;
 408                ib_free_cq(cq);
 409        }
 410        return ret;
 411}
 412
 413/**
 414 * ib_cq_pool_get() - Find the least used completion queue that matches
 415 *   a given cpu hint (or least used for wild card affinity) and fits
 416 *   nr_cqe.
 417 * @dev: rdma device
 418 * @nr_cqe: number of needed cqe entries
 419 * @comp_vector_hint: completion vector hint (-1) for the driver to assign
 420 *   a comp vector based on internal counter
 421 * @poll_ctx: cq polling context
 422 *
 423 * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and
 424 * claim entries in it for us.  In case there is no available cq, allocate
 425 * a new cq with the requirements and add it to the device pool.
 426 * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value
 427 * for @poll_ctx.
 428 */
 429struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
 430                             int comp_vector_hint,
 431                             enum ib_poll_context poll_ctx)
 432{
 433        static unsigned int default_comp_vector;
 434        unsigned int vector, num_comp_vectors;
 435        struct ib_cq *cq, *found = NULL;
 436        int ret;
 437
 438        if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
 439                WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
 440                return ERR_PTR(-EINVAL);
 441        }
 442
 443        num_comp_vectors =
 444                min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
 445        /* Project the affinty to the device completion vector range */
 446        if (comp_vector_hint < 0) {
 447                comp_vector_hint =
 448                        (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors;
 449                WRITE_ONCE(default_comp_vector, comp_vector_hint);
 450        }
 451        vector = comp_vector_hint % num_comp_vectors;
 452
 453        /*
 454         * Find the least used CQ with correct affinity and
 455         * enough free CQ entries
 456         */
 457        while (!found) {
 458                spin_lock_irq(&dev->cq_pools_lock);
 459                list_for_each_entry(cq, &dev->cq_pools[poll_ctx],
 460                                    pool_entry) {
 461                        /*
 462                         * Check to see if we have found a CQ with the
 463                         * correct completion vector
 464                         */
 465                        if (vector != cq->comp_vector)
 466                                continue;
 467                        if (cq->cqe_used + nr_cqe > cq->cqe)
 468                                continue;
 469                        found = cq;
 470                        break;
 471                }
 472
 473                if (found) {
 474                        found->cqe_used += nr_cqe;
 475                        spin_unlock_irq(&dev->cq_pools_lock);
 476
 477                        return found;
 478                }
 479                spin_unlock_irq(&dev->cq_pools_lock);
 480
 481                /*
 482                 * Didn't find a match or ran out of CQs in the device
 483                 * pool, allocate a new array of CQs.
 484                 */
 485                ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx);
 486                if (ret)
 487                        return ERR_PTR(ret);
 488        }
 489
 490        return found;
 491}
 492EXPORT_SYMBOL(ib_cq_pool_get);
 493
 494/**
 495 * ib_cq_pool_put - Return a CQ taken from a shared pool.
 496 * @cq: The CQ to return.
 497 * @nr_cqe: The max number of cqes that the user had requested.
 498 */
 499void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe)
 500{
 501        if (WARN_ON_ONCE(nr_cqe > cq->cqe_used))
 502                return;
 503
 504        spin_lock_irq(&cq->device->cq_pools_lock);
 505        cq->cqe_used -= nr_cqe;
 506        spin_unlock_irq(&cq->device->cq_pools_lock);
 507}
 508EXPORT_SYMBOL(ib_cq_pool_put);
 509