linux/drivers/infiniband/core/uverbs_main.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
   4 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
   5 * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
   6 * Copyright (c) 2005 PathScale, Inc. All rights reserved.
   7 *
   8 * This software is available to you under a choice of one of two
   9 * licenses.  You may choose to be licensed under the terms of the GNU
  10 * General Public License (GPL) Version 2, available from the file
  11 * COPYING in the main directory of this source tree, or the
  12 * OpenIB.org BSD license below:
  13 *
  14 *     Redistribution and use in source and binary forms, with or
  15 *     without modification, are permitted provided that the following
  16 *     conditions are met:
  17 *
  18 *      - Redistributions of source code must retain the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer.
  21 *
  22 *      - Redistributions in binary form must reproduce the above
  23 *        copyright notice, this list of conditions and the following
  24 *        disclaimer in the documentation and/or other materials
  25 *        provided with the distribution.
  26 *
  27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  34 * SOFTWARE.
  35 */
  36
  37#include <linux/module.h>
  38#include <linux/init.h>
  39#include <linux/device.h>
  40#include <linux/err.h>
  41#include <linux/fs.h>
  42#include <linux/poll.h>
  43#include <linux/sched.h>
  44#include <linux/file.h>
  45#include <linux/cdev.h>
  46#include <linux/anon_inodes.h>
  47#include <linux/slab.h>
  48#include <linux/sched/mm.h>
  49
  50#include <linux/uaccess.h>
  51
  52#include <rdma/ib.h>
  53#include <rdma/uverbs_std_types.h>
  54#include <rdma/rdma_netlink.h>
  55
  56#include "uverbs.h"
  57#include "core_priv.h"
  58#include "rdma_core.h"
  59
  60MODULE_AUTHOR("Roland Dreier");
  61MODULE_DESCRIPTION("InfiniBand userspace verbs access");
  62MODULE_LICENSE("Dual BSD/GPL");
  63
  64enum {
  65        IB_UVERBS_MAJOR       = 231,
  66        IB_UVERBS_BASE_MINOR  = 192,
  67        IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS,
  68        IB_UVERBS_NUM_FIXED_MINOR = 32,
  69        IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR,
  70};
  71
  72#define IB_UVERBS_BASE_DEV      MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
  73
  74static dev_t dynamic_uverbs_dev;
  75static struct class *uverbs_class;
  76
  77static DEFINE_IDA(uverbs_ida);
  78static void ib_uverbs_add_one(struct ib_device *device);
  79static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
  80
  81/*
  82 * Must be called with the ufile->device->disassociate_srcu held, and the lock
  83 * must be held until use of the ucontext is finished.
  84 */
  85struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile)
  86{
  87        /*
  88         * We do not hold the hw_destroy_rwsem lock for this flow, instead
  89         * srcu is used. It does not matter if someone races this with
  90         * get_context, we get NULL or valid ucontext.
  91         */
  92        struct ib_ucontext *ucontext = smp_load_acquire(&ufile->ucontext);
  93
  94        if (!srcu_dereference(ufile->device->ib_dev,
  95                              &ufile->device->disassociate_srcu))
  96                return ERR_PTR(-EIO);
  97
  98        if (!ucontext)
  99                return ERR_PTR(-EINVAL);
 100
 101        return ucontext;
 102}
 103EXPORT_SYMBOL(ib_uverbs_get_ucontext_file);
 104
 105int uverbs_dealloc_mw(struct ib_mw *mw)
 106{
 107        struct ib_pd *pd = mw->pd;
 108        int ret;
 109
 110        ret = mw->device->ops.dealloc_mw(mw);
 111        if (!ret)
 112                atomic_dec(&pd->usecnt);
 113        return ret;
 114}
 115
 116static void ib_uverbs_release_dev(struct device *device)
 117{
 118        struct ib_uverbs_device *dev =
 119                        container_of(device, struct ib_uverbs_device, dev);
 120
 121        uverbs_destroy_api(dev->uapi);
 122        cleanup_srcu_struct(&dev->disassociate_srcu);
 123        kfree(dev);
 124}
 125
 126static void ib_uverbs_release_async_event_file(struct kref *ref)
 127{
 128        struct ib_uverbs_async_event_file *file =
 129                container_of(ref, struct ib_uverbs_async_event_file, ref);
 130
 131        kfree(file);
 132}
 133
 134void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
 135                          struct ib_uverbs_completion_event_file *ev_file,
 136                          struct ib_ucq_object *uobj)
 137{
 138        struct ib_uverbs_event *evt, *tmp;
 139
 140        if (ev_file) {
 141                spin_lock_irq(&ev_file->ev_queue.lock);
 142                list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
 143                        list_del(&evt->list);
 144                        kfree(evt);
 145                }
 146                spin_unlock_irq(&ev_file->ev_queue.lock);
 147
 148                uverbs_uobject_put(&ev_file->uobj);
 149        }
 150
 151        spin_lock_irq(&file->async_file->ev_queue.lock);
 152        list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) {
 153                list_del(&evt->list);
 154                kfree(evt);
 155        }
 156        spin_unlock_irq(&file->async_file->ev_queue.lock);
 157}
 158
 159void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
 160                              struct ib_uevent_object *uobj)
 161{
 162        struct ib_uverbs_event *evt, *tmp;
 163
 164        spin_lock_irq(&file->async_file->ev_queue.lock);
 165        list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
 166                list_del(&evt->list);
 167                kfree(evt);
 168        }
 169        spin_unlock_irq(&file->async_file->ev_queue.lock);
 170}
 171
 172void ib_uverbs_detach_umcast(struct ib_qp *qp,
 173                             struct ib_uqp_object *uobj)
 174{
 175        struct ib_uverbs_mcast_entry *mcast, *tmp;
 176
 177        list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
 178                ib_detach_mcast(qp, &mcast->gid, mcast->lid);
 179                list_del(&mcast->list);
 180                kfree(mcast);
 181        }
 182}
 183
 184static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
 185{
 186        complete(&dev->comp);
 187}
 188
 189void ib_uverbs_release_file(struct kref *ref)
 190{
 191        struct ib_uverbs_file *file =
 192                container_of(ref, struct ib_uverbs_file, ref);
 193        struct ib_device *ib_dev;
 194        int srcu_key;
 195
 196        release_ufile_idr_uobject(file);
 197
 198        srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
 199        ib_dev = srcu_dereference(file->device->ib_dev,
 200                                  &file->device->disassociate_srcu);
 201        if (ib_dev && !ib_dev->ops.disassociate_ucontext)
 202                module_put(ib_dev->ops.owner);
 203        srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 204
 205        if (atomic_dec_and_test(&file->device->refcount))
 206                ib_uverbs_comp_dev(file->device);
 207
 208        if (file->async_file)
 209                kref_put(&file->async_file->ref,
 210                         ib_uverbs_release_async_event_file);
 211        put_device(&file->device->dev);
 212
 213        if (file->disassociate_page)
 214                __free_pages(file->disassociate_page, 0);
 215        kfree(file);
 216}
 217
 218static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue,
 219                                    struct ib_uverbs_file *uverbs_file,
 220                                    struct file *filp, char __user *buf,
 221                                    size_t count, loff_t *pos,
 222                                    size_t eventsz)
 223{
 224        struct ib_uverbs_event *event;
 225        int ret = 0;
 226
 227        spin_lock_irq(&ev_queue->lock);
 228
 229        while (list_empty(&ev_queue->event_list)) {
 230                spin_unlock_irq(&ev_queue->lock);
 231
 232                if (filp->f_flags & O_NONBLOCK)
 233                        return -EAGAIN;
 234
 235                if (wait_event_interruptible(ev_queue->poll_wait,
 236                                             (!list_empty(&ev_queue->event_list) ||
 237                        /* The barriers built into wait_event_interruptible()
 238                         * and wake_up() guarentee this will see the null set
 239                         * without using RCU
 240                         */
 241                                             !uverbs_file->device->ib_dev)))
 242                        return -ERESTARTSYS;
 243
 244                /* If device was disassociated and no event exists set an error */
 245                if (list_empty(&ev_queue->event_list) &&
 246                    !uverbs_file->device->ib_dev)
 247                        return -EIO;
 248
 249                spin_lock_irq(&ev_queue->lock);
 250        }
 251
 252        event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list);
 253
 254        if (eventsz > count) {
 255                ret   = -EINVAL;
 256                event = NULL;
 257        } else {
 258                list_del(ev_queue->event_list.next);
 259                if (event->counter) {
 260                        ++(*event->counter);
 261                        list_del(&event->obj_list);
 262                }
 263        }
 264
 265        spin_unlock_irq(&ev_queue->lock);
 266
 267        if (event) {
 268                if (copy_to_user(buf, event, eventsz))
 269                        ret = -EFAULT;
 270                else
 271                        ret = eventsz;
 272        }
 273
 274        kfree(event);
 275
 276        return ret;
 277}
 278
 279static ssize_t ib_uverbs_async_event_read(struct file *filp, char __user *buf,
 280                                          size_t count, loff_t *pos)
 281{
 282        struct ib_uverbs_async_event_file *file = filp->private_data;
 283
 284        return ib_uverbs_event_read(&file->ev_queue, file->uverbs_file, filp,
 285                                    buf, count, pos,
 286                                    sizeof(struct ib_uverbs_async_event_desc));
 287}
 288
 289static ssize_t ib_uverbs_comp_event_read(struct file *filp, char __user *buf,
 290                                         size_t count, loff_t *pos)
 291{
 292        struct ib_uverbs_completion_event_file *comp_ev_file =
 293                filp->private_data;
 294
 295        return ib_uverbs_event_read(&comp_ev_file->ev_queue,
 296                                    comp_ev_file->uobj.ufile, filp,
 297                                    buf, count, pos,
 298                                    sizeof(struct ib_uverbs_comp_event_desc));
 299}
 300
 301static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue,
 302                                         struct file *filp,
 303                                         struct poll_table_struct *wait)
 304{
 305        __poll_t pollflags = 0;
 306
 307        poll_wait(filp, &ev_queue->poll_wait, wait);
 308
 309        spin_lock_irq(&ev_queue->lock);
 310        if (!list_empty(&ev_queue->event_list))
 311                pollflags = EPOLLIN | EPOLLRDNORM;
 312        spin_unlock_irq(&ev_queue->lock);
 313
 314        return pollflags;
 315}
 316
 317static __poll_t ib_uverbs_async_event_poll(struct file *filp,
 318                                               struct poll_table_struct *wait)
 319{
 320        return ib_uverbs_event_poll(filp->private_data, filp, wait);
 321}
 322
 323static __poll_t ib_uverbs_comp_event_poll(struct file *filp,
 324                                              struct poll_table_struct *wait)
 325{
 326        struct ib_uverbs_completion_event_file *comp_ev_file =
 327                filp->private_data;
 328
 329        return ib_uverbs_event_poll(&comp_ev_file->ev_queue, filp, wait);
 330}
 331
 332static int ib_uverbs_async_event_fasync(int fd, struct file *filp, int on)
 333{
 334        struct ib_uverbs_event_queue *ev_queue = filp->private_data;
 335
 336        return fasync_helper(fd, filp, on, &ev_queue->async_queue);
 337}
 338
 339static int ib_uverbs_comp_event_fasync(int fd, struct file *filp, int on)
 340{
 341        struct ib_uverbs_completion_event_file *comp_ev_file =
 342                filp->private_data;
 343
 344        return fasync_helper(fd, filp, on, &comp_ev_file->ev_queue.async_queue);
 345}
 346
 347static int ib_uverbs_async_event_close(struct inode *inode, struct file *filp)
 348{
 349        struct ib_uverbs_async_event_file *file = filp->private_data;
 350        struct ib_uverbs_file *uverbs_file = file->uverbs_file;
 351        struct ib_uverbs_event *entry, *tmp;
 352        int closed_already = 0;
 353
 354        mutex_lock(&uverbs_file->device->lists_mutex);
 355        spin_lock_irq(&file->ev_queue.lock);
 356        closed_already = file->ev_queue.is_closed;
 357        file->ev_queue.is_closed = 1;
 358        list_for_each_entry_safe(entry, tmp, &file->ev_queue.event_list, list) {
 359                if (entry->counter)
 360                        list_del(&entry->obj_list);
 361                kfree(entry);
 362        }
 363        spin_unlock_irq(&file->ev_queue.lock);
 364        if (!closed_already) {
 365                list_del(&file->list);
 366                ib_unregister_event_handler(&uverbs_file->event_handler);
 367        }
 368        mutex_unlock(&uverbs_file->device->lists_mutex);
 369
 370        kref_put(&uverbs_file->ref, ib_uverbs_release_file);
 371        kref_put(&file->ref, ib_uverbs_release_async_event_file);
 372
 373        return 0;
 374}
 375
 376static int ib_uverbs_comp_event_close(struct inode *inode, struct file *filp)
 377{
 378        struct ib_uobject *uobj = filp->private_data;
 379        struct ib_uverbs_completion_event_file *file = container_of(
 380                uobj, struct ib_uverbs_completion_event_file, uobj);
 381        struct ib_uverbs_event *entry, *tmp;
 382
 383        spin_lock_irq(&file->ev_queue.lock);
 384        list_for_each_entry_safe(entry, tmp, &file->ev_queue.event_list, list) {
 385                if (entry->counter)
 386                        list_del(&entry->obj_list);
 387                kfree(entry);
 388        }
 389        file->ev_queue.is_closed = 1;
 390        spin_unlock_irq(&file->ev_queue.lock);
 391
 392        uverbs_close_fd(filp);
 393
 394        return 0;
 395}
 396
 397const struct file_operations uverbs_event_fops = {
 398        .owner   = THIS_MODULE,
 399        .read    = ib_uverbs_comp_event_read,
 400        .poll    = ib_uverbs_comp_event_poll,
 401        .release = ib_uverbs_comp_event_close,
 402        .fasync  = ib_uverbs_comp_event_fasync,
 403        .llseek  = no_llseek,
 404};
 405
 406static const struct file_operations uverbs_async_event_fops = {
 407        .owner   = THIS_MODULE,
 408        .read    = ib_uverbs_async_event_read,
 409        .poll    = ib_uverbs_async_event_poll,
 410        .release = ib_uverbs_async_event_close,
 411        .fasync  = ib_uverbs_async_event_fasync,
 412        .llseek  = no_llseek,
 413};
 414
 415void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
 416{
 417        struct ib_uverbs_event_queue   *ev_queue = cq_context;
 418        struct ib_ucq_object           *uobj;
 419        struct ib_uverbs_event         *entry;
 420        unsigned long                   flags;
 421
 422        if (!ev_queue)
 423                return;
 424
 425        spin_lock_irqsave(&ev_queue->lock, flags);
 426        if (ev_queue->is_closed) {
 427                spin_unlock_irqrestore(&ev_queue->lock, flags);
 428                return;
 429        }
 430
 431        entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
 432        if (!entry) {
 433                spin_unlock_irqrestore(&ev_queue->lock, flags);
 434                return;
 435        }
 436
 437        uobj = container_of(cq->uobject, struct ib_ucq_object, uobject);
 438
 439        entry->desc.comp.cq_handle = cq->uobject->user_handle;
 440        entry->counter             = &uobj->comp_events_reported;
 441
 442        list_add_tail(&entry->list, &ev_queue->event_list);
 443        list_add_tail(&entry->obj_list, &uobj->comp_list);
 444        spin_unlock_irqrestore(&ev_queue->lock, flags);
 445
 446        wake_up_interruptible(&ev_queue->poll_wait);
 447        kill_fasync(&ev_queue->async_queue, SIGIO, POLL_IN);
 448}
 449
 450static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
 451                                    __u64 element, __u64 event,
 452                                    struct list_head *obj_list,
 453                                    u32 *counter)
 454{
 455        struct ib_uverbs_event *entry;
 456        unsigned long flags;
 457
 458        spin_lock_irqsave(&file->async_file->ev_queue.lock, flags);
 459        if (file->async_file->ev_queue.is_closed) {
 460                spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags);
 461                return;
 462        }
 463
 464        entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
 465        if (!entry) {
 466                spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags);
 467                return;
 468        }
 469
 470        entry->desc.async.element    = element;
 471        entry->desc.async.event_type = event;
 472        entry->desc.async.reserved   = 0;
 473        entry->counter               = counter;
 474
 475        list_add_tail(&entry->list, &file->async_file->ev_queue.event_list);
 476        if (obj_list)
 477                list_add_tail(&entry->obj_list, obj_list);
 478        spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags);
 479
 480        wake_up_interruptible(&file->async_file->ev_queue.poll_wait);
 481        kill_fasync(&file->async_file->ev_queue.async_queue, SIGIO, POLL_IN);
 482}
 483
 484void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
 485{
 486        struct ib_ucq_object *uobj = container_of(event->element.cq->uobject,
 487                                                  struct ib_ucq_object, uobject);
 488
 489        ib_uverbs_async_handler(uobj->uobject.ufile, uobj->uobject.user_handle,
 490                                event->event, &uobj->async_list,
 491                                &uobj->async_events_reported);
 492}
 493
 494void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
 495{
 496        struct ib_uevent_object *uobj;
 497
 498        /* for XRC target qp's, check that qp is live */
 499        if (!event->element.qp->uobject)
 500                return;
 501
 502        uobj = container_of(event->element.qp->uobject,
 503                            struct ib_uevent_object, uobject);
 504
 505        ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
 506                                event->event, &uobj->event_list,
 507                                &uobj->events_reported);
 508}
 509
 510void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr)
 511{
 512        struct ib_uevent_object *uobj = container_of(event->element.wq->uobject,
 513                                                  struct ib_uevent_object, uobject);
 514
 515        ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
 516                                event->event, &uobj->event_list,
 517                                &uobj->events_reported);
 518}
 519
 520void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
 521{
 522        struct ib_uevent_object *uobj;
 523
 524        uobj = container_of(event->element.srq->uobject,
 525                            struct ib_uevent_object, uobject);
 526
 527        ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
 528                                event->event, &uobj->event_list,
 529                                &uobj->events_reported);
 530}
 531
 532void ib_uverbs_event_handler(struct ib_event_handler *handler,
 533                             struct ib_event *event)
 534{
 535        struct ib_uverbs_file *file =
 536                container_of(handler, struct ib_uverbs_file, event_handler);
 537
 538        ib_uverbs_async_handler(file, event->element.port_num, event->event,
 539                                NULL, NULL);
 540}
 541
 542void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
 543{
 544        kref_put(&file->async_file->ref, ib_uverbs_release_async_event_file);
 545        file->async_file = NULL;
 546}
 547
 548void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue)
 549{
 550        spin_lock_init(&ev_queue->lock);
 551        INIT_LIST_HEAD(&ev_queue->event_list);
 552        init_waitqueue_head(&ev_queue->poll_wait);
 553        ev_queue->is_closed   = 0;
 554        ev_queue->async_queue = NULL;
 555}
 556
 557struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file,
 558                                              struct ib_device  *ib_dev)
 559{
 560        struct ib_uverbs_async_event_file *ev_file;
 561        struct file *filp;
 562
 563        ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
 564        if (!ev_file)
 565                return ERR_PTR(-ENOMEM);
 566
 567        ib_uverbs_init_event_queue(&ev_file->ev_queue);
 568        ev_file->uverbs_file = uverbs_file;
 569        kref_get(&ev_file->uverbs_file->ref);
 570        kref_init(&ev_file->ref);
 571        filp = anon_inode_getfile("[infinibandevent]", &uverbs_async_event_fops,
 572                                  ev_file, O_RDONLY);
 573        if (IS_ERR(filp))
 574                goto err_put_refs;
 575
 576        mutex_lock(&uverbs_file->device->lists_mutex);
 577        list_add_tail(&ev_file->list,
 578                      &uverbs_file->device->uverbs_events_file_list);
 579        mutex_unlock(&uverbs_file->device->lists_mutex);
 580
 581        WARN_ON(uverbs_file->async_file);
 582        uverbs_file->async_file = ev_file;
 583        kref_get(&uverbs_file->async_file->ref);
 584        INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
 585                              ib_dev,
 586                              ib_uverbs_event_handler);
 587        ib_register_event_handler(&uverbs_file->event_handler);
 588        /* At that point async file stuff was fully set */
 589
 590        return filp;
 591
 592err_put_refs:
 593        kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
 594        kref_put(&ev_file->ref, ib_uverbs_release_async_event_file);
 595        return filp;
 596}
 597
 598static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr,
 599                          struct ib_uverbs_ex_cmd_hdr *ex_hdr, size_t count,
 600                          const struct uverbs_api_write_method *method_elm)
 601{
 602        if (method_elm->is_ex) {
 603                count -= sizeof(*hdr) + sizeof(*ex_hdr);
 604
 605                if ((hdr->in_words + ex_hdr->provider_in_words) * 8 != count)
 606                        return -EINVAL;
 607
 608                if (hdr->in_words * 8 < method_elm->req_size)
 609                        return -ENOSPC;
 610
 611                if (ex_hdr->cmd_hdr_reserved)
 612                        return -EINVAL;
 613
 614                if (ex_hdr->response) {
 615                        if (!hdr->out_words && !ex_hdr->provider_out_words)
 616                                return -EINVAL;
 617
 618                        if (hdr->out_words * 8 < method_elm->resp_size)
 619                                return -ENOSPC;
 620
 621                        if (!access_ok(u64_to_user_ptr(ex_hdr->response),
 622                                       (hdr->out_words + ex_hdr->provider_out_words) * 8))
 623                                return -EFAULT;
 624                } else {
 625                        if (hdr->out_words || ex_hdr->provider_out_words)
 626                                return -EINVAL;
 627                }
 628
 629                return 0;
 630        }
 631
 632        /* not extended command */
 633        if (hdr->in_words * 4 != count)
 634                return -EINVAL;
 635
 636        if (count < method_elm->req_size + sizeof(hdr)) {
 637                /*
 638                 * rdma-core v18 and v19 have a bug where they send DESTROY_CQ
 639                 * with a 16 byte write instead of 24. Old kernels didn't
 640                 * check the size so they allowed this. Now that the size is
 641                 * checked provide a compatibility work around to not break
 642                 * those userspaces.
 643                 */
 644                if (hdr->command == IB_USER_VERBS_CMD_DESTROY_CQ &&
 645                    count == 16) {
 646                        hdr->in_words = 6;
 647                        return 0;
 648                }
 649                return -ENOSPC;
 650        }
 651        if (hdr->out_words * 4 < method_elm->resp_size)
 652                return -ENOSPC;
 653
 654        return 0;
 655}
 656
 657static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 658                             size_t count, loff_t *pos)
 659{
 660        struct ib_uverbs_file *file = filp->private_data;
 661        const struct uverbs_api_write_method *method_elm;
 662        struct uverbs_api *uapi = file->device->uapi;
 663        struct ib_uverbs_ex_cmd_hdr ex_hdr;
 664        struct ib_uverbs_cmd_hdr hdr;
 665        struct uverbs_attr_bundle bundle;
 666        int srcu_key;
 667        ssize_t ret;
 668
 669        if (!ib_safe_file_access(filp)) {
 670                pr_err_once("uverbs_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
 671                            task_tgid_vnr(current), current->comm);
 672                return -EACCES;
 673        }
 674
 675        if (count < sizeof(hdr))
 676                return -EINVAL;
 677
 678        if (copy_from_user(&hdr, buf, sizeof(hdr)))
 679                return -EFAULT;
 680
 681        method_elm = uapi_get_method(uapi, hdr.command);
 682        if (IS_ERR(method_elm))
 683                return PTR_ERR(method_elm);
 684
 685        if (method_elm->is_ex) {
 686                if (count < (sizeof(hdr) + sizeof(ex_hdr)))
 687                        return -EINVAL;
 688                if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
 689                        return -EFAULT;
 690        }
 691
 692        ret = verify_hdr(&hdr, &ex_hdr, count, method_elm);
 693        if (ret)
 694                return ret;
 695
 696        srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
 697
 698        buf += sizeof(hdr);
 699
 700        memset(bundle.attr_present, 0, sizeof(bundle.attr_present));
 701        bundle.ufile = file;
 702        bundle.context = NULL; /* only valid if bundle has uobject */
 703        if (!method_elm->is_ex) {
 704                size_t in_len = hdr.in_words * 4 - sizeof(hdr);
 705                size_t out_len = hdr.out_words * 4;
 706                u64 response = 0;
 707
 708                if (method_elm->has_udata) {
 709                        bundle.driver_udata.inlen =
 710                                in_len - method_elm->req_size;
 711                        in_len = method_elm->req_size;
 712                        if (bundle.driver_udata.inlen)
 713                                bundle.driver_udata.inbuf = buf + in_len;
 714                        else
 715                                bundle.driver_udata.inbuf = NULL;
 716                } else {
 717                        memset(&bundle.driver_udata, 0,
 718                               sizeof(bundle.driver_udata));
 719                }
 720
 721                if (method_elm->has_resp) {
 722                        /*
 723                         * The macros check that if has_resp is set
 724                         * then the command request structure starts
 725                         * with a '__aligned u64 response' member.
 726                         */
 727                        ret = get_user(response, (const u64 __user *)buf);
 728                        if (ret)
 729                                goto out_unlock;
 730
 731                        if (method_elm->has_udata) {
 732                                bundle.driver_udata.outlen =
 733                                        out_len - method_elm->resp_size;
 734                                out_len = method_elm->resp_size;
 735                                if (bundle.driver_udata.outlen)
 736                                        bundle.driver_udata.outbuf =
 737                                                u64_to_user_ptr(response +
 738                                                                out_len);
 739                                else
 740                                        bundle.driver_udata.outbuf = NULL;
 741                        }
 742                } else {
 743                        bundle.driver_udata.outlen = 0;
 744                        bundle.driver_udata.outbuf = NULL;
 745                }
 746
 747                ib_uverbs_init_udata_buf_or_null(
 748                        &bundle.ucore, buf, u64_to_user_ptr(response),
 749                        in_len, out_len);
 750        } else {
 751                buf += sizeof(ex_hdr);
 752
 753                ib_uverbs_init_udata_buf_or_null(&bundle.ucore, buf,
 754                                        u64_to_user_ptr(ex_hdr.response),
 755                                        hdr.in_words * 8, hdr.out_words * 8);
 756
 757                ib_uverbs_init_udata_buf_or_null(
 758                        &bundle.driver_udata, buf + bundle.ucore.inlen,
 759                        u64_to_user_ptr(ex_hdr.response) + bundle.ucore.outlen,
 760                        ex_hdr.provider_in_words * 8,
 761                        ex_hdr.provider_out_words * 8);
 762
 763        }
 764
 765        ret = method_elm->handler(&bundle);
 766out_unlock:
 767        srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 768        return (ret) ? : count;
 769}
 770
 771static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
 772{
 773        struct ib_uverbs_file *file = filp->private_data;
 774        struct ib_ucontext *ucontext;
 775        int ret = 0;
 776        int srcu_key;
 777
 778        srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
 779        ucontext = ib_uverbs_get_ucontext_file(file);
 780        if (IS_ERR(ucontext)) {
 781                ret = PTR_ERR(ucontext);
 782                goto out;
 783        }
 784
 785        ret = ucontext->device->ops.mmap(ucontext, vma);
 786out:
 787        srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 788        return ret;
 789}
 790
 791/*
 792 * Each time we map IO memory into user space this keeps track of the mapping.
 793 * When the device is hot-unplugged we 'zap' the mmaps in user space to point
 794 * to the zero page and allow the hot unplug to proceed.
 795 *
 796 * This is necessary for cases like PCI physical hot unplug as the actual BAR
 797 * memory may vanish after this and access to it from userspace could MCE.
 798 *
 799 * RDMA drivers supporting disassociation must have their user space designed
 800 * to cope in some way with their IO pages going to the zero page.
 801 */
 802struct rdma_umap_priv {
 803        struct vm_area_struct *vma;
 804        struct list_head list;
 805};
 806
 807static const struct vm_operations_struct rdma_umap_ops;
 808
 809static void rdma_umap_priv_init(struct rdma_umap_priv *priv,
 810                                struct vm_area_struct *vma)
 811{
 812        struct ib_uverbs_file *ufile = vma->vm_file->private_data;
 813
 814        priv->vma = vma;
 815        vma->vm_private_data = priv;
 816        vma->vm_ops = &rdma_umap_ops;
 817
 818        mutex_lock(&ufile->umap_lock);
 819        list_add(&priv->list, &ufile->umaps);
 820        mutex_unlock(&ufile->umap_lock);
 821}
 822
 823/*
 824 * The VMA has been dup'd, initialize the vm_private_data with a new tracking
 825 * struct
 826 */
 827static void rdma_umap_open(struct vm_area_struct *vma)
 828{
 829        struct ib_uverbs_file *ufile = vma->vm_file->private_data;
 830        struct rdma_umap_priv *opriv = vma->vm_private_data;
 831        struct rdma_umap_priv *priv;
 832
 833        if (!opriv)
 834                return;
 835
 836        /* We are racing with disassociation */
 837        if (!down_read_trylock(&ufile->hw_destroy_rwsem))
 838                goto out_zap;
 839        /*
 840         * Disassociation already completed, the VMA should already be zapped.
 841         */
 842        if (!ufile->ucontext)
 843                goto out_unlock;
 844
 845        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 846        if (!priv)
 847                goto out_unlock;
 848        rdma_umap_priv_init(priv, vma);
 849
 850        up_read(&ufile->hw_destroy_rwsem);
 851        return;
 852
 853out_unlock:
 854        up_read(&ufile->hw_destroy_rwsem);
 855out_zap:
 856        /*
 857         * We can't allow the VMA to be created with the actual IO pages, that
 858         * would break our API contract, and it can't be stopped at this
 859         * point, so zap it.
 860         */
 861        vma->vm_private_data = NULL;
 862        zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
 863}
 864
 865static void rdma_umap_close(struct vm_area_struct *vma)
 866{
 867        struct ib_uverbs_file *ufile = vma->vm_file->private_data;
 868        struct rdma_umap_priv *priv = vma->vm_private_data;
 869
 870        if (!priv)
 871                return;
 872
 873        /*
 874         * The vma holds a reference on the struct file that created it, which
 875         * in turn means that the ib_uverbs_file is guaranteed to exist at
 876         * this point.
 877         */
 878        mutex_lock(&ufile->umap_lock);
 879        list_del(&priv->list);
 880        mutex_unlock(&ufile->umap_lock);
 881        kfree(priv);
 882}
 883
 884/*
 885 * Once the zap_vma_ptes has been called touches to the VMA will come here and
 886 * we return a dummy writable zero page for all the pfns.
 887 */
 888static vm_fault_t rdma_umap_fault(struct vm_fault *vmf)
 889{
 890        struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data;
 891        struct rdma_umap_priv *priv = vmf->vma->vm_private_data;
 892        vm_fault_t ret = 0;
 893
 894        if (!priv)
 895                return VM_FAULT_SIGBUS;
 896
 897        /* Read only pages can just use the system zero page. */
 898        if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) {
 899                vmf->page = ZERO_PAGE(vmf->address);
 900                get_page(vmf->page);
 901                return 0;
 902        }
 903
 904        mutex_lock(&ufile->umap_lock);
 905        if (!ufile->disassociate_page)
 906                ufile->disassociate_page =
 907                        alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0);
 908
 909        if (ufile->disassociate_page) {
 910                /*
 911                 * This VMA is forced to always be shared so this doesn't have
 912                 * to worry about COW.
 913                 */
 914                vmf->page = ufile->disassociate_page;
 915                get_page(vmf->page);
 916        } else {
 917                ret = VM_FAULT_SIGBUS;
 918        }
 919        mutex_unlock(&ufile->umap_lock);
 920
 921        return ret;
 922}
 923
 924static const struct vm_operations_struct rdma_umap_ops = {
 925        .open = rdma_umap_open,
 926        .close = rdma_umap_close,
 927        .fault = rdma_umap_fault,
 928};
 929
 930/*
 931 * Map IO memory into a process. This is to be called by drivers as part of
 932 * their mmap() functions if they wish to send something like PCI-E BAR memory
 933 * to userspace.
 934 */
 935int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
 936                      unsigned long pfn, unsigned long size, pgprot_t prot)
 937{
 938        struct ib_uverbs_file *ufile = ucontext->ufile;
 939        struct rdma_umap_priv *priv;
 940
 941        if (!(vma->vm_flags & VM_SHARED))
 942                return -EINVAL;
 943
 944        if (vma->vm_end - vma->vm_start != size)
 945                return -EINVAL;
 946
 947        /* Driver is using this wrong, must be called by ib_uverbs_mmap */
 948        if (WARN_ON(!vma->vm_file ||
 949                    vma->vm_file->private_data != ufile))
 950                return -EINVAL;
 951        lockdep_assert_held(&ufile->device->disassociate_srcu);
 952
 953        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 954        if (!priv)
 955                return -ENOMEM;
 956
 957        vma->vm_page_prot = prot;
 958        if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
 959                kfree(priv);
 960                return -EAGAIN;
 961        }
 962
 963        rdma_umap_priv_init(priv, vma);
 964        return 0;
 965}
 966EXPORT_SYMBOL(rdma_user_mmap_io);
 967
 968void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
 969{
 970        struct rdma_umap_priv *priv, *next_priv;
 971
 972        lockdep_assert_held(&ufile->hw_destroy_rwsem);
 973
 974        while (1) {
 975                struct mm_struct *mm = NULL;
 976
 977                /* Get an arbitrary mm pointer that hasn't been cleaned yet */
 978                mutex_lock(&ufile->umap_lock);
 979                while (!list_empty(&ufile->umaps)) {
 980                        int ret;
 981
 982                        priv = list_first_entry(&ufile->umaps,
 983                                                struct rdma_umap_priv, list);
 984                        mm = priv->vma->vm_mm;
 985                        ret = mmget_not_zero(mm);
 986                        if (!ret) {
 987                                list_del_init(&priv->list);
 988                                mm = NULL;
 989                                continue;
 990                        }
 991                        break;
 992                }
 993                mutex_unlock(&ufile->umap_lock);
 994                if (!mm)
 995                        return;
 996
 997                /*
 998                 * The umap_lock is nested under mmap_sem since it used within
 999                 * the vma_ops callbacks, so we have to clean the list one mm
1000                 * at a time to get the lock ordering right. Typically there
1001                 * will only be one mm, so no big deal.
1002                 */
1003                down_read(&mm->mmap_sem);
1004                if (!mmget_still_valid(mm))
1005                        goto skip_mm;
1006                mutex_lock(&ufile->umap_lock);
1007                list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
1008                                          list) {
1009                        struct vm_area_struct *vma = priv->vma;
1010
1011                        if (vma->vm_mm != mm)
1012                                continue;
1013                        list_del_init(&priv->list);
1014
1015                        zap_vma_ptes(vma, vma->vm_start,
1016                                     vma->vm_end - vma->vm_start);
1017                }
1018                mutex_unlock(&ufile->umap_lock);
1019        skip_mm:
1020                up_read(&mm->mmap_sem);
1021                mmput(mm);
1022        }
1023}
1024
1025/*
1026 * ib_uverbs_open() does not need the BKL:
1027 *
1028 *  - the ib_uverbs_device structures are properly reference counted and
1029 *    everything else is purely local to the file being created, so
1030 *    races against other open calls are not a problem;
1031 *  - there is no ioctl method to race against;
1032 *  - the open method will either immediately run -ENXIO, or all
1033 *    required initialization will be done.
1034 */
1035static int ib_uverbs_open(struct inode *inode, struct file *filp)
1036{
1037        struct ib_uverbs_device *dev;
1038        struct ib_uverbs_file *file;
1039        struct ib_device *ib_dev;
1040        int ret;
1041        int module_dependent;
1042        int srcu_key;
1043
1044        dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
1045        if (!atomic_inc_not_zero(&dev->refcount))
1046                return -ENXIO;
1047
1048        get_device(&dev->dev);
1049        srcu_key = srcu_read_lock(&dev->disassociate_srcu);
1050        mutex_lock(&dev->lists_mutex);
1051        ib_dev = srcu_dereference(dev->ib_dev,
1052                                  &dev->disassociate_srcu);
1053        if (!ib_dev) {
1054                ret = -EIO;
1055                goto err;
1056        }
1057
1058        if (!rdma_dev_access_netns(ib_dev, current->nsproxy->net_ns)) {
1059                ret = -EPERM;
1060                goto err;
1061        }
1062
1063        /* In case IB device supports disassociate ucontext, there is no hard
1064         * dependency between uverbs device and its low level device.
1065         */
1066        module_dependent = !(ib_dev->ops.disassociate_ucontext);
1067
1068        if (module_dependent) {
1069                if (!try_module_get(ib_dev->ops.owner)) {
1070                        ret = -ENODEV;
1071                        goto err;
1072                }
1073        }
1074
1075        file = kzalloc(sizeof(*file), GFP_KERNEL);
1076        if (!file) {
1077                ret = -ENOMEM;
1078                if (module_dependent)
1079                        goto err_module;
1080
1081                goto err;
1082        }
1083
1084        file->device     = dev;
1085        kref_init(&file->ref);
1086        mutex_init(&file->ucontext_lock);
1087
1088        spin_lock_init(&file->uobjects_lock);
1089        INIT_LIST_HEAD(&file->uobjects);
1090        init_rwsem(&file->hw_destroy_rwsem);
1091        mutex_init(&file->umap_lock);
1092        INIT_LIST_HEAD(&file->umaps);
1093
1094        filp->private_data = file;
1095        list_add_tail(&file->list, &dev->uverbs_file_list);
1096        mutex_unlock(&dev->lists_mutex);
1097        srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
1098
1099        setup_ufile_idr_uobject(file);
1100
1101        return stream_open(inode, filp);
1102
1103err_module:
1104        module_put(ib_dev->ops.owner);
1105
1106err:
1107        mutex_unlock(&dev->lists_mutex);
1108        srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
1109        if (atomic_dec_and_test(&dev->refcount))
1110                ib_uverbs_comp_dev(dev);
1111
1112        put_device(&dev->dev);
1113        return ret;
1114}
1115
1116static int ib_uverbs_close(struct inode *inode, struct file *filp)
1117{
1118        struct ib_uverbs_file *file = filp->private_data;
1119
1120        uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE);
1121
1122        mutex_lock(&file->device->lists_mutex);
1123        list_del_init(&file->list);
1124        mutex_unlock(&file->device->lists_mutex);
1125
1126        kref_put(&file->ref, ib_uverbs_release_file);
1127
1128        return 0;
1129}
1130
1131static const struct file_operations uverbs_fops = {
1132        .owner   = THIS_MODULE,
1133        .write   = ib_uverbs_write,
1134        .open    = ib_uverbs_open,
1135        .release = ib_uverbs_close,
1136        .llseek  = no_llseek,
1137        .unlocked_ioctl = ib_uverbs_ioctl,
1138        .compat_ioctl = ib_uverbs_ioctl,
1139};
1140
1141static const struct file_operations uverbs_mmap_fops = {
1142        .owner   = THIS_MODULE,
1143        .write   = ib_uverbs_write,
1144        .mmap    = ib_uverbs_mmap,
1145        .open    = ib_uverbs_open,
1146        .release = ib_uverbs_close,
1147        .llseek  = no_llseek,
1148        .unlocked_ioctl = ib_uverbs_ioctl,
1149        .compat_ioctl = ib_uverbs_ioctl,
1150};
1151
1152static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data,
1153                                 struct ib_client_nl_info *res)
1154{
1155        struct ib_uverbs_device *uverbs_dev = client_data;
1156        int ret;
1157
1158        if (res->port != -1)
1159                return -EINVAL;
1160
1161        res->abi = ibdev->ops.uverbs_abi_ver;
1162        res->cdev = &uverbs_dev->dev;
1163
1164        /*
1165         * To support DRIVER_ID binding in userspace some of the driver need
1166         * upgrading to expose their PCI dependent revision information
1167         * through get_context instead of relying on modalias matching. When
1168         * the drivers are fixed they can drop this flag.
1169         */
1170        if (!ibdev->ops.uverbs_no_driver_id_binding) {
1171                ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID,
1172                                  ibdev->ops.driver_id);
1173                if (ret)
1174                        return ret;
1175        }
1176        return 0;
1177}
1178
1179static struct ib_client uverbs_client = {
1180        .name   = "uverbs",
1181        .no_kverbs_req = true,
1182        .add    = ib_uverbs_add_one,
1183        .remove = ib_uverbs_remove_one,
1184        .get_nl_info = ib_uverbs_get_nl_info,
1185};
1186MODULE_ALIAS_RDMA_CLIENT("uverbs");
1187
1188static ssize_t ibdev_show(struct device *device, struct device_attribute *attr,
1189                          char *buf)
1190{
1191        struct ib_uverbs_device *dev =
1192                        container_of(device, struct ib_uverbs_device, dev);
1193        int ret = -ENODEV;
1194        int srcu_key;
1195        struct ib_device *ib_dev;
1196
1197        srcu_key = srcu_read_lock(&dev->disassociate_srcu);
1198        ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
1199        if (ib_dev)
1200                ret = sprintf(buf, "%s\n", dev_name(&ib_dev->dev));
1201        srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
1202
1203        return ret;
1204}
1205static DEVICE_ATTR_RO(ibdev);
1206
1207static ssize_t abi_version_show(struct device *device,
1208                                struct device_attribute *attr, char *buf)
1209{
1210        struct ib_uverbs_device *dev =
1211                        container_of(device, struct ib_uverbs_device, dev);
1212        int ret = -ENODEV;
1213        int srcu_key;
1214        struct ib_device *ib_dev;
1215
1216        srcu_key = srcu_read_lock(&dev->disassociate_srcu);
1217        ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
1218        if (ib_dev)
1219                ret = sprintf(buf, "%u\n", ib_dev->ops.uverbs_abi_ver);
1220        srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
1221
1222        return ret;
1223}
1224static DEVICE_ATTR_RO(abi_version);
1225
1226static struct attribute *ib_dev_attrs[] = {
1227        &dev_attr_abi_version.attr,
1228        &dev_attr_ibdev.attr,
1229        NULL,
1230};
1231
1232static const struct attribute_group dev_attr_group = {
1233        .attrs = ib_dev_attrs,
1234};
1235
1236static CLASS_ATTR_STRING(abi_version, S_IRUGO,
1237                         __stringify(IB_USER_VERBS_ABI_VERSION));
1238
1239static int ib_uverbs_create_uapi(struct ib_device *device,
1240                                 struct ib_uverbs_device *uverbs_dev)
1241{
1242        struct uverbs_api *uapi;
1243
1244        uapi = uverbs_alloc_api(device);
1245        if (IS_ERR(uapi))
1246                return PTR_ERR(uapi);
1247
1248        uverbs_dev->uapi = uapi;
1249        return 0;
1250}
1251
1252static void ib_uverbs_add_one(struct ib_device *device)
1253{
1254        int devnum;
1255        dev_t base;
1256        struct ib_uverbs_device *uverbs_dev;
1257        int ret;
1258
1259        if (!device->ops.alloc_ucontext)
1260                return;
1261
1262        uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL);
1263        if (!uverbs_dev)
1264                return;
1265
1266        ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
1267        if (ret) {
1268                kfree(uverbs_dev);
1269                return;
1270        }
1271
1272        device_initialize(&uverbs_dev->dev);
1273        uverbs_dev->dev.class = uverbs_class;
1274        uverbs_dev->dev.parent = device->dev.parent;
1275        uverbs_dev->dev.release = ib_uverbs_release_dev;
1276        uverbs_dev->groups[0] = &dev_attr_group;
1277        uverbs_dev->dev.groups = uverbs_dev->groups;
1278        atomic_set(&uverbs_dev->refcount, 1);
1279        init_completion(&uverbs_dev->comp);
1280        uverbs_dev->xrcd_tree = RB_ROOT;
1281        mutex_init(&uverbs_dev->xrcd_tree_mutex);
1282        mutex_init(&uverbs_dev->lists_mutex);
1283        INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
1284        INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
1285        rcu_assign_pointer(uverbs_dev->ib_dev, device);
1286        uverbs_dev->num_comp_vectors = device->num_comp_vectors;
1287
1288        devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1,
1289                               GFP_KERNEL);
1290        if (devnum < 0)
1291                goto err;
1292        uverbs_dev->devnum = devnum;
1293        if (devnum >= IB_UVERBS_NUM_FIXED_MINOR)
1294                base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR;
1295        else
1296                base = IB_UVERBS_BASE_DEV + devnum;
1297
1298        if (ib_uverbs_create_uapi(device, uverbs_dev))
1299                goto err_uapi;
1300
1301        uverbs_dev->dev.devt = base;
1302        dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum);
1303
1304        cdev_init(&uverbs_dev->cdev,
1305                  device->ops.mmap ? &uverbs_mmap_fops : &uverbs_fops);
1306        uverbs_dev->cdev.owner = THIS_MODULE;
1307
1308        ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev);
1309        if (ret)
1310                goto err_uapi;
1311
1312        ib_set_client_data(device, &uverbs_client, uverbs_dev);
1313        return;
1314
1315err_uapi:
1316        ida_free(&uverbs_ida, devnum);
1317err:
1318        if (atomic_dec_and_test(&uverbs_dev->refcount))
1319                ib_uverbs_comp_dev(uverbs_dev);
1320        wait_for_completion(&uverbs_dev->comp);
1321        put_device(&uverbs_dev->dev);
1322        return;
1323}
1324
1325static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
1326                                        struct ib_device *ib_dev)
1327{
1328        struct ib_uverbs_file *file;
1329        struct ib_uverbs_async_event_file *event_file;
1330        struct ib_event event;
1331
1332        /* Pending running commands to terminate */
1333        uverbs_disassociate_api_pre(uverbs_dev);
1334        event.event = IB_EVENT_DEVICE_FATAL;
1335        event.element.port_num = 0;
1336        event.device = ib_dev;
1337
1338        mutex_lock(&uverbs_dev->lists_mutex);
1339        while (!list_empty(&uverbs_dev->uverbs_file_list)) {
1340                file = list_first_entry(&uverbs_dev->uverbs_file_list,
1341                                        struct ib_uverbs_file, list);
1342                list_del_init(&file->list);
1343                kref_get(&file->ref);
1344
1345                /* We must release the mutex before going ahead and calling
1346                 * uverbs_cleanup_ufile, as it might end up indirectly calling
1347                 * uverbs_close, for example due to freeing the resources (e.g
1348                 * mmput).
1349                 */
1350                mutex_unlock(&uverbs_dev->lists_mutex);
1351
1352                ib_uverbs_event_handler(&file->event_handler, &event);
1353                uverbs_destroy_ufile_hw(file, RDMA_REMOVE_DRIVER_REMOVE);
1354                kref_put(&file->ref, ib_uverbs_release_file);
1355
1356                mutex_lock(&uverbs_dev->lists_mutex);
1357        }
1358
1359        while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
1360                event_file = list_first_entry(&uverbs_dev->
1361                                              uverbs_events_file_list,
1362                                              struct ib_uverbs_async_event_file,
1363                                              list);
1364                spin_lock_irq(&event_file->ev_queue.lock);
1365                event_file->ev_queue.is_closed = 1;
1366                spin_unlock_irq(&event_file->ev_queue.lock);
1367
1368                list_del(&event_file->list);
1369                ib_unregister_event_handler(
1370                        &event_file->uverbs_file->event_handler);
1371                event_file->uverbs_file->event_handler.device =
1372                        NULL;
1373
1374                wake_up_interruptible(&event_file->ev_queue.poll_wait);
1375                kill_fasync(&event_file->ev_queue.async_queue, SIGIO, POLL_IN);
1376        }
1377        mutex_unlock(&uverbs_dev->lists_mutex);
1378
1379        uverbs_disassociate_api(uverbs_dev->uapi);
1380}
1381
1382static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
1383{
1384        struct ib_uverbs_device *uverbs_dev = client_data;
1385        int wait_clients = 1;
1386
1387        if (!uverbs_dev)
1388                return;
1389
1390        cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev);
1391        ida_free(&uverbs_ida, uverbs_dev->devnum);
1392
1393        if (device->ops.disassociate_ucontext) {
1394                /* We disassociate HW resources and immediately return.
1395                 * Userspace will see a EIO errno for all future access.
1396                 * Upon returning, ib_device may be freed internally and is not
1397                 * valid any more.
1398                 * uverbs_device is still available until all clients close
1399                 * their files, then the uverbs device ref count will be zero
1400                 * and its resources will be freed.
1401                 * Note: At this point no more files can be opened since the
1402                 * cdev was deleted, however active clients can still issue
1403                 * commands and close their open files.
1404                 */
1405                ib_uverbs_free_hw_resources(uverbs_dev, device);
1406                wait_clients = 0;
1407        }
1408
1409        if (atomic_dec_and_test(&uverbs_dev->refcount))
1410                ib_uverbs_comp_dev(uverbs_dev);
1411        if (wait_clients)
1412                wait_for_completion(&uverbs_dev->comp);
1413
1414        put_device(&uverbs_dev->dev);
1415}
1416
1417static char *uverbs_devnode(struct device *dev, umode_t *mode)
1418{
1419        if (mode)
1420                *mode = 0666;
1421        return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
1422}
1423
1424static int __init ib_uverbs_init(void)
1425{
1426        int ret;
1427
1428        ret = register_chrdev_region(IB_UVERBS_BASE_DEV,
1429                                     IB_UVERBS_NUM_FIXED_MINOR,
1430                                     "infiniband_verbs");
1431        if (ret) {
1432                pr_err("user_verbs: couldn't register device number\n");
1433                goto out;
1434        }
1435
1436        ret = alloc_chrdev_region(&dynamic_uverbs_dev, 0,
1437                                  IB_UVERBS_NUM_DYNAMIC_MINOR,
1438                                  "infiniband_verbs");
1439        if (ret) {
1440                pr_err("couldn't register dynamic device number\n");
1441                goto out_alloc;
1442        }
1443
1444        uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
1445        if (IS_ERR(uverbs_class)) {
1446                ret = PTR_ERR(uverbs_class);
1447                pr_err("user_verbs: couldn't create class infiniband_verbs\n");
1448                goto out_chrdev;
1449        }
1450
1451        uverbs_class->devnode = uverbs_devnode;
1452
1453        ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
1454        if (ret) {
1455                pr_err("user_verbs: couldn't create abi_version attribute\n");
1456                goto out_class;
1457        }
1458
1459        ret = ib_register_client(&uverbs_client);
1460        if (ret) {
1461                pr_err("user_verbs: couldn't register client\n");
1462                goto out_class;
1463        }
1464
1465        return 0;
1466
1467out_class:
1468        class_destroy(uverbs_class);
1469
1470out_chrdev:
1471        unregister_chrdev_region(dynamic_uverbs_dev,
1472                                 IB_UVERBS_NUM_DYNAMIC_MINOR);
1473
1474out_alloc:
1475        unregister_chrdev_region(IB_UVERBS_BASE_DEV,
1476                                 IB_UVERBS_NUM_FIXED_MINOR);
1477
1478out:
1479        return ret;
1480}
1481
1482static void __exit ib_uverbs_cleanup(void)
1483{
1484        ib_unregister_client(&uverbs_client);
1485        class_destroy(uverbs_class);
1486        unregister_chrdev_region(IB_UVERBS_BASE_DEV,
1487                                 IB_UVERBS_NUM_FIXED_MINOR);
1488        unregister_chrdev_region(dynamic_uverbs_dev,
1489                                 IB_UVERBS_NUM_DYNAMIC_MINOR);
1490}
1491
1492module_init(ib_uverbs_init);
1493module_exit(ib_uverbs_cleanup);
1494