linux/drivers/block/aoe/aoedev.c
<<
>>
Prefs
   1/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
   2/*
   3 * aoedev.c
   4 * AoE device utility functions; maintains device list.
   5 */
   6
   7#include <linux/hdreg.h>
   8#include <linux/blkdev.h>
   9#include <linux/netdevice.h>
  10#include <linux/delay.h>
  11#include <linux/slab.h>
  12#include <linux/bitmap.h>
  13#include <linux/kdev_t.h>
  14#include <linux/moduleparam.h>
  15#include <linux/string.h>
  16#include "aoe.h"
  17
  18static void freetgt(struct aoedev *d, struct aoetgt *t);
  19static void skbpoolfree(struct aoedev *d);
  20
  21static int aoe_dyndevs = 1;
  22module_param(aoe_dyndevs, int, 0644);
  23MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices.");
  24
  25static struct aoedev *devlist;
  26static DEFINE_SPINLOCK(devlist_lock);
  27
  28/* Because some systems will have one, many, or no
  29 *   - partitions,
  30 *   - slots per shelf,
  31 *   - or shelves,
  32 * we need some flexibility in the way the minor numbers
  33 * are allocated.  So they are dynamic.
  34 */
  35#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS)
  36
  37static DEFINE_SPINLOCK(used_minors_lock);
  38static DECLARE_BITMAP(used_minors, N_DEVS);
  39
  40static int
  41minor_get_dyn(ulong *sysminor)
  42{
  43        ulong flags;
  44        ulong n;
  45        int error = 0;
  46
  47        spin_lock_irqsave(&used_minors_lock, flags);
  48        n = find_first_zero_bit(used_minors, N_DEVS);
  49        if (n < N_DEVS)
  50                set_bit(n, used_minors);
  51        else
  52                error = -1;
  53        spin_unlock_irqrestore(&used_minors_lock, flags);
  54
  55        *sysminor = n * AOE_PARTITIONS;
  56        return error;
  57}
  58
  59static int
  60minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin)
  61{
  62        ulong flags;
  63        ulong n;
  64        int error = 0;
  65        enum {
  66                /* for backwards compatibility when !aoe_dyndevs,
  67                 * a static number of supported slots per shelf */
  68                NPERSHELF = 16,
  69        };
  70
  71        if (aoemin >= NPERSHELF) {
  72                pr_err("aoe: %s %d slots per shelf\n",
  73                        "static minor device numbers support only",
  74                        NPERSHELF);
  75                error = -1;
  76                goto out;
  77        }
  78
  79        n = aoemaj * NPERSHELF + aoemin;
  80        if (n >= N_DEVS) {
  81                pr_err("aoe: %s with e%ld.%d\n",
  82                        "cannot use static minor device numbers",
  83                        aoemaj, aoemin);
  84                error = -1;
  85                goto out;
  86        }
  87
  88        spin_lock_irqsave(&used_minors_lock, flags);
  89        if (test_bit(n, used_minors)) {
  90                pr_err("aoe: %s %lu\n",
  91                        "existing device already has static minor number",
  92                        n);
  93                error = -1;
  94        } else
  95                set_bit(n, used_minors);
  96        spin_unlock_irqrestore(&used_minors_lock, flags);
  97        *sysminor = n * AOE_PARTITIONS;
  98out:
  99        return error;
 100}
 101
 102static int
 103minor_get(ulong *sysminor, ulong aoemaj, int aoemin)
 104{
 105        if (aoe_dyndevs)
 106                return minor_get_dyn(sysminor);
 107        else
 108                return minor_get_static(sysminor, aoemaj, aoemin);
 109}
 110
 111static void
 112minor_free(ulong minor)
 113{
 114        ulong flags;
 115
 116        minor /= AOE_PARTITIONS;
 117        BUG_ON(minor >= N_DEVS);
 118
 119        spin_lock_irqsave(&used_minors_lock, flags);
 120        BUG_ON(!test_bit(minor, used_minors));
 121        clear_bit(minor, used_minors);
 122        spin_unlock_irqrestore(&used_minors_lock, flags);
 123}
 124
 125/*
 126 * Users who grab a pointer to the device with aoedev_by_aoeaddr
 127 * automatically get a reference count and must be responsible
 128 * for performing a aoedev_put.  With the addition of async
 129 * kthread processing I'm no longer confident that we can
 130 * guarantee consistency in the face of device flushes.
 131 *
 132 * For the time being, we only bother to add extra references for
 133 * frames sitting on the iocq.  When the kthreads finish processing
 134 * these frames, they will aoedev_put the device.
 135 */
 136
 137void
 138aoedev_put(struct aoedev *d)
 139{
 140        ulong flags;
 141
 142        spin_lock_irqsave(&devlist_lock, flags);
 143        d->ref--;
 144        spin_unlock_irqrestore(&devlist_lock, flags);
 145}
 146
 147static void
 148dummy_timer(struct timer_list *t)
 149{
 150        struct aoedev *d;
 151
 152        d = from_timer(d, t, timer);
 153        if (d->flags & DEVFL_TKILL)
 154                return;
 155        d->timer.expires = jiffies + HZ;
 156        add_timer(&d->timer);
 157}
 158
 159static void
 160aoe_failip(struct aoedev *d)
 161{
 162        struct request *rq;
 163        struct bio *bio;
 164        unsigned long n;
 165
 166        aoe_failbuf(d, d->ip.buf);
 167
 168        rq = d->ip.rq;
 169        if (rq == NULL)
 170                return;
 171        while ((bio = d->ip.nxbio)) {
 172                bio->bi_status = BLK_STS_IOERR;
 173                d->ip.nxbio = bio->bi_next;
 174                n = (unsigned long) rq->special;
 175                rq->special = (void *) --n;
 176        }
 177        if ((unsigned long) rq->special == 0)
 178                aoe_end_request(d, rq, 0);
 179}
 180
 181static void
 182downdev_frame(struct list_head *pos)
 183{
 184        struct frame *f;
 185
 186        f = list_entry(pos, struct frame, head);
 187        list_del(pos);
 188        if (f->buf) {
 189                f->buf->nframesout--;
 190                aoe_failbuf(f->t->d, f->buf);
 191        }
 192        aoe_freetframe(f);
 193}
 194
 195void
 196aoedev_downdev(struct aoedev *d)
 197{
 198        struct aoetgt *t, **tt, **te;
 199        struct list_head *head, *pos, *nx;
 200        struct request *rq;
 201        int i;
 202
 203        d->flags &= ~DEVFL_UP;
 204
 205        /* clean out active and to-be-retransmitted buffers */
 206        for (i = 0; i < NFACTIVE; i++) {
 207                head = &d->factive[i];
 208                list_for_each_safe(pos, nx, head)
 209                        downdev_frame(pos);
 210        }
 211        head = &d->rexmitq;
 212        list_for_each_safe(pos, nx, head)
 213                downdev_frame(pos);
 214
 215        /* reset window dressings */
 216        tt = d->targets;
 217        te = tt + d->ntargets;
 218        for (; tt < te && (t = *tt); tt++) {
 219                aoecmd_wreset(t);
 220                t->nout = 0;
 221        }
 222
 223        /* clean out the in-process request (if any) */
 224        aoe_failip(d);
 225
 226        /* fast fail all pending I/O */
 227        if (d->blkq) {
 228                while ((rq = blk_peek_request(d->blkq))) {
 229                        blk_start_request(rq);
 230                        aoe_end_request(d, rq, 1);
 231                }
 232        }
 233
 234        if (d->gd)
 235                set_capacity(d->gd, 0);
 236}
 237
 238/* return whether the user asked for this particular
 239 * device to be flushed
 240 */
 241static int
 242user_req(char *s, size_t slen, struct aoedev *d)
 243{
 244        const char *p;
 245        size_t lim;
 246
 247        if (!d->gd)
 248                return 0;
 249        p = kbasename(d->gd->disk_name);
 250        lim = sizeof(d->gd->disk_name);
 251        lim -= p - d->gd->disk_name;
 252        if (slen < lim)
 253                lim = slen;
 254
 255        return !strncmp(s, p, lim);
 256}
 257
 258static void
 259freedev(struct aoedev *d)
 260{
 261        struct aoetgt **t, **e;
 262        int freeing = 0;
 263        unsigned long flags;
 264
 265        spin_lock_irqsave(&d->lock, flags);
 266        if (d->flags & DEVFL_TKILL
 267        && !(d->flags & DEVFL_FREEING)) {
 268                d->flags |= DEVFL_FREEING;
 269                freeing = 1;
 270        }
 271        spin_unlock_irqrestore(&d->lock, flags);
 272        if (!freeing)
 273                return;
 274
 275        del_timer_sync(&d->timer);
 276        if (d->gd) {
 277                aoedisk_rm_debugfs(d);
 278                aoedisk_rm_sysfs(d);
 279                del_gendisk(d->gd);
 280                put_disk(d->gd);
 281                blk_cleanup_queue(d->blkq);
 282        }
 283        t = d->targets;
 284        e = t + d->ntargets;
 285        for (; t < e && *t; t++)
 286                freetgt(d, *t);
 287        if (d->bufpool)
 288                mempool_destroy(d->bufpool);
 289        skbpoolfree(d);
 290        minor_free(d->sysminor);
 291
 292        spin_lock_irqsave(&d->lock, flags);
 293        d->flags |= DEVFL_FREED;
 294        spin_unlock_irqrestore(&d->lock, flags);
 295}
 296
 297enum flush_parms {
 298        NOT_EXITING = 0,
 299        EXITING = 1,
 300};
 301
 302static int
 303flush(const char __user *str, size_t cnt, int exiting)
 304{
 305        ulong flags;
 306        struct aoedev *d, **dd;
 307        char buf[16];
 308        int all = 0;
 309        int specified = 0;      /* flush a specific device */
 310        unsigned int skipflags;
 311
 312        skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL;
 313
 314        if (!exiting && cnt >= 3) {
 315                if (cnt > sizeof buf)
 316                        cnt = sizeof buf;
 317                if (copy_from_user(buf, str, cnt))
 318                        return -EFAULT;
 319                all = !strncmp(buf, "all", 3);
 320                if (!all)
 321                        specified = 1;
 322        }
 323
 324        flush_scheduled_work();
 325        /* pass one: without sleeping, do aoedev_downdev */
 326        spin_lock_irqsave(&devlist_lock, flags);
 327        for (d = devlist; d; d = d->next) {
 328                spin_lock(&d->lock);
 329                if (exiting) {
 330                        /* unconditionally take each device down */
 331                } else if (specified) {
 332                        if (!user_req(buf, cnt, d))
 333                                goto cont;
 334                } else if ((!all && (d->flags & DEVFL_UP))
 335                || d->flags & skipflags
 336                || d->nopen
 337                || d->ref)
 338                        goto cont;
 339
 340                aoedev_downdev(d);
 341                d->flags |= DEVFL_TKILL;
 342cont:
 343                spin_unlock(&d->lock);
 344        }
 345        spin_unlock_irqrestore(&devlist_lock, flags);
 346
 347        /* pass two: call freedev, which might sleep,
 348         * for aoedevs marked with DEVFL_TKILL
 349         */
 350restart:
 351        spin_lock_irqsave(&devlist_lock, flags);
 352        for (d = devlist; d; d = d->next) {
 353                spin_lock(&d->lock);
 354                if (d->flags & DEVFL_TKILL
 355                && !(d->flags & DEVFL_FREEING)) {
 356                        spin_unlock(&d->lock);
 357                        spin_unlock_irqrestore(&devlist_lock, flags);
 358                        freedev(d);
 359                        goto restart;
 360                }
 361                spin_unlock(&d->lock);
 362        }
 363
 364        /* pass three: remove aoedevs marked with DEVFL_FREED */
 365        for (dd = &devlist, d = *dd; d; d = *dd) {
 366                struct aoedev *doomed = NULL;
 367
 368                spin_lock(&d->lock);
 369                if (d->flags & DEVFL_FREED) {
 370                        *dd = d->next;
 371                        doomed = d;
 372                } else {
 373                        dd = &d->next;
 374                }
 375                spin_unlock(&d->lock);
 376                if (doomed)
 377                        kfree(doomed->targets);
 378                kfree(doomed);
 379        }
 380        spin_unlock_irqrestore(&devlist_lock, flags);
 381
 382        return 0;
 383}
 384
 385int
 386aoedev_flush(const char __user *str, size_t cnt)
 387{
 388        return flush(str, cnt, NOT_EXITING);
 389}
 390
 391/* This has been confirmed to occur once with Tms=3*1000 due to the
 392 * driver changing link and not processing its transmit ring.  The
 393 * problem is hard enough to solve by returning an error that I'm
 394 * still punting on "solving" this.
 395 */
 396static void
 397skbfree(struct sk_buff *skb)
 398{
 399        enum { Sms = 250, Tms = 30 * 1000};
 400        int i = Tms / Sms;
 401
 402        if (skb == NULL)
 403                return;
 404        while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0)
 405                msleep(Sms);
 406        if (i < 0) {
 407                printk(KERN_ERR
 408                        "aoe: %s holds ref: %s\n",
 409                        skb->dev ? skb->dev->name : "netif",
 410                        "cannot free skb -- memory leaked.");
 411                return;
 412        }
 413        skb->truesize -= skb->data_len;
 414        skb_shinfo(skb)->nr_frags = skb->data_len = 0;
 415        skb_trim(skb, 0);
 416        dev_kfree_skb(skb);
 417}
 418
 419static void
 420skbpoolfree(struct aoedev *d)
 421{
 422        struct sk_buff *skb, *tmp;
 423
 424        skb_queue_walk_safe(&d->skbpool, skb, tmp)
 425                skbfree(skb);
 426
 427        __skb_queue_head_init(&d->skbpool);
 428}
 429
 430/* find it or allocate it */
 431struct aoedev *
 432aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
 433{
 434        struct aoedev *d;
 435        int i;
 436        ulong flags;
 437        ulong sysminor = 0;
 438
 439        spin_lock_irqsave(&devlist_lock, flags);
 440
 441        for (d=devlist; d; d=d->next)
 442                if (d->aoemajor == maj && d->aoeminor == min) {
 443                        spin_lock(&d->lock);
 444                        if (d->flags & DEVFL_TKILL) {
 445                                spin_unlock(&d->lock);
 446                                d = NULL;
 447                                goto out;
 448                        }
 449                        d->ref++;
 450                        spin_unlock(&d->lock);
 451                        break;
 452                }
 453        if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
 454                goto out;
 455        d = kcalloc(1, sizeof *d, GFP_ATOMIC);
 456        if (!d)
 457                goto out;
 458        d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC);
 459        if (!d->targets) {
 460                kfree(d);
 461                d = NULL;
 462                goto out;
 463        }
 464        d->ntargets = NTARGETS;
 465        INIT_WORK(&d->work, aoecmd_sleepwork);
 466        spin_lock_init(&d->lock);
 467        skb_queue_head_init(&d->skbpool);
 468        timer_setup(&d->timer, dummy_timer, 0);
 469        d->timer.expires = jiffies + HZ;
 470        add_timer(&d->timer);
 471        d->bufpool = NULL;      /* defer to aoeblk_gdalloc */
 472        d->tgt = d->targets;
 473        d->ref = 1;
 474        for (i = 0; i < NFACTIVE; i++)
 475                INIT_LIST_HEAD(&d->factive[i]);
 476        INIT_LIST_HEAD(&d->rexmitq);
 477        d->sysminor = sysminor;
 478        d->aoemajor = maj;
 479        d->aoeminor = min;
 480        d->rttavg = RTTAVG_INIT;
 481        d->rttdev = RTTDEV_INIT;
 482        d->next = devlist;
 483        devlist = d;
 484 out:
 485        spin_unlock_irqrestore(&devlist_lock, flags);
 486        return d;
 487}
 488
 489static void
 490freetgt(struct aoedev *d, struct aoetgt *t)
 491{
 492        struct frame *f;
 493        struct list_head *pos, *nx, *head;
 494        struct aoeif *ifp;
 495
 496        for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) {
 497                if (!ifp->nd)
 498                        break;
 499                dev_put(ifp->nd);
 500        }
 501
 502        head = &t->ffree;
 503        list_for_each_safe(pos, nx, head) {
 504                list_del(pos);
 505                f = list_entry(pos, struct frame, head);
 506                skbfree(f->skb);
 507                kfree(f);
 508        }
 509        kfree(t);
 510}
 511
 512void
 513aoedev_exit(void)
 514{
 515        flush_scheduled_work();
 516        flush(NULL, 0, EXITING);
 517}
 518
 519int __init
 520aoedev_init(void)
 521{
 522        return 0;
 523}
 524