linux/drivers/block/aoe/aoecmd.c
<<
>>
Prefs
   1/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
   2/*
   3 * aoecmd.c
   4 * Filesystem request handling methods
   5 */
   6
   7#include <linux/ata.h>
   8#include <linux/slab.h>
   9#include <linux/hdreg.h>
  10#include <linux/blkdev.h>
  11#include <linux/skbuff.h>
  12#include <linux/netdevice.h>
  13#include <linux/genhd.h>
  14#include <linux/moduleparam.h>
  15#include <linux/workqueue.h>
  16#include <linux/kthread.h>
  17#include <net/net_namespace.h>
  18#include <asm/unaligned.h>
  19#include <linux/uio.h>
  20#include "aoe.h"
  21
  22#define MAXIOC (8192)   /* default meant to avoid most soft lockups */
  23
  24static void ktcomplete(struct frame *, struct sk_buff *);
  25static int count_targets(struct aoedev *d, int *untainted);
  26
  27static struct buf *nextbuf(struct aoedev *);
  28
  29static int aoe_deadsecs = 60 * 3;
  30module_param(aoe_deadsecs, int, 0644);
  31MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
  32
  33static int aoe_maxout = 64;
  34module_param(aoe_maxout, int, 0644);
  35MODULE_PARM_DESC(aoe_maxout,
  36        "Only aoe_maxout outstanding packets for every MAC on eX.Y.");
  37
  38/* The number of online cpus during module initialization gives us a
  39 * convenient heuristic cap on the parallelism used for ktio threads
  40 * doing I/O completion.  It is not important that the cap equal the
  41 * actual number of running CPUs at any given time, but because of CPU
  42 * hotplug, we take care to use ncpus instead of using
  43 * num_online_cpus() after module initialization.
  44 */
  45static int ncpus;
  46
  47/* mutex lock used for synchronization while thread spawning */
  48static DEFINE_MUTEX(ktio_spawn_lock);
  49
  50static wait_queue_head_t *ktiowq;
  51static struct ktstate *kts;
  52
  53/* io completion queue */
  54struct iocq_ktio {
  55        struct list_head head;
  56        spinlock_t lock;
  57};
  58static struct iocq_ktio *iocq;
  59
  60static struct page *empty_page;
  61
  62static struct sk_buff *
  63new_skb(ulong len)
  64{
  65        struct sk_buff *skb;
  66
  67        skb = alloc_skb(len + MAX_HEADER, GFP_ATOMIC);
  68        if (skb) {
  69                skb_reserve(skb, MAX_HEADER);
  70                skb_reset_mac_header(skb);
  71                skb_reset_network_header(skb);
  72                skb->protocol = __constant_htons(ETH_P_AOE);
  73                skb_checksum_none_assert(skb);
  74        }
  75        return skb;
  76}
  77
  78static struct frame *
  79getframe_deferred(struct aoedev *d, u32 tag)
  80{
  81        struct list_head *head, *pos, *nx;
  82        struct frame *f;
  83
  84        head = &d->rexmitq;
  85        list_for_each_safe(pos, nx, head) {
  86                f = list_entry(pos, struct frame, head);
  87                if (f->tag == tag) {
  88                        list_del(pos);
  89                        return f;
  90                }
  91        }
  92        return NULL;
  93}
  94
  95static struct frame *
  96getframe(struct aoedev *d, u32 tag)
  97{
  98        struct frame *f;
  99        struct list_head *head, *pos, *nx;
 100        u32 n;
 101
 102        n = tag % NFACTIVE;
 103        head = &d->factive[n];
 104        list_for_each_safe(pos, nx, head) {
 105                f = list_entry(pos, struct frame, head);
 106                if (f->tag == tag) {
 107                        list_del(pos);
 108                        return f;
 109                }
 110        }
 111        return NULL;
 112}
 113
 114/*
 115 * Leave the top bit clear so we have tagspace for userland.
 116 * The bottom 16 bits are the xmit tick for rexmit/rttavg processing.
 117 * This driver reserves tag -1 to mean "unused frame."
 118 */
 119static int
 120newtag(struct aoedev *d)
 121{
 122        register ulong n;
 123
 124        n = jiffies & 0xffff;
 125        return n |= (++d->lasttag & 0x7fff) << 16;
 126}
 127
 128static u32
 129aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
 130{
 131        u32 host_tag = newtag(d);
 132
 133        memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
 134        memcpy(h->dst, t->addr, sizeof h->dst);
 135        h->type = __constant_cpu_to_be16(ETH_P_AOE);
 136        h->verfl = AOE_HVER;
 137        h->major = cpu_to_be16(d->aoemajor);
 138        h->minor = d->aoeminor;
 139        h->cmd = AOECMD_ATA;
 140        h->tag = cpu_to_be32(host_tag);
 141
 142        return host_tag;
 143}
 144
 145static inline void
 146put_lba(struct aoe_atahdr *ah, sector_t lba)
 147{
 148        ah->lba0 = lba;
 149        ah->lba1 = lba >>= 8;
 150        ah->lba2 = lba >>= 8;
 151        ah->lba3 = lba >>= 8;
 152        ah->lba4 = lba >>= 8;
 153        ah->lba5 = lba >>= 8;
 154}
 155
 156static struct aoeif *
 157ifrotate(struct aoetgt *t)
 158{
 159        struct aoeif *ifp;
 160
 161        ifp = t->ifp;
 162        ifp++;
 163        if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL)
 164                ifp = t->ifs;
 165        if (ifp->nd == NULL)
 166                return NULL;
 167        return t->ifp = ifp;
 168}
 169
 170static void
 171skb_pool_put(struct aoedev *d, struct sk_buff *skb)
 172{
 173        __skb_queue_tail(&d->skbpool, skb);
 174}
 175
 176static struct sk_buff *
 177skb_pool_get(struct aoedev *d)
 178{
 179        struct sk_buff *skb = skb_peek(&d->skbpool);
 180
 181        if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) {
 182                __skb_unlink(skb, &d->skbpool);
 183                return skb;
 184        }
 185        if (skb_queue_len(&d->skbpool) < NSKBPOOLMAX &&
 186            (skb = new_skb(ETH_ZLEN)))
 187                return skb;
 188
 189        return NULL;
 190}
 191
 192void
 193aoe_freetframe(struct frame *f)
 194{
 195        struct aoetgt *t;
 196
 197        t = f->t;
 198        f->buf = NULL;
 199        memset(&f->iter, 0, sizeof(f->iter));
 200        f->r_skb = NULL;
 201        f->flags = 0;
 202        list_add(&f->head, &t->ffree);
 203}
 204
 205static struct frame *
 206newtframe(struct aoedev *d, struct aoetgt *t)
 207{
 208        struct frame *f;
 209        struct sk_buff *skb;
 210        struct list_head *pos;
 211
 212        if (list_empty(&t->ffree)) {
 213                if (t->falloc >= NSKBPOOLMAX*2)
 214                        return NULL;
 215                f = kcalloc(1, sizeof(*f), GFP_ATOMIC);
 216                if (f == NULL)
 217                        return NULL;
 218                t->falloc++;
 219                f->t = t;
 220        } else {
 221                pos = t->ffree.next;
 222                list_del(pos);
 223                f = list_entry(pos, struct frame, head);
 224        }
 225
 226        skb = f->skb;
 227        if (skb == NULL) {
 228                f->skb = skb = new_skb(ETH_ZLEN);
 229                if (!skb) {
 230bail:                   aoe_freetframe(f);
 231                        return NULL;
 232                }
 233        }
 234
 235        if (atomic_read(&skb_shinfo(skb)->dataref) != 1) {
 236                skb = skb_pool_get(d);
 237                if (skb == NULL)
 238                        goto bail;
 239                skb_pool_put(d, f->skb);
 240                f->skb = skb;
 241        }
 242
 243        skb->truesize -= skb->data_len;
 244        skb_shinfo(skb)->nr_frags = skb->data_len = 0;
 245        skb_trim(skb, 0);
 246        return f;
 247}
 248
 249static struct frame *
 250newframe(struct aoedev *d)
 251{
 252        struct frame *f;
 253        struct aoetgt *t, **tt;
 254        int totout = 0;
 255        int use_tainted;
 256        int has_untainted;
 257
 258        if (!d->targets || !d->targets[0]) {
 259                printk(KERN_ERR "aoe: NULL TARGETS!\n");
 260                return NULL;
 261        }
 262        tt = d->tgt;    /* last used target */
 263        for (use_tainted = 0, has_untainted = 0;;) {
 264                tt++;
 265                if (tt >= &d->targets[d->ntargets] || !*tt)
 266                        tt = d->targets;
 267                t = *tt;
 268                if (!t->taint) {
 269                        has_untainted = 1;
 270                        totout += t->nout;
 271                }
 272                if (t->nout < t->maxout
 273                && (use_tainted || !t->taint)
 274                && t->ifp->nd) {
 275                        f = newtframe(d, t);
 276                        if (f) {
 277                                ifrotate(t);
 278                                d->tgt = tt;
 279                                return f;
 280                        }
 281                }
 282                if (tt == d->tgt) {     /* we've looped and found nada */
 283                        if (!use_tainted && !has_untainted)
 284                                use_tainted = 1;
 285                        else
 286                                break;
 287                }
 288        }
 289        if (totout == 0) {
 290                d->kicked++;
 291                d->flags |= DEVFL_KICKME;
 292        }
 293        return NULL;
 294}
 295
 296static void
 297skb_fillup(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter)
 298{
 299        int frag = 0;
 300        struct bio_vec bv;
 301
 302        __bio_for_each_segment(bv, bio, iter, iter)
 303                skb_fill_page_desc(skb, frag++, bv.bv_page,
 304                                   bv.bv_offset, bv.bv_len);
 305}
 306
 307static void
 308fhash(struct frame *f)
 309{
 310        struct aoedev *d = f->t->d;
 311        u32 n;
 312
 313        n = f->tag % NFACTIVE;
 314        list_add_tail(&f->head, &d->factive[n]);
 315}
 316
 317static void
 318ata_rw_frameinit(struct frame *f)
 319{
 320        struct aoetgt *t;
 321        struct aoe_hdr *h;
 322        struct aoe_atahdr *ah;
 323        struct sk_buff *skb;
 324        char writebit, extbit;
 325
 326        skb = f->skb;
 327        h = (struct aoe_hdr *) skb_mac_header(skb);
 328        ah = (struct aoe_atahdr *) (h + 1);
 329        skb_put(skb, sizeof(*h) + sizeof(*ah));
 330        memset(h, 0, skb->len);
 331
 332        writebit = 0x10;
 333        extbit = 0x4;
 334
 335        t = f->t;
 336        f->tag = aoehdr_atainit(t->d, t, h);
 337        fhash(f);
 338        t->nout++;
 339        f->waited = 0;
 340        f->waited_total = 0;
 341
 342        /* set up ata header */
 343        ah->scnt = f->iter.bi_size >> 9;
 344        put_lba(ah, f->iter.bi_sector);
 345        if (t->d->flags & DEVFL_EXT) {
 346                ah->aflags |= AOEAFL_EXT;
 347        } else {
 348                extbit = 0;
 349                ah->lba3 &= 0x0f;
 350                ah->lba3 |= 0xe0;       /* LBA bit + obsolete 0xa0 */
 351        }
 352        if (f->buf && bio_data_dir(f->buf->bio) == WRITE) {
 353                skb_fillup(skb, f->buf->bio, f->iter);
 354                ah->aflags |= AOEAFL_WRITE;
 355                skb->len += f->iter.bi_size;
 356                skb->data_len = f->iter.bi_size;
 357                skb->truesize += f->iter.bi_size;
 358                t->wpkts++;
 359        } else {
 360                t->rpkts++;
 361                writebit = 0;
 362        }
 363
 364        ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
 365        skb->dev = t->ifp->nd;
 366}
 367
 368static int
 369aoecmd_ata_rw(struct aoedev *d)
 370{
 371        struct frame *f;
 372        struct buf *buf;
 373        struct sk_buff *skb;
 374        struct sk_buff_head queue;
 375
 376        buf = nextbuf(d);
 377        if (buf == NULL)
 378                return 0;
 379        f = newframe(d);
 380        if (f == NULL)
 381                return 0;
 382
 383        /* initialize the headers & frame */
 384        f->buf = buf;
 385        f->iter = buf->iter;
 386        f->iter.bi_size = min_t(unsigned long,
 387                                d->maxbcnt ?: DEFAULTBCNT,
 388                                f->iter.bi_size);
 389        bio_advance_iter(buf->bio, &buf->iter, f->iter.bi_size);
 390
 391        if (!buf->iter.bi_size)
 392                d->ip.buf = NULL;
 393
 394        /* mark all tracking fields and load out */
 395        buf->nframesout += 1;
 396
 397        ata_rw_frameinit(f);
 398
 399        skb = skb_clone(f->skb, GFP_ATOMIC);
 400        if (skb) {
 401                do_gettimeofday(&f->sent);
 402                f->sent_jiffs = (u32) jiffies;
 403                __skb_queue_head_init(&queue);
 404                __skb_queue_tail(&queue, skb);
 405                aoenet_xmit(&queue);
 406        }
 407        return 1;
 408}
 409
 410/* some callers cannot sleep, and they can call this function,
 411 * transmitting the packets later, when interrupts are on
 412 */
 413static void
 414aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *queue)
 415{
 416        struct aoe_hdr *h;
 417        struct aoe_cfghdr *ch;
 418        struct sk_buff *skb;
 419        struct net_device *ifp;
 420
 421        rcu_read_lock();
 422        for_each_netdev_rcu(&init_net, ifp) {
 423                dev_hold(ifp);
 424                if (!is_aoe_netif(ifp))
 425                        goto cont;
 426
 427                skb = new_skb(sizeof *h + sizeof *ch);
 428                if (skb == NULL) {
 429                        printk(KERN_INFO "aoe: skb alloc failure\n");
 430                        goto cont;
 431                }
 432                skb_put(skb, sizeof *h + sizeof *ch);
 433                skb->dev = ifp;
 434                __skb_queue_tail(queue, skb);
 435                h = (struct aoe_hdr *) skb_mac_header(skb);
 436                memset(h, 0, sizeof *h + sizeof *ch);
 437
 438                memset(h->dst, 0xff, sizeof h->dst);
 439                memcpy(h->src, ifp->dev_addr, sizeof h->src);
 440                h->type = __constant_cpu_to_be16(ETH_P_AOE);
 441                h->verfl = AOE_HVER;
 442                h->major = cpu_to_be16(aoemajor);
 443                h->minor = aoeminor;
 444                h->cmd = AOECMD_CFG;
 445
 446cont:
 447                dev_put(ifp);
 448        }
 449        rcu_read_unlock();
 450}
 451
 452static void
 453resend(struct aoedev *d, struct frame *f)
 454{
 455        struct sk_buff *skb;
 456        struct sk_buff_head queue;
 457        struct aoe_hdr *h;
 458        struct aoetgt *t;
 459        char buf[128];
 460        u32 n;
 461
 462        t = f->t;
 463        n = newtag(d);
 464        skb = f->skb;
 465        if (ifrotate(t) == NULL) {
 466                /* probably can't happen, but set it up to fail anyway */
 467                pr_info("aoe: resend: no interfaces to rotate to.\n");
 468                ktcomplete(f, NULL);
 469                return;
 470        }
 471        h = (struct aoe_hdr *) skb_mac_header(skb);
 472
 473        if (!(f->flags & FFL_PROBE)) {
 474                snprintf(buf, sizeof(buf),
 475                        "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
 476                        "retransmit", d->aoemajor, d->aoeminor,
 477                        f->tag, jiffies, n,
 478                        h->src, h->dst, t->nout);
 479                aoechr_error(buf);
 480        }
 481
 482        f->tag = n;
 483        fhash(f);
 484        h->tag = cpu_to_be32(n);
 485        memcpy(h->dst, t->addr, sizeof h->dst);
 486        memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
 487
 488        skb->dev = t->ifp->nd;
 489        skb = skb_clone(skb, GFP_ATOMIC);
 490        if (skb == NULL)
 491                return;
 492        do_gettimeofday(&f->sent);
 493        f->sent_jiffs = (u32) jiffies;
 494        __skb_queue_head_init(&queue);
 495        __skb_queue_tail(&queue, skb);
 496        aoenet_xmit(&queue);
 497}
 498
 499static int
 500tsince_hr(struct frame *f)
 501{
 502        struct timeval now;
 503        int n;
 504
 505        do_gettimeofday(&now);
 506        n = now.tv_usec - f->sent.tv_usec;
 507        n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
 508
 509        if (n < 0)
 510                n = -n;
 511
 512        /* For relatively long periods, use jiffies to avoid
 513         * discrepancies caused by updates to the system time.
 514         *
 515         * On system with HZ of 1000, 32-bits is over 49 days
 516         * worth of jiffies, or over 71 minutes worth of usecs.
 517         *
 518         * Jiffies overflow is handled by subtraction of unsigned ints:
 519         * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
 520         * $3 = 4
 521         * (gdb)
 522         */
 523        if (n > USEC_PER_SEC / 4) {
 524                n = ((u32) jiffies) - f->sent_jiffs;
 525                n *= USEC_PER_SEC / HZ;
 526        }
 527
 528        return n;
 529}
 530
 531static int
 532tsince(u32 tag)
 533{
 534        int n;
 535
 536        n = jiffies & 0xffff;
 537        n -= tag & 0xffff;
 538        if (n < 0)
 539                n += 1<<16;
 540        return jiffies_to_usecs(n + 1);
 541}
 542
 543static struct aoeif *
 544getif(struct aoetgt *t, struct net_device *nd)
 545{
 546        struct aoeif *p, *e;
 547
 548        p = t->ifs;
 549        e = p + NAOEIFS;
 550        for (; p < e; p++)
 551                if (p->nd == nd)
 552                        return p;
 553        return NULL;
 554}
 555
 556static void
 557ejectif(struct aoetgt *t, struct aoeif *ifp)
 558{
 559        struct aoeif *e;
 560        struct net_device *nd;
 561        ulong n;
 562
 563        nd = ifp->nd;
 564        e = t->ifs + NAOEIFS - 1;
 565        n = (e - ifp) * sizeof *ifp;
 566        memmove(ifp, ifp+1, n);
 567        e->nd = NULL;
 568        dev_put(nd);
 569}
 570
 571static struct frame *
 572reassign_frame(struct frame *f)
 573{
 574        struct frame *nf;
 575        struct sk_buff *skb;
 576
 577        nf = newframe(f->t->d);
 578        if (!nf)
 579                return NULL;
 580        if (nf->t == f->t) {
 581                aoe_freetframe(nf);
 582                return NULL;
 583        }
 584
 585        skb = nf->skb;
 586        nf->skb = f->skb;
 587        nf->buf = f->buf;
 588        nf->iter = f->iter;
 589        nf->waited = 0;
 590        nf->waited_total = f->waited_total;
 591        nf->sent = f->sent;
 592        nf->sent_jiffs = f->sent_jiffs;
 593        f->skb = skb;
 594
 595        return nf;
 596}
 597
 598static void
 599probe(struct aoetgt *t)
 600{
 601        struct aoedev *d;
 602        struct frame *f;
 603        struct sk_buff *skb;
 604        struct sk_buff_head queue;
 605        size_t n, m;
 606        int frag;
 607
 608        d = t->d;
 609        f = newtframe(d, t);
 610        if (!f) {
 611                pr_err("%s %pm for e%ld.%d: %s\n",
 612                        "aoe: cannot probe remote address",
 613                        t->addr,
 614                        (long) d->aoemajor, d->aoeminor,
 615                        "no frame available");
 616                return;
 617        }
 618        f->flags |= FFL_PROBE;
 619        ifrotate(t);
 620        f->iter.bi_size = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT;
 621        ata_rw_frameinit(f);
 622        skb = f->skb;
 623        for (frag = 0, n = f->iter.bi_size; n > 0; ++frag, n -= m) {
 624                if (n < PAGE_SIZE)
 625                        m = n;
 626                else
 627                        m = PAGE_SIZE;
 628                skb_fill_page_desc(skb, frag, empty_page, 0, m);
 629        }
 630        skb->len += f->iter.bi_size;
 631        skb->data_len = f->iter.bi_size;
 632        skb->truesize += f->iter.bi_size;
 633
 634        skb = skb_clone(f->skb, GFP_ATOMIC);
 635        if (skb) {
 636                do_gettimeofday(&f->sent);
 637                f->sent_jiffs = (u32) jiffies;
 638                __skb_queue_head_init(&queue);
 639                __skb_queue_tail(&queue, skb);
 640                aoenet_xmit(&queue);
 641        }
 642}
 643
 644static long
 645rto(struct aoedev *d)
 646{
 647        long t;
 648
 649        t = 2 * d->rttavg >> RTTSCALE;
 650        t += 8 * d->rttdev >> RTTDSCALE;
 651        if (t == 0)
 652                t = 1;
 653
 654        return t;
 655}
 656
 657static void
 658rexmit_deferred(struct aoedev *d)
 659{
 660        struct aoetgt *t;
 661        struct frame *f;
 662        struct frame *nf;
 663        struct list_head *pos, *nx, *head;
 664        int since;
 665        int untainted;
 666
 667        count_targets(d, &untainted);
 668
 669        head = &d->rexmitq;
 670        list_for_each_safe(pos, nx, head) {
 671                f = list_entry(pos, struct frame, head);
 672                t = f->t;
 673                if (t->taint) {
 674                        if (!(f->flags & FFL_PROBE)) {
 675                                nf = reassign_frame(f);
 676                                if (nf) {
 677                                        if (t->nout_probes == 0
 678                                        && untainted > 0) {
 679                                                probe(t);
 680                                                t->nout_probes++;
 681                                        }
 682                                        list_replace(&f->head, &nf->head);
 683                                        pos = &nf->head;
 684                                        aoe_freetframe(f);
 685                                        f = nf;
 686                                        t = f->t;
 687                                }
 688                        } else if (untainted < 1) {
 689                                /* don't probe w/o other untainted aoetgts */
 690                                goto stop_probe;
 691                        } else if (tsince_hr(f) < t->taint * rto(d)) {
 692                                /* reprobe slowly when taint is high */
 693                                continue;
 694                        }
 695                } else if (f->flags & FFL_PROBE) {
 696stop_probe:             /* don't probe untainted aoetgts */
 697                        list_del(pos);
 698                        aoe_freetframe(f);
 699                        /* leaving d->kicked, because this is routine */
 700                        f->t->d->flags |= DEVFL_KICKME;
 701                        continue;
 702                }
 703                if (t->nout >= t->maxout)
 704                        continue;
 705                list_del(pos);
 706                t->nout++;
 707                if (f->flags & FFL_PROBE)
 708                        t->nout_probes++;
 709                since = tsince_hr(f);
 710                f->waited += since;
 711                f->waited_total += since;
 712                resend(d, f);
 713        }
 714}
 715
 716/* An aoetgt accumulates demerits quickly, and successful
 717 * probing redeems the aoetgt slowly.
 718 */
 719static void
 720scorn(struct aoetgt *t)
 721{
 722        int n;
 723
 724        n = t->taint++;
 725        t->taint += t->taint * 2;
 726        if (n > t->taint)
 727                t->taint = n;
 728        if (t->taint > MAX_TAINT)
 729                t->taint = MAX_TAINT;
 730}
 731
 732static int
 733count_targets(struct aoedev *d, int *untainted)
 734{
 735        int i, good;
 736
 737        for (i = good = 0; i < d->ntargets && d->targets[i]; ++i)
 738                if (d->targets[i]->taint == 0)
 739                        good++;
 740
 741        if (untainted)
 742                *untainted = good;
 743        return i;
 744}
 745
 746static void
 747rexmit_timer(ulong vp)
 748{
 749        struct aoedev *d;
 750        struct aoetgt *t;
 751        struct aoeif *ifp;
 752        struct frame *f;
 753        struct list_head *head, *pos, *nx;
 754        LIST_HEAD(flist);
 755        register long timeout;
 756        ulong flags, n;
 757        int i;
 758        int utgts;      /* number of aoetgt descriptors (not slots) */
 759        int since;
 760
 761        d = (struct aoedev *) vp;
 762
 763        spin_lock_irqsave(&d->lock, flags);
 764
 765        /* timeout based on observed timings and variations */
 766        timeout = rto(d);
 767
 768        utgts = count_targets(d, NULL);
 769
 770        if (d->flags & DEVFL_TKILL) {
 771                spin_unlock_irqrestore(&d->lock, flags);
 772                return;
 773        }
 774
 775        /* collect all frames to rexmit into flist */
 776        for (i = 0; i < NFACTIVE; i++) {
 777                head = &d->factive[i];
 778                list_for_each_safe(pos, nx, head) {
 779                        f = list_entry(pos, struct frame, head);
 780                        if (tsince_hr(f) < timeout)
 781                                break;  /* end of expired frames */
 782                        /* move to flist for later processing */
 783                        list_move_tail(pos, &flist);
 784                }
 785        }
 786
 787        /* process expired frames */
 788        while (!list_empty(&flist)) {
 789                pos = flist.next;
 790                f = list_entry(pos, struct frame, head);
 791                since = tsince_hr(f);
 792                n = f->waited_total + since;
 793                n /= USEC_PER_SEC;
 794                if (aoe_deadsecs
 795                && n > aoe_deadsecs
 796                && !(f->flags & FFL_PROBE)) {
 797                        /* Waited too long.  Device failure.
 798                         * Hang all frames on first hash bucket for downdev
 799                         * to clean up.
 800                         */
 801                        list_splice(&flist, &d->factive[0]);
 802                        aoedev_downdev(d);
 803                        goto out;
 804                }
 805
 806                t = f->t;
 807                n = f->waited + since;
 808                n /= USEC_PER_SEC;
 809                if (aoe_deadsecs && utgts > 0
 810                && (n > aoe_deadsecs / utgts || n > HARD_SCORN_SECS))
 811                        scorn(t); /* avoid this target */
 812
 813                if (t->maxout != 1) {
 814                        t->ssthresh = t->maxout / 2;
 815                        t->maxout = 1;
 816                }
 817
 818                if (f->flags & FFL_PROBE) {
 819                        t->nout_probes--;
 820                } else {
 821                        ifp = getif(t, f->skb->dev);
 822                        if (ifp && ++ifp->lost > (t->nframes << 1)
 823                        && (ifp != t->ifs || t->ifs[1].nd)) {
 824                                ejectif(t, ifp);
 825                                ifp = NULL;
 826                        }
 827                }
 828                list_move_tail(pos, &d->rexmitq);
 829                t->nout--;
 830        }
 831        rexmit_deferred(d);
 832
 833out:
 834        if ((d->flags & DEVFL_KICKME) && d->blkq) {
 835                d->flags &= ~DEVFL_KICKME;
 836                d->blkq->request_fn(d->blkq);
 837        }
 838
 839        d->timer.expires = jiffies + TIMERTICK;
 840        add_timer(&d->timer);
 841
 842        spin_unlock_irqrestore(&d->lock, flags);
 843}
 844
 845static unsigned long
 846rqbiocnt(struct request *r)
 847{
 848        struct bio *bio;
 849        unsigned long n = 0;
 850
 851        __rq_for_each_bio(bio, r)
 852                n++;
 853        return n;
 854}
 855
 856/* This can be removed if we are certain that no users of the block
 857 * layer will ever use zero-count pages in bios.  Otherwise we have to
 858 * protect against the put_page sometimes done by the network layer.
 859 *
 860 * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
 861 * discussion.
 862 *
 863 * We cannot use get_page in the workaround, because it insists on a
 864 * positive page count as a precondition.  So we use _count directly.
 865 */
 866static void
 867bio_pageinc(struct bio *bio)
 868{
 869        struct bio_vec bv;
 870        struct page *page;
 871        struct bvec_iter iter;
 872
 873        bio_for_each_segment(bv, bio, iter) {
 874                /* Non-zero page count for non-head members of
 875                 * compound pages is no longer allowed by the kernel.
 876                 */
 877                page = compound_head(bv.bv_page);
 878                atomic_inc(&page->_count);
 879        }
 880}
 881
 882static void
 883bio_pagedec(struct bio *bio)
 884{
 885        struct page *page;
 886        struct bio_vec bv;
 887        struct bvec_iter iter;
 888
 889        bio_for_each_segment(bv, bio, iter) {
 890                page = compound_head(bv.bv_page);
 891                atomic_dec(&page->_count);
 892        }
 893}
 894
 895static void
 896bufinit(struct buf *buf, struct request *rq, struct bio *bio)
 897{
 898        memset(buf, 0, sizeof(*buf));
 899        buf->rq = rq;
 900        buf->bio = bio;
 901        buf->iter = bio->bi_iter;
 902        bio_pageinc(bio);
 903}
 904
 905static struct buf *
 906nextbuf(struct aoedev *d)
 907{
 908        struct request *rq;
 909        struct request_queue *q;
 910        struct buf *buf;
 911        struct bio *bio;
 912
 913        q = d->blkq;
 914        if (q == NULL)
 915                return NULL;    /* initializing */
 916        if (d->ip.buf)
 917                return d->ip.buf;
 918        rq = d->ip.rq;
 919        if (rq == NULL) {
 920                rq = blk_peek_request(q);
 921                if (rq == NULL)
 922                        return NULL;
 923                blk_start_request(rq);
 924                d->ip.rq = rq;
 925                d->ip.nxbio = rq->bio;
 926                rq->special = (void *) rqbiocnt(rq);
 927        }
 928        buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
 929        if (buf == NULL) {
 930                pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
 931                return NULL;
 932        }
 933        bio = d->ip.nxbio;
 934        bufinit(buf, rq, bio);
 935        bio = bio->bi_next;
 936        d->ip.nxbio = bio;
 937        if (bio == NULL)
 938                d->ip.rq = NULL;
 939        return d->ip.buf = buf;
 940}
 941
 942/* enters with d->lock held */
 943void
 944aoecmd_work(struct aoedev *d)
 945{
 946        rexmit_deferred(d);
 947        while (aoecmd_ata_rw(d))
 948                ;
 949}
 950
 951/* this function performs work that has been deferred until sleeping is OK
 952 */
 953void
 954aoecmd_sleepwork(struct work_struct *work)
 955{
 956        struct aoedev *d = container_of(work, struct aoedev, work);
 957        struct block_device *bd;
 958        u64 ssize;
 959
 960        if (d->flags & DEVFL_GDALLOC)
 961                aoeblk_gdalloc(d);
 962
 963        if (d->flags & DEVFL_NEWSIZE) {
 964                ssize = get_capacity(d->gd);
 965                bd = bdget_disk(d->gd, 0);
 966                if (bd) {
 967                        mutex_lock(&bd->bd_inode->i_mutex);
 968                        i_size_write(bd->bd_inode, (loff_t)ssize<<9);
 969                        mutex_unlock(&bd->bd_inode->i_mutex);
 970                        bdput(bd);
 971                }
 972                spin_lock_irq(&d->lock);
 973                d->flags |= DEVFL_UP;
 974                d->flags &= ~DEVFL_NEWSIZE;
 975                spin_unlock_irq(&d->lock);
 976        }
 977}
 978
 979static void
 980ata_ident_fixstring(u16 *id, int ns)
 981{
 982        u16 s;
 983
 984        while (ns-- > 0) {
 985                s = *id;
 986                *id++ = s >> 8 | s << 8;
 987        }
 988}
 989
 990static void
 991ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
 992{
 993        u64 ssize;
 994        u16 n;
 995
 996        /* word 83: command set supported */
 997        n = get_unaligned_le16(&id[83 << 1]);
 998
 999        /* word 86: command set/feature enabled */
1000        n |= get_unaligned_le16(&id[86 << 1]);
1001
1002        if (n & (1<<10)) {      /* bit 10: LBA 48 */
1003                d->flags |= DEVFL_EXT;
1004
1005                /* word 100: number lba48 sectors */
1006                ssize = get_unaligned_le64(&id[100 << 1]);
1007
1008                /* set as in ide-disk.c:init_idedisk_capacity */
1009                d->geo.cylinders = ssize;
1010                d->geo.cylinders /= (255 * 63);
1011                d->geo.heads = 255;
1012                d->geo.sectors = 63;
1013        } else {
1014                d->flags &= ~DEVFL_EXT;
1015
1016                /* number lba28 sectors */
1017                ssize = get_unaligned_le32(&id[60 << 1]);
1018
1019                /* NOTE: obsolete in ATA 6 */
1020                d->geo.cylinders = get_unaligned_le16(&id[54 << 1]);
1021                d->geo.heads = get_unaligned_le16(&id[55 << 1]);
1022                d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
1023        }
1024
1025        ata_ident_fixstring((u16 *) &id[10<<1], 10);    /* serial */
1026        ata_ident_fixstring((u16 *) &id[23<<1], 4);     /* firmware */
1027        ata_ident_fixstring((u16 *) &id[27<<1], 20);    /* model */
1028        memcpy(d->ident, id, sizeof(d->ident));
1029
1030        if (d->ssize != ssize)
1031                printk(KERN_INFO
1032                        "aoe: %pm e%ld.%d v%04x has %llu sectors\n",
1033                        t->addr,
1034                        d->aoemajor, d->aoeminor,
1035                        d->fw_ver, (long long)ssize);
1036        d->ssize = ssize;
1037        d->geo.start = 0;
1038        if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
1039                return;
1040        if (d->gd != NULL) {
1041                set_capacity(d->gd, ssize);
1042                d->flags |= DEVFL_NEWSIZE;
1043        } else
1044                d->flags |= DEVFL_GDALLOC;
1045        schedule_work(&d->work);
1046}
1047
1048static void
1049calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt)
1050{
1051        register long n;
1052
1053        n = rtt;
1054
1055        /* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */
1056        n -= d->rttavg >> RTTSCALE;
1057        d->rttavg += n;
1058        if (n < 0)
1059                n = -n;
1060        n -= d->rttdev >> RTTDSCALE;
1061        d->rttdev += n;
1062
1063        if (!t || t->maxout >= t->nframes)
1064                return;
1065        if (t->maxout < t->ssthresh)
1066                t->maxout += 1;
1067        else if (t->nout == t->maxout && t->next_cwnd-- == 0) {
1068                t->maxout += 1;
1069                t->next_cwnd = t->maxout;
1070        }
1071}
1072
1073static struct aoetgt *
1074gettgt(struct aoedev *d, char *addr)
1075{
1076        struct aoetgt **t, **e;
1077
1078        t = d->targets;
1079        e = t + d->ntargets;
1080        for (; t < e && *t; t++)
1081                if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
1082                        return *t;
1083        return NULL;
1084}
1085
1086static void
1087bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt)
1088{
1089        int soff = 0;
1090        struct bio_vec bv;
1091
1092        iter.bi_size = cnt;
1093
1094        __bio_for_each_segment(bv, bio, iter, iter) {
1095                char *p = page_address(bv.bv_page) + bv.bv_offset;
1096                skb_copy_bits(skb, soff, p, bv.bv_len);
1097                soff += bv.bv_len;
1098        }
1099}
1100
1101void
1102aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
1103{
1104        struct bio *bio;
1105        int bok;
1106        struct request_queue *q;
1107
1108        q = d->blkq;
1109        if (rq == d->ip.rq)
1110                d->ip.rq = NULL;
1111        do {
1112                bio = rq->bio;
1113                bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
1114        } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size));
1115
1116        /* cf. http://lkml.org/lkml/2006/10/31/28 */
1117        if (!fastfail)
1118                __blk_run_queue(q);
1119}
1120
1121static void
1122aoe_end_buf(struct aoedev *d, struct buf *buf)
1123{
1124        struct request *rq;
1125        unsigned long n;
1126
1127        if (buf == d->ip.buf)
1128                d->ip.buf = NULL;
1129        rq = buf->rq;
1130        bio_pagedec(buf->bio);
1131        mempool_free(buf, d->bufpool);
1132        n = (unsigned long) rq->special;
1133        rq->special = (void *) --n;
1134        if (n == 0)
1135                aoe_end_request(d, rq, 0);
1136}
1137
1138static void
1139ktiocomplete(struct frame *f)
1140{
1141        struct aoe_hdr *hin, *hout;
1142        struct aoe_atahdr *ahin, *ahout;
1143        struct buf *buf;
1144        struct sk_buff *skb;
1145        struct aoetgt *t;
1146        struct aoeif *ifp;
1147        struct aoedev *d;
1148        long n;
1149        int untainted;
1150
1151        if (f == NULL)
1152                return;
1153
1154        t = f->t;
1155        d = t->d;
1156        skb = f->r_skb;
1157        buf = f->buf;
1158        if (f->flags & FFL_PROBE)
1159                goto out;
1160        if (!skb)               /* just fail the buf. */
1161                goto noskb;
1162
1163        hout = (struct aoe_hdr *) skb_mac_header(f->skb);
1164        ahout = (struct aoe_atahdr *) (hout+1);
1165
1166        hin = (struct aoe_hdr *) skb->data;
1167        skb_pull(skb, sizeof(*hin));
1168        ahin = (struct aoe_atahdr *) skb->data;
1169        skb_pull(skb, sizeof(*ahin));
1170        if (ahin->cmdstat & 0xa9) {     /* these bits cleared on success */
1171                pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
1172                        ahout->cmdstat, ahin->cmdstat,
1173                        d->aoemajor, d->aoeminor);
1174noskb:          if (buf)
1175                        clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
1176                goto out;
1177        }
1178
1179        n = ahout->scnt << 9;
1180        switch (ahout->cmdstat) {
1181        case ATA_CMD_PIO_READ:
1182        case ATA_CMD_PIO_READ_EXT:
1183                if (skb->len < n) {
1184                        pr_err("%s e%ld.%d.  skb->len=%d need=%ld\n",
1185                                "aoe: runt data size in read from",
1186                                (long) d->aoemajor, d->aoeminor,
1187                               skb->len, n);
1188                        clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
1189                        break;
1190                }
1191                if (n > f->iter.bi_size) {
1192                        pr_err_ratelimited("%s e%ld.%d.  bytes=%ld need=%u\n",
1193                                "aoe: too-large data size in read from",
1194                                (long) d->aoemajor, d->aoeminor,
1195                                n, f->iter.bi_size);
1196                        clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
1197                        break;
1198                }
1199                bvcpy(skb, f->buf->bio, f->iter, n);
1200        case ATA_CMD_PIO_WRITE:
1201        case ATA_CMD_PIO_WRITE_EXT:
1202                spin_lock_irq(&d->lock);
1203                ifp = getif(t, skb->dev);
1204                if (ifp)
1205                        ifp->lost = 0;
1206                spin_unlock_irq(&d->lock);
1207                break;
1208        case ATA_CMD_ID_ATA:
1209                if (skb->len < 512) {
1210                        pr_info("%s e%ld.%d.  skb->len=%d need=512\n",
1211                                "aoe: runt data size in ataid from",
1212                                (long) d->aoemajor, d->aoeminor,
1213                                skb->len);
1214                        break;
1215                }
1216                if (skb_linearize(skb))
1217                        break;
1218                spin_lock_irq(&d->lock);
1219                ataid_complete(d, t, skb->data);
1220                spin_unlock_irq(&d->lock);
1221                break;
1222        default:
1223                pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n",
1224                        ahout->cmdstat,
1225                        be16_to_cpu(get_unaligned(&hin->major)),
1226                        hin->minor);
1227        }
1228out:
1229        spin_lock_irq(&d->lock);
1230        if (t->taint > 0
1231        && --t->taint > 0
1232        && t->nout_probes == 0) {
1233                count_targets(d, &untainted);
1234                if (untainted > 0) {
1235                        probe(t);
1236                        t->nout_probes++;
1237                }
1238        }
1239
1240        aoe_freetframe(f);
1241
1242        if (buf && --buf->nframesout == 0 && buf->iter.bi_size == 0)
1243                aoe_end_buf(d, buf);
1244
1245        spin_unlock_irq(&d->lock);
1246        aoedev_put(d);
1247        dev_kfree_skb(skb);
1248}
1249
1250/* Enters with iocq.lock held.
1251 * Returns true iff responses needing processing remain.
1252 */
1253static int
1254ktio(int id)
1255{
1256        struct frame *f;
1257        struct list_head *pos;
1258        int i;
1259        int actual_id;
1260
1261        for (i = 0; ; ++i) {
1262                if (i == MAXIOC)
1263                        return 1;
1264                if (list_empty(&iocq[id].head))
1265                        return 0;
1266                pos = iocq[id].head.next;
1267                list_del(pos);
1268                f = list_entry(pos, struct frame, head);
1269                spin_unlock_irq(&iocq[id].lock);
1270                ktiocomplete(f);
1271
1272                /* Figure out if extra threads are required. */
1273                actual_id = f->t->d->aoeminor % ncpus;
1274
1275                if (!kts[actual_id].active) {
1276                        BUG_ON(id != 0);
1277                        mutex_lock(&ktio_spawn_lock);
1278                        if (!kts[actual_id].active
1279                                && aoe_ktstart(&kts[actual_id]) == 0)
1280                                kts[actual_id].active = 1;
1281                        mutex_unlock(&ktio_spawn_lock);
1282                }
1283                spin_lock_irq(&iocq[id].lock);
1284        }
1285}
1286
1287static int
1288kthread(void *vp)
1289{
1290        struct ktstate *k;
1291        DECLARE_WAITQUEUE(wait, current);
1292        int more;
1293
1294        k = vp;
1295        current->flags |= PF_NOFREEZE;
1296        set_user_nice(current, -10);
1297        complete(&k->rendez);   /* tell spawner we're running */
1298        do {
1299                spin_lock_irq(k->lock);
1300                more = k->fn(k->id);
1301                if (!more) {
1302                        add_wait_queue(k->waitq, &wait);
1303                        __set_current_state(TASK_INTERRUPTIBLE);
1304                }
1305                spin_unlock_irq(k->lock);
1306                if (!more) {
1307                        schedule();
1308                        remove_wait_queue(k->waitq, &wait);
1309                } else
1310                        cond_resched();
1311        } while (!kthread_should_stop());
1312        complete(&k->rendez);   /* tell spawner we're stopping */
1313        return 0;
1314}
1315
1316void
1317aoe_ktstop(struct ktstate *k)
1318{
1319        kthread_stop(k->task);
1320        wait_for_completion(&k->rendez);
1321}
1322
1323int
1324aoe_ktstart(struct ktstate *k)
1325{
1326        struct task_struct *task;
1327
1328        init_completion(&k->rendez);
1329        task = kthread_run(kthread, k, "%s", k->name);
1330        if (task == NULL || IS_ERR(task))
1331                return -ENOMEM;
1332        k->task = task;
1333        wait_for_completion(&k->rendez); /* allow kthread to start */
1334        init_completion(&k->rendez);    /* for waiting for exit later */
1335        return 0;
1336}
1337
1338/* pass it off to kthreads for processing */
1339static void
1340ktcomplete(struct frame *f, struct sk_buff *skb)
1341{
1342        int id;
1343        ulong flags;
1344
1345        f->r_skb = skb;
1346        id = f->t->d->aoeminor % ncpus;
1347        spin_lock_irqsave(&iocq[id].lock, flags);
1348        if (!kts[id].active) {
1349                spin_unlock_irqrestore(&iocq[id].lock, flags);
1350                /* The thread with id has not been spawned yet,
1351                 * so delegate the work to the main thread and
1352                 * try spawning a new thread.
1353                 */
1354                id = 0;
1355                spin_lock_irqsave(&iocq[id].lock, flags);
1356        }
1357        list_add_tail(&f->head, &iocq[id].head);
1358        spin_unlock_irqrestore(&iocq[id].lock, flags);
1359        wake_up(&ktiowq[id]);
1360}
1361
1362struct sk_buff *
1363aoecmd_ata_rsp(struct sk_buff *skb)
1364{
1365        struct aoedev *d;
1366        struct aoe_hdr *h;
1367        struct frame *f;
1368        u32 n;
1369        ulong flags;
1370        char ebuf[128];
1371        u16 aoemajor;
1372
1373        h = (struct aoe_hdr *) skb->data;
1374        aoemajor = be16_to_cpu(get_unaligned(&h->major));
1375        d = aoedev_by_aoeaddr(aoemajor, h->minor, 0);
1376        if (d == NULL) {
1377                snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
1378                        "for unknown device %d.%d\n",
1379                        aoemajor, h->minor);
1380                aoechr_error(ebuf);
1381                return skb;
1382        }
1383
1384        spin_lock_irqsave(&d->lock, flags);
1385
1386        n = be32_to_cpu(get_unaligned(&h->tag));
1387        f = getframe(d, n);
1388        if (f) {
1389                calc_rttavg(d, f->t, tsince_hr(f));
1390                f->t->nout--;
1391                if (f->flags & FFL_PROBE)
1392                        f->t->nout_probes--;
1393        } else {
1394                f = getframe_deferred(d, n);
1395                if (f) {
1396                        calc_rttavg(d, NULL, tsince_hr(f));
1397                } else {
1398                        calc_rttavg(d, NULL, tsince(n));
1399                        spin_unlock_irqrestore(&d->lock, flags);
1400                        aoedev_put(d);
1401                        snprintf(ebuf, sizeof(ebuf),
1402                                 "%15s e%d.%d    tag=%08x@%08lx s=%pm d=%pm\n",
1403                                 "unexpected rsp",
1404                                 get_unaligned_be16(&h->major),
1405                                 h->minor,
1406                                 get_unaligned_be32(&h->tag),
1407                                 jiffies,
1408                                 h->src,
1409                                 h->dst);
1410                        aoechr_error(ebuf);
1411                        return skb;
1412                }
1413        }
1414        aoecmd_work(d);
1415
1416        spin_unlock_irqrestore(&d->lock, flags);
1417
1418        ktcomplete(f, skb);
1419
1420        /*
1421         * Note here that we do not perform an aoedev_put, as we are
1422         * leaving this reference for the ktio to release.
1423         */
1424        return NULL;
1425}
1426
1427void
1428aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
1429{
1430        struct sk_buff_head queue;
1431
1432        __skb_queue_head_init(&queue);
1433        aoecmd_cfg_pkts(aoemajor, aoeminor, &queue);
1434        aoenet_xmit(&queue);
1435}
1436
1437struct sk_buff *
1438aoecmd_ata_id(struct aoedev *d)
1439{
1440        struct aoe_hdr *h;
1441        struct aoe_atahdr *ah;
1442        struct frame *f;
1443        struct sk_buff *skb;
1444        struct aoetgt *t;
1445
1446        f = newframe(d);
1447        if (f == NULL)
1448                return NULL;
1449
1450        t = *d->tgt;
1451
1452        /* initialize the headers & frame */
1453        skb = f->skb;
1454        h = (struct aoe_hdr *) skb_mac_header(skb);
1455        ah = (struct aoe_atahdr *) (h+1);
1456        skb_put(skb, sizeof *h + sizeof *ah);
1457        memset(h, 0, skb->len);
1458        f->tag = aoehdr_atainit(d, t, h);
1459        fhash(f);
1460        t->nout++;
1461        f->waited = 0;
1462        f->waited_total = 0;
1463
1464        /* set up ata header */
1465        ah->scnt = 1;
1466        ah->cmdstat = ATA_CMD_ID_ATA;
1467        ah->lba3 = 0xa0;
1468
1469        skb->dev = t->ifp->nd;
1470
1471        d->rttavg = RTTAVG_INIT;
1472        d->rttdev = RTTDEV_INIT;
1473        d->timer.function = rexmit_timer;
1474
1475        skb = skb_clone(skb, GFP_ATOMIC);
1476        if (skb) {
1477                do_gettimeofday(&f->sent);
1478                f->sent_jiffs = (u32) jiffies;
1479        }
1480
1481        return skb;
1482}
1483
1484static struct aoetgt **
1485grow_targets(struct aoedev *d)
1486{
1487        ulong oldn, newn;
1488        struct aoetgt **tt;
1489
1490        oldn = d->ntargets;
1491        newn = oldn * 2;
1492        tt = kcalloc(newn, sizeof(*d->targets), GFP_ATOMIC);
1493        if (!tt)
1494                return NULL;
1495        memmove(tt, d->targets, sizeof(*d->targets) * oldn);
1496        d->tgt = tt + (d->tgt - d->targets);
1497        kfree(d->targets);
1498        d->targets = tt;
1499        d->ntargets = newn;
1500
1501        return &d->targets[oldn];
1502}
1503
1504static struct aoetgt *
1505addtgt(struct aoedev *d, char *addr, ulong nframes)
1506{
1507        struct aoetgt *t, **tt, **te;
1508
1509        tt = d->targets;
1510        te = tt + d->ntargets;
1511        for (; tt < te && *tt; tt++)
1512                ;
1513
1514        if (tt == te) {
1515                tt = grow_targets(d);
1516                if (!tt)
1517                        goto nomem;
1518        }
1519        t = kzalloc(sizeof(*t), GFP_ATOMIC);
1520        if (!t)
1521                goto nomem;
1522        t->nframes = nframes;
1523        t->d = d;
1524        memcpy(t->addr, addr, sizeof t->addr);
1525        t->ifp = t->ifs;
1526        aoecmd_wreset(t);
1527        t->maxout = t->nframes / 2;
1528        INIT_LIST_HEAD(&t->ffree);
1529        return *tt = t;
1530
1531 nomem:
1532        pr_info("aoe: cannot allocate memory to add target\n");
1533        return NULL;
1534}
1535
1536static void
1537setdbcnt(struct aoedev *d)
1538{
1539        struct aoetgt **t, **e;
1540        int bcnt = 0;
1541
1542        t = d->targets;
1543        e = t + d->ntargets;
1544        for (; t < e && *t; t++)
1545                if (bcnt == 0 || bcnt > (*t)->minbcnt)
1546                        bcnt = (*t)->minbcnt;
1547        if (bcnt != d->maxbcnt) {
1548                d->maxbcnt = bcnt;
1549                pr_info("aoe: e%ld.%d: setting %d byte data frames\n",
1550                        d->aoemajor, d->aoeminor, bcnt);
1551        }
1552}
1553
1554static void
1555setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt)
1556{
1557        struct aoedev *d;
1558        struct aoeif *p, *e;
1559        int minbcnt;
1560
1561        d = t->d;
1562        minbcnt = bcnt;
1563        p = t->ifs;
1564        e = p + NAOEIFS;
1565        for (; p < e; p++) {
1566                if (p->nd == NULL)
1567                        break;          /* end of the valid interfaces */
1568                if (p->nd == nd) {
1569                        p->bcnt = bcnt; /* we're updating */
1570                        nd = NULL;
1571                } else if (minbcnt > p->bcnt)
1572                        minbcnt = p->bcnt; /* find the min interface */
1573        }
1574        if (nd) {
1575                if (p == e) {
1576                        pr_err("aoe: device setifbcnt failure; too many interfaces.\n");
1577                        return;
1578                }
1579                dev_hold(nd);
1580                p->nd = nd;
1581                p->bcnt = bcnt;
1582        }
1583        t->minbcnt = minbcnt;
1584        setdbcnt(d);
1585}
1586
1587void
1588aoecmd_cfg_rsp(struct sk_buff *skb)
1589{
1590        struct aoedev *d;
1591        struct aoe_hdr *h;
1592        struct aoe_cfghdr *ch;
1593        struct aoetgt *t;
1594        ulong flags, aoemajor;
1595        struct sk_buff *sl;
1596        struct sk_buff_head queue;
1597        u16 n;
1598
1599        sl = NULL;
1600        h = (struct aoe_hdr *) skb_mac_header(skb);
1601        ch = (struct aoe_cfghdr *) (h+1);
1602
1603        /*
1604         * Enough people have their dip switches set backwards to
1605         * warrant a loud message for this special case.
1606         */
1607        aoemajor = get_unaligned_be16(&h->major);
1608        if (aoemajor == 0xfff) {
1609                printk(KERN_ERR "aoe: Warning: shelf address is all ones.  "
1610                        "Check shelf dip switches.\n");
1611                return;
1612        }
1613        if (aoemajor == 0xffff) {
1614                pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n",
1615                        aoemajor, (int) h->minor);
1616                return;
1617        }
1618        if (h->minor == 0xff) {
1619                pr_info("aoe: e%ld.%d: broadcast slot number invalid\n",
1620                        aoemajor, (int) h->minor);
1621                return;
1622        }
1623
1624        n = be16_to_cpu(ch->bufcnt);
1625        if (n > aoe_maxout)     /* keep it reasonable */
1626                n = aoe_maxout;
1627
1628        d = aoedev_by_aoeaddr(aoemajor, h->minor, 1);
1629        if (d == NULL) {
1630                pr_info("aoe: device allocation failure\n");
1631                return;
1632        }
1633
1634        spin_lock_irqsave(&d->lock, flags);
1635
1636        t = gettgt(d, h->src);
1637        if (t) {
1638                t->nframes = n;
1639                if (n < t->maxout)
1640                        aoecmd_wreset(t);
1641        } else {
1642                t = addtgt(d, h->src, n);
1643                if (!t)
1644                        goto bail;
1645        }
1646        n = skb->dev->mtu;
1647        n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
1648        n /= 512;
1649        if (n > ch->scnt)
1650                n = ch->scnt;
1651        n = n ? n * 512 : DEFAULTBCNT;
1652        setifbcnt(t, skb->dev, n);
1653
1654        /* don't change users' perspective */
1655        if (d->nopen == 0) {
1656                d->fw_ver = be16_to_cpu(ch->fwver);
1657                sl = aoecmd_ata_id(d);
1658        }
1659bail:
1660        spin_unlock_irqrestore(&d->lock, flags);
1661        aoedev_put(d);
1662        if (sl) {
1663                __skb_queue_head_init(&queue);
1664                __skb_queue_tail(&queue, sl);
1665                aoenet_xmit(&queue);
1666        }
1667}
1668
1669void
1670aoecmd_wreset(struct aoetgt *t)
1671{
1672        t->maxout = 1;
1673        t->ssthresh = t->nframes / 2;
1674        t->next_cwnd = t->nframes;
1675}
1676
1677void
1678aoecmd_cleanslate(struct aoedev *d)
1679{
1680        struct aoetgt **t, **te;
1681
1682        d->rttavg = RTTAVG_INIT;
1683        d->rttdev = RTTDEV_INIT;
1684        d->maxbcnt = 0;
1685
1686        t = d->targets;
1687        te = t + d->ntargets;
1688        for (; t < te && *t; t++)
1689                aoecmd_wreset(*t);
1690}
1691
1692void
1693aoe_failbuf(struct aoedev *d, struct buf *buf)
1694{
1695        if (buf == NULL)
1696                return;
1697        buf->iter.bi_size = 0;
1698        clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
1699        if (buf->nframesout == 0)
1700                aoe_end_buf(d, buf);
1701}
1702
1703void
1704aoe_flush_iocq(void)
1705{
1706        int i;
1707
1708        for (i = 0; i < ncpus; i++) {
1709                if (kts[i].active)
1710                        aoe_flush_iocq_by_index(i);
1711        }
1712}
1713
1714void
1715aoe_flush_iocq_by_index(int id)
1716{
1717        struct frame *f;
1718        struct aoedev *d;
1719        LIST_HEAD(flist);
1720        struct list_head *pos;
1721        struct sk_buff *skb;
1722        ulong flags;
1723
1724        spin_lock_irqsave(&iocq[id].lock, flags);
1725        list_splice_init(&iocq[id].head, &flist);
1726        spin_unlock_irqrestore(&iocq[id].lock, flags);
1727        while (!list_empty(&flist)) {
1728                pos = flist.next;
1729                list_del(pos);
1730                f = list_entry(pos, struct frame, head);
1731                d = f->t->d;
1732                skb = f->r_skb;
1733                spin_lock_irqsave(&d->lock, flags);
1734                if (f->buf) {
1735                        f->buf->nframesout--;
1736                        aoe_failbuf(d, f->buf);
1737                }
1738                aoe_freetframe(f);
1739                spin_unlock_irqrestore(&d->lock, flags);
1740                dev_kfree_skb(skb);
1741                aoedev_put(d);
1742        }
1743}
1744
1745int __init
1746aoecmd_init(void)
1747{
1748        void *p;
1749        int i;
1750        int ret;
1751
1752        /* get_zeroed_page returns page with ref count 1 */
1753        p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT);
1754        if (!p)
1755                return -ENOMEM;
1756        empty_page = virt_to_page(p);
1757
1758        ncpus = num_online_cpus();
1759
1760        iocq = kcalloc(ncpus, sizeof(struct iocq_ktio), GFP_KERNEL);
1761        if (!iocq)
1762                return -ENOMEM;
1763
1764        kts = kcalloc(ncpus, sizeof(struct ktstate), GFP_KERNEL);
1765        if (!kts) {
1766                ret = -ENOMEM;
1767                goto kts_fail;
1768        }
1769
1770        ktiowq = kcalloc(ncpus, sizeof(wait_queue_head_t), GFP_KERNEL);
1771        if (!ktiowq) {
1772                ret = -ENOMEM;
1773                goto ktiowq_fail;
1774        }
1775
1776        mutex_init(&ktio_spawn_lock);
1777
1778        for (i = 0; i < ncpus; i++) {
1779                INIT_LIST_HEAD(&iocq[i].head);
1780                spin_lock_init(&iocq[i].lock);
1781                init_waitqueue_head(&ktiowq[i]);
1782                snprintf(kts[i].name, sizeof(kts[i].name), "aoe_ktio%d", i);
1783                kts[i].fn = ktio;
1784                kts[i].waitq = &ktiowq[i];
1785                kts[i].lock = &iocq[i].lock;
1786                kts[i].id = i;
1787                kts[i].active = 0;
1788        }
1789        kts[0].active = 1;
1790        if (aoe_ktstart(&kts[0])) {
1791                ret = -ENOMEM;
1792                goto ktstart_fail;
1793        }
1794        return 0;
1795
1796ktstart_fail:
1797        kfree(ktiowq);
1798ktiowq_fail:
1799        kfree(kts);
1800kts_fail:
1801        kfree(iocq);
1802
1803        return ret;
1804}
1805
1806void
1807aoecmd_exit(void)
1808{
1809        int i;
1810
1811        for (i = 0; i < ncpus; i++)
1812                if (kts[i].active)
1813                        aoe_ktstop(&kts[i]);
1814
1815        aoe_flush_iocq();
1816
1817        /* Free up the iocq and thread speicific configuration
1818        * allocated during startup.
1819        */
1820        kfree(iocq);
1821        kfree(kts);
1822        kfree(ktiowq);
1823
1824        free_page((unsigned long) page_address(empty_page));
1825        empty_page = NULL;
1826}
1827