linux/drivers/net/xen-netfront.c
<<
>>
Prefs
   1/*
   2 * Virtual network driver for conversing with remote driver backends.
   3 *
   4 * Copyright (c) 2002-2005, K A Fraser
   5 * Copyright (c) 2005, XenSource Ltd
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public License version 2
   9 * as published by the Free Software Foundation; or, when distributed
  10 * separately from the Linux kernel or incorporated into other
  11 * software packages, subject to the following license:
  12 *
  13 * Permission is hereby granted, free of charge, to any person obtaining a copy
  14 * of this source file (the "Software"), to deal in the Software without
  15 * restriction, including without limitation the rights to use, copy, modify,
  16 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  17 * and to permit persons to whom the Software is furnished to do so, subject to
  18 * the following conditions:
  19 *
  20 * The above copyright notice and this permission notice shall be included in
  21 * all copies or substantial portions of the Software.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  29 * IN THE SOFTWARE.
  30 */
  31
  32#include <linux/module.h>
  33#include <linux/kernel.h>
  34#include <linux/netdevice.h>
  35#include <linux/etherdevice.h>
  36#include <linux/skbuff.h>
  37#include <linux/ethtool.h>
  38#include <linux/if_ether.h>
  39#include <linux/tcp.h>
  40#include <linux/udp.h>
  41#include <linux/moduleparam.h>
  42#include <linux/mm.h>
  43#include <net/ip.h>
  44
  45#include <xen/xenbus.h>
  46#include <xen/events.h>
  47#include <xen/page.h>
  48#include <xen/grant_table.h>
  49
  50#include <xen/interface/io/netif.h>
  51#include <xen/interface/memory.h>
  52#include <xen/interface/grant_table.h>
  53
  54static const struct ethtool_ops xennet_ethtool_ops;
  55
  56struct netfront_cb {
  57        struct page *page;
  58        unsigned offset;
  59};
  60
  61#define NETFRONT_SKB_CB(skb)    ((struct netfront_cb *)((skb)->cb))
  62
  63#define RX_COPY_THRESHOLD 256
  64
  65#define GRANT_INVALID_REF       0
  66
  67#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
  68#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
  69#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
  70
  71struct netfront_info {
  72        struct list_head list;
  73        struct net_device *netdev;
  74
  75        struct napi_struct napi;
  76
  77        unsigned int evtchn;
  78        struct xenbus_device *xbdev;
  79
  80        spinlock_t   tx_lock;
  81        struct xen_netif_tx_front_ring tx;
  82        int tx_ring_ref;
  83
  84        /*
  85         * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries
  86         * are linked from tx_skb_freelist through skb_entry.link.
  87         *
  88         *  NB. Freelist index entries are always going to be less than
  89         *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
  90         *  greater than PAGE_OFFSET: we use this property to distinguish
  91         *  them.
  92         */
  93        union skb_entry {
  94                struct sk_buff *skb;
  95                unsigned long link;
  96        } tx_skbs[NET_TX_RING_SIZE];
  97        grant_ref_t gref_tx_head;
  98        grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
  99        unsigned tx_skb_freelist;
 100
 101        spinlock_t   rx_lock ____cacheline_aligned_in_smp;
 102        struct xen_netif_rx_front_ring rx;
 103        int rx_ring_ref;
 104
 105        /* Receive-ring batched refills. */
 106#define RX_MIN_TARGET 8
 107#define RX_DFL_MIN_TARGET 64
 108#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
 109        unsigned rx_min_target, rx_max_target, rx_target;
 110        struct sk_buff_head rx_batch;
 111
 112        struct timer_list rx_refill_timer;
 113
 114        struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
 115        grant_ref_t gref_rx_head;
 116        grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
 117
 118        unsigned long rx_pfn_array[NET_RX_RING_SIZE];
 119        struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
 120        struct mmu_update rx_mmu[NET_RX_RING_SIZE];
 121};
 122
 123struct netfront_rx_info {
 124        struct xen_netif_rx_response rx;
 125        struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
 126};
 127
 128static void skb_entry_set_link(union skb_entry *list, unsigned short id)
 129{
 130        list->link = id;
 131}
 132
 133static int skb_entry_is_link(const union skb_entry *list)
 134{
 135        BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link));
 136        return ((unsigned long)list->skb < PAGE_OFFSET);
 137}
 138
 139/*
 140 * Access macros for acquiring freeing slots in tx_skbs[].
 141 */
 142
 143static void add_id_to_freelist(unsigned *head, union skb_entry *list,
 144                               unsigned short id)
 145{
 146        skb_entry_set_link(&list[id], *head);
 147        *head = id;
 148}
 149
 150static unsigned short get_id_from_freelist(unsigned *head,
 151                                           union skb_entry *list)
 152{
 153        unsigned int id = *head;
 154        *head = list[id].link;
 155        return id;
 156}
 157
 158static int xennet_rxidx(RING_IDX idx)
 159{
 160        return idx & (NET_RX_RING_SIZE - 1);
 161}
 162
 163static struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
 164                                         RING_IDX ri)
 165{
 166        int i = xennet_rxidx(ri);
 167        struct sk_buff *skb = np->rx_skbs[i];
 168        np->rx_skbs[i] = NULL;
 169        return skb;
 170}
 171
 172static grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
 173                                            RING_IDX ri)
 174{
 175        int i = xennet_rxidx(ri);
 176        grant_ref_t ref = np->grant_rx_ref[i];
 177        np->grant_rx_ref[i] = GRANT_INVALID_REF;
 178        return ref;
 179}
 180
 181#ifdef CONFIG_SYSFS
 182static int xennet_sysfs_addif(struct net_device *netdev);
 183static void xennet_sysfs_delif(struct net_device *netdev);
 184#else /* !CONFIG_SYSFS */
 185#define xennet_sysfs_addif(dev) (0)
 186#define xennet_sysfs_delif(dev) do { } while (0)
 187#endif
 188
 189static int xennet_can_sg(struct net_device *dev)
 190{
 191        return dev->features & NETIF_F_SG;
 192}
 193
 194
 195static void rx_refill_timeout(unsigned long data)
 196{
 197        struct net_device *dev = (struct net_device *)data;
 198        struct netfront_info *np = netdev_priv(dev);
 199        napi_schedule(&np->napi);
 200}
 201
 202static int netfront_tx_slot_available(struct netfront_info *np)
 203{
 204        return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
 205                (TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
 206}
 207
 208static void xennet_maybe_wake_tx(struct net_device *dev)
 209{
 210        struct netfront_info *np = netdev_priv(dev);
 211
 212        if (unlikely(netif_queue_stopped(dev)) &&
 213            netfront_tx_slot_available(np) &&
 214            likely(netif_running(dev)))
 215                netif_wake_queue(dev);
 216}
 217
 218static void xennet_alloc_rx_buffers(struct net_device *dev)
 219{
 220        unsigned short id;
 221        struct netfront_info *np = netdev_priv(dev);
 222        struct sk_buff *skb;
 223        struct page *page;
 224        int i, batch_target, notify;
 225        RING_IDX req_prod = np->rx.req_prod_pvt;
 226        grant_ref_t ref;
 227        unsigned long pfn;
 228        void *vaddr;
 229        struct xen_netif_rx_request *req;
 230
 231        if (unlikely(!netif_carrier_ok(dev)))
 232                return;
 233
 234        /*
 235         * Allocate skbuffs greedily, even though we batch updates to the
 236         * receive ring. This creates a less bursty demand on the memory
 237         * allocator, so should reduce the chance of failed allocation requests
 238         * both for ourself and for other kernel subsystems.
 239         */
 240        batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
 241        for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
 242                skb = __netdev_alloc_skb(dev, RX_COPY_THRESHOLD + NET_IP_ALIGN,
 243                                         GFP_ATOMIC | __GFP_NOWARN);
 244                if (unlikely(!skb))
 245                        goto no_skb;
 246
 247                /* Align ip header to a 16 bytes boundary */
 248                skb_reserve(skb, NET_IP_ALIGN);
 249
 250                page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
 251                if (!page) {
 252                        kfree_skb(skb);
 253no_skb:
 254                        /* Any skbuffs queued for refill? Force them out. */
 255                        if (i != 0)
 256                                goto refill;
 257                        /* Could not allocate any skbuffs. Try again later. */
 258                        mod_timer(&np->rx_refill_timer,
 259                                  jiffies + (HZ/10));
 260                        break;
 261                }
 262
 263                skb_shinfo(skb)->frags[0].page = page;
 264                skb_shinfo(skb)->nr_frags = 1;
 265                __skb_queue_tail(&np->rx_batch, skb);
 266        }
 267
 268        /* Is the batch large enough to be worthwhile? */
 269        if (i < (np->rx_target/2)) {
 270                if (req_prod > np->rx.sring->req_prod)
 271                        goto push;
 272                return;
 273        }
 274
 275        /* Adjust our fill target if we risked running out of buffers. */
 276        if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
 277            ((np->rx_target *= 2) > np->rx_max_target))
 278                np->rx_target = np->rx_max_target;
 279
 280 refill:
 281        for (i = 0; ; i++) {
 282                skb = __skb_dequeue(&np->rx_batch);
 283                if (skb == NULL)
 284                        break;
 285
 286                skb->dev = dev;
 287
 288                id = xennet_rxidx(req_prod + i);
 289
 290                BUG_ON(np->rx_skbs[id]);
 291                np->rx_skbs[id] = skb;
 292
 293                ref = gnttab_claim_grant_reference(&np->gref_rx_head);
 294                BUG_ON((signed short)ref < 0);
 295                np->grant_rx_ref[id] = ref;
 296
 297                pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
 298                vaddr = page_address(skb_shinfo(skb)->frags[0].page);
 299
 300                req = RING_GET_REQUEST(&np->rx, req_prod + i);
 301                gnttab_grant_foreign_access_ref(ref,
 302                                                np->xbdev->otherend_id,
 303                                                pfn_to_mfn(pfn),
 304                                                0);
 305
 306                req->id = id;
 307                req->gref = ref;
 308        }
 309
 310        wmb();          /* barrier so backend seens requests */
 311
 312        /* Above is a suitable barrier to ensure backend will see requests. */
 313        np->rx.req_prod_pvt = req_prod + i;
 314 push:
 315        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
 316        if (notify)
 317                notify_remote_via_irq(np->netdev->irq);
 318}
 319
 320static int xennet_open(struct net_device *dev)
 321{
 322        struct netfront_info *np = netdev_priv(dev);
 323
 324        napi_enable(&np->napi);
 325
 326        spin_lock_bh(&np->rx_lock);
 327        if (netif_carrier_ok(dev)) {
 328                xennet_alloc_rx_buffers(dev);
 329                np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
 330                if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
 331                        napi_schedule(&np->napi);
 332        }
 333        spin_unlock_bh(&np->rx_lock);
 334
 335        netif_start_queue(dev);
 336
 337        return 0;
 338}
 339
 340static void xennet_tx_buf_gc(struct net_device *dev)
 341{
 342        RING_IDX cons, prod;
 343        unsigned short id;
 344        struct netfront_info *np = netdev_priv(dev);
 345        struct sk_buff *skb;
 346
 347        BUG_ON(!netif_carrier_ok(dev));
 348
 349        do {
 350                prod = np->tx.sring->rsp_prod;
 351                rmb(); /* Ensure we see responses up to 'rp'. */
 352
 353                for (cons = np->tx.rsp_cons; cons != prod; cons++) {
 354                        struct xen_netif_tx_response *txrsp;
 355
 356                        txrsp = RING_GET_RESPONSE(&np->tx, cons);
 357                        if (txrsp->status == NETIF_RSP_NULL)
 358                                continue;
 359
 360                        id  = txrsp->id;
 361                        skb = np->tx_skbs[id].skb;
 362                        if (unlikely(gnttab_query_foreign_access(
 363                                np->grant_tx_ref[id]) != 0)) {
 364                                printk(KERN_ALERT "xennet_tx_buf_gc: warning "
 365                                       "-- grant still in use by backend "
 366                                       "domain.\n");
 367                                BUG();
 368                        }
 369                        gnttab_end_foreign_access_ref(
 370                                np->grant_tx_ref[id], GNTMAP_readonly);
 371                        gnttab_release_grant_reference(
 372                                &np->gref_tx_head, np->grant_tx_ref[id]);
 373                        np->grant_tx_ref[id] = GRANT_INVALID_REF;
 374                        add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, id);
 375                        dev_kfree_skb_irq(skb);
 376                }
 377
 378                np->tx.rsp_cons = prod;
 379
 380                /*
 381                 * Set a new event, then check for race with update of tx_cons.
 382                 * Note that it is essential to schedule a callback, no matter
 383                 * how few buffers are pending. Even if there is space in the
 384                 * transmit ring, higher layers may be blocked because too much
 385                 * data is outstanding: in such cases notification from Xen is
 386                 * likely to be the only kick that we'll get.
 387                 */
 388                np->tx.sring->rsp_event =
 389                        prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
 390                mb();           /* update shared area */
 391        } while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
 392
 393        xennet_maybe_wake_tx(dev);
 394}
 395
 396static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
 397                              struct xen_netif_tx_request *tx)
 398{
 399        struct netfront_info *np = netdev_priv(dev);
 400        char *data = skb->data;
 401        unsigned long mfn;
 402        RING_IDX prod = np->tx.req_prod_pvt;
 403        int frags = skb_shinfo(skb)->nr_frags;
 404        unsigned int offset = offset_in_page(data);
 405        unsigned int len = skb_headlen(skb);
 406        unsigned int id;
 407        grant_ref_t ref;
 408        int i;
 409
 410        /* While the header overlaps a page boundary (including being
 411           larger than a page), split it it into page-sized chunks. */
 412        while (len > PAGE_SIZE - offset) {
 413                tx->size = PAGE_SIZE - offset;
 414                tx->flags |= NETTXF_more_data;
 415                len -= tx->size;
 416                data += tx->size;
 417                offset = 0;
 418
 419                id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
 420                np->tx_skbs[id].skb = skb_get(skb);
 421                tx = RING_GET_REQUEST(&np->tx, prod++);
 422                tx->id = id;
 423                ref = gnttab_claim_grant_reference(&np->gref_tx_head);
 424                BUG_ON((signed short)ref < 0);
 425
 426                mfn = virt_to_mfn(data);
 427                gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
 428                                                mfn, GNTMAP_readonly);
 429
 430                tx->gref = np->grant_tx_ref[id] = ref;
 431                tx->offset = offset;
 432                tx->size = len;
 433                tx->flags = 0;
 434        }
 435
 436        /* Grant backend access to each skb fragment page. */
 437        for (i = 0; i < frags; i++) {
 438                skb_frag_t *frag = skb_shinfo(skb)->frags + i;
 439
 440                tx->flags |= NETTXF_more_data;
 441
 442                id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
 443                np->tx_skbs[id].skb = skb_get(skb);
 444                tx = RING_GET_REQUEST(&np->tx, prod++);
 445                tx->id = id;
 446                ref = gnttab_claim_grant_reference(&np->gref_tx_head);
 447                BUG_ON((signed short)ref < 0);
 448
 449                mfn = pfn_to_mfn(page_to_pfn(frag->page));
 450                gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
 451                                                mfn, GNTMAP_readonly);
 452
 453                tx->gref = np->grant_tx_ref[id] = ref;
 454                tx->offset = frag->page_offset;
 455                tx->size = frag->size;
 456                tx->flags = 0;
 457        }
 458
 459        np->tx.req_prod_pvt = prod;
 460}
 461
 462static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 463{
 464        unsigned short id;
 465        struct netfront_info *np = netdev_priv(dev);
 466        struct xen_netif_tx_request *tx;
 467        struct xen_netif_extra_info *extra;
 468        char *data = skb->data;
 469        RING_IDX i;
 470        grant_ref_t ref;
 471        unsigned long mfn;
 472        int notify;
 473        int frags = skb_shinfo(skb)->nr_frags;
 474        unsigned int offset = offset_in_page(data);
 475        unsigned int len = skb_headlen(skb);
 476
 477        frags += DIV_ROUND_UP(offset + len, PAGE_SIZE);
 478        if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
 479                printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
 480                       frags);
 481                dump_stack();
 482                goto drop;
 483        }
 484
 485        spin_lock_irq(&np->tx_lock);
 486
 487        if (unlikely(!netif_carrier_ok(dev) ||
 488                     (frags > 1 && !xennet_can_sg(dev)) ||
 489                     netif_needs_gso(dev, skb))) {
 490                spin_unlock_irq(&np->tx_lock);
 491                goto drop;
 492        }
 493
 494        i = np->tx.req_prod_pvt;
 495
 496        id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
 497        np->tx_skbs[id].skb = skb;
 498
 499        tx = RING_GET_REQUEST(&np->tx, i);
 500
 501        tx->id   = id;
 502        ref = gnttab_claim_grant_reference(&np->gref_tx_head);
 503        BUG_ON((signed short)ref < 0);
 504        mfn = virt_to_mfn(data);
 505        gnttab_grant_foreign_access_ref(
 506                ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
 507        tx->gref = np->grant_tx_ref[id] = ref;
 508        tx->offset = offset;
 509        tx->size = len;
 510        extra = NULL;
 511
 512        tx->flags = 0;
 513        if (skb->ip_summed == CHECKSUM_PARTIAL)
 514                /* local packet? */
 515                tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
 516        else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
 517                /* remote but checksummed. */
 518                tx->flags |= NETTXF_data_validated;
 519
 520        if (skb_shinfo(skb)->gso_size) {
 521                struct xen_netif_extra_info *gso;
 522
 523                gso = (struct xen_netif_extra_info *)
 524                        RING_GET_REQUEST(&np->tx, ++i);
 525
 526                if (extra)
 527                        extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
 528                else
 529                        tx->flags |= NETTXF_extra_info;
 530
 531                gso->u.gso.size = skb_shinfo(skb)->gso_size;
 532                gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
 533                gso->u.gso.pad = 0;
 534                gso->u.gso.features = 0;
 535
 536                gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
 537                gso->flags = 0;
 538                extra = gso;
 539        }
 540
 541        np->tx.req_prod_pvt = i + 1;
 542
 543        xennet_make_frags(skb, dev, tx);
 544        tx->size = skb->len;
 545
 546        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
 547        if (notify)
 548                notify_remote_via_irq(np->netdev->irq);
 549
 550        dev->stats.tx_bytes += skb->len;
 551        dev->stats.tx_packets++;
 552
 553        /* Note: It is not safe to access skb after xennet_tx_buf_gc()! */
 554        xennet_tx_buf_gc(dev);
 555
 556        if (!netfront_tx_slot_available(np))
 557                netif_stop_queue(dev);
 558
 559        spin_unlock_irq(&np->tx_lock);
 560
 561        return NETDEV_TX_OK;
 562
 563 drop:
 564        dev->stats.tx_dropped++;
 565        dev_kfree_skb(skb);
 566        return NETDEV_TX_OK;
 567}
 568
 569static int xennet_close(struct net_device *dev)
 570{
 571        struct netfront_info *np = netdev_priv(dev);
 572        netif_stop_queue(np->netdev);
 573        napi_disable(&np->napi);
 574        return 0;
 575}
 576
 577static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
 578                                grant_ref_t ref)
 579{
 580        int new = xennet_rxidx(np->rx.req_prod_pvt);
 581
 582        BUG_ON(np->rx_skbs[new]);
 583        np->rx_skbs[new] = skb;
 584        np->grant_rx_ref[new] = ref;
 585        RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
 586        RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
 587        np->rx.req_prod_pvt++;
 588}
 589
 590static int xennet_get_extras(struct netfront_info *np,
 591                             struct xen_netif_extra_info *extras,
 592                             RING_IDX rp)
 593
 594{
 595        struct xen_netif_extra_info *extra;
 596        struct device *dev = &np->netdev->dev;
 597        RING_IDX cons = np->rx.rsp_cons;
 598        int err = 0;
 599
 600        do {
 601                struct sk_buff *skb;
 602                grant_ref_t ref;
 603
 604                if (unlikely(cons + 1 == rp)) {
 605                        if (net_ratelimit())
 606                                dev_warn(dev, "Missing extra info\n");
 607                        err = -EBADR;
 608                        break;
 609                }
 610
 611                extra = (struct xen_netif_extra_info *)
 612                        RING_GET_RESPONSE(&np->rx, ++cons);
 613
 614                if (unlikely(!extra->type ||
 615                             extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
 616                        if (net_ratelimit())
 617                                dev_warn(dev, "Invalid extra type: %d\n",
 618                                        extra->type);
 619                        err = -EINVAL;
 620                } else {
 621                        memcpy(&extras[extra->type - 1], extra,
 622                               sizeof(*extra));
 623                }
 624
 625                skb = xennet_get_rx_skb(np, cons);
 626                ref = xennet_get_rx_ref(np, cons);
 627                xennet_move_rx_slot(np, skb, ref);
 628        } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
 629
 630        np->rx.rsp_cons = cons;
 631        return err;
 632}
 633
 634static int xennet_get_responses(struct netfront_info *np,
 635                                struct netfront_rx_info *rinfo, RING_IDX rp,
 636                                struct sk_buff_head *list)
 637{
 638        struct xen_netif_rx_response *rx = &rinfo->rx;
 639        struct xen_netif_extra_info *extras = rinfo->extras;
 640        struct device *dev = &np->netdev->dev;
 641        RING_IDX cons = np->rx.rsp_cons;
 642        struct sk_buff *skb = xennet_get_rx_skb(np, cons);
 643        grant_ref_t ref = xennet_get_rx_ref(np, cons);
 644        int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
 645        int frags = 1;
 646        int err = 0;
 647        unsigned long ret;
 648
 649        if (rx->flags & NETRXF_extra_info) {
 650                err = xennet_get_extras(np, extras, rp);
 651                cons = np->rx.rsp_cons;
 652        }
 653
 654        for (;;) {
 655                if (unlikely(rx->status < 0 ||
 656                             rx->offset + rx->status > PAGE_SIZE)) {
 657                        if (net_ratelimit())
 658                                dev_warn(dev, "rx->offset: %x, size: %u\n",
 659                                         rx->offset, rx->status);
 660                        xennet_move_rx_slot(np, skb, ref);
 661                        err = -EINVAL;
 662                        goto next;
 663                }
 664
 665                /*
 666                 * This definitely indicates a bug, either in this driver or in
 667                 * the backend driver. In future this should flag the bad
 668                 * situation to the system controller to reboot the backed.
 669                 */
 670                if (ref == GRANT_INVALID_REF) {
 671                        if (net_ratelimit())
 672                                dev_warn(dev, "Bad rx response id %d.\n",
 673                                         rx->id);
 674                        err = -EINVAL;
 675                        goto next;
 676                }
 677
 678                ret = gnttab_end_foreign_access_ref(ref, 0);
 679                BUG_ON(!ret);
 680
 681                gnttab_release_grant_reference(&np->gref_rx_head, ref);
 682
 683                __skb_queue_tail(list, skb);
 684
 685next:
 686                if (!(rx->flags & NETRXF_more_data))
 687                        break;
 688
 689                if (cons + frags == rp) {
 690                        if (net_ratelimit())
 691                                dev_warn(dev, "Need more frags\n");
 692                        err = -ENOENT;
 693                        break;
 694                }
 695
 696                rx = RING_GET_RESPONSE(&np->rx, cons + frags);
 697                skb = xennet_get_rx_skb(np, cons + frags);
 698                ref = xennet_get_rx_ref(np, cons + frags);
 699                frags++;
 700        }
 701
 702        if (unlikely(frags > max)) {
 703                if (net_ratelimit())
 704                        dev_warn(dev, "Too many frags\n");
 705                err = -E2BIG;
 706        }
 707
 708        if (unlikely(err))
 709                np->rx.rsp_cons = cons + frags;
 710
 711        return err;
 712}
 713
 714static int xennet_set_skb_gso(struct sk_buff *skb,
 715                              struct xen_netif_extra_info *gso)
 716{
 717        if (!gso->u.gso.size) {
 718                if (net_ratelimit())
 719                        printk(KERN_WARNING "GSO size must not be zero.\n");
 720                return -EINVAL;
 721        }
 722
 723        /* Currently only TCPv4 S.O. is supported. */
 724        if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
 725                if (net_ratelimit())
 726                        printk(KERN_WARNING "Bad GSO type %d.\n", gso->u.gso.type);
 727                return -EINVAL;
 728        }
 729
 730        skb_shinfo(skb)->gso_size = gso->u.gso.size;
 731        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
 732
 733        /* Header must be checked, and gso_segs computed. */
 734        skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
 735        skb_shinfo(skb)->gso_segs = 0;
 736
 737        return 0;
 738}
 739
 740static RING_IDX xennet_fill_frags(struct netfront_info *np,
 741                                  struct sk_buff *skb,
 742                                  struct sk_buff_head *list)
 743{
 744        struct skb_shared_info *shinfo = skb_shinfo(skb);
 745        int nr_frags = shinfo->nr_frags;
 746        RING_IDX cons = np->rx.rsp_cons;
 747        skb_frag_t *frag = shinfo->frags + nr_frags;
 748        struct sk_buff *nskb;
 749
 750        while ((nskb = __skb_dequeue(list))) {
 751                struct xen_netif_rx_response *rx =
 752                        RING_GET_RESPONSE(&np->rx, ++cons);
 753
 754                frag->page = skb_shinfo(nskb)->frags[0].page;
 755                frag->page_offset = rx->offset;
 756                frag->size = rx->status;
 757
 758                skb->data_len += rx->status;
 759
 760                skb_shinfo(nskb)->nr_frags = 0;
 761                kfree_skb(nskb);
 762
 763                frag++;
 764                nr_frags++;
 765        }
 766
 767        shinfo->nr_frags = nr_frags;
 768        return cons;
 769}
 770
 771static int skb_checksum_setup(struct sk_buff *skb)
 772{
 773        struct iphdr *iph;
 774        unsigned char *th;
 775        int err = -EPROTO;
 776
 777        if (skb->protocol != htons(ETH_P_IP))
 778                goto out;
 779
 780        iph = (void *)skb->data;
 781        th = skb->data + 4 * iph->ihl;
 782        if (th >= skb_tail_pointer(skb))
 783                goto out;
 784
 785        skb->csum_start = th - skb->head;
 786        switch (iph->protocol) {
 787        case IPPROTO_TCP:
 788                skb->csum_offset = offsetof(struct tcphdr, check);
 789                break;
 790        case IPPROTO_UDP:
 791                skb->csum_offset = offsetof(struct udphdr, check);
 792                break;
 793        default:
 794                if (net_ratelimit())
 795                        printk(KERN_ERR "Attempting to checksum a non-"
 796                               "TCP/UDP packet, dropping a protocol"
 797                               " %d packet", iph->protocol);
 798                goto out;
 799        }
 800
 801        if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
 802                goto out;
 803
 804        err = 0;
 805
 806out:
 807        return err;
 808}
 809
 810static int handle_incoming_queue(struct net_device *dev,
 811                                 struct sk_buff_head *rxq)
 812{
 813        int packets_dropped = 0;
 814        struct sk_buff *skb;
 815
 816        while ((skb = __skb_dequeue(rxq)) != NULL) {
 817                struct page *page = NETFRONT_SKB_CB(skb)->page;
 818                void *vaddr = page_address(page);
 819                unsigned offset = NETFRONT_SKB_CB(skb)->offset;
 820
 821                memcpy(skb->data, vaddr + offset,
 822                       skb_headlen(skb));
 823
 824                if (page != skb_shinfo(skb)->frags[0].page)
 825                        __free_page(page);
 826
 827                /* Ethernet work: Delayed to here as it peeks the header. */
 828                skb->protocol = eth_type_trans(skb, dev);
 829
 830                if (skb->ip_summed == CHECKSUM_PARTIAL) {
 831                        if (skb_checksum_setup(skb)) {
 832                                kfree_skb(skb);
 833                                packets_dropped++;
 834                                dev->stats.rx_errors++;
 835                                continue;
 836                        }
 837                }
 838
 839                dev->stats.rx_packets++;
 840                dev->stats.rx_bytes += skb->len;
 841
 842                /* Pass it up. */
 843                netif_receive_skb(skb);
 844        }
 845
 846        return packets_dropped;
 847}
 848
 849static int xennet_poll(struct napi_struct *napi, int budget)
 850{
 851        struct netfront_info *np = container_of(napi, struct netfront_info, napi);
 852        struct net_device *dev = np->netdev;
 853        struct sk_buff *skb;
 854        struct netfront_rx_info rinfo;
 855        struct xen_netif_rx_response *rx = &rinfo.rx;
 856        struct xen_netif_extra_info *extras = rinfo.extras;
 857        RING_IDX i, rp;
 858        int work_done;
 859        struct sk_buff_head rxq;
 860        struct sk_buff_head errq;
 861        struct sk_buff_head tmpq;
 862        unsigned long flags;
 863        unsigned int len;
 864        int err;
 865
 866        spin_lock(&np->rx_lock);
 867
 868        skb_queue_head_init(&rxq);
 869        skb_queue_head_init(&errq);
 870        skb_queue_head_init(&tmpq);
 871
 872        rp = np->rx.sring->rsp_prod;
 873        rmb(); /* Ensure we see queued responses up to 'rp'. */
 874
 875        i = np->rx.rsp_cons;
 876        work_done = 0;
 877        while ((i != rp) && (work_done < budget)) {
 878                memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
 879                memset(extras, 0, sizeof(rinfo.extras));
 880
 881                err = xennet_get_responses(np, &rinfo, rp, &tmpq);
 882
 883                if (unlikely(err)) {
 884err:
 885                        while ((skb = __skb_dequeue(&tmpq)))
 886                                __skb_queue_tail(&errq, skb);
 887                        dev->stats.rx_errors++;
 888                        i = np->rx.rsp_cons;
 889                        continue;
 890                }
 891
 892                skb = __skb_dequeue(&tmpq);
 893
 894                if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
 895                        struct xen_netif_extra_info *gso;
 896                        gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
 897
 898                        if (unlikely(xennet_set_skb_gso(skb, gso))) {
 899                                __skb_queue_head(&tmpq, skb);
 900                                np->rx.rsp_cons += skb_queue_len(&tmpq);
 901                                goto err;
 902                        }
 903                }
 904
 905                NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page;
 906                NETFRONT_SKB_CB(skb)->offset = rx->offset;
 907
 908                len = rx->status;
 909                if (len > RX_COPY_THRESHOLD)
 910                        len = RX_COPY_THRESHOLD;
 911                skb_put(skb, len);
 912
 913                if (rx->status > len) {
 914                        skb_shinfo(skb)->frags[0].page_offset =
 915                                rx->offset + len;
 916                        skb_shinfo(skb)->frags[0].size = rx->status - len;
 917                        skb->data_len = rx->status - len;
 918                } else {
 919                        skb_shinfo(skb)->frags[0].page = NULL;
 920                        skb_shinfo(skb)->nr_frags = 0;
 921                }
 922
 923                i = xennet_fill_frags(np, skb, &tmpq);
 924
 925                /*
 926                 * Truesize approximates the size of true data plus
 927                 * any supervisor overheads. Adding hypervisor
 928                 * overheads has been shown to significantly reduce
 929                 * achievable bandwidth with the default receive
 930                 * buffer size. It is therefore not wise to account
 931                 * for it here.
 932                 *
 933                 * After alloc_skb(RX_COPY_THRESHOLD), truesize is set
 934                 * to RX_COPY_THRESHOLD + the supervisor
 935                 * overheads. Here, we add the size of the data pulled
 936                 * in xennet_fill_frags().
 937                 *
 938                 * We also adjust for any unused space in the main
 939                 * data area by subtracting (RX_COPY_THRESHOLD -
 940                 * len). This is especially important with drivers
 941                 * which split incoming packets into header and data,
 942                 * using only 66 bytes of the main data area (see the
 943                 * e1000 driver for example.)  On such systems,
 944                 * without this last adjustement, our achievable
 945                 * receive throughout using the standard receive
 946                 * buffer size was cut by 25%(!!!).
 947                 */
 948                skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
 949                skb->len += skb->data_len;
 950
 951                if (rx->flags & NETRXF_csum_blank)
 952                        skb->ip_summed = CHECKSUM_PARTIAL;
 953                else if (rx->flags & NETRXF_data_validated)
 954                        skb->ip_summed = CHECKSUM_UNNECESSARY;
 955
 956                __skb_queue_tail(&rxq, skb);
 957
 958                np->rx.rsp_cons = ++i;
 959                work_done++;
 960        }
 961
 962        __skb_queue_purge(&errq);
 963
 964        work_done -= handle_incoming_queue(dev, &rxq);
 965
 966        /* If we get a callback with very few responses, reduce fill target. */
 967        /* NB. Note exponential increase, linear decrease. */
 968        if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
 969             ((3*np->rx_target) / 4)) &&
 970            (--np->rx_target < np->rx_min_target))
 971                np->rx_target = np->rx_min_target;
 972
 973        xennet_alloc_rx_buffers(dev);
 974
 975        if (work_done < budget) {
 976                int more_to_do = 0;
 977
 978                local_irq_save(flags);
 979
 980                RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
 981                if (!more_to_do)
 982                        __napi_complete(napi);
 983
 984                local_irq_restore(flags);
 985        }
 986
 987        spin_unlock(&np->rx_lock);
 988
 989        return work_done;
 990}
 991
 992static int xennet_change_mtu(struct net_device *dev, int mtu)
 993{
 994        int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
 995
 996        if (mtu > max)
 997                return -EINVAL;
 998        dev->mtu = mtu;
 999        return 0;
1000}
1001
1002static void xennet_release_tx_bufs(struct netfront_info *np)
1003{
1004        struct sk_buff *skb;
1005        int i;
1006
1007        for (i = 0; i < NET_TX_RING_SIZE; i++) {
1008                /* Skip over entries which are actually freelist references */
1009                if (skb_entry_is_link(&np->tx_skbs[i]))
1010                        continue;
1011
1012                skb = np->tx_skbs[i].skb;
1013                gnttab_end_foreign_access_ref(np->grant_tx_ref[i],
1014                                              GNTMAP_readonly);
1015                gnttab_release_grant_reference(&np->gref_tx_head,
1016                                               np->grant_tx_ref[i]);
1017                np->grant_tx_ref[i] = GRANT_INVALID_REF;
1018                add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, i);
1019                dev_kfree_skb_irq(skb);
1020        }
1021}
1022
1023static void xennet_release_rx_bufs(struct netfront_info *np)
1024{
1025        struct mmu_update      *mmu = np->rx_mmu;
1026        struct multicall_entry *mcl = np->rx_mcl;
1027        struct sk_buff_head free_list;
1028        struct sk_buff *skb;
1029        unsigned long mfn;
1030        int xfer = 0, noxfer = 0, unused = 0;
1031        int id, ref;
1032
1033        dev_warn(&np->netdev->dev, "%s: fix me for copying receiver.\n",
1034                         __func__);
1035        return;
1036
1037        skb_queue_head_init(&free_list);
1038
1039        spin_lock_bh(&np->rx_lock);
1040
1041        for (id = 0; id < NET_RX_RING_SIZE; id++) {
1042                ref = np->grant_rx_ref[id];
1043                if (ref == GRANT_INVALID_REF) {
1044                        unused++;
1045                        continue;
1046                }
1047
1048                skb = np->rx_skbs[id];
1049                mfn = gnttab_end_foreign_transfer_ref(ref);
1050                gnttab_release_grant_reference(&np->gref_rx_head, ref);
1051                np->grant_rx_ref[id] = GRANT_INVALID_REF;
1052
1053                if (0 == mfn) {
1054                        skb_shinfo(skb)->nr_frags = 0;
1055                        dev_kfree_skb(skb);
1056                        noxfer++;
1057                        continue;
1058                }
1059
1060                if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1061                        /* Remap the page. */
1062                        struct page *page = skb_shinfo(skb)->frags[0].page;
1063                        unsigned long pfn = page_to_pfn(page);
1064                        void *vaddr = page_address(page);
1065
1066                        MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
1067                                                mfn_pte(mfn, PAGE_KERNEL),
1068                                                0);
1069                        mcl++;
1070                        mmu->ptr = ((u64)mfn << PAGE_SHIFT)
1071                                | MMU_MACHPHYS_UPDATE;
1072                        mmu->val = pfn;
1073                        mmu++;
1074
1075                        set_phys_to_machine(pfn, mfn);
1076                }
1077                __skb_queue_tail(&free_list, skb);
1078                xfer++;
1079        }
1080
1081        dev_info(&np->netdev->dev, "%s: %d xfer, %d noxfer, %d unused\n",
1082                 __func__, xfer, noxfer, unused);
1083
1084        if (xfer) {
1085                if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1086                        /* Do all the remapping work and M2P updates. */
1087                        MULTI_mmu_update(mcl, np->rx_mmu, mmu - np->rx_mmu,
1088                                         NULL, DOMID_SELF);
1089                        mcl++;
1090                        HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
1091                }
1092        }
1093
1094        __skb_queue_purge(&free_list);
1095
1096        spin_unlock_bh(&np->rx_lock);
1097}
1098
1099static void xennet_uninit(struct net_device *dev)
1100{
1101        struct netfront_info *np = netdev_priv(dev);
1102        xennet_release_tx_bufs(np);
1103        xennet_release_rx_bufs(np);
1104        gnttab_free_grant_references(np->gref_tx_head);
1105        gnttab_free_grant_references(np->gref_rx_head);
1106}
1107
1108static const struct net_device_ops xennet_netdev_ops = {
1109        .ndo_open            = xennet_open,
1110        .ndo_uninit          = xennet_uninit,
1111        .ndo_stop            = xennet_close,
1112        .ndo_start_xmit      = xennet_start_xmit,
1113        .ndo_change_mtu      = xennet_change_mtu,
1114        .ndo_set_mac_address = eth_mac_addr,
1115        .ndo_validate_addr   = eth_validate_addr,
1116};
1117
1118static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev)
1119{
1120        int i, err;
1121        struct net_device *netdev;
1122        struct netfront_info *np;
1123
1124        netdev = alloc_etherdev(sizeof(struct netfront_info));
1125        if (!netdev) {
1126                printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
1127                       __func__);
1128                return ERR_PTR(-ENOMEM);
1129        }
1130
1131        np                   = netdev_priv(netdev);
1132        np->xbdev            = dev;
1133
1134        spin_lock_init(&np->tx_lock);
1135        spin_lock_init(&np->rx_lock);
1136
1137        skb_queue_head_init(&np->rx_batch);
1138        np->rx_target     = RX_DFL_MIN_TARGET;
1139        np->rx_min_target = RX_DFL_MIN_TARGET;
1140        np->rx_max_target = RX_MAX_TARGET;
1141
1142        init_timer(&np->rx_refill_timer);
1143        np->rx_refill_timer.data = (unsigned long)netdev;
1144        np->rx_refill_timer.function = rx_refill_timeout;
1145
1146        /* Initialise tx_skbs as a free chain containing every entry. */
1147        np->tx_skb_freelist = 0;
1148        for (i = 0; i < NET_TX_RING_SIZE; i++) {
1149                skb_entry_set_link(&np->tx_skbs[i], i+1);
1150                np->grant_tx_ref[i] = GRANT_INVALID_REF;
1151        }
1152
1153        /* Clear out rx_skbs */
1154        for (i = 0; i < NET_RX_RING_SIZE; i++) {
1155                np->rx_skbs[i] = NULL;
1156                np->grant_rx_ref[i] = GRANT_INVALID_REF;
1157        }
1158
1159        /* A grant for every tx ring slot */
1160        if (gnttab_alloc_grant_references(TX_MAX_TARGET,
1161                                          &np->gref_tx_head) < 0) {
1162                printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
1163                err = -ENOMEM;
1164                goto exit;
1165        }
1166        /* A grant for every rx ring slot */
1167        if (gnttab_alloc_grant_references(RX_MAX_TARGET,
1168                                          &np->gref_rx_head) < 0) {
1169                printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
1170                err = -ENOMEM;
1171                goto exit_free_tx;
1172        }
1173
1174        netdev->netdev_ops      = &xennet_netdev_ops;
1175
1176        netif_napi_add(netdev, &np->napi, xennet_poll, 64);
1177        netdev->features        = NETIF_F_IP_CSUM;
1178
1179        SET_ETHTOOL_OPS(netdev, &xennet_ethtool_ops);
1180        SET_NETDEV_DEV(netdev, &dev->dev);
1181
1182        np->netdev = netdev;
1183
1184        netif_carrier_off(netdev);
1185
1186        return netdev;
1187
1188 exit_free_tx:
1189        gnttab_free_grant_references(np->gref_tx_head);
1190 exit:
1191        free_netdev(netdev);
1192        return ERR_PTR(err);
1193}
1194
1195/**
1196 * Entry point to this code when a new device is created.  Allocate the basic
1197 * structures and the ring buffers for communication with the backend, and
1198 * inform the backend of the appropriate details for those.
1199 */
1200static int __devinit netfront_probe(struct xenbus_device *dev,
1201                                    const struct xenbus_device_id *id)
1202{
1203        int err;
1204        struct net_device *netdev;
1205        struct netfront_info *info;
1206
1207        netdev = xennet_create_dev(dev);
1208        if (IS_ERR(netdev)) {
1209                err = PTR_ERR(netdev);
1210                xenbus_dev_fatal(dev, err, "creating netdev");
1211                return err;
1212        }
1213
1214        info = netdev_priv(netdev);
1215        dev_set_drvdata(&dev->dev, info);
1216
1217        err = register_netdev(info->netdev);
1218        if (err) {
1219                printk(KERN_WARNING "%s: register_netdev err=%d\n",
1220                       __func__, err);
1221                goto fail;
1222        }
1223
1224        err = xennet_sysfs_addif(info->netdev);
1225        if (err) {
1226                unregister_netdev(info->netdev);
1227                printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
1228                       __func__, err);
1229                goto fail;
1230        }
1231
1232        return 0;
1233
1234 fail:
1235        free_netdev(netdev);
1236        dev_set_drvdata(&dev->dev, NULL);
1237        return err;
1238}
1239
1240static void xennet_end_access(int ref, void *page)
1241{
1242        /* This frees the page as a side-effect */
1243        if (ref != GRANT_INVALID_REF)
1244                gnttab_end_foreign_access(ref, 0, (unsigned long)page);
1245}
1246
1247static void xennet_disconnect_backend(struct netfront_info *info)
1248{
1249        /* Stop old i/f to prevent errors whilst we rebuild the state. */
1250        spin_lock_bh(&info->rx_lock);
1251        spin_lock_irq(&info->tx_lock);
1252        netif_carrier_off(info->netdev);
1253        spin_unlock_irq(&info->tx_lock);
1254        spin_unlock_bh(&info->rx_lock);
1255
1256        if (info->netdev->irq)
1257                unbind_from_irqhandler(info->netdev->irq, info->netdev);
1258        info->evtchn = info->netdev->irq = 0;
1259
1260        /* End access and free the pages */
1261        xennet_end_access(info->tx_ring_ref, info->tx.sring);
1262        xennet_end_access(info->rx_ring_ref, info->rx.sring);
1263
1264        info->tx_ring_ref = GRANT_INVALID_REF;
1265        info->rx_ring_ref = GRANT_INVALID_REF;
1266        info->tx.sring = NULL;
1267        info->rx.sring = NULL;
1268}
1269
1270/**
1271 * We are reconnecting to the backend, due to a suspend/resume, or a backend
1272 * driver restart.  We tear down our netif structure and recreate it, but
1273 * leave the device-layer structures intact so that this is transparent to the
1274 * rest of the kernel.
1275 */
1276static int netfront_resume(struct xenbus_device *dev)
1277{
1278        struct netfront_info *info = dev_get_drvdata(&dev->dev);
1279
1280        dev_dbg(&dev->dev, "%s\n", dev->nodename);
1281
1282        xennet_disconnect_backend(info);
1283        return 0;
1284}
1285
1286static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
1287{
1288        char *s, *e, *macstr;
1289        int i;
1290
1291        macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
1292        if (IS_ERR(macstr))
1293                return PTR_ERR(macstr);
1294
1295        for (i = 0; i < ETH_ALEN; i++) {
1296                mac[i] = simple_strtoul(s, &e, 16);
1297                if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
1298                        kfree(macstr);
1299                        return -ENOENT;
1300                }
1301                s = e+1;
1302        }
1303
1304        kfree(macstr);
1305        return 0;
1306}
1307
1308static irqreturn_t xennet_interrupt(int irq, void *dev_id)
1309{
1310        struct net_device *dev = dev_id;
1311        struct netfront_info *np = netdev_priv(dev);
1312        unsigned long flags;
1313
1314        spin_lock_irqsave(&np->tx_lock, flags);
1315
1316        if (likely(netif_carrier_ok(dev))) {
1317                xennet_tx_buf_gc(dev);
1318                /* Under tx_lock: protects access to rx shared-ring indexes. */
1319                if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
1320                        napi_schedule(&np->napi);
1321        }
1322
1323        spin_unlock_irqrestore(&np->tx_lock, flags);
1324
1325        return IRQ_HANDLED;
1326}
1327
1328static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
1329{
1330        struct xen_netif_tx_sring *txs;
1331        struct xen_netif_rx_sring *rxs;
1332        int err;
1333        struct net_device *netdev = info->netdev;
1334
1335        info->tx_ring_ref = GRANT_INVALID_REF;
1336        info->rx_ring_ref = GRANT_INVALID_REF;
1337        info->rx.sring = NULL;
1338        info->tx.sring = NULL;
1339        netdev->irq = 0;
1340
1341        err = xen_net_read_mac(dev, netdev->dev_addr);
1342        if (err) {
1343                xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
1344                goto fail;
1345        }
1346
1347        txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
1348        if (!txs) {
1349                err = -ENOMEM;
1350                xenbus_dev_fatal(dev, err, "allocating tx ring page");
1351                goto fail;
1352        }
1353        SHARED_RING_INIT(txs);
1354        FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
1355
1356        err = xenbus_grant_ring(dev, virt_to_mfn(txs));
1357        if (err < 0) {
1358                free_page((unsigned long)txs);
1359                goto fail;
1360        }
1361
1362        info->tx_ring_ref = err;
1363        rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
1364        if (!rxs) {
1365                err = -ENOMEM;
1366                xenbus_dev_fatal(dev, err, "allocating rx ring page");
1367                goto fail;
1368        }
1369        SHARED_RING_INIT(rxs);
1370        FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
1371
1372        err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
1373        if (err < 0) {
1374                free_page((unsigned long)rxs);
1375                goto fail;
1376        }
1377        info->rx_ring_ref = err;
1378
1379        err = xenbus_alloc_evtchn(dev, &info->evtchn);
1380        if (err)
1381                goto fail;
1382
1383        err = bind_evtchn_to_irqhandler(info->evtchn, xennet_interrupt,
1384                                        IRQF_SAMPLE_RANDOM, netdev->name,
1385                                        netdev);
1386        if (err < 0)
1387                goto fail;
1388        netdev->irq = err;
1389        return 0;
1390
1391 fail:
1392        return err;
1393}
1394
1395/* Common code used when first setting up, and when resuming. */
1396static int talk_to_backend(struct xenbus_device *dev,
1397                           struct netfront_info *info)
1398{
1399        const char *message;
1400        struct xenbus_transaction xbt;
1401        int err;
1402
1403        /* Create shared ring, alloc event channel. */
1404        err = setup_netfront(dev, info);
1405        if (err)
1406                goto out;
1407
1408again:
1409        err = xenbus_transaction_start(&xbt);
1410        if (err) {
1411                xenbus_dev_fatal(dev, err, "starting transaction");
1412                goto destroy_ring;
1413        }
1414
1415        err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref", "%u",
1416                            info->tx_ring_ref);
1417        if (err) {
1418                message = "writing tx ring-ref";
1419                goto abort_transaction;
1420        }
1421        err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref", "%u",
1422                            info->rx_ring_ref);
1423        if (err) {
1424                message = "writing rx ring-ref";
1425                goto abort_transaction;
1426        }
1427        err = xenbus_printf(xbt, dev->nodename,
1428                            "event-channel", "%u", info->evtchn);
1429        if (err) {
1430                message = "writing event-channel";
1431                goto abort_transaction;
1432        }
1433
1434        err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
1435                            1);
1436        if (err) {
1437                message = "writing request-rx-copy";
1438                goto abort_transaction;
1439        }
1440
1441        err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
1442        if (err) {
1443                message = "writing feature-rx-notify";
1444                goto abort_transaction;
1445        }
1446
1447        err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
1448        if (err) {
1449                message = "writing feature-sg";
1450                goto abort_transaction;
1451        }
1452
1453        err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
1454        if (err) {
1455                message = "writing feature-gso-tcpv4";
1456                goto abort_transaction;
1457        }
1458
1459        err = xenbus_transaction_end(xbt, 0);
1460        if (err) {
1461                if (err == -EAGAIN)
1462                        goto again;
1463                xenbus_dev_fatal(dev, err, "completing transaction");
1464                goto destroy_ring;
1465        }
1466
1467        return 0;
1468
1469 abort_transaction:
1470        xenbus_transaction_end(xbt, 1);
1471        xenbus_dev_fatal(dev, err, "%s", message);
1472 destroy_ring:
1473        xennet_disconnect_backend(info);
1474 out:
1475        return err;
1476}
1477
1478static int xennet_set_sg(struct net_device *dev, u32 data)
1479{
1480        if (data) {
1481                struct netfront_info *np = netdev_priv(dev);
1482                int val;
1483
1484                if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
1485                                 "%d", &val) < 0)
1486                        val = 0;
1487                if (!val)
1488                        return -ENOSYS;
1489        } else if (dev->mtu > ETH_DATA_LEN)
1490                dev->mtu = ETH_DATA_LEN;
1491
1492        return ethtool_op_set_sg(dev, data);
1493}
1494
1495static int xennet_set_tso(struct net_device *dev, u32 data)
1496{
1497        if (data) {
1498                struct netfront_info *np = netdev_priv(dev);
1499                int val;
1500
1501                if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
1502                                 "feature-gso-tcpv4", "%d", &val) < 0)
1503                        val = 0;
1504                if (!val)
1505                        return -ENOSYS;
1506        }
1507
1508        return ethtool_op_set_tso(dev, data);
1509}
1510
1511static void xennet_set_features(struct net_device *dev)
1512{
1513        /* Turn off all GSO bits except ROBUST. */
1514        dev->features &= ~NETIF_F_GSO_MASK;
1515        dev->features |= NETIF_F_GSO_ROBUST;
1516        xennet_set_sg(dev, 0);
1517
1518        /* We need checksum offload to enable scatter/gather and TSO. */
1519        if (!(dev->features & NETIF_F_IP_CSUM))
1520                return;
1521
1522        if (!xennet_set_sg(dev, 1))
1523                xennet_set_tso(dev, 1);
1524}
1525
1526static int xennet_connect(struct net_device *dev)
1527{
1528        struct netfront_info *np = netdev_priv(dev);
1529        int i, requeue_idx, err;
1530        struct sk_buff *skb;
1531        grant_ref_t ref;
1532        struct xen_netif_rx_request *req;
1533        unsigned int feature_rx_copy;
1534
1535        err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
1536                           "feature-rx-copy", "%u", &feature_rx_copy);
1537        if (err != 1)
1538                feature_rx_copy = 0;
1539
1540        if (!feature_rx_copy) {
1541                dev_info(&dev->dev,
1542                         "backend does not support copying receive path\n");
1543                return -ENODEV;
1544        }
1545
1546        err = talk_to_backend(np->xbdev, np);
1547        if (err)
1548                return err;
1549
1550        xennet_set_features(dev);
1551
1552        spin_lock_bh(&np->rx_lock);
1553        spin_lock_irq(&np->tx_lock);
1554
1555        /* Step 1: Discard all pending TX packet fragments. */
1556        xennet_release_tx_bufs(np);
1557
1558        /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
1559        for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
1560                if (!np->rx_skbs[i])
1561                        continue;
1562
1563                skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
1564                ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
1565                req = RING_GET_REQUEST(&np->rx, requeue_idx);
1566
1567                gnttab_grant_foreign_access_ref(
1568                        ref, np->xbdev->otherend_id,
1569                        pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
1570                                               frags->page)),
1571                        0);
1572                req->gref = ref;
1573                req->id   = requeue_idx;
1574
1575                requeue_idx++;
1576        }
1577
1578        np->rx.req_prod_pvt = requeue_idx;
1579
1580        /*
1581         * Step 3: All public and private state should now be sane.  Get
1582         * ready to start sending and receiving packets and give the driver
1583         * domain a kick because we've probably just requeued some
1584         * packets.
1585         */
1586        netif_carrier_on(np->netdev);
1587        notify_remote_via_irq(np->netdev->irq);
1588        xennet_tx_buf_gc(dev);
1589        xennet_alloc_rx_buffers(dev);
1590
1591        spin_unlock_irq(&np->tx_lock);
1592        spin_unlock_bh(&np->rx_lock);
1593
1594        return 0;
1595}
1596
1597/**
1598 * Callback received when the backend's state changes.
1599 */
1600static void backend_changed(struct xenbus_device *dev,
1601                            enum xenbus_state backend_state)
1602{
1603        struct netfront_info *np = dev_get_drvdata(&dev->dev);
1604        struct net_device *netdev = np->netdev;
1605
1606        dev_dbg(&dev->dev, "%s\n", xenbus_strstate(backend_state));
1607
1608        switch (backend_state) {
1609        case XenbusStateInitialising:
1610        case XenbusStateInitialised:
1611        case XenbusStateConnected:
1612        case XenbusStateUnknown:
1613        case XenbusStateClosed:
1614                break;
1615
1616        case XenbusStateInitWait:
1617                if (dev->state != XenbusStateInitialising)
1618                        break;
1619                if (xennet_connect(netdev) != 0)
1620                        break;
1621                xenbus_switch_state(dev, XenbusStateConnected);
1622                break;
1623
1624        case XenbusStateClosing:
1625                xenbus_frontend_closed(dev);
1626                break;
1627        }
1628}
1629
1630static const struct ethtool_ops xennet_ethtool_ops =
1631{
1632        .set_tx_csum = ethtool_op_set_tx_csum,
1633        .set_sg = xennet_set_sg,
1634        .set_tso = xennet_set_tso,
1635        .get_link = ethtool_op_get_link,
1636};
1637
1638#ifdef CONFIG_SYSFS
1639static ssize_t show_rxbuf_min(struct device *dev,
1640                              struct device_attribute *attr, char *buf)
1641{
1642        struct net_device *netdev = to_net_dev(dev);
1643        struct netfront_info *info = netdev_priv(netdev);
1644
1645        return sprintf(buf, "%u\n", info->rx_min_target);
1646}
1647
1648static ssize_t store_rxbuf_min(struct device *dev,
1649                               struct device_attribute *attr,
1650                               const char *buf, size_t len)
1651{
1652        struct net_device *netdev = to_net_dev(dev);
1653        struct netfront_info *np = netdev_priv(netdev);
1654        char *endp;
1655        unsigned long target;
1656
1657        if (!capable(CAP_NET_ADMIN))
1658                return -EPERM;
1659
1660        target = simple_strtoul(buf, &endp, 0);
1661        if (endp == buf)
1662                return -EBADMSG;
1663
1664        if (target < RX_MIN_TARGET)
1665                target = RX_MIN_TARGET;
1666        if (target > RX_MAX_TARGET)
1667                target = RX_MAX_TARGET;
1668
1669        spin_lock_bh(&np->rx_lock);
1670        if (target > np->rx_max_target)
1671                np->rx_max_target = target;
1672        np->rx_min_target = target;
1673        if (target > np->rx_target)
1674                np->rx_target = target;
1675
1676        xennet_alloc_rx_buffers(netdev);
1677
1678        spin_unlock_bh(&np->rx_lock);
1679        return len;
1680}
1681
1682static ssize_t show_rxbuf_max(struct device *dev,
1683                              struct device_attribute *attr, char *buf)
1684{
1685        struct net_device *netdev = to_net_dev(dev);
1686        struct netfront_info *info = netdev_priv(netdev);
1687
1688        return sprintf(buf, "%u\n", info->rx_max_target);
1689}
1690
1691static ssize_t store_rxbuf_max(struct device *dev,
1692                               struct device_attribute *attr,
1693                               const char *buf, size_t len)
1694{
1695        struct net_device *netdev = to_net_dev(dev);
1696        struct netfront_info *np = netdev_priv(netdev);
1697        char *endp;
1698        unsigned long target;
1699
1700        if (!capable(CAP_NET_ADMIN))
1701                return -EPERM;
1702
1703        target = simple_strtoul(buf, &endp, 0);
1704        if (endp == buf)
1705                return -EBADMSG;
1706
1707        if (target < RX_MIN_TARGET)
1708                target = RX_MIN_TARGET;
1709        if (target > RX_MAX_TARGET)
1710                target = RX_MAX_TARGET;
1711
1712        spin_lock_bh(&np->rx_lock);
1713        if (target < np->rx_min_target)
1714                np->rx_min_target = target;
1715        np->rx_max_target = target;
1716        if (target < np->rx_target)
1717                np->rx_target = target;
1718
1719        xennet_alloc_rx_buffers(netdev);
1720
1721        spin_unlock_bh(&np->rx_lock);
1722        return len;
1723}
1724
1725static ssize_t show_rxbuf_cur(struct device *dev,
1726                              struct device_attribute *attr, char *buf)
1727{
1728        struct net_device *netdev = to_net_dev(dev);
1729        struct netfront_info *info = netdev_priv(netdev);
1730
1731        return sprintf(buf, "%u\n", info->rx_target);
1732}
1733
1734static struct device_attribute xennet_attrs[] = {
1735        __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
1736        __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
1737        __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
1738};
1739
1740static int xennet_sysfs_addif(struct net_device *netdev)
1741{
1742        int i;
1743        int err;
1744
1745        for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
1746                err = device_create_file(&netdev->dev,
1747                                           &xennet_attrs[i]);
1748                if (err)
1749                        goto fail;
1750        }
1751        return 0;
1752
1753 fail:
1754        while (--i >= 0)
1755                device_remove_file(&netdev->dev, &xennet_attrs[i]);
1756        return err;
1757}
1758
1759static void xennet_sysfs_delif(struct net_device *netdev)
1760{
1761        int i;
1762
1763        for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
1764                device_remove_file(&netdev->dev, &xennet_attrs[i]);
1765}
1766
1767#endif /* CONFIG_SYSFS */
1768
1769static struct xenbus_device_id netfront_ids[] = {
1770        { "vif" },
1771        { "" }
1772};
1773
1774
1775static int __devexit xennet_remove(struct xenbus_device *dev)
1776{
1777        struct netfront_info *info = dev_get_drvdata(&dev->dev);
1778
1779        dev_dbg(&dev->dev, "%s\n", dev->nodename);
1780
1781        unregister_netdev(info->netdev);
1782
1783        xennet_disconnect_backend(info);
1784
1785        del_timer_sync(&info->rx_refill_timer);
1786
1787        xennet_sysfs_delif(info->netdev);
1788
1789        free_netdev(info->netdev);
1790
1791        return 0;
1792}
1793
1794static struct xenbus_driver netfront_driver = {
1795        .name = "vif",
1796        .owner = THIS_MODULE,
1797        .ids = netfront_ids,
1798        .probe = netfront_probe,
1799        .remove = __devexit_p(xennet_remove),
1800        .resume = netfront_resume,
1801        .otherend_changed = backend_changed,
1802};
1803
1804static int __init netif_init(void)
1805{
1806        if (!xen_domain())
1807                return -ENODEV;
1808
1809        if (xen_initial_domain())
1810                return 0;
1811
1812        printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n");
1813
1814        return xenbus_register_frontend(&netfront_driver);
1815}
1816module_init(netif_init);
1817
1818
1819static void __exit netif_exit(void)
1820{
1821        if (xen_initial_domain())
1822                return;
1823
1824        xenbus_unregister_driver(&netfront_driver);
1825}
1826module_exit(netif_exit);
1827
1828MODULE_DESCRIPTION("Xen virtual network device frontend");
1829MODULE_LICENSE("GPL");
1830MODULE_ALIAS("xen:vif");
1831MODULE_ALIAS("xennet");
1832