linux/net/ipv4/ipmr.c
<<
>>
Prefs
   1/*
   2 *      IP multicast routing support for mrouted 3.6/3.8
   3 *
   4 *              (c) 1995 Alan Cox, <alan@redhat.com>
   5 *        Linux Consultancy and Custom Driver Development
   6 *
   7 *      This program is free software; you can redistribute it and/or
   8 *      modify it under the terms of the GNU General Public License
   9 *      as published by the Free Software Foundation; either version
  10 *      2 of the License, or (at your option) any later version.
  11 *
  12 *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
  13 *
  14 *      Fixes:
  15 *      Michael Chastain        :       Incorrect size of copying.
  16 *      Alan Cox                :       Added the cache manager code
  17 *      Alan Cox                :       Fixed the clone/copy bug and device race.
  18 *      Mike McLagan            :       Routing by source
  19 *      Malcolm Beattie         :       Buffer handling fixes.
  20 *      Alexey Kuznetsov        :       Double buffer free and other fixes.
  21 *      SVR Anand               :       Fixed several multicast bugs and problems.
  22 *      Alexey Kuznetsov        :       Status, optimisations and more.
  23 *      Brad Parker             :       Better behaviour on mrouted upcall
  24 *                                      overflow.
  25 *      Carlos Picoto           :       PIMv1 Support
  26 *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
  27 *                                      Relax this requrement to work with older peers.
  28 *
  29 */
  30
  31#include <asm/system.h>
  32#include <asm/uaccess.h>
  33#include <linux/types.h>
  34#include <linux/capability.h>
  35#include <linux/errno.h>
  36#include <linux/timer.h>
  37#include <linux/mm.h>
  38#include <linux/kernel.h>
  39#include <linux/fcntl.h>
  40#include <linux/stat.h>
  41#include <linux/socket.h>
  42#include <linux/in.h>
  43#include <linux/inet.h>
  44#include <linux/netdevice.h>
  45#include <linux/inetdevice.h>
  46#include <linux/igmp.h>
  47#include <linux/proc_fs.h>
  48#include <linux/seq_file.h>
  49#include <linux/mroute.h>
  50#include <linux/init.h>
  51#include <linux/if_ether.h>
  52#include <net/net_namespace.h>
  53#include <net/ip.h>
  54#include <net/protocol.h>
  55#include <linux/skbuff.h>
  56#include <net/route.h>
  57#include <net/sock.h>
  58#include <net/icmp.h>
  59#include <net/udp.h>
  60#include <net/raw.h>
  61#include <linux/notifier.h>
  62#include <linux/if_arp.h>
  63#include <linux/netfilter_ipv4.h>
  64#include <net/ipip.h>
  65#include <net/checksum.h>
  66#include <net/netlink.h>
  67
  68#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
  69#define CONFIG_IP_PIMSM 1
  70#endif
  71
  72static struct sock *mroute_socket;
  73
  74
  75/* Big lock, protecting vif table, mrt cache and mroute socket state.
  76   Note that the changes are semaphored via rtnl_lock.
  77 */
  78
  79static DEFINE_RWLOCK(mrt_lock);
  80
  81/*
  82 *      Multicast router control variables
  83 */
  84
  85static struct vif_device vif_table[MAXVIFS];            /* Devices              */
  86static int maxvif;
  87
  88#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
  89
  90static int mroute_do_assert;                            /* Set in PIM assert    */
  91static int mroute_do_pim;
  92
  93static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
  94
  95static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
  96static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
  97
  98/* Special spinlock for queue of unresolved entries */
  99static DEFINE_SPINLOCK(mfc_unres_lock);
 100
 101/* We return to original Alan's scheme. Hash table of resolved
 102   entries is changed only in process context and protected
 103   with weak lock mrt_lock. Queue of unresolved entries is protected
 104   with strong spinlock mfc_unres_lock.
 105
 106   In this case data path is free of exclusive locks at all.
 107 */
 108
 109static struct kmem_cache *mrt_cachep __read_mostly;
 110
 111static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
 112static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
 113static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
 114
 115#ifdef CONFIG_IP_PIMSM_V2
 116static struct net_protocol pim_protocol;
 117#endif
 118
 119static struct timer_list ipmr_expire_timer;
 120
 121/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 122
 123static
 124struct net_device *ipmr_new_tunnel(struct vifctl *v)
 125{
 126        struct net_device  *dev;
 127
 128        dev = __dev_get_by_name(&init_net, "tunl0");
 129
 130        if (dev) {
 131                int err;
 132                struct ifreq ifr;
 133                mm_segment_t    oldfs;
 134                struct ip_tunnel_parm p;
 135                struct in_device  *in_dev;
 136
 137                memset(&p, 0, sizeof(p));
 138                p.iph.daddr = v->vifc_rmt_addr.s_addr;
 139                p.iph.saddr = v->vifc_lcl_addr.s_addr;
 140                p.iph.version = 4;
 141                p.iph.ihl = 5;
 142                p.iph.protocol = IPPROTO_IPIP;
 143                sprintf(p.name, "dvmrp%d", v->vifc_vifi);
 144                ifr.ifr_ifru.ifru_data = (void*)&p;
 145
 146                oldfs = get_fs(); set_fs(KERNEL_DS);
 147                err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
 148                set_fs(oldfs);
 149
 150                dev = NULL;
 151
 152                if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
 153                        dev->flags |= IFF_MULTICAST;
 154
 155                        in_dev = __in_dev_get_rtnl(dev);
 156                        if (in_dev == NULL)
 157                                goto failure;
 158
 159                        ipv4_devconf_setall(in_dev);
 160                        IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
 161
 162                        if (dev_open(dev))
 163                                goto failure;
 164                }
 165        }
 166        return dev;
 167
 168failure:
 169        /* allow the register to be completed before unregistering. */
 170        rtnl_unlock();
 171        rtnl_lock();
 172
 173        unregister_netdevice(dev);
 174        return NULL;
 175}
 176
 177#ifdef CONFIG_IP_PIMSM
 178
 179static int reg_vif_num = -1;
 180
 181static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
 182{
 183        read_lock(&mrt_lock);
 184        ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
 185        ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
 186        ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
 187        read_unlock(&mrt_lock);
 188        kfree_skb(skb);
 189        return 0;
 190}
 191
 192static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
 193{
 194        return (struct net_device_stats*)netdev_priv(dev);
 195}
 196
 197static void reg_vif_setup(struct net_device *dev)
 198{
 199        dev->type               = ARPHRD_PIMREG;
 200        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
 201        dev->flags              = IFF_NOARP;
 202        dev->hard_start_xmit    = reg_vif_xmit;
 203        dev->get_stats          = reg_vif_get_stats;
 204        dev->destructor         = free_netdev;
 205}
 206
 207static struct net_device *ipmr_reg_vif(void)
 208{
 209        struct net_device *dev;
 210        struct in_device *in_dev;
 211
 212        dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
 213                           reg_vif_setup);
 214
 215        if (dev == NULL)
 216                return NULL;
 217
 218        if (register_netdevice(dev)) {
 219                free_netdev(dev);
 220                return NULL;
 221        }
 222        dev->iflink = 0;
 223
 224        rcu_read_lock();
 225        if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
 226                rcu_read_unlock();
 227                goto failure;
 228        }
 229
 230        ipv4_devconf_setall(in_dev);
 231        IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
 232        rcu_read_unlock();
 233
 234        if (dev_open(dev))
 235                goto failure;
 236
 237        return dev;
 238
 239failure:
 240        /* allow the register to be completed before unregistering. */
 241        rtnl_unlock();
 242        rtnl_lock();
 243
 244        unregister_netdevice(dev);
 245        return NULL;
 246}
 247#endif
 248
 249/*
 250 *      Delete a VIF entry
 251 */
 252
 253static int vif_delete(int vifi)
 254{
 255        struct vif_device *v;
 256        struct net_device *dev;
 257        struct in_device *in_dev;
 258
 259        if (vifi < 0 || vifi >= maxvif)
 260                return -EADDRNOTAVAIL;
 261
 262        v = &vif_table[vifi];
 263
 264        write_lock_bh(&mrt_lock);
 265        dev = v->dev;
 266        v->dev = NULL;
 267
 268        if (!dev) {
 269                write_unlock_bh(&mrt_lock);
 270                return -EADDRNOTAVAIL;
 271        }
 272
 273#ifdef CONFIG_IP_PIMSM
 274        if (vifi == reg_vif_num)
 275                reg_vif_num = -1;
 276#endif
 277
 278        if (vifi+1 == maxvif) {
 279                int tmp;
 280                for (tmp=vifi-1; tmp>=0; tmp--) {
 281                        if (VIF_EXISTS(tmp))
 282                                break;
 283                }
 284                maxvif = tmp+1;
 285        }
 286
 287        write_unlock_bh(&mrt_lock);
 288
 289        dev_set_allmulti(dev, -1);
 290
 291        if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
 292                IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
 293                ip_rt_multicast_event(in_dev);
 294        }
 295
 296        if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
 297                unregister_netdevice(dev);
 298
 299        dev_put(dev);
 300        return 0;
 301}
 302
 303/* Destroy an unresolved cache entry, killing queued skbs
 304   and reporting error to netlink readers.
 305 */
 306
 307static void ipmr_destroy_unres(struct mfc_cache *c)
 308{
 309        struct sk_buff *skb;
 310        struct nlmsgerr *e;
 311
 312        atomic_dec(&cache_resolve_queue_len);
 313
 314        while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
 315                if (ip_hdr(skb)->version == 0) {
 316                        struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
 317                        nlh->nlmsg_type = NLMSG_ERROR;
 318                        nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
 319                        skb_trim(skb, nlh->nlmsg_len);
 320                        e = NLMSG_DATA(nlh);
 321                        e->error = -ETIMEDOUT;
 322                        memset(&e->msg, 0, sizeof(e->msg));
 323
 324                        rtnl_unicast(skb, NETLINK_CB(skb).pid);
 325                } else
 326                        kfree_skb(skb);
 327        }
 328
 329        kmem_cache_free(mrt_cachep, c);
 330}
 331
 332
 333/* Single timer process for all the unresolved queue. */
 334
 335static void ipmr_expire_process(unsigned long dummy)
 336{
 337        unsigned long now;
 338        unsigned long expires;
 339        struct mfc_cache *c, **cp;
 340
 341        if (!spin_trylock(&mfc_unres_lock)) {
 342                mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
 343                return;
 344        }
 345
 346        if (atomic_read(&cache_resolve_queue_len) == 0)
 347                goto out;
 348
 349        now = jiffies;
 350        expires = 10*HZ;
 351        cp = &mfc_unres_queue;
 352
 353        while ((c=*cp) != NULL) {
 354                if (time_after(c->mfc_un.unres.expires, now)) {
 355                        unsigned long interval = c->mfc_un.unres.expires - now;
 356                        if (interval < expires)
 357                                expires = interval;
 358                        cp = &c->next;
 359                        continue;
 360                }
 361
 362                *cp = c->next;
 363
 364                ipmr_destroy_unres(c);
 365        }
 366
 367        if (atomic_read(&cache_resolve_queue_len))
 368                mod_timer(&ipmr_expire_timer, jiffies + expires);
 369
 370out:
 371        spin_unlock(&mfc_unres_lock);
 372}
 373
 374/* Fill oifs list. It is called under write locked mrt_lock. */
 375
 376static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
 377{
 378        int vifi;
 379
 380        cache->mfc_un.res.minvif = MAXVIFS;
 381        cache->mfc_un.res.maxvif = 0;
 382        memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
 383
 384        for (vifi=0; vifi<maxvif; vifi++) {
 385                if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
 386                        cache->mfc_un.res.ttls[vifi] = ttls[vifi];
 387                        if (cache->mfc_un.res.minvif > vifi)
 388                                cache->mfc_un.res.minvif = vifi;
 389                        if (cache->mfc_un.res.maxvif <= vifi)
 390                                cache->mfc_un.res.maxvif = vifi + 1;
 391                }
 392        }
 393}
 394
 395static int vif_add(struct vifctl *vifc, int mrtsock)
 396{
 397        int vifi = vifc->vifc_vifi;
 398        struct vif_device *v = &vif_table[vifi];
 399        struct net_device *dev;
 400        struct in_device *in_dev;
 401
 402        /* Is vif busy ? */
 403        if (VIF_EXISTS(vifi))
 404                return -EADDRINUSE;
 405
 406        switch (vifc->vifc_flags) {
 407#ifdef CONFIG_IP_PIMSM
 408        case VIFF_REGISTER:
 409                /*
 410                 * Special Purpose VIF in PIM
 411                 * All the packets will be sent to the daemon
 412                 */
 413                if (reg_vif_num >= 0)
 414                        return -EADDRINUSE;
 415                dev = ipmr_reg_vif();
 416                if (!dev)
 417                        return -ENOBUFS;
 418                break;
 419#endif
 420        case VIFF_TUNNEL:
 421                dev = ipmr_new_tunnel(vifc);
 422                if (!dev)
 423                        return -ENOBUFS;
 424                break;
 425        case 0:
 426                dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
 427                if (!dev)
 428                        return -EADDRNOTAVAIL;
 429                dev_put(dev);
 430                break;
 431        default:
 432                return -EINVAL;
 433        }
 434
 435        if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
 436                return -EADDRNOTAVAIL;
 437        IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
 438        dev_set_allmulti(dev, +1);
 439        ip_rt_multicast_event(in_dev);
 440
 441        /*
 442         *      Fill in the VIF structures
 443         */
 444        v->rate_limit=vifc->vifc_rate_limit;
 445        v->local=vifc->vifc_lcl_addr.s_addr;
 446        v->remote=vifc->vifc_rmt_addr.s_addr;
 447        v->flags=vifc->vifc_flags;
 448        if (!mrtsock)
 449                v->flags |= VIFF_STATIC;
 450        v->threshold=vifc->vifc_threshold;
 451        v->bytes_in = 0;
 452        v->bytes_out = 0;
 453        v->pkt_in = 0;
 454        v->pkt_out = 0;
 455        v->link = dev->ifindex;
 456        if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
 457                v->link = dev->iflink;
 458
 459        /* And finish update writing critical data */
 460        write_lock_bh(&mrt_lock);
 461        dev_hold(dev);
 462        v->dev=dev;
 463#ifdef CONFIG_IP_PIMSM
 464        if (v->flags&VIFF_REGISTER)
 465                reg_vif_num = vifi;
 466#endif
 467        if (vifi+1 > maxvif)
 468                maxvif = vifi+1;
 469        write_unlock_bh(&mrt_lock);
 470        return 0;
 471}
 472
 473static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
 474{
 475        int line=MFC_HASH(mcastgrp,origin);
 476        struct mfc_cache *c;
 477
 478        for (c=mfc_cache_array[line]; c; c = c->next) {
 479                if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
 480                        break;
 481        }
 482        return c;
 483}
 484
 485/*
 486 *      Allocate a multicast cache entry
 487 */
 488static struct mfc_cache *ipmr_cache_alloc(void)
 489{
 490        struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
 491        if (c==NULL)
 492                return NULL;
 493        c->mfc_un.res.minvif = MAXVIFS;
 494        return c;
 495}
 496
 497static struct mfc_cache *ipmr_cache_alloc_unres(void)
 498{
 499        struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
 500        if (c==NULL)
 501                return NULL;
 502        skb_queue_head_init(&c->mfc_un.unres.unresolved);
 503        c->mfc_un.unres.expires = jiffies + 10*HZ;
 504        return c;
 505}
 506
 507/*
 508 *      A cache entry has gone into a resolved state from queued
 509 */
 510
 511static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
 512{
 513        struct sk_buff *skb;
 514        struct nlmsgerr *e;
 515
 516        /*
 517         *      Play the pending entries through our router
 518         */
 519
 520        while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
 521                if (ip_hdr(skb)->version == 0) {
 522                        struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
 523
 524                        if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
 525                                nlh->nlmsg_len = (skb_tail_pointer(skb) -
 526                                                  (u8 *)nlh);
 527                        } else {
 528                                nlh->nlmsg_type = NLMSG_ERROR;
 529                                nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
 530                                skb_trim(skb, nlh->nlmsg_len);
 531                                e = NLMSG_DATA(nlh);
 532                                e->error = -EMSGSIZE;
 533                                memset(&e->msg, 0, sizeof(e->msg));
 534                        }
 535
 536                        rtnl_unicast(skb, NETLINK_CB(skb).pid);
 537                } else
 538                        ip_mr_forward(skb, c, 0);
 539        }
 540}
 541
 542/*
 543 *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
 544 *      expects the following bizarre scheme.
 545 *
 546 *      Called under mrt_lock.
 547 */
 548
 549static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
 550{
 551        struct sk_buff *skb;
 552        const int ihl = ip_hdrlen(pkt);
 553        struct igmphdr *igmp;
 554        struct igmpmsg *msg;
 555        int ret;
 556
 557#ifdef CONFIG_IP_PIMSM
 558        if (assert == IGMPMSG_WHOLEPKT)
 559                skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
 560        else
 561#endif
 562                skb = alloc_skb(128, GFP_ATOMIC);
 563
 564        if (!skb)
 565                return -ENOBUFS;
 566
 567#ifdef CONFIG_IP_PIMSM
 568        if (assert == IGMPMSG_WHOLEPKT) {
 569                /* Ugly, but we have no choice with this interface.
 570                   Duplicate old header, fix ihl, length etc.
 571                   And all this only to mangle msg->im_msgtype and
 572                   to set msg->im_mbz to "mbz" :-)
 573                 */
 574                skb_push(skb, sizeof(struct iphdr));
 575                skb_reset_network_header(skb);
 576                skb_reset_transport_header(skb);
 577                msg = (struct igmpmsg *)skb_network_header(skb);
 578                memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
 579                msg->im_msgtype = IGMPMSG_WHOLEPKT;
 580                msg->im_mbz = 0;
 581                msg->im_vif = reg_vif_num;
 582                ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
 583                ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
 584                                             sizeof(struct iphdr));
 585        } else
 586#endif
 587        {
 588
 589        /*
 590         *      Copy the IP header
 591         */
 592
 593        skb->network_header = skb->tail;
 594        skb_put(skb, ihl);
 595        skb_copy_to_linear_data(skb, pkt->data, ihl);
 596        ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
 597        msg = (struct igmpmsg *)skb_network_header(skb);
 598        msg->im_vif = vifi;
 599        skb->dst = dst_clone(pkt->dst);
 600
 601        /*
 602         *      Add our header
 603         */
 604
 605        igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
 606        igmp->type      =
 607        msg->im_msgtype = assert;
 608        igmp->code      =       0;
 609        ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
 610        skb->transport_header = skb->network_header;
 611        }
 612
 613        if (mroute_socket == NULL) {
 614                kfree_skb(skb);
 615                return -EINVAL;
 616        }
 617
 618        /*
 619         *      Deliver to mrouted
 620         */
 621        if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
 622                if (net_ratelimit())
 623                        printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
 624                kfree_skb(skb);
 625        }
 626
 627        return ret;
 628}
 629
 630/*
 631 *      Queue a packet for resolution. It gets locked cache entry!
 632 */
 633
 634static int
 635ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
 636{
 637        int err;
 638        struct mfc_cache *c;
 639        const struct iphdr *iph = ip_hdr(skb);
 640
 641        spin_lock_bh(&mfc_unres_lock);
 642        for (c=mfc_unres_queue; c; c=c->next) {
 643                if (c->mfc_mcastgrp == iph->daddr &&
 644                    c->mfc_origin == iph->saddr)
 645                        break;
 646        }
 647
 648        if (c == NULL) {
 649                /*
 650                 *      Create a new entry if allowable
 651                 */
 652
 653                if (atomic_read(&cache_resolve_queue_len)>=10 ||
 654                    (c=ipmr_cache_alloc_unres())==NULL) {
 655                        spin_unlock_bh(&mfc_unres_lock);
 656
 657                        kfree_skb(skb);
 658                        return -ENOBUFS;
 659                }
 660
 661                /*
 662                 *      Fill in the new cache entry
 663                 */
 664                c->mfc_parent   = -1;
 665                c->mfc_origin   = iph->saddr;
 666                c->mfc_mcastgrp = iph->daddr;
 667
 668                /*
 669                 *      Reflect first query at mrouted.
 670                 */
 671                if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
 672                        /* If the report failed throw the cache entry
 673                           out - Brad Parker
 674                         */
 675                        spin_unlock_bh(&mfc_unres_lock);
 676
 677                        kmem_cache_free(mrt_cachep, c);
 678                        kfree_skb(skb);
 679                        return err;
 680                }
 681
 682                atomic_inc(&cache_resolve_queue_len);
 683                c->next = mfc_unres_queue;
 684                mfc_unres_queue = c;
 685
 686                mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
 687        }
 688
 689        /*
 690         *      See if we can append the packet
 691         */
 692        if (c->mfc_un.unres.unresolved.qlen>3) {
 693                kfree_skb(skb);
 694                err = -ENOBUFS;
 695        } else {
 696                skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
 697                err = 0;
 698        }
 699
 700        spin_unlock_bh(&mfc_unres_lock);
 701        return err;
 702}
 703
 704/*
 705 *      MFC cache manipulation by user space mroute daemon
 706 */
 707
 708static int ipmr_mfc_delete(struct mfcctl *mfc)
 709{
 710        int line;
 711        struct mfc_cache *c, **cp;
 712
 713        line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
 714
 715        for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
 716                if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
 717                    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
 718                        write_lock_bh(&mrt_lock);
 719                        *cp = c->next;
 720                        write_unlock_bh(&mrt_lock);
 721
 722                        kmem_cache_free(mrt_cachep, c);
 723                        return 0;
 724                }
 725        }
 726        return -ENOENT;
 727}
 728
 729static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
 730{
 731        int line;
 732        struct mfc_cache *uc, *c, **cp;
 733
 734        line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
 735
 736        for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
 737                if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
 738                    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
 739                        break;
 740        }
 741
 742        if (c != NULL) {
 743                write_lock_bh(&mrt_lock);
 744                c->mfc_parent = mfc->mfcc_parent;
 745                ipmr_update_thresholds(c, mfc->mfcc_ttls);
 746                if (!mrtsock)
 747                        c->mfc_flags |= MFC_STATIC;
 748                write_unlock_bh(&mrt_lock);
 749                return 0;
 750        }
 751
 752        if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
 753                return -EINVAL;
 754
 755        c=ipmr_cache_alloc();
 756        if (c==NULL)
 757                return -ENOMEM;
 758
 759        c->mfc_origin=mfc->mfcc_origin.s_addr;
 760        c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
 761        c->mfc_parent=mfc->mfcc_parent;
 762        ipmr_update_thresholds(c, mfc->mfcc_ttls);
 763        if (!mrtsock)
 764                c->mfc_flags |= MFC_STATIC;
 765
 766        write_lock_bh(&mrt_lock);
 767        c->next = mfc_cache_array[line];
 768        mfc_cache_array[line] = c;
 769        write_unlock_bh(&mrt_lock);
 770
 771        /*
 772         *      Check to see if we resolved a queued list. If so we
 773         *      need to send on the frames and tidy up.
 774         */
 775        spin_lock_bh(&mfc_unres_lock);
 776        for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
 777             cp = &uc->next) {
 778                if (uc->mfc_origin == c->mfc_origin &&
 779                    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
 780                        *cp = uc->next;
 781                        if (atomic_dec_and_test(&cache_resolve_queue_len))
 782                                del_timer(&ipmr_expire_timer);
 783                        break;
 784                }
 785        }
 786        spin_unlock_bh(&mfc_unres_lock);
 787
 788        if (uc) {
 789                ipmr_cache_resolve(uc, c);
 790                kmem_cache_free(mrt_cachep, uc);
 791        }
 792        return 0;
 793}
 794
 795/*
 796 *      Close the multicast socket, and clear the vif tables etc
 797 */
 798
 799static void mroute_clean_tables(struct sock *sk)
 800{
 801        int i;
 802
 803        /*
 804         *      Shut down all active vif entries
 805         */
 806        for (i=0; i<maxvif; i++) {
 807                if (!(vif_table[i].flags&VIFF_STATIC))
 808                        vif_delete(i);
 809        }
 810
 811        /*
 812         *      Wipe the cache
 813         */
 814        for (i=0;i<MFC_LINES;i++) {
 815                struct mfc_cache *c, **cp;
 816
 817                cp = &mfc_cache_array[i];
 818                while ((c = *cp) != NULL) {
 819                        if (c->mfc_flags&MFC_STATIC) {
 820                                cp = &c->next;
 821                                continue;
 822                        }
 823                        write_lock_bh(&mrt_lock);
 824                        *cp = c->next;
 825                        write_unlock_bh(&mrt_lock);
 826
 827                        kmem_cache_free(mrt_cachep, c);
 828                }
 829        }
 830
 831        if (atomic_read(&cache_resolve_queue_len) != 0) {
 832                struct mfc_cache *c;
 833
 834                spin_lock_bh(&mfc_unres_lock);
 835                while (mfc_unres_queue != NULL) {
 836                        c = mfc_unres_queue;
 837                        mfc_unres_queue = c->next;
 838                        spin_unlock_bh(&mfc_unres_lock);
 839
 840                        ipmr_destroy_unres(c);
 841
 842                        spin_lock_bh(&mfc_unres_lock);
 843                }
 844                spin_unlock_bh(&mfc_unres_lock);
 845        }
 846}
 847
 848static void mrtsock_destruct(struct sock *sk)
 849{
 850        rtnl_lock();
 851        if (sk == mroute_socket) {
 852                IPV4_DEVCONF_ALL(MC_FORWARDING)--;
 853
 854                write_lock_bh(&mrt_lock);
 855                mroute_socket=NULL;
 856                write_unlock_bh(&mrt_lock);
 857
 858                mroute_clean_tables(sk);
 859        }
 860        rtnl_unlock();
 861}
 862
 863/*
 864 *      Socket options and virtual interface manipulation. The whole
 865 *      virtual interface system is a complete heap, but unfortunately
 866 *      that's how BSD mrouted happens to think. Maybe one day with a proper
 867 *      MOSPF/PIM router set up we can clean this up.
 868 */
 869
 870int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
 871{
 872        int ret;
 873        struct vifctl vif;
 874        struct mfcctl mfc;
 875
 876        if (optname != MRT_INIT) {
 877                if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
 878                        return -EACCES;
 879        }
 880
 881        switch (optname) {
 882        case MRT_INIT:
 883                if (sk->sk_type != SOCK_RAW ||
 884                    inet_sk(sk)->num != IPPROTO_IGMP)
 885                        return -EOPNOTSUPP;
 886                if (optlen!=sizeof(int))
 887                        return -ENOPROTOOPT;
 888
 889                rtnl_lock();
 890                if (mroute_socket) {
 891                        rtnl_unlock();
 892                        return -EADDRINUSE;
 893                }
 894
 895                ret = ip_ra_control(sk, 1, mrtsock_destruct);
 896                if (ret == 0) {
 897                        write_lock_bh(&mrt_lock);
 898                        mroute_socket=sk;
 899                        write_unlock_bh(&mrt_lock);
 900
 901                        IPV4_DEVCONF_ALL(MC_FORWARDING)++;
 902                }
 903                rtnl_unlock();
 904                return ret;
 905        case MRT_DONE:
 906                if (sk!=mroute_socket)
 907                        return -EACCES;
 908                return ip_ra_control(sk, 0, NULL);
 909        case MRT_ADD_VIF:
 910        case MRT_DEL_VIF:
 911                if (optlen!=sizeof(vif))
 912                        return -EINVAL;
 913                if (copy_from_user(&vif,optval,sizeof(vif)))
 914                        return -EFAULT;
 915                if (vif.vifc_vifi >= MAXVIFS)
 916                        return -ENFILE;
 917                rtnl_lock();
 918                if (optname==MRT_ADD_VIF) {
 919                        ret = vif_add(&vif, sk==mroute_socket);
 920                } else {
 921                        ret = vif_delete(vif.vifc_vifi);
 922                }
 923                rtnl_unlock();
 924                return ret;
 925
 926                /*
 927                 *      Manipulate the forwarding caches. These live
 928                 *      in a sort of kernel/user symbiosis.
 929                 */
 930        case MRT_ADD_MFC:
 931        case MRT_DEL_MFC:
 932                if (optlen!=sizeof(mfc))
 933                        return -EINVAL;
 934                if (copy_from_user(&mfc,optval, sizeof(mfc)))
 935                        return -EFAULT;
 936                rtnl_lock();
 937                if (optname==MRT_DEL_MFC)
 938                        ret = ipmr_mfc_delete(&mfc);
 939                else
 940                        ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
 941                rtnl_unlock();
 942                return ret;
 943                /*
 944                 *      Control PIM assert.
 945                 */
 946        case MRT_ASSERT:
 947        {
 948                int v;
 949                if (get_user(v,(int __user *)optval))
 950                        return -EFAULT;
 951                mroute_do_assert=(v)?1:0;
 952                return 0;
 953        }
 954#ifdef CONFIG_IP_PIMSM
 955        case MRT_PIM:
 956        {
 957                int v, ret;
 958                if (get_user(v,(int __user *)optval))
 959                        return -EFAULT;
 960                v = (v)?1:0;
 961                rtnl_lock();
 962                ret = 0;
 963                if (v != mroute_do_pim) {
 964                        mroute_do_pim = v;
 965                        mroute_do_assert = v;
 966#ifdef CONFIG_IP_PIMSM_V2
 967                        if (mroute_do_pim)
 968                                ret = inet_add_protocol(&pim_protocol,
 969                                                        IPPROTO_PIM);
 970                        else
 971                                ret = inet_del_protocol(&pim_protocol,
 972                                                        IPPROTO_PIM);
 973                        if (ret < 0)
 974                                ret = -EAGAIN;
 975#endif
 976                }
 977                rtnl_unlock();
 978                return ret;
 979        }
 980#endif
 981        /*
 982         *      Spurious command, or MRT_VERSION which you cannot
 983         *      set.
 984         */
 985        default:
 986                return -ENOPROTOOPT;
 987        }
 988}
 989
 990/*
 991 *      Getsock opt support for the multicast routing system.
 992 */
 993
 994int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
 995{
 996        int olr;
 997        int val;
 998
 999        if (optname!=MRT_VERSION &&
1000#ifdef CONFIG_IP_PIMSM
1001           optname!=MRT_PIM &&
1002#endif
1003           optname!=MRT_ASSERT)
1004                return -ENOPROTOOPT;
1005
1006        if (get_user(olr, optlen))
1007                return -EFAULT;
1008
1009        olr = min_t(unsigned int, olr, sizeof(int));
1010        if (olr < 0)
1011                return -EINVAL;
1012
1013        if (put_user(olr,optlen))
1014                return -EFAULT;
1015        if (optname==MRT_VERSION)
1016                val=0x0305;
1017#ifdef CONFIG_IP_PIMSM
1018        else if (optname==MRT_PIM)
1019                val=mroute_do_pim;
1020#endif
1021        else
1022                val=mroute_do_assert;
1023        if (copy_to_user(optval,&val,olr))
1024                return -EFAULT;
1025        return 0;
1026}
1027
1028/*
1029 *      The IP multicast ioctl support routines.
1030 */
1031
1032int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1033{
1034        struct sioc_sg_req sr;
1035        struct sioc_vif_req vr;
1036        struct vif_device *vif;
1037        struct mfc_cache *c;
1038
1039        switch (cmd) {
1040        case SIOCGETVIFCNT:
1041                if (copy_from_user(&vr,arg,sizeof(vr)))
1042                        return -EFAULT;
1043                if (vr.vifi>=maxvif)
1044                        return -EINVAL;
1045                read_lock(&mrt_lock);
1046                vif=&vif_table[vr.vifi];
1047                if (VIF_EXISTS(vr.vifi))        {
1048                        vr.icount=vif->pkt_in;
1049                        vr.ocount=vif->pkt_out;
1050                        vr.ibytes=vif->bytes_in;
1051                        vr.obytes=vif->bytes_out;
1052                        read_unlock(&mrt_lock);
1053
1054                        if (copy_to_user(arg,&vr,sizeof(vr)))
1055                                return -EFAULT;
1056                        return 0;
1057                }
1058                read_unlock(&mrt_lock);
1059                return -EADDRNOTAVAIL;
1060        case SIOCGETSGCNT:
1061                if (copy_from_user(&sr,arg,sizeof(sr)))
1062                        return -EFAULT;
1063
1064                read_lock(&mrt_lock);
1065                c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1066                if (c) {
1067                        sr.pktcnt = c->mfc_un.res.pkt;
1068                        sr.bytecnt = c->mfc_un.res.bytes;
1069                        sr.wrong_if = c->mfc_un.res.wrong_if;
1070                        read_unlock(&mrt_lock);
1071
1072                        if (copy_to_user(arg,&sr,sizeof(sr)))
1073                                return -EFAULT;
1074                        return 0;
1075                }
1076                read_unlock(&mrt_lock);
1077                return -EADDRNOTAVAIL;
1078        default:
1079                return -ENOIOCTLCMD;
1080        }
1081}
1082
1083
1084static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1085{
1086        struct net_device *dev = ptr;
1087        struct vif_device *v;
1088        int ct;
1089
1090        if (dev->nd_net != &init_net)
1091                return NOTIFY_DONE;
1092
1093        if (event != NETDEV_UNREGISTER)
1094                return NOTIFY_DONE;
1095        v=&vif_table[0];
1096        for (ct=0;ct<maxvif;ct++,v++) {
1097                if (v->dev==dev)
1098                        vif_delete(ct);
1099        }
1100        return NOTIFY_DONE;
1101}
1102
1103
1104static struct notifier_block ip_mr_notifier={
1105        .notifier_call = ipmr_device_event,
1106};
1107
1108/*
1109 *      Encapsulate a packet by attaching a valid IPIP header to it.
1110 *      This avoids tunnel drivers and other mess and gives us the speed so
1111 *      important for multicast video.
1112 */
1113
1114static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1115{
1116        struct iphdr *iph;
1117        struct iphdr *old_iph = ip_hdr(skb);
1118
1119        skb_push(skb, sizeof(struct iphdr));
1120        skb->transport_header = skb->network_header;
1121        skb_reset_network_header(skb);
1122        iph = ip_hdr(skb);
1123
1124        iph->version    =       4;
1125        iph->tos        =       old_iph->tos;
1126        iph->ttl        =       old_iph->ttl;
1127        iph->frag_off   =       0;
1128        iph->daddr      =       daddr;
1129        iph->saddr      =       saddr;
1130        iph->protocol   =       IPPROTO_IPIP;
1131        iph->ihl        =       5;
1132        iph->tot_len    =       htons(skb->len);
1133        ip_select_ident(iph, skb->dst, NULL);
1134        ip_send_check(iph);
1135
1136        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1137        nf_reset(skb);
1138}
1139
1140static inline int ipmr_forward_finish(struct sk_buff *skb)
1141{
1142        struct ip_options * opt = &(IPCB(skb)->opt);
1143
1144        IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1145
1146        if (unlikely(opt->optlen))
1147                ip_forward_options(skb);
1148
1149        return dst_output(skb);
1150}
1151
1152/*
1153 *      Processing handlers for ipmr_forward
1154 */
1155
1156static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1157{
1158        const struct iphdr *iph = ip_hdr(skb);
1159        struct vif_device *vif = &vif_table[vifi];
1160        struct net_device *dev;
1161        struct rtable *rt;
1162        int    encap = 0;
1163
1164        if (vif->dev == NULL)
1165                goto out_free;
1166
1167#ifdef CONFIG_IP_PIMSM
1168        if (vif->flags & VIFF_REGISTER) {
1169                vif->pkt_out++;
1170                vif->bytes_out+=skb->len;
1171                ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1172                ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1173                ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1174                kfree_skb(skb);
1175                return;
1176        }
1177#endif
1178
1179        if (vif->flags&VIFF_TUNNEL) {
1180                struct flowi fl = { .oif = vif->link,
1181                                    .nl_u = { .ip4_u =
1182                                              { .daddr = vif->remote,
1183                                                .saddr = vif->local,
1184                                                .tos = RT_TOS(iph->tos) } },
1185                                    .proto = IPPROTO_IPIP };
1186                if (ip_route_output_key(&rt, &fl))
1187                        goto out_free;
1188                encap = sizeof(struct iphdr);
1189        } else {
1190                struct flowi fl = { .oif = vif->link,
1191                                    .nl_u = { .ip4_u =
1192                                              { .daddr = iph->daddr,
1193                                                .tos = RT_TOS(iph->tos) } },
1194                                    .proto = IPPROTO_IPIP };
1195                if (ip_route_output_key(&rt, &fl))
1196                        goto out_free;
1197        }
1198
1199        dev = rt->u.dst.dev;
1200
1201        if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1202                /* Do not fragment multicasts. Alas, IPv4 does not
1203                   allow to send ICMP, so that packets will disappear
1204                   to blackhole.
1205                 */
1206
1207                IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1208                ip_rt_put(rt);
1209                goto out_free;
1210        }
1211
1212        encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1213
1214        if (skb_cow(skb, encap)) {
1215                ip_rt_put(rt);
1216                goto out_free;
1217        }
1218
1219        vif->pkt_out++;
1220        vif->bytes_out+=skb->len;
1221
1222        dst_release(skb->dst);
1223        skb->dst = &rt->u.dst;
1224        ip_decrease_ttl(ip_hdr(skb));
1225
1226        /* FIXME: forward and output firewalls used to be called here.
1227         * What do we do with netfilter? -- RR */
1228        if (vif->flags & VIFF_TUNNEL) {
1229                ip_encap(skb, vif->local, vif->remote);
1230                /* FIXME: extra output firewall step used to be here. --RR */
1231                ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1232                ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1233        }
1234
1235        IPCB(skb)->flags |= IPSKB_FORWARDED;
1236
1237        /*
1238         * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1239         * not only before forwarding, but after forwarding on all output
1240         * interfaces. It is clear, if mrouter runs a multicasting
1241         * program, it should receive packets not depending to what interface
1242         * program is joined.
1243         * If we will not make it, the program will have to join on all
1244         * interfaces. On the other hand, multihoming host (or router, but
1245         * not mrouter) cannot join to more than one interface - it will
1246         * result in receiving multiple packets.
1247         */
1248        NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1249                ipmr_forward_finish);
1250        return;
1251
1252out_free:
1253        kfree_skb(skb);
1254        return;
1255}
1256
1257static int ipmr_find_vif(struct net_device *dev)
1258{
1259        int ct;
1260        for (ct=maxvif-1; ct>=0; ct--) {
1261                if (vif_table[ct].dev == dev)
1262                        break;
1263        }
1264        return ct;
1265}
1266
1267/* "local" means that we should preserve one skb (for local delivery) */
1268
1269static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1270{
1271        int psend = -1;
1272        int vif, ct;
1273
1274        vif = cache->mfc_parent;
1275        cache->mfc_un.res.pkt++;
1276        cache->mfc_un.res.bytes += skb->len;
1277
1278        /*
1279         * Wrong interface: drop packet and (maybe) send PIM assert.
1280         */
1281        if (vif_table[vif].dev != skb->dev) {
1282                int true_vifi;
1283
1284                if (((struct rtable*)skb->dst)->fl.iif == 0) {
1285                        /* It is our own packet, looped back.
1286                           Very complicated situation...
1287
1288                           The best workaround until routing daemons will be
1289                           fixed is not to redistribute packet, if it was
1290                           send through wrong interface. It means, that
1291                           multicast applications WILL NOT work for
1292                           (S,G), which have default multicast route pointing
1293                           to wrong oif. In any case, it is not a good
1294                           idea to use multicasting applications on router.
1295                         */
1296                        goto dont_forward;
1297                }
1298
1299                cache->mfc_un.res.wrong_if++;
1300                true_vifi = ipmr_find_vif(skb->dev);
1301
1302                if (true_vifi >= 0 && mroute_do_assert &&
1303                    /* pimsm uses asserts, when switching from RPT to SPT,
1304                       so that we cannot check that packet arrived on an oif.
1305                       It is bad, but otherwise we would need to move pretty
1306                       large chunk of pimd to kernel. Ough... --ANK
1307                     */
1308                    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1309                    time_after(jiffies,
1310                               cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1311                        cache->mfc_un.res.last_assert = jiffies;
1312                        ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1313                }
1314                goto dont_forward;
1315        }
1316
1317        vif_table[vif].pkt_in++;
1318        vif_table[vif].bytes_in+=skb->len;
1319
1320        /*
1321         *      Forward the frame
1322         */
1323        for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1324                if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1325                        if (psend != -1) {
1326                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1327                                if (skb2)
1328                                        ipmr_queue_xmit(skb2, cache, psend);
1329                        }
1330                        psend=ct;
1331                }
1332        }
1333        if (psend != -1) {
1334                if (local) {
1335                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1336                        if (skb2)
1337                                ipmr_queue_xmit(skb2, cache, psend);
1338                } else {
1339                        ipmr_queue_xmit(skb, cache, psend);
1340                        return 0;
1341                }
1342        }
1343
1344dont_forward:
1345        if (!local)
1346                kfree_skb(skb);
1347        return 0;
1348}
1349
1350
1351/*
1352 *      Multicast packets for forwarding arrive here
1353 */
1354
1355int ip_mr_input(struct sk_buff *skb)
1356{
1357        struct mfc_cache *cache;
1358        int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1359
1360        /* Packet is looped back after forward, it should not be
1361           forwarded second time, but still can be delivered locally.
1362         */
1363        if (IPCB(skb)->flags&IPSKB_FORWARDED)
1364                goto dont_forward;
1365
1366        if (!local) {
1367                    if (IPCB(skb)->opt.router_alert) {
1368                            if (ip_call_ra_chain(skb))
1369                                    return 0;
1370                    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1371                            /* IGMPv1 (and broken IGMPv2 implementations sort of
1372                               Cisco IOS <= 11.2(8)) do not put router alert
1373                               option to IGMP packets destined to routable
1374                               groups. It is very bad, because it means
1375                               that we can forward NO IGMP messages.
1376                             */
1377                            read_lock(&mrt_lock);
1378                            if (mroute_socket) {
1379                                    nf_reset(skb);
1380                                    raw_rcv(mroute_socket, skb);
1381                                    read_unlock(&mrt_lock);
1382                                    return 0;
1383                            }
1384                            read_unlock(&mrt_lock);
1385                    }
1386        }
1387
1388        read_lock(&mrt_lock);
1389        cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1390
1391        /*
1392         *      No usable cache entry
1393         */
1394        if (cache==NULL) {
1395                int vif;
1396
1397                if (local) {
1398                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1399                        ip_local_deliver(skb);
1400                        if (skb2 == NULL) {
1401                                read_unlock(&mrt_lock);
1402                                return -ENOBUFS;
1403                        }
1404                        skb = skb2;
1405                }
1406
1407                vif = ipmr_find_vif(skb->dev);
1408                if (vif >= 0) {
1409                        int err = ipmr_cache_unresolved(vif, skb);
1410                        read_unlock(&mrt_lock);
1411
1412                        return err;
1413                }
1414                read_unlock(&mrt_lock);
1415                kfree_skb(skb);
1416                return -ENODEV;
1417        }
1418
1419        ip_mr_forward(skb, cache, local);
1420
1421        read_unlock(&mrt_lock);
1422
1423        if (local)
1424                return ip_local_deliver(skb);
1425
1426        return 0;
1427
1428dont_forward:
1429        if (local)
1430                return ip_local_deliver(skb);
1431        kfree_skb(skb);
1432        return 0;
1433}
1434
1435#ifdef CONFIG_IP_PIMSM_V1
1436/*
1437 * Handle IGMP messages of PIMv1
1438 */
1439
1440int pim_rcv_v1(struct sk_buff * skb)
1441{
1442        struct igmphdr *pim;
1443        struct iphdr   *encap;
1444        struct net_device  *reg_dev = NULL;
1445
1446        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1447                goto drop;
1448
1449        pim = igmp_hdr(skb);
1450
1451        if (!mroute_do_pim ||
1452            skb->len < sizeof(*pim) + sizeof(*encap) ||
1453            pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1454                goto drop;
1455
1456        encap = (struct iphdr *)(skb_transport_header(skb) +
1457                                 sizeof(struct igmphdr));
1458        /*
1459           Check that:
1460           a. packet is really destinted to a multicast group
1461           b. packet is not a NULL-REGISTER
1462           c. packet is not truncated
1463         */
1464        if (!MULTICAST(encap->daddr) ||
1465            encap->tot_len == 0 ||
1466            ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1467                goto drop;
1468
1469        read_lock(&mrt_lock);
1470        if (reg_vif_num >= 0)
1471                reg_dev = vif_table[reg_vif_num].dev;
1472        if (reg_dev)
1473                dev_hold(reg_dev);
1474        read_unlock(&mrt_lock);
1475
1476        if (reg_dev == NULL)
1477                goto drop;
1478
1479        skb->mac_header = skb->network_header;
1480        skb_pull(skb, (u8*)encap - skb->data);
1481        skb_reset_network_header(skb);
1482        skb->dev = reg_dev;
1483        skb->protocol = htons(ETH_P_IP);
1484        skb->ip_summed = 0;
1485        skb->pkt_type = PACKET_HOST;
1486        dst_release(skb->dst);
1487        skb->dst = NULL;
1488        ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1489        ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1490        nf_reset(skb);
1491        netif_rx(skb);
1492        dev_put(reg_dev);
1493        return 0;
1494 drop:
1495        kfree_skb(skb);
1496        return 0;
1497}
1498#endif
1499
1500#ifdef CONFIG_IP_PIMSM_V2
1501static int pim_rcv(struct sk_buff * skb)
1502{
1503        struct pimreghdr *pim;
1504        struct iphdr   *encap;
1505        struct net_device  *reg_dev = NULL;
1506
1507        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1508                goto drop;
1509
1510        pim = (struct pimreghdr *)skb_transport_header(skb);
1511        if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1512            (pim->flags&PIM_NULL_REGISTER) ||
1513            (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1514             csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1515                goto drop;
1516
1517        /* check if the inner packet is destined to mcast group */
1518        encap = (struct iphdr *)(skb_transport_header(skb) +
1519                                 sizeof(struct pimreghdr));
1520        if (!MULTICAST(encap->daddr) ||
1521            encap->tot_len == 0 ||
1522            ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1523                goto drop;
1524
1525        read_lock(&mrt_lock);
1526        if (reg_vif_num >= 0)
1527                reg_dev = vif_table[reg_vif_num].dev;
1528        if (reg_dev)
1529                dev_hold(reg_dev);
1530        read_unlock(&mrt_lock);
1531
1532        if (reg_dev == NULL)
1533                goto drop;
1534
1535        skb->mac_header = skb->network_header;
1536        skb_pull(skb, (u8*)encap - skb->data);
1537        skb_reset_network_header(skb);
1538        skb->dev = reg_dev;
1539        skb->protocol = htons(ETH_P_IP);
1540        skb->ip_summed = 0;
1541        skb->pkt_type = PACKET_HOST;
1542        dst_release(skb->dst);
1543        ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1544        ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1545        skb->dst = NULL;
1546        nf_reset(skb);
1547        netif_rx(skb);
1548        dev_put(reg_dev);
1549        return 0;
1550 drop:
1551        kfree_skb(skb);
1552        return 0;
1553}
1554#endif
1555
1556static int
1557ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1558{
1559        int ct;
1560        struct rtnexthop *nhp;
1561        struct net_device *dev = vif_table[c->mfc_parent].dev;
1562        u8 *b = skb_tail_pointer(skb);
1563        struct rtattr *mp_head;
1564
1565        if (dev)
1566                RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1567
1568        mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1569
1570        for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1571                if (c->mfc_un.res.ttls[ct] < 255) {
1572                        if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1573                                goto rtattr_failure;
1574                        nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1575                        nhp->rtnh_flags = 0;
1576                        nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1577                        nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1578                        nhp->rtnh_len = sizeof(*nhp);
1579                }
1580        }
1581        mp_head->rta_type = RTA_MULTIPATH;
1582        mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1583        rtm->rtm_type = RTN_MULTICAST;
1584        return 1;
1585
1586rtattr_failure:
1587        nlmsg_trim(skb, b);
1588        return -EMSGSIZE;
1589}
1590
1591int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1592{
1593        int err;
1594        struct mfc_cache *cache;
1595        struct rtable *rt = (struct rtable*)skb->dst;
1596
1597        read_lock(&mrt_lock);
1598        cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1599
1600        if (cache==NULL) {
1601                struct sk_buff *skb2;
1602                struct iphdr *iph;
1603                struct net_device *dev;
1604                int vif;
1605
1606                if (nowait) {
1607                        read_unlock(&mrt_lock);
1608                        return -EAGAIN;
1609                }
1610
1611                dev = skb->dev;
1612                if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1613                        read_unlock(&mrt_lock);
1614                        return -ENODEV;
1615                }
1616                skb2 = skb_clone(skb, GFP_ATOMIC);
1617                if (!skb2) {
1618                        read_unlock(&mrt_lock);
1619                        return -ENOMEM;
1620                }
1621
1622                skb_push(skb2, sizeof(struct iphdr));
1623                skb_reset_network_header(skb2);
1624                iph = ip_hdr(skb2);
1625                iph->ihl = sizeof(struct iphdr) >> 2;
1626                iph->saddr = rt->rt_src;
1627                iph->daddr = rt->rt_dst;
1628                iph->version = 0;
1629                err = ipmr_cache_unresolved(vif, skb2);
1630                read_unlock(&mrt_lock);
1631                return err;
1632        }
1633
1634        if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1635                cache->mfc_flags |= MFC_NOTIFY;
1636        err = ipmr_fill_mroute(skb, cache, rtm);
1637        read_unlock(&mrt_lock);
1638        return err;
1639}
1640
1641#ifdef CONFIG_PROC_FS
1642/*
1643 *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1644 */
1645struct ipmr_vif_iter {
1646        int ct;
1647};
1648
1649static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1650                                           loff_t pos)
1651{
1652        for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1653                if (!VIF_EXISTS(iter->ct))
1654                        continue;
1655                if (pos-- == 0)
1656                        return &vif_table[iter->ct];
1657        }
1658        return NULL;
1659}
1660
1661static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1662{
1663        read_lock(&mrt_lock);
1664        return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1665                : SEQ_START_TOKEN;
1666}
1667
1668static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1669{
1670        struct ipmr_vif_iter *iter = seq->private;
1671
1672        ++*pos;
1673        if (v == SEQ_START_TOKEN)
1674                return ipmr_vif_seq_idx(iter, 0);
1675
1676        while (++iter->ct < maxvif) {
1677                if (!VIF_EXISTS(iter->ct))
1678                        continue;
1679                return &vif_table[iter->ct];
1680        }
1681        return NULL;
1682}
1683
1684static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1685{
1686        read_unlock(&mrt_lock);
1687}
1688
1689static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1690{
1691        if (v == SEQ_START_TOKEN) {
1692                seq_puts(seq,
1693                         "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1694        } else {
1695                const struct vif_device *vif = v;
1696                const char *name =  vif->dev ? vif->dev->name : "none";
1697
1698                seq_printf(seq,
1699                           "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1700                           vif - vif_table,
1701                           name, vif->bytes_in, vif->pkt_in,
1702                           vif->bytes_out, vif->pkt_out,
1703                           vif->flags, vif->local, vif->remote);
1704        }
1705        return 0;
1706}
1707
1708static const struct seq_operations ipmr_vif_seq_ops = {
1709        .start = ipmr_vif_seq_start,
1710        .next  = ipmr_vif_seq_next,
1711        .stop  = ipmr_vif_seq_stop,
1712        .show  = ipmr_vif_seq_show,
1713};
1714
1715static int ipmr_vif_open(struct inode *inode, struct file *file)
1716{
1717        return seq_open_private(file, &ipmr_vif_seq_ops,
1718                        sizeof(struct ipmr_vif_iter));
1719}
1720
1721static const struct file_operations ipmr_vif_fops = {
1722        .owner   = THIS_MODULE,
1723        .open    = ipmr_vif_open,
1724        .read    = seq_read,
1725        .llseek  = seq_lseek,
1726        .release = seq_release_private,
1727};
1728
1729struct ipmr_mfc_iter {
1730        struct mfc_cache **cache;
1731        int ct;
1732};
1733
1734
1735static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1736{
1737        struct mfc_cache *mfc;
1738
1739        it->cache = mfc_cache_array;
1740        read_lock(&mrt_lock);
1741        for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1742                for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1743                        if (pos-- == 0)
1744                                return mfc;
1745        read_unlock(&mrt_lock);
1746
1747        it->cache = &mfc_unres_queue;
1748        spin_lock_bh(&mfc_unres_lock);
1749        for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1750                if (pos-- == 0)
1751                        return mfc;
1752        spin_unlock_bh(&mfc_unres_lock);
1753
1754        it->cache = NULL;
1755        return NULL;
1756}
1757
1758
1759static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1760{
1761        struct ipmr_mfc_iter *it = seq->private;
1762        it->cache = NULL;
1763        it->ct = 0;
1764        return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1765                : SEQ_START_TOKEN;
1766}
1767
1768static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1769{
1770        struct mfc_cache *mfc = v;
1771        struct ipmr_mfc_iter *it = seq->private;
1772
1773        ++*pos;
1774
1775        if (v == SEQ_START_TOKEN)
1776                return ipmr_mfc_seq_idx(seq->private, 0);
1777
1778        if (mfc->next)
1779                return mfc->next;
1780
1781        if (it->cache == &mfc_unres_queue)
1782                goto end_of_list;
1783
1784        BUG_ON(it->cache != mfc_cache_array);
1785
1786        while (++it->ct < MFC_LINES) {
1787                mfc = mfc_cache_array[it->ct];
1788                if (mfc)
1789                        return mfc;
1790        }
1791
1792        /* exhausted cache_array, show unresolved */
1793        read_unlock(&mrt_lock);
1794        it->cache = &mfc_unres_queue;
1795        it->ct = 0;
1796
1797        spin_lock_bh(&mfc_unres_lock);
1798        mfc = mfc_unres_queue;
1799        if (mfc)
1800                return mfc;
1801
1802 end_of_list:
1803        spin_unlock_bh(&mfc_unres_lock);
1804        it->cache = NULL;
1805
1806        return NULL;
1807}
1808
1809static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1810{
1811        struct ipmr_mfc_iter *it = seq->private;
1812
1813        if (it->cache == &mfc_unres_queue)
1814                spin_unlock_bh(&mfc_unres_lock);
1815        else if (it->cache == mfc_cache_array)
1816                read_unlock(&mrt_lock);
1817}
1818
1819static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1820{
1821        int n;
1822
1823        if (v == SEQ_START_TOKEN) {
1824                seq_puts(seq,
1825                 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1826        } else {
1827                const struct mfc_cache *mfc = v;
1828                const struct ipmr_mfc_iter *it = seq->private;
1829
1830                seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1831                           (unsigned long) mfc->mfc_mcastgrp,
1832                           (unsigned long) mfc->mfc_origin,
1833                           mfc->mfc_parent,
1834                           mfc->mfc_un.res.pkt,
1835                           mfc->mfc_un.res.bytes,
1836                           mfc->mfc_un.res.wrong_if);
1837
1838                if (it->cache != &mfc_unres_queue) {
1839                        for (n = mfc->mfc_un.res.minvif;
1840                             n < mfc->mfc_un.res.maxvif; n++ ) {
1841                                if (VIF_EXISTS(n)
1842                                   && mfc->mfc_un.res.ttls[n] < 255)
1843                                seq_printf(seq,
1844                                           " %2d:%-3d",
1845                                           n, mfc->mfc_un.res.ttls[n]);
1846                        }
1847                }
1848                seq_putc(seq, '\n');
1849        }
1850        return 0;
1851}
1852
1853static const struct seq_operations ipmr_mfc_seq_ops = {
1854        .start = ipmr_mfc_seq_start,
1855        .next  = ipmr_mfc_seq_next,
1856        .stop  = ipmr_mfc_seq_stop,
1857        .show  = ipmr_mfc_seq_show,
1858};
1859
1860static int ipmr_mfc_open(struct inode *inode, struct file *file)
1861{
1862        return seq_open_private(file, &ipmr_mfc_seq_ops,
1863                        sizeof(struct ipmr_mfc_iter));
1864}
1865
1866static const struct file_operations ipmr_mfc_fops = {
1867        .owner   = THIS_MODULE,
1868        .open    = ipmr_mfc_open,
1869        .read    = seq_read,
1870        .llseek  = seq_lseek,
1871        .release = seq_release_private,
1872};
1873#endif
1874
1875#ifdef CONFIG_IP_PIMSM_V2
1876static struct net_protocol pim_protocol = {
1877        .handler        =       pim_rcv,
1878};
1879#endif
1880
1881
1882/*
1883 *      Setup for IP multicast routing
1884 */
1885
1886void __init ip_mr_init(void)
1887{
1888        mrt_cachep = kmem_cache_create("ip_mrt_cache",
1889                                       sizeof(struct mfc_cache),
1890                                       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1891                                       NULL);
1892        init_timer(&ipmr_expire_timer);
1893        ipmr_expire_timer.function=ipmr_expire_process;
1894        register_netdevice_notifier(&ip_mr_notifier);
1895#ifdef CONFIG_PROC_FS
1896        proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops);
1897        proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops);
1898#endif
1899}
1900