linux/net/ipv4/ipmr.c
<<
>>
Prefs
   1/*
   2 *      IP multicast routing support for mrouted 3.6/3.8
   3 *
   4 *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5 *        Linux Consultancy and Custom Driver Development
   6 *
   7 *      This program is free software; you can redistribute it and/or
   8 *      modify it under the terms of the GNU General Public License
   9 *      as published by the Free Software Foundation; either version
  10 *      2 of the License, or (at your option) any later version.
  11 *
  12 *      Fixes:
  13 *      Michael Chastain        :       Incorrect size of copying.
  14 *      Alan Cox                :       Added the cache manager code
  15 *      Alan Cox                :       Fixed the clone/copy bug and device race.
  16 *      Mike McLagan            :       Routing by source
  17 *      Malcolm Beattie         :       Buffer handling fixes.
  18 *      Alexey Kuznetsov        :       Double buffer free and other fixes.
  19 *      SVR Anand               :       Fixed several multicast bugs and problems.
  20 *      Alexey Kuznetsov        :       Status, optimisations and more.
  21 *      Brad Parker             :       Better behaviour on mrouted upcall
  22 *                                      overflow.
  23 *      Carlos Picoto           :       PIMv1 Support
  24 *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
  25 *                                      Relax this requirement to work with older peers.
  26 *
  27 */
  28
  29#include <linux/uaccess.h>
  30#include <linux/types.h>
  31#include <linux/cache.h>
  32#include <linux/capability.h>
  33#include <linux/errno.h>
  34#include <linux/mm.h>
  35#include <linux/kernel.h>
  36#include <linux/fcntl.h>
  37#include <linux/stat.h>
  38#include <linux/socket.h>
  39#include <linux/in.h>
  40#include <linux/inet.h>
  41#include <linux/netdevice.h>
  42#include <linux/inetdevice.h>
  43#include <linux/igmp.h>
  44#include <linux/proc_fs.h>
  45#include <linux/seq_file.h>
  46#include <linux/mroute.h>
  47#include <linux/init.h>
  48#include <linux/if_ether.h>
  49#include <linux/slab.h>
  50#include <net/net_namespace.h>
  51#include <net/ip.h>
  52#include <net/protocol.h>
  53#include <linux/skbuff.h>
  54#include <net/route.h>
  55#include <net/icmp.h>
  56#include <net/udp.h>
  57#include <net/raw.h>
  58#include <linux/notifier.h>
  59#include <linux/if_arp.h>
  60#include <linux/netfilter_ipv4.h>
  61#include <linux/compat.h>
  62#include <linux/export.h>
  63#include <net/ip_tunnels.h>
  64#include <net/checksum.h>
  65#include <net/netlink.h>
  66#include <net/fib_rules.h>
  67#include <linux/netconf.h>
  68#include <net/nexthop.h>
  69#include <net/switchdev.h>
  70
  71#include <linux/nospec.h>
  72
  73struct ipmr_rule {
  74        struct fib_rule         common;
  75};
  76
  77struct ipmr_result {
  78        struct mr_table         *mrt;
  79};
  80
  81/* Big lock, protecting vif table, mrt cache and mroute socket state.
  82 * Note that the changes are semaphored via rtnl_lock.
  83 */
  84
  85static DEFINE_RWLOCK(mrt_lock);
  86
  87/* Multicast router control variables */
  88
  89/* Special spinlock for queue of unresolved entries */
  90static DEFINE_SPINLOCK(mfc_unres_lock);
  91
  92/* We return to original Alan's scheme. Hash table of resolved
  93 * entries is changed only in process context and protected
  94 * with weak lock mrt_lock. Queue of unresolved entries is protected
  95 * with strong spinlock mfc_unres_lock.
  96 *
  97 * In this case data path is free of exclusive locks at all.
  98 */
  99
 100static struct kmem_cache *mrt_cachep __ro_after_init;
 101
 102static struct mr_table *ipmr_new_table(struct net *net, u32 id);
 103static void ipmr_free_table(struct mr_table *mrt);
 104
 105static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 106                          struct net_device *dev, struct sk_buff *skb,
 107                          struct mfc_cache *cache, int local);
 108static int ipmr_cache_report(struct mr_table *mrt,
 109                             struct sk_buff *pkt, vifi_t vifi, int assert);
 110static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 111                                 int cmd);
 112static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
 113static void mroute_clean_tables(struct mr_table *mrt, bool all);
 114static void ipmr_expire_process(struct timer_list *t);
 115
 116#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
 117#define ipmr_for_each_table(mrt, net) \
 118        list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
 119
 120static struct mr_table *ipmr_mr_table_iter(struct net *net,
 121                                           struct mr_table *mrt)
 122{
 123        struct mr_table *ret;
 124
 125        if (!mrt)
 126                ret = list_entry_rcu(net->ipv4.mr_tables.next,
 127                                     struct mr_table, list);
 128        else
 129                ret = list_entry_rcu(mrt->list.next,
 130                                     struct mr_table, list);
 131
 132        if (&ret->list == &net->ipv4.mr_tables)
 133                return NULL;
 134        return ret;
 135}
 136
 137static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 138{
 139        struct mr_table *mrt;
 140
 141        ipmr_for_each_table(mrt, net) {
 142                if (mrt->id == id)
 143                        return mrt;
 144        }
 145        return NULL;
 146}
 147
 148static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
 149                           struct mr_table **mrt)
 150{
 151        int err;
 152        struct ipmr_result res;
 153        struct fib_lookup_arg arg = {
 154                .result = &res,
 155                .flags = FIB_LOOKUP_NOREF,
 156        };
 157
 158        /* update flow if oif or iif point to device enslaved to l3mdev */
 159        l3mdev_update_flow(net, flowi4_to_flowi(flp4));
 160
 161        err = fib_rules_lookup(net->ipv4.mr_rules_ops,
 162                               flowi4_to_flowi(flp4), 0, &arg);
 163        if (err < 0)
 164                return err;
 165        *mrt = res.mrt;
 166        return 0;
 167}
 168
 169static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
 170                            int flags, struct fib_lookup_arg *arg)
 171{
 172        struct ipmr_result *res = arg->result;
 173        struct mr_table *mrt;
 174
 175        switch (rule->action) {
 176        case FR_ACT_TO_TBL:
 177                break;
 178        case FR_ACT_UNREACHABLE:
 179                return -ENETUNREACH;
 180        case FR_ACT_PROHIBIT:
 181                return -EACCES;
 182        case FR_ACT_BLACKHOLE:
 183        default:
 184                return -EINVAL;
 185        }
 186
 187        arg->table = fib_rule_get_table(rule, arg);
 188
 189        mrt = ipmr_get_table(rule->fr_net, arg->table);
 190        if (!mrt)
 191                return -EAGAIN;
 192        res->mrt = mrt;
 193        return 0;
 194}
 195
 196static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 197{
 198        return 1;
 199}
 200
 201static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
 202        FRA_GENERIC_POLICY,
 203};
 204
 205static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 206                               struct fib_rule_hdr *frh, struct nlattr **tb,
 207                               struct netlink_ext_ack *extack)
 208{
 209        return 0;
 210}
 211
 212static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
 213                             struct nlattr **tb)
 214{
 215        return 1;
 216}
 217
 218static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 219                          struct fib_rule_hdr *frh)
 220{
 221        frh->dst_len = 0;
 222        frh->src_len = 0;
 223        frh->tos     = 0;
 224        return 0;
 225}
 226
 227static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
 228        .family         = RTNL_FAMILY_IPMR,
 229        .rule_size      = sizeof(struct ipmr_rule),
 230        .addr_size      = sizeof(u32),
 231        .action         = ipmr_rule_action,
 232        .match          = ipmr_rule_match,
 233        .configure      = ipmr_rule_configure,
 234        .compare        = ipmr_rule_compare,
 235        .fill           = ipmr_rule_fill,
 236        .nlgroup        = RTNLGRP_IPV4_RULE,
 237        .policy         = ipmr_rule_policy,
 238        .owner          = THIS_MODULE,
 239};
 240
 241static int __net_init ipmr_rules_init(struct net *net)
 242{
 243        struct fib_rules_ops *ops;
 244        struct mr_table *mrt;
 245        int err;
 246
 247        ops = fib_rules_register(&ipmr_rules_ops_template, net);
 248        if (IS_ERR(ops))
 249                return PTR_ERR(ops);
 250
 251        INIT_LIST_HEAD(&net->ipv4.mr_tables);
 252
 253        mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
 254        if (IS_ERR(mrt)) {
 255                err = PTR_ERR(mrt);
 256                goto err1;
 257        }
 258
 259        err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
 260        if (err < 0)
 261                goto err2;
 262
 263        net->ipv4.mr_rules_ops = ops;
 264        return 0;
 265
 266err2:
 267        ipmr_free_table(mrt);
 268err1:
 269        fib_rules_unregister(ops);
 270        return err;
 271}
 272
 273static void __net_exit ipmr_rules_exit(struct net *net)
 274{
 275        struct mr_table *mrt, *next;
 276
 277        rtnl_lock();
 278        list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
 279                list_del(&mrt->list);
 280                ipmr_free_table(mrt);
 281        }
 282        fib_rules_unregister(net->ipv4.mr_rules_ops);
 283        rtnl_unlock();
 284}
 285
 286static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
 287{
 288        return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
 289}
 290
 291static unsigned int ipmr_rules_seq_read(struct net *net)
 292{
 293        return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
 294}
 295
 296bool ipmr_rule_default(const struct fib_rule *rule)
 297{
 298        return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
 299}
 300EXPORT_SYMBOL(ipmr_rule_default);
 301#else
 302#define ipmr_for_each_table(mrt, net) \
 303        for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
 304
 305static struct mr_table *ipmr_mr_table_iter(struct net *net,
 306                                           struct mr_table *mrt)
 307{
 308        if (!mrt)
 309                return net->ipv4.mrt;
 310        return NULL;
 311}
 312
 313static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 314{
 315        return net->ipv4.mrt;
 316}
 317
 318static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
 319                           struct mr_table **mrt)
 320{
 321        *mrt = net->ipv4.mrt;
 322        return 0;
 323}
 324
 325static int __net_init ipmr_rules_init(struct net *net)
 326{
 327        struct mr_table *mrt;
 328
 329        mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
 330        if (IS_ERR(mrt))
 331                return PTR_ERR(mrt);
 332        net->ipv4.mrt = mrt;
 333        return 0;
 334}
 335
 336static void __net_exit ipmr_rules_exit(struct net *net)
 337{
 338        rtnl_lock();
 339        ipmr_free_table(net->ipv4.mrt);
 340        net->ipv4.mrt = NULL;
 341        rtnl_unlock();
 342}
 343
 344static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
 345{
 346        return 0;
 347}
 348
 349static unsigned int ipmr_rules_seq_read(struct net *net)
 350{
 351        return 0;
 352}
 353
 354bool ipmr_rule_default(const struct fib_rule *rule)
 355{
 356        return true;
 357}
 358EXPORT_SYMBOL(ipmr_rule_default);
 359#endif
 360
 361static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
 362                                const void *ptr)
 363{
 364        const struct mfc_cache_cmp_arg *cmparg = arg->key;
 365        struct mfc_cache *c = (struct mfc_cache *)ptr;
 366
 367        return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
 368               cmparg->mfc_origin != c->mfc_origin;
 369}
 370
 371static const struct rhashtable_params ipmr_rht_params = {
 372        .head_offset = offsetof(struct mr_mfc, mnode),
 373        .key_offset = offsetof(struct mfc_cache, cmparg),
 374        .key_len = sizeof(struct mfc_cache_cmp_arg),
 375        .nelem_hint = 3,
 376        .locks_mul = 1,
 377        .obj_cmpfn = ipmr_hash_cmp,
 378        .automatic_shrinking = true,
 379};
 380
 381static void ipmr_new_table_set(struct mr_table *mrt,
 382                               struct net *net)
 383{
 384#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
 385        list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
 386#endif
 387}
 388
 389static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = {
 390        .mfc_mcastgrp = htonl(INADDR_ANY),
 391        .mfc_origin = htonl(INADDR_ANY),
 392};
 393
 394static struct mr_table_ops ipmr_mr_table_ops = {
 395        .rht_params = &ipmr_rht_params,
 396        .cmparg_any = &ipmr_mr_table_ops_cmparg_any,
 397};
 398
 399static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 400{
 401        struct mr_table *mrt;
 402
 403        /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
 404        if (id != RT_TABLE_DEFAULT && id >= 1000000000)
 405                return ERR_PTR(-EINVAL);
 406
 407        mrt = ipmr_get_table(net, id);
 408        if (mrt)
 409                return mrt;
 410
 411        return mr_table_alloc(net, id, &ipmr_mr_table_ops,
 412                              ipmr_expire_process, ipmr_new_table_set);
 413}
 414
 415static void ipmr_free_table(struct mr_table *mrt)
 416{
 417        del_timer_sync(&mrt->ipmr_expire_timer);
 418        mroute_clean_tables(mrt, true);
 419        rhltable_destroy(&mrt->mfc_hash);
 420        kfree(mrt);
 421}
 422
 423/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 424
 425static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
 426{
 427        struct net *net = dev_net(dev);
 428
 429        dev_close(dev);
 430
 431        dev = __dev_get_by_name(net, "tunl0");
 432        if (dev) {
 433                const struct net_device_ops *ops = dev->netdev_ops;
 434                struct ifreq ifr;
 435                struct ip_tunnel_parm p;
 436
 437                memset(&p, 0, sizeof(p));
 438                p.iph.daddr = v->vifc_rmt_addr.s_addr;
 439                p.iph.saddr = v->vifc_lcl_addr.s_addr;
 440                p.iph.version = 4;
 441                p.iph.ihl = 5;
 442                p.iph.protocol = IPPROTO_IPIP;
 443                sprintf(p.name, "dvmrp%d", v->vifc_vifi);
 444                ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
 445
 446                if (ops->ndo_do_ioctl) {
 447                        mm_segment_t oldfs = get_fs();
 448
 449                        set_fs(KERNEL_DS);
 450                        ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
 451                        set_fs(oldfs);
 452                }
 453        }
 454}
 455
 456/* Initialize ipmr pimreg/tunnel in_device */
 457static bool ipmr_init_vif_indev(const struct net_device *dev)
 458{
 459        struct in_device *in_dev;
 460
 461        ASSERT_RTNL();
 462
 463        in_dev = __in_dev_get_rtnl(dev);
 464        if (!in_dev)
 465                return false;
 466        ipv4_devconf_setall(in_dev);
 467        neigh_parms_data_state_setall(in_dev->arp_parms);
 468        IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
 469
 470        return true;
 471}
 472
 473static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 474{
 475        struct net_device  *dev;
 476
 477        dev = __dev_get_by_name(net, "tunl0");
 478
 479        if (dev) {
 480                const struct net_device_ops *ops = dev->netdev_ops;
 481                int err;
 482                struct ifreq ifr;
 483                struct ip_tunnel_parm p;
 484
 485                memset(&p, 0, sizeof(p));
 486                p.iph.daddr = v->vifc_rmt_addr.s_addr;
 487                p.iph.saddr = v->vifc_lcl_addr.s_addr;
 488                p.iph.version = 4;
 489                p.iph.ihl = 5;
 490                p.iph.protocol = IPPROTO_IPIP;
 491                sprintf(p.name, "dvmrp%d", v->vifc_vifi);
 492                ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
 493
 494                if (ops->ndo_do_ioctl) {
 495                        mm_segment_t oldfs = get_fs();
 496
 497                        set_fs(KERNEL_DS);
 498                        err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
 499                        set_fs(oldfs);
 500                } else {
 501                        err = -EOPNOTSUPP;
 502                }
 503                dev = NULL;
 504
 505                if (err == 0 &&
 506                    (dev = __dev_get_by_name(net, p.name)) != NULL) {
 507                        dev->flags |= IFF_MULTICAST;
 508                        if (!ipmr_init_vif_indev(dev))
 509                                goto failure;
 510                        if (dev_open(dev))
 511                                goto failure;
 512                        dev_hold(dev);
 513                }
 514        }
 515        return dev;
 516
 517failure:
 518        unregister_netdevice(dev);
 519        return NULL;
 520}
 521
 522#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
 523static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
 524{
 525        struct net *net = dev_net(dev);
 526        struct mr_table *mrt;
 527        struct flowi4 fl4 = {
 528                .flowi4_oif     = dev->ifindex,
 529                .flowi4_iif     = skb->skb_iif ? : LOOPBACK_IFINDEX,
 530                .flowi4_mark    = skb->mark,
 531        };
 532        int err;
 533
 534        err = ipmr_fib_lookup(net, &fl4, &mrt);
 535        if (err < 0) {
 536                kfree_skb(skb);
 537                return err;
 538        }
 539
 540        read_lock(&mrt_lock);
 541        dev->stats.tx_bytes += skb->len;
 542        dev->stats.tx_packets++;
 543        ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
 544        read_unlock(&mrt_lock);
 545        kfree_skb(skb);
 546        return NETDEV_TX_OK;
 547}
 548
 549static int reg_vif_get_iflink(const struct net_device *dev)
 550{
 551        return 0;
 552}
 553
 554static const struct net_device_ops reg_vif_netdev_ops = {
 555        .ndo_start_xmit = reg_vif_xmit,
 556        .ndo_get_iflink = reg_vif_get_iflink,
 557};
 558
 559static void reg_vif_setup(struct net_device *dev)
 560{
 561        dev->type               = ARPHRD_PIMREG;
 562        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
 563        dev->flags              = IFF_NOARP;
 564        dev->netdev_ops         = &reg_vif_netdev_ops;
 565        dev->needs_free_netdev  = true;
 566        dev->features           |= NETIF_F_NETNS_LOCAL;
 567}
 568
 569static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 570{
 571        struct net_device *dev;
 572        char name[IFNAMSIZ];
 573
 574        if (mrt->id == RT_TABLE_DEFAULT)
 575                sprintf(name, "pimreg");
 576        else
 577                sprintf(name, "pimreg%u", mrt->id);
 578
 579        dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
 580
 581        if (!dev)
 582                return NULL;
 583
 584        dev_net_set(dev, net);
 585
 586        if (register_netdevice(dev)) {
 587                free_netdev(dev);
 588                return NULL;
 589        }
 590
 591        if (!ipmr_init_vif_indev(dev))
 592                goto failure;
 593        if (dev_open(dev))
 594                goto failure;
 595
 596        dev_hold(dev);
 597
 598        return dev;
 599
 600failure:
 601        unregister_netdevice(dev);
 602        return NULL;
 603}
 604
 605/* called with rcu_read_lock() */
 606static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
 607                     unsigned int pimlen)
 608{
 609        struct net_device *reg_dev = NULL;
 610        struct iphdr *encap;
 611
 612        encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
 613        /* Check that:
 614         * a. packet is really sent to a multicast group
 615         * b. packet is not a NULL-REGISTER
 616         * c. packet is not truncated
 617         */
 618        if (!ipv4_is_multicast(encap->daddr) ||
 619            encap->tot_len == 0 ||
 620            ntohs(encap->tot_len) + pimlen > skb->len)
 621                return 1;
 622
 623        read_lock(&mrt_lock);
 624        if (mrt->mroute_reg_vif_num >= 0)
 625                reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
 626        read_unlock(&mrt_lock);
 627
 628        if (!reg_dev)
 629                return 1;
 630
 631        skb->mac_header = skb->network_header;
 632        skb_pull(skb, (u8 *)encap - skb->data);
 633        skb_reset_network_header(skb);
 634        skb->protocol = htons(ETH_P_IP);
 635        skb->ip_summed = CHECKSUM_NONE;
 636
 637        skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
 638
 639        netif_rx(skb);
 640
 641        return NET_RX_SUCCESS;
 642}
 643#else
 644static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 645{
 646        return NULL;
 647}
 648#endif
 649
 650static int call_ipmr_vif_entry_notifiers(struct net *net,
 651                                         enum fib_event_type event_type,
 652                                         struct vif_device *vif,
 653                                         vifi_t vif_index, u32 tb_id)
 654{
 655        return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type,
 656                                     vif, vif_index, tb_id,
 657                                     &net->ipv4.ipmr_seq);
 658}
 659
 660static int call_ipmr_mfc_entry_notifiers(struct net *net,
 661                                         enum fib_event_type event_type,
 662                                         struct mfc_cache *mfc, u32 tb_id)
 663{
 664        return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type,
 665                                     &mfc->_c, tb_id, &net->ipv4.ipmr_seq);
 666}
 667
 668/**
 669 *      vif_delete - Delete a VIF entry
 670 *      @notify: Set to 1, if the caller is a notifier_call
 671 */
 672static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 673                      struct list_head *head)
 674{
 675        struct net *net = read_pnet(&mrt->net);
 676        struct vif_device *v;
 677        struct net_device *dev;
 678        struct in_device *in_dev;
 679
 680        if (vifi < 0 || vifi >= mrt->maxvif)
 681                return -EADDRNOTAVAIL;
 682
 683        v = &mrt->vif_table[vifi];
 684
 685        if (VIF_EXISTS(mrt, vifi))
 686                call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi,
 687                                              mrt->id);
 688
 689        write_lock_bh(&mrt_lock);
 690        dev = v->dev;
 691        v->dev = NULL;
 692
 693        if (!dev) {
 694                write_unlock_bh(&mrt_lock);
 695                return -EADDRNOTAVAIL;
 696        }
 697
 698        if (vifi == mrt->mroute_reg_vif_num)
 699                mrt->mroute_reg_vif_num = -1;
 700
 701        if (vifi + 1 == mrt->maxvif) {
 702                int tmp;
 703
 704                for (tmp = vifi - 1; tmp >= 0; tmp--) {
 705                        if (VIF_EXISTS(mrt, tmp))
 706                                break;
 707                }
 708                mrt->maxvif = tmp+1;
 709        }
 710
 711        write_unlock_bh(&mrt_lock);
 712
 713        dev_set_allmulti(dev, -1);
 714
 715        in_dev = __in_dev_get_rtnl(dev);
 716        if (in_dev) {
 717                IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
 718                inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
 719                                            NETCONFA_MC_FORWARDING,
 720                                            dev->ifindex, &in_dev->cnf);
 721                ip_rt_multicast_event(in_dev);
 722        }
 723
 724        if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
 725                unregister_netdevice_queue(dev, head);
 726
 727        dev_put(dev);
 728        return 0;
 729}
 730
 731static void ipmr_cache_free_rcu(struct rcu_head *head)
 732{
 733        struct mr_mfc *c = container_of(head, struct mr_mfc, rcu);
 734
 735        kmem_cache_free(mrt_cachep, (struct mfc_cache *)c);
 736}
 737
 738static void ipmr_cache_free(struct mfc_cache *c)
 739{
 740        call_rcu(&c->_c.rcu, ipmr_cache_free_rcu);
 741}
 742
 743/* Destroy an unresolved cache entry, killing queued skbs
 744 * and reporting error to netlink readers.
 745 */
 746static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
 747{
 748        struct net *net = read_pnet(&mrt->net);
 749        struct sk_buff *skb;
 750        struct nlmsgerr *e;
 751
 752        atomic_dec(&mrt->cache_resolve_queue_len);
 753
 754        while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) {
 755                if (ip_hdr(skb)->version == 0) {
 756                        struct nlmsghdr *nlh = skb_pull(skb,
 757                                                        sizeof(struct iphdr));
 758                        nlh->nlmsg_type = NLMSG_ERROR;
 759                        nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
 760                        skb_trim(skb, nlh->nlmsg_len);
 761                        e = nlmsg_data(nlh);
 762                        e->error = -ETIMEDOUT;
 763                        memset(&e->msg, 0, sizeof(e->msg));
 764
 765                        rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
 766                } else {
 767                        kfree_skb(skb);
 768                }
 769        }
 770
 771        ipmr_cache_free(c);
 772}
 773
 774/* Timer process for the unresolved queue. */
 775static void ipmr_expire_process(struct timer_list *t)
 776{
 777        struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
 778        struct mr_mfc *c, *next;
 779        unsigned long expires;
 780        unsigned long now;
 781
 782        if (!spin_trylock(&mfc_unres_lock)) {
 783                mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
 784                return;
 785        }
 786
 787        if (list_empty(&mrt->mfc_unres_queue))
 788                goto out;
 789
 790        now = jiffies;
 791        expires = 10*HZ;
 792
 793        list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
 794                if (time_after(c->mfc_un.unres.expires, now)) {
 795                        unsigned long interval = c->mfc_un.unres.expires - now;
 796                        if (interval < expires)
 797                                expires = interval;
 798                        continue;
 799                }
 800
 801                list_del(&c->list);
 802                mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE);
 803                ipmr_destroy_unres(mrt, (struct mfc_cache *)c);
 804        }
 805
 806        if (!list_empty(&mrt->mfc_unres_queue))
 807                mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
 808
 809out:
 810        spin_unlock(&mfc_unres_lock);
 811}
 812
 813/* Fill oifs list. It is called under write locked mrt_lock. */
 814static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
 815                                   unsigned char *ttls)
 816{
 817        int vifi;
 818
 819        cache->mfc_un.res.minvif = MAXVIFS;
 820        cache->mfc_un.res.maxvif = 0;
 821        memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
 822
 823        for (vifi = 0; vifi < mrt->maxvif; vifi++) {
 824                if (VIF_EXISTS(mrt, vifi) &&
 825                    ttls[vifi] && ttls[vifi] < 255) {
 826                        cache->mfc_un.res.ttls[vifi] = ttls[vifi];
 827                        if (cache->mfc_un.res.minvif > vifi)
 828                                cache->mfc_un.res.minvif = vifi;
 829                        if (cache->mfc_un.res.maxvif <= vifi)
 830                                cache->mfc_un.res.maxvif = vifi + 1;
 831                }
 832        }
 833        cache->mfc_un.res.lastuse = jiffies;
 834}
 835
 836static int vif_add(struct net *net, struct mr_table *mrt,
 837                   struct vifctl *vifc, int mrtsock)
 838{
 839        int vifi = vifc->vifc_vifi;
 840        struct switchdev_attr attr = {
 841                .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
 842        };
 843        struct vif_device *v = &mrt->vif_table[vifi];
 844        struct net_device *dev;
 845        struct in_device *in_dev;
 846        int err;
 847
 848        /* Is vif busy ? */
 849        if (VIF_EXISTS(mrt, vifi))
 850                return -EADDRINUSE;
 851
 852        switch (vifc->vifc_flags) {
 853        case VIFF_REGISTER:
 854                if (!ipmr_pimsm_enabled())
 855                        return -EINVAL;
 856                /* Special Purpose VIF in PIM
 857                 * All the packets will be sent to the daemon
 858                 */
 859                if (mrt->mroute_reg_vif_num >= 0)
 860                        return -EADDRINUSE;
 861                dev = ipmr_reg_vif(net, mrt);
 862                if (!dev)
 863                        return -ENOBUFS;
 864                err = dev_set_allmulti(dev, 1);
 865                if (err) {
 866                        unregister_netdevice(dev);
 867                        dev_put(dev);
 868                        return err;
 869                }
 870                break;
 871        case VIFF_TUNNEL:
 872                dev = ipmr_new_tunnel(net, vifc);
 873                if (!dev)
 874                        return -ENOBUFS;
 875                err = dev_set_allmulti(dev, 1);
 876                if (err) {
 877                        ipmr_del_tunnel(dev, vifc);
 878                        dev_put(dev);
 879                        return err;
 880                }
 881                break;
 882        case VIFF_USE_IFINDEX:
 883        case 0:
 884                if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
 885                        dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
 886                        if (dev && !__in_dev_get_rtnl(dev)) {
 887                                dev_put(dev);
 888                                return -EADDRNOTAVAIL;
 889                        }
 890                } else {
 891                        dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
 892                }
 893                if (!dev)
 894                        return -EADDRNOTAVAIL;
 895                err = dev_set_allmulti(dev, 1);
 896                if (err) {
 897                        dev_put(dev);
 898                        return err;
 899                }
 900                break;
 901        default:
 902                return -EINVAL;
 903        }
 904
 905        in_dev = __in_dev_get_rtnl(dev);
 906        if (!in_dev) {
 907                dev_put(dev);
 908                return -EADDRNOTAVAIL;
 909        }
 910        IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
 911        inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING,
 912                                    dev->ifindex, &in_dev->cnf);
 913        ip_rt_multicast_event(in_dev);
 914
 915        /* Fill in the VIF structures */
 916        vif_device_init(v, dev, vifc->vifc_rate_limit,
 917                        vifc->vifc_threshold,
 918                        vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0),
 919                        (VIFF_TUNNEL | VIFF_REGISTER));
 920
 921        attr.orig_dev = dev;
 922        if (!switchdev_port_attr_get(dev, &attr)) {
 923                memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len);
 924                v->dev_parent_id.id_len = attr.u.ppid.id_len;
 925        } else {
 926                v->dev_parent_id.id_len = 0;
 927        }
 928
 929        v->local = vifc->vifc_lcl_addr.s_addr;
 930        v->remote = vifc->vifc_rmt_addr.s_addr;
 931
 932        /* And finish update writing critical data */
 933        write_lock_bh(&mrt_lock);
 934        v->dev = dev;
 935        if (v->flags & VIFF_REGISTER)
 936                mrt->mroute_reg_vif_num = vifi;
 937        if (vifi+1 > mrt->maxvif)
 938                mrt->maxvif = vifi+1;
 939        write_unlock_bh(&mrt_lock);
 940        call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id);
 941        return 0;
 942}
 943
 944/* called with rcu_read_lock() */
 945static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 946                                         __be32 origin,
 947                                         __be32 mcastgrp)
 948{
 949        struct mfc_cache_cmp_arg arg = {
 950                        .mfc_mcastgrp = mcastgrp,
 951                        .mfc_origin = origin
 952        };
 953
 954        return mr_mfc_find(mrt, &arg);
 955}
 956
 957/* Look for a (*,G) entry */
 958static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
 959                                             __be32 mcastgrp, int vifi)
 960{
 961        struct mfc_cache_cmp_arg arg = {
 962                        .mfc_mcastgrp = mcastgrp,
 963                        .mfc_origin = htonl(INADDR_ANY)
 964        };
 965
 966        if (mcastgrp == htonl(INADDR_ANY))
 967                return mr_mfc_find_any_parent(mrt, vifi);
 968        return mr_mfc_find_any(mrt, vifi, &arg);
 969}
 970
 971/* Look for a (S,G,iif) entry if parent != -1 */
 972static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
 973                                                __be32 origin, __be32 mcastgrp,
 974                                                int parent)
 975{
 976        struct mfc_cache_cmp_arg arg = {
 977                        .mfc_mcastgrp = mcastgrp,
 978                        .mfc_origin = origin,
 979        };
 980
 981        return mr_mfc_find_parent(mrt, &arg, parent);
 982}
 983
 984/* Allocate a multicast cache entry */
 985static struct mfc_cache *ipmr_cache_alloc(void)
 986{
 987        struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
 988
 989        if (c) {
 990                c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
 991                c->_c.mfc_un.res.minvif = MAXVIFS;
 992                c->_c.free = ipmr_cache_free_rcu;
 993                refcount_set(&c->_c.mfc_un.res.refcount, 1);
 994        }
 995        return c;
 996}
 997
 998static struct mfc_cache *ipmr_cache_alloc_unres(void)
 999{
1000        struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
1001
1002        if (c) {
1003                skb_queue_head_init(&c->_c.mfc_un.unres.unresolved);
1004                c->_c.mfc_un.unres.expires = jiffies + 10 * HZ;
1005        }
1006        return c;
1007}
1008
1009/* A cache entry has gone into a resolved state from queued */
1010static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
1011                               struct mfc_cache *uc, struct mfc_cache *c)
1012{
1013        struct sk_buff *skb;
1014        struct nlmsgerr *e;
1015
1016        /* Play the pending entries through our router */
1017        while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) {
1018                if (ip_hdr(skb)->version == 0) {
1019                        struct nlmsghdr *nlh = skb_pull(skb,
1020                                                        sizeof(struct iphdr));
1021
1022                        if (mr_fill_mroute(mrt, skb, &c->_c,
1023                                           nlmsg_data(nlh)) > 0) {
1024                                nlh->nlmsg_len = skb_tail_pointer(skb) -
1025                                                 (u8 *)nlh;
1026                        } else {
1027                                nlh->nlmsg_type = NLMSG_ERROR;
1028                                nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
1029                                skb_trim(skb, nlh->nlmsg_len);
1030                                e = nlmsg_data(nlh);
1031                                e->error = -EMSGSIZE;
1032                                memset(&e->msg, 0, sizeof(e->msg));
1033                        }
1034
1035                        rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
1036                } else {
1037                        ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
1038                }
1039        }
1040}
1041
1042/* Bounce a cache query up to mrouted and netlink.
1043 *
1044 * Called under mrt_lock.
1045 */
1046static int ipmr_cache_report(struct mr_table *mrt,
1047                             struct sk_buff *pkt, vifi_t vifi, int assert)
1048{
1049        const int ihl = ip_hdrlen(pkt);
1050        struct sock *mroute_sk;
1051        struct igmphdr *igmp;
1052        struct igmpmsg *msg;
1053        struct sk_buff *skb;
1054        int ret;
1055
1056        if (assert == IGMPMSG_WHOLEPKT)
1057                skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
1058        else
1059                skb = alloc_skb(128, GFP_ATOMIC);
1060
1061        if (!skb)
1062                return -ENOBUFS;
1063
1064        if (assert == IGMPMSG_WHOLEPKT) {
1065                /* Ugly, but we have no choice with this interface.
1066                 * Duplicate old header, fix ihl, length etc.
1067                 * And all this only to mangle msg->im_msgtype and
1068                 * to set msg->im_mbz to "mbz" :-)
1069                 */
1070                skb_push(skb, sizeof(struct iphdr));
1071                skb_reset_network_header(skb);
1072                skb_reset_transport_header(skb);
1073                msg = (struct igmpmsg *)skb_network_header(skb);
1074                memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
1075                msg->im_msgtype = IGMPMSG_WHOLEPKT;
1076                msg->im_mbz = 0;
1077                msg->im_vif = mrt->mroute_reg_vif_num;
1078                ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
1079                ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
1080                                             sizeof(struct iphdr));
1081        } else {
1082                /* Copy the IP header */
1083                skb_set_network_header(skb, skb->len);
1084                skb_put(skb, ihl);
1085                skb_copy_to_linear_data(skb, pkt->data, ihl);
1086                /* Flag to the kernel this is a route add */
1087                ip_hdr(skb)->protocol = 0;
1088                msg = (struct igmpmsg *)skb_network_header(skb);
1089                msg->im_vif = vifi;
1090                skb_dst_set(skb, dst_clone(skb_dst(pkt)));
1091                /* Add our header */
1092                igmp = skb_put(skb, sizeof(struct igmphdr));
1093                igmp->type = assert;
1094                msg->im_msgtype = assert;
1095                igmp->code = 0;
1096                ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
1097                skb->transport_header = skb->network_header;
1098        }
1099
1100        rcu_read_lock();
1101        mroute_sk = rcu_dereference(mrt->mroute_sk);
1102        if (!mroute_sk) {
1103                rcu_read_unlock();
1104                kfree_skb(skb);
1105                return -EINVAL;
1106        }
1107
1108        igmpmsg_netlink_event(mrt, skb);
1109
1110        /* Deliver to mrouted */
1111        ret = sock_queue_rcv_skb(mroute_sk, skb);
1112        rcu_read_unlock();
1113        if (ret < 0) {
1114                net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
1115                kfree_skb(skb);
1116        }
1117
1118        return ret;
1119}
1120
1121/* Queue a packet for resolution. It gets locked cache entry! */
1122static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
1123                                 struct sk_buff *skb, struct net_device *dev)
1124{
1125        const struct iphdr *iph = ip_hdr(skb);
1126        struct mfc_cache *c;
1127        bool found = false;
1128        int err;
1129
1130        spin_lock_bh(&mfc_unres_lock);
1131        list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
1132                if (c->mfc_mcastgrp == iph->daddr &&
1133                    c->mfc_origin == iph->saddr) {
1134                        found = true;
1135                        break;
1136                }
1137        }
1138
1139        if (!found) {
1140                /* Create a new entry if allowable */
1141                if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
1142                    (c = ipmr_cache_alloc_unres()) == NULL) {
1143                        spin_unlock_bh(&mfc_unres_lock);
1144
1145                        kfree_skb(skb);
1146                        return -ENOBUFS;
1147                }
1148
1149                /* Fill in the new cache entry */
1150                c->_c.mfc_parent = -1;
1151                c->mfc_origin   = iph->saddr;
1152                c->mfc_mcastgrp = iph->daddr;
1153
1154                /* Reflect first query at mrouted. */
1155                err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
1156
1157                if (err < 0) {
1158                        /* If the report failed throw the cache entry
1159                           out - Brad Parker
1160                         */
1161                        spin_unlock_bh(&mfc_unres_lock);
1162
1163                        ipmr_cache_free(c);
1164                        kfree_skb(skb);
1165                        return err;
1166                }
1167
1168                atomic_inc(&mrt->cache_resolve_queue_len);
1169                list_add(&c->_c.list, &mrt->mfc_unres_queue);
1170                mroute_netlink_event(mrt, c, RTM_NEWROUTE);
1171
1172                if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1173                        mod_timer(&mrt->ipmr_expire_timer,
1174                                  c->_c.mfc_un.unres.expires);
1175        }
1176
1177        /* See if we can append the packet */
1178        if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
1179                kfree_skb(skb);
1180                err = -ENOBUFS;
1181        } else {
1182                if (dev) {
1183                        skb->dev = dev;
1184                        skb->skb_iif = dev->ifindex;
1185                }
1186                skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
1187                err = 0;
1188        }
1189
1190        spin_unlock_bh(&mfc_unres_lock);
1191        return err;
1192}
1193
1194/* MFC cache manipulation by user space mroute daemon */
1195
1196static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
1197{
1198        struct net *net = read_pnet(&mrt->net);
1199        struct mfc_cache *c;
1200
1201        /* The entries are added/deleted only under RTNL */
1202        rcu_read_lock();
1203        c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
1204                                   mfc->mfcc_mcastgrp.s_addr, parent);
1205        rcu_read_unlock();
1206        if (!c)
1207                return -ENOENT;
1208        rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params);
1209        list_del_rcu(&c->_c.list);
1210        call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
1211        mroute_netlink_event(mrt, c, RTM_DELROUTE);
1212        mr_cache_put(&c->_c);
1213
1214        return 0;
1215}
1216
1217static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1218                        struct mfcctl *mfc, int mrtsock, int parent)
1219{
1220        struct mfc_cache *uc, *c;
1221        struct mr_mfc *_uc;
1222        bool found;
1223        int ret;
1224
1225        if (mfc->mfcc_parent >= MAXVIFS)
1226                return -ENFILE;
1227
1228        /* The entries are added/deleted only under RTNL */
1229        rcu_read_lock();
1230        c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
1231                                   mfc->mfcc_mcastgrp.s_addr, parent);
1232        rcu_read_unlock();
1233        if (c) {
1234                write_lock_bh(&mrt_lock);
1235                c->_c.mfc_parent = mfc->mfcc_parent;
1236                ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
1237                if (!mrtsock)
1238                        c->_c.mfc_flags |= MFC_STATIC;
1239                write_unlock_bh(&mrt_lock);
1240                call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
1241                                              mrt->id);
1242                mroute_netlink_event(mrt, c, RTM_NEWROUTE);
1243                return 0;
1244        }
1245
1246        if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
1247            !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
1248                return -EINVAL;
1249
1250        c = ipmr_cache_alloc();
1251        if (!c)
1252                return -ENOMEM;
1253
1254        c->mfc_origin = mfc->mfcc_origin.s_addr;
1255        c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
1256        c->_c.mfc_parent = mfc->mfcc_parent;
1257        ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
1258        if (!mrtsock)
1259                c->_c.mfc_flags |= MFC_STATIC;
1260
1261        ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode,
1262                                  ipmr_rht_params);
1263        if (ret) {
1264                pr_err("ipmr: rhtable insert error %d\n", ret);
1265                ipmr_cache_free(c);
1266                return ret;
1267        }
1268        list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list);
1269        /* Check to see if we resolved a queued list. If so we
1270         * need to send on the frames and tidy up.
1271         */
1272        found = false;
1273        spin_lock_bh(&mfc_unres_lock);
1274        list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) {
1275                uc = (struct mfc_cache *)_uc;
1276                if (uc->mfc_origin == c->mfc_origin &&
1277                    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1278                        list_del(&_uc->list);
1279                        atomic_dec(&mrt->cache_resolve_queue_len);
1280                        found = true;
1281                        break;
1282                }
1283        }
1284        if (list_empty(&mrt->mfc_unres_queue))
1285                del_timer(&mrt->ipmr_expire_timer);
1286        spin_unlock_bh(&mfc_unres_lock);
1287
1288        if (found) {
1289                ipmr_cache_resolve(net, mrt, uc, c);
1290                ipmr_cache_free(uc);
1291        }
1292        call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id);
1293        mroute_netlink_event(mrt, c, RTM_NEWROUTE);
1294        return 0;
1295}
1296
1297/* Close the multicast socket, and clear the vif tables etc */
1298static void mroute_clean_tables(struct mr_table *mrt, bool all)
1299{
1300        struct net *net = read_pnet(&mrt->net);
1301        struct mr_mfc *c, *tmp;
1302        struct mfc_cache *cache;
1303        LIST_HEAD(list);
1304        int i;
1305
1306        /* Shut down all active vif entries */
1307        for (i = 0; i < mrt->maxvif; i++) {
1308                if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
1309                        continue;
1310                vif_delete(mrt, i, 0, &list);
1311        }
1312        unregister_netdevice_many(&list);
1313
1314        /* Wipe the cache */
1315        list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
1316                if (!all && (c->mfc_flags & MFC_STATIC))
1317                        continue;
1318                rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
1319                list_del_rcu(&c->list);
1320                cache = (struct mfc_cache *)c;
1321                call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
1322                                              mrt->id);
1323                mroute_netlink_event(mrt, cache, RTM_DELROUTE);
1324                mr_cache_put(c);
1325        }
1326
1327        if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1328                spin_lock_bh(&mfc_unres_lock);
1329                list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
1330                        list_del(&c->list);
1331                        cache = (struct mfc_cache *)c;
1332                        mroute_netlink_event(mrt, cache, RTM_DELROUTE);
1333                        ipmr_destroy_unres(mrt, cache);
1334                }
1335                spin_unlock_bh(&mfc_unres_lock);
1336        }
1337}
1338
1339/* called from ip_ra_control(), before an RCU grace period,
1340 * we dont need to call synchronize_rcu() here
1341 */
1342static void mrtsock_destruct(struct sock *sk)
1343{
1344        struct net *net = sock_net(sk);
1345        struct mr_table *mrt;
1346
1347        rtnl_lock();
1348        ipmr_for_each_table(mrt, net) {
1349                if (sk == rtnl_dereference(mrt->mroute_sk)) {
1350                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1351                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
1352                                                    NETCONFA_MC_FORWARDING,
1353                                                    NETCONFA_IFINDEX_ALL,
1354                                                    net->ipv4.devconf_all);
1355                        RCU_INIT_POINTER(mrt->mroute_sk, NULL);
1356                        mroute_clean_tables(mrt, false);
1357                }
1358        }
1359        rtnl_unlock();
1360}
1361
1362/* Socket options and virtual interface manipulation. The whole
1363 * virtual interface system is a complete heap, but unfortunately
1364 * that's how BSD mrouted happens to think. Maybe one day with a proper
1365 * MOSPF/PIM router set up we can clean this up.
1366 */
1367
1368int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
1369                         unsigned int optlen)
1370{
1371        struct net *net = sock_net(sk);
1372        int val, ret = 0, parent = 0;
1373        struct mr_table *mrt;
1374        struct vifctl vif;
1375        struct mfcctl mfc;
1376        u32 uval;
1377
1378        /* There's one exception to the lock - MRT_DONE which needs to unlock */
1379        rtnl_lock();
1380        if (sk->sk_type != SOCK_RAW ||
1381            inet_sk(sk)->inet_num != IPPROTO_IGMP) {
1382                ret = -EOPNOTSUPP;
1383                goto out_unlock;
1384        }
1385
1386        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1387        if (!mrt) {
1388                ret = -ENOENT;
1389                goto out_unlock;
1390        }
1391        if (optname != MRT_INIT) {
1392                if (sk != rcu_access_pointer(mrt->mroute_sk) &&
1393                    !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
1394                        ret = -EACCES;
1395                        goto out_unlock;
1396                }
1397        }
1398
1399        switch (optname) {
1400        case MRT_INIT:
1401                if (optlen != sizeof(int)) {
1402                        ret = -EINVAL;
1403                        break;
1404                }
1405                if (rtnl_dereference(mrt->mroute_sk)) {
1406                        ret = -EADDRINUSE;
1407                        break;
1408                }
1409
1410                ret = ip_ra_control(sk, 1, mrtsock_destruct);
1411                if (ret == 0) {
1412                        rcu_assign_pointer(mrt->mroute_sk, sk);
1413                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1414                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
1415                                                    NETCONFA_MC_FORWARDING,
1416                                                    NETCONFA_IFINDEX_ALL,
1417                                                    net->ipv4.devconf_all);
1418                }
1419                break;
1420        case MRT_DONE:
1421                if (sk != rcu_access_pointer(mrt->mroute_sk)) {
1422                        ret = -EACCES;
1423                } else {
1424                        /* We need to unlock here because mrtsock_destruct takes
1425                         * care of rtnl itself and we can't change that due to
1426                         * the IP_ROUTER_ALERT setsockopt which runs without it.
1427                         */
1428                        rtnl_unlock();
1429                        ret = ip_ra_control(sk, 0, NULL);
1430                        goto out;
1431                }
1432                break;
1433        case MRT_ADD_VIF:
1434        case MRT_DEL_VIF:
1435                if (optlen != sizeof(vif)) {
1436                        ret = -EINVAL;
1437                        break;
1438                }
1439                if (copy_from_user(&vif, optval, sizeof(vif))) {
1440                        ret = -EFAULT;
1441                        break;
1442                }
1443                if (vif.vifc_vifi >= MAXVIFS) {
1444                        ret = -ENFILE;
1445                        break;
1446                }
1447                if (optname == MRT_ADD_VIF) {
1448                        ret = vif_add(net, mrt, &vif,
1449                                      sk == rtnl_dereference(mrt->mroute_sk));
1450                } else {
1451                        ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1452                }
1453                break;
1454        /* Manipulate the forwarding caches. These live
1455         * in a sort of kernel/user symbiosis.
1456         */
1457        case MRT_ADD_MFC:
1458        case MRT_DEL_MFC:
1459                parent = -1;
1460                /* fall through */
1461        case MRT_ADD_MFC_PROXY:
1462        case MRT_DEL_MFC_PROXY:
1463                if (optlen != sizeof(mfc)) {
1464                        ret = -EINVAL;
1465                        break;
1466                }
1467                if (copy_from_user(&mfc, optval, sizeof(mfc))) {
1468                        ret = -EFAULT;
1469                        break;
1470                }
1471                if (parent == 0)
1472                        parent = mfc.mfcc_parent;
1473                if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
1474                        ret = ipmr_mfc_delete(mrt, &mfc, parent);
1475                else
1476                        ret = ipmr_mfc_add(net, mrt, &mfc,
1477                                           sk == rtnl_dereference(mrt->mroute_sk),
1478                                           parent);
1479                break;
1480        /* Control PIM assert. */
1481        case MRT_ASSERT:
1482                if (optlen != sizeof(val)) {
1483                        ret = -EINVAL;
1484                        break;
1485                }
1486                if (get_user(val, (int __user *)optval)) {
1487                        ret = -EFAULT;
1488                        break;
1489                }
1490                mrt->mroute_do_assert = val;
1491                break;
1492        case MRT_PIM:
1493                if (!ipmr_pimsm_enabled()) {
1494                        ret = -ENOPROTOOPT;
1495                        break;
1496                }
1497                if (optlen != sizeof(val)) {
1498                        ret = -EINVAL;
1499                        break;
1500                }
1501                if (get_user(val, (int __user *)optval)) {
1502                        ret = -EFAULT;
1503                        break;
1504                }
1505
1506                val = !!val;
1507                if (val != mrt->mroute_do_pim) {
1508                        mrt->mroute_do_pim = val;
1509                        mrt->mroute_do_assert = val;
1510                }
1511                break;
1512        case MRT_TABLE:
1513                if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) {
1514                        ret = -ENOPROTOOPT;
1515                        break;
1516                }
1517                if (optlen != sizeof(uval)) {
1518                        ret = -EINVAL;
1519                        break;
1520                }
1521                if (get_user(uval, (u32 __user *)optval)) {
1522                        ret = -EFAULT;
1523                        break;
1524                }
1525
1526                if (sk == rtnl_dereference(mrt->mroute_sk)) {
1527                        ret = -EBUSY;
1528                } else {
1529                        mrt = ipmr_new_table(net, uval);
1530                        if (IS_ERR(mrt))
1531                                ret = PTR_ERR(mrt);
1532                        else
1533                                raw_sk(sk)->ipmr_table = uval;
1534                }
1535                break;
1536        /* Spurious command, or MRT_VERSION which you cannot set. */
1537        default:
1538                ret = -ENOPROTOOPT;
1539        }
1540out_unlock:
1541        rtnl_unlock();
1542out:
1543        return ret;
1544}
1545
1546/* Getsock opt support for the multicast routing system. */
1547int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1548{
1549        int olr;
1550        int val;
1551        struct net *net = sock_net(sk);
1552        struct mr_table *mrt;
1553
1554        if (sk->sk_type != SOCK_RAW ||
1555            inet_sk(sk)->inet_num != IPPROTO_IGMP)
1556                return -EOPNOTSUPP;
1557
1558        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1559        if (!mrt)
1560                return -ENOENT;
1561
1562        switch (optname) {
1563        case MRT_VERSION:
1564                val = 0x0305;
1565                break;
1566        case MRT_PIM:
1567                if (!ipmr_pimsm_enabled())
1568                        return -ENOPROTOOPT;
1569                val = mrt->mroute_do_pim;
1570                break;
1571        case MRT_ASSERT:
1572                val = mrt->mroute_do_assert;
1573                break;
1574        default:
1575                return -ENOPROTOOPT;
1576        }
1577
1578        if (get_user(olr, optlen))
1579                return -EFAULT;
1580        olr = min_t(unsigned int, olr, sizeof(int));
1581        if (olr < 0)
1582                return -EINVAL;
1583        if (put_user(olr, optlen))
1584                return -EFAULT;
1585        if (copy_to_user(optval, &val, olr))
1586                return -EFAULT;
1587        return 0;
1588}
1589
1590/* The IP multicast ioctl support routines. */
1591int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1592{
1593        struct sioc_sg_req sr;
1594        struct sioc_vif_req vr;
1595        struct vif_device *vif;
1596        struct mfc_cache *c;
1597        struct net *net = sock_net(sk);
1598        struct mr_table *mrt;
1599
1600        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1601        if (!mrt)
1602                return -ENOENT;
1603
1604        switch (cmd) {
1605        case SIOCGETVIFCNT:
1606                if (copy_from_user(&vr, arg, sizeof(vr)))
1607                        return -EFAULT;
1608                if (vr.vifi >= mrt->maxvif)
1609                        return -EINVAL;
1610                vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
1611                read_lock(&mrt_lock);
1612                vif = &mrt->vif_table[vr.vifi];
1613                if (VIF_EXISTS(mrt, vr.vifi)) {
1614                        vr.icount = vif->pkt_in;
1615                        vr.ocount = vif->pkt_out;
1616                        vr.ibytes = vif->bytes_in;
1617                        vr.obytes = vif->bytes_out;
1618                        read_unlock(&mrt_lock);
1619
1620                        if (copy_to_user(arg, &vr, sizeof(vr)))
1621                                return -EFAULT;
1622                        return 0;
1623                }
1624                read_unlock(&mrt_lock);
1625                return -EADDRNOTAVAIL;
1626        case SIOCGETSGCNT:
1627                if (copy_from_user(&sr, arg, sizeof(sr)))
1628                        return -EFAULT;
1629
1630                rcu_read_lock();
1631                c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1632                if (c) {
1633                        sr.pktcnt = c->_c.mfc_un.res.pkt;
1634                        sr.bytecnt = c->_c.mfc_un.res.bytes;
1635                        sr.wrong_if = c->_c.mfc_un.res.wrong_if;
1636                        rcu_read_unlock();
1637
1638                        if (copy_to_user(arg, &sr, sizeof(sr)))
1639                                return -EFAULT;
1640                        return 0;
1641                }
1642                rcu_read_unlock();
1643                return -EADDRNOTAVAIL;
1644        default:
1645                return -ENOIOCTLCMD;
1646        }
1647}
1648
1649#ifdef CONFIG_COMPAT
1650struct compat_sioc_sg_req {
1651        struct in_addr src;
1652        struct in_addr grp;
1653        compat_ulong_t pktcnt;
1654        compat_ulong_t bytecnt;
1655        compat_ulong_t wrong_if;
1656};
1657
1658struct compat_sioc_vif_req {
1659        vifi_t  vifi;           /* Which iface */
1660        compat_ulong_t icount;
1661        compat_ulong_t ocount;
1662        compat_ulong_t ibytes;
1663        compat_ulong_t obytes;
1664};
1665
1666int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
1667{
1668        struct compat_sioc_sg_req sr;
1669        struct compat_sioc_vif_req vr;
1670        struct vif_device *vif;
1671        struct mfc_cache *c;
1672        struct net *net = sock_net(sk);
1673        struct mr_table *mrt;
1674
1675        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1676        if (!mrt)
1677                return -ENOENT;
1678
1679        switch (cmd) {
1680        case SIOCGETVIFCNT:
1681                if (copy_from_user(&vr, arg, sizeof(vr)))
1682                        return -EFAULT;
1683                if (vr.vifi >= mrt->maxvif)
1684                        return -EINVAL;
1685                vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
1686                read_lock(&mrt_lock);
1687                vif = &mrt->vif_table[vr.vifi];
1688                if (VIF_EXISTS(mrt, vr.vifi)) {
1689                        vr.icount = vif->pkt_in;
1690                        vr.ocount = vif->pkt_out;
1691                        vr.ibytes = vif->bytes_in;
1692                        vr.obytes = vif->bytes_out;
1693                        read_unlock(&mrt_lock);
1694
1695                        if (copy_to_user(arg, &vr, sizeof(vr)))
1696                                return -EFAULT;
1697                        return 0;
1698                }
1699                read_unlock(&mrt_lock);
1700                return -EADDRNOTAVAIL;
1701        case SIOCGETSGCNT:
1702                if (copy_from_user(&sr, arg, sizeof(sr)))
1703                        return -EFAULT;
1704
1705                rcu_read_lock();
1706                c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1707                if (c) {
1708                        sr.pktcnt = c->_c.mfc_un.res.pkt;
1709                        sr.bytecnt = c->_c.mfc_un.res.bytes;
1710                        sr.wrong_if = c->_c.mfc_un.res.wrong_if;
1711                        rcu_read_unlock();
1712
1713                        if (copy_to_user(arg, &sr, sizeof(sr)))
1714                                return -EFAULT;
1715                        return 0;
1716                }
1717                rcu_read_unlock();
1718                return -EADDRNOTAVAIL;
1719        default:
1720                return -ENOIOCTLCMD;
1721        }
1722}
1723#endif
1724
1725static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1726{
1727        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1728        struct net *net = dev_net(dev);
1729        struct mr_table *mrt;
1730        struct vif_device *v;
1731        int ct;
1732
1733        if (event != NETDEV_UNREGISTER)
1734                return NOTIFY_DONE;
1735
1736        ipmr_for_each_table(mrt, net) {
1737                v = &mrt->vif_table[0];
1738                for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1739                        if (v->dev == dev)
1740                                vif_delete(mrt, ct, 1, NULL);
1741                }
1742        }
1743        return NOTIFY_DONE;
1744}
1745
1746static struct notifier_block ip_mr_notifier = {
1747        .notifier_call = ipmr_device_event,
1748};
1749
1750/* Encapsulate a packet by attaching a valid IPIP header to it.
1751 * This avoids tunnel drivers and other mess and gives us the speed so
1752 * important for multicast video.
1753 */
1754static void ip_encap(struct net *net, struct sk_buff *skb,
1755                     __be32 saddr, __be32 daddr)
1756{
1757        struct iphdr *iph;
1758        const struct iphdr *old_iph = ip_hdr(skb);
1759
1760        skb_push(skb, sizeof(struct iphdr));
1761        skb->transport_header = skb->network_header;
1762        skb_reset_network_header(skb);
1763        iph = ip_hdr(skb);
1764
1765        iph->version    =       4;
1766        iph->tos        =       old_iph->tos;
1767        iph->ttl        =       old_iph->ttl;
1768        iph->frag_off   =       0;
1769        iph->daddr      =       daddr;
1770        iph->saddr      =       saddr;
1771        iph->protocol   =       IPPROTO_IPIP;
1772        iph->ihl        =       5;
1773        iph->tot_len    =       htons(skb->len);
1774        ip_select_ident(net, skb, NULL);
1775        ip_send_check(iph);
1776
1777        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1778        nf_reset(skb);
1779}
1780
1781static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
1782                                      struct sk_buff *skb)
1783{
1784        struct ip_options *opt = &(IPCB(skb)->opt);
1785
1786        IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
1787        IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
1788
1789        if (unlikely(opt->optlen))
1790                ip_forward_options(skb);
1791
1792        return dst_output(net, sk, skb);
1793}
1794
1795#ifdef CONFIG_NET_SWITCHDEV
1796static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
1797                                   int in_vifi, int out_vifi)
1798{
1799        struct vif_device *out_vif = &mrt->vif_table[out_vifi];
1800        struct vif_device *in_vif = &mrt->vif_table[in_vifi];
1801
1802        if (!skb->offload_mr_fwd_mark)
1803                return false;
1804        if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
1805                return false;
1806        return netdev_phys_item_id_same(&out_vif->dev_parent_id,
1807                                        &in_vif->dev_parent_id);
1808}
1809#else
1810static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
1811                                   int in_vifi, int out_vifi)
1812{
1813        return false;
1814}
1815#endif
1816
1817/* Processing handlers for ipmr_forward */
1818
1819static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1820                            int in_vifi, struct sk_buff *skb,
1821                            struct mfc_cache *c, int vifi)
1822{
1823        const struct iphdr *iph = ip_hdr(skb);
1824        struct vif_device *vif = &mrt->vif_table[vifi];
1825        struct net_device *dev;
1826        struct rtable *rt;
1827        struct flowi4 fl4;
1828        int    encap = 0;
1829
1830        if (!vif->dev)
1831                goto out_free;
1832
1833        if (vif->flags & VIFF_REGISTER) {
1834                vif->pkt_out++;
1835                vif->bytes_out += skb->len;
1836                vif->dev->stats.tx_bytes += skb->len;
1837                vif->dev->stats.tx_packets++;
1838                ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1839                goto out_free;
1840        }
1841
1842        if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
1843                goto out_free;
1844
1845        if (vif->flags & VIFF_TUNNEL) {
1846                rt = ip_route_output_ports(net, &fl4, NULL,
1847                                           vif->remote, vif->local,
1848                                           0, 0,
1849                                           IPPROTO_IPIP,
1850                                           RT_TOS(iph->tos), vif->link);
1851                if (IS_ERR(rt))
1852                        goto out_free;
1853                encap = sizeof(struct iphdr);
1854        } else {
1855                rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
1856                                           0, 0,
1857                                           IPPROTO_IPIP,
1858                                           RT_TOS(iph->tos), vif->link);
1859                if (IS_ERR(rt))
1860                        goto out_free;
1861        }
1862
1863        dev = rt->dst.dev;
1864
1865        if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1866                /* Do not fragment multicasts. Alas, IPv4 does not
1867                 * allow to send ICMP, so that packets will disappear
1868                 * to blackhole.
1869                 */
1870                IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
1871                ip_rt_put(rt);
1872                goto out_free;
1873        }
1874
1875        encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
1876
1877        if (skb_cow(skb, encap)) {
1878                ip_rt_put(rt);
1879                goto out_free;
1880        }
1881
1882        vif->pkt_out++;
1883        vif->bytes_out += skb->len;
1884
1885        skb_dst_drop(skb);
1886        skb_dst_set(skb, &rt->dst);
1887        ip_decrease_ttl(ip_hdr(skb));
1888
1889        /* FIXME: forward and output firewalls used to be called here.
1890         * What do we do with netfilter? -- RR
1891         */
1892        if (vif->flags & VIFF_TUNNEL) {
1893                ip_encap(net, skb, vif->local, vif->remote);
1894                /* FIXME: extra output firewall step used to be here. --RR */
1895                vif->dev->stats.tx_packets++;
1896                vif->dev->stats.tx_bytes += skb->len;
1897        }
1898
1899        IPCB(skb)->flags |= IPSKB_FORWARDED;
1900
1901        /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1902         * not only before forwarding, but after forwarding on all output
1903         * interfaces. It is clear, if mrouter runs a multicasting
1904         * program, it should receive packets not depending to what interface
1905         * program is joined.
1906         * If we will not make it, the program will have to join on all
1907         * interfaces. On the other hand, multihoming host (or router, but
1908         * not mrouter) cannot join to more than one interface - it will
1909         * result in receiving multiple packets.
1910         */
1911        NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
1912                net, NULL, skb, skb->dev, dev,
1913                ipmr_forward_finish);
1914        return;
1915
1916out_free:
1917        kfree_skb(skb);
1918}
1919
1920static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1921{
1922        int ct;
1923
1924        for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1925                if (mrt->vif_table[ct].dev == dev)
1926                        break;
1927        }
1928        return ct;
1929}
1930
1931/* "local" means that we should preserve one skb (for local delivery) */
1932static void ip_mr_forward(struct net *net, struct mr_table *mrt,
1933                          struct net_device *dev, struct sk_buff *skb,
1934                          struct mfc_cache *c, int local)
1935{
1936        int true_vifi = ipmr_find_vif(mrt, dev);
1937        int psend = -1;
1938        int vif, ct;
1939
1940        vif = c->_c.mfc_parent;
1941        c->_c.mfc_un.res.pkt++;
1942        c->_c.mfc_un.res.bytes += skb->len;
1943        c->_c.mfc_un.res.lastuse = jiffies;
1944
1945        if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
1946                struct mfc_cache *cache_proxy;
1947
1948                /* For an (*,G) entry, we only check that the incomming
1949                 * interface is part of the static tree.
1950                 */
1951                cache_proxy = mr_mfc_find_any_parent(mrt, vif);
1952                if (cache_proxy &&
1953                    cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
1954                        goto forward;
1955        }
1956
1957        /* Wrong interface: drop packet and (maybe) send PIM assert. */
1958        if (mrt->vif_table[vif].dev != dev) {
1959                if (rt_is_output_route(skb_rtable(skb))) {
1960                        /* It is our own packet, looped back.
1961                         * Very complicated situation...
1962                         *
1963                         * The best workaround until routing daemons will be
1964                         * fixed is not to redistribute packet, if it was
1965                         * send through wrong interface. It means, that
1966                         * multicast applications WILL NOT work for
1967                         * (S,G), which have default multicast route pointing
1968                         * to wrong oif. In any case, it is not a good
1969                         * idea to use multicasting applications on router.
1970                         */
1971                        goto dont_forward;
1972                }
1973
1974                c->_c.mfc_un.res.wrong_if++;
1975
1976                if (true_vifi >= 0 && mrt->mroute_do_assert &&
1977                    /* pimsm uses asserts, when switching from RPT to SPT,
1978                     * so that we cannot check that packet arrived on an oif.
1979                     * It is bad, but otherwise we would need to move pretty
1980                     * large chunk of pimd to kernel. Ough... --ANK
1981                     */
1982                    (mrt->mroute_do_pim ||
1983                     c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
1984                    time_after(jiffies,
1985                               c->_c.mfc_un.res.last_assert +
1986                               MFC_ASSERT_THRESH)) {
1987                        c->_c.mfc_un.res.last_assert = jiffies;
1988                        ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1989                }
1990                goto dont_forward;
1991        }
1992
1993forward:
1994        mrt->vif_table[vif].pkt_in++;
1995        mrt->vif_table[vif].bytes_in += skb->len;
1996
1997        /* Forward the frame */
1998        if (c->mfc_origin == htonl(INADDR_ANY) &&
1999            c->mfc_mcastgrp == htonl(INADDR_ANY)) {
2000                if (true_vifi >= 0 &&
2001                    true_vifi != c->_c.mfc_parent &&
2002                    ip_hdr(skb)->ttl >
2003                                c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
2004                        /* It's an (*,*) entry and the packet is not coming from
2005                         * the upstream: forward the packet to the upstream
2006                         * only.
2007                         */
2008                        psend = c->_c.mfc_parent;
2009                        goto last_forward;
2010                }
2011                goto dont_forward;
2012        }
2013        for (ct = c->_c.mfc_un.res.maxvif - 1;
2014             ct >= c->_c.mfc_un.res.minvif; ct--) {
2015                /* For (*,G) entry, don't forward to the incoming interface */
2016                if ((c->mfc_origin != htonl(INADDR_ANY) ||
2017                     ct != true_vifi) &&
2018                    ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) {
2019                        if (psend != -1) {
2020                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
2021
2022                                if (skb2)
2023                                        ipmr_queue_xmit(net, mrt, true_vifi,
2024                                                        skb2, c, psend);
2025                        }
2026                        psend = ct;
2027                }
2028        }
2029last_forward:
2030        if (psend != -1) {
2031                if (local) {
2032                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
2033
2034                        if (skb2)
2035                                ipmr_queue_xmit(net, mrt, true_vifi, skb2,
2036                                                c, psend);
2037                } else {
2038                        ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend);
2039                        return;
2040                }
2041        }
2042
2043dont_forward:
2044        if (!local)
2045                kfree_skb(skb);
2046}
2047
2048static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
2049{
2050        struct rtable *rt = skb_rtable(skb);
2051        struct iphdr *iph = ip_hdr(skb);
2052        struct flowi4 fl4 = {
2053                .daddr = iph->daddr,
2054                .saddr = iph->saddr,
2055                .flowi4_tos = RT_TOS(iph->tos),
2056                .flowi4_oif = (rt_is_output_route(rt) ?
2057                               skb->dev->ifindex : 0),
2058                .flowi4_iif = (rt_is_output_route(rt) ?
2059                               LOOPBACK_IFINDEX :
2060                               skb->dev->ifindex),
2061                .flowi4_mark = skb->mark,
2062        };
2063        struct mr_table *mrt;
2064        int err;
2065
2066        err = ipmr_fib_lookup(net, &fl4, &mrt);
2067        if (err)
2068                return ERR_PTR(err);
2069        return mrt;
2070}
2071
2072/* Multicast packets for forwarding arrive here
2073 * Called with rcu_read_lock();
2074 */
2075int ip_mr_input(struct sk_buff *skb)
2076{
2077        struct mfc_cache *cache;
2078        struct net *net = dev_net(skb->dev);
2079        int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
2080        struct mr_table *mrt;
2081        struct net_device *dev;
2082
2083        /* skb->dev passed in is the loX master dev for vrfs.
2084         * As there are no vifs associated with loopback devices,
2085         * get the proper interface that does have a vif associated with it.
2086         */
2087        dev = skb->dev;
2088        if (netif_is_l3_master(skb->dev)) {
2089                dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
2090                if (!dev) {
2091                        kfree_skb(skb);
2092                        return -ENODEV;
2093                }
2094        }
2095
2096        /* Packet is looped back after forward, it should not be
2097         * forwarded second time, but still can be delivered locally.
2098         */
2099        if (IPCB(skb)->flags & IPSKB_FORWARDED)
2100                goto dont_forward;
2101
2102        mrt = ipmr_rt_fib_lookup(net, skb);
2103        if (IS_ERR(mrt)) {
2104                kfree_skb(skb);
2105                return PTR_ERR(mrt);
2106        }
2107        if (!local) {
2108                if (IPCB(skb)->opt.router_alert) {
2109                        if (ip_call_ra_chain(skb))
2110                                return 0;
2111                } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
2112                        /* IGMPv1 (and broken IGMPv2 implementations sort of
2113                         * Cisco IOS <= 11.2(8)) do not put router alert
2114                         * option to IGMP packets destined to routable
2115                         * groups. It is very bad, because it means
2116                         * that we can forward NO IGMP messages.
2117                         */
2118                        struct sock *mroute_sk;
2119
2120                        mroute_sk = rcu_dereference(mrt->mroute_sk);
2121                        if (mroute_sk) {
2122                                nf_reset(skb);
2123                                raw_rcv(mroute_sk, skb);
2124                                return 0;
2125                        }
2126                    }
2127        }
2128
2129        /* already under rcu_read_lock() */
2130        cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
2131        if (!cache) {
2132                int vif = ipmr_find_vif(mrt, dev);
2133
2134                if (vif >= 0)
2135                        cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
2136                                                    vif);
2137        }
2138
2139        /* No usable cache entry */
2140        if (!cache) {
2141                int vif;
2142
2143                if (local) {
2144                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
2145                        ip_local_deliver(skb);
2146                        if (!skb2)
2147                                return -ENOBUFS;
2148                        skb = skb2;
2149                }
2150
2151                read_lock(&mrt_lock);
2152                vif = ipmr_find_vif(mrt, dev);
2153                if (vif >= 0) {
2154                        int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev);
2155                        read_unlock(&mrt_lock);
2156
2157                        return err2;
2158                }
2159                read_unlock(&mrt_lock);
2160                kfree_skb(skb);
2161                return -ENODEV;
2162        }
2163
2164        read_lock(&mrt_lock);
2165        ip_mr_forward(net, mrt, dev, skb, cache, local);
2166        read_unlock(&mrt_lock);
2167
2168        if (local)
2169                return ip_local_deliver(skb);
2170
2171        return 0;
2172
2173dont_forward:
2174        if (local)
2175                return ip_local_deliver(skb);
2176        kfree_skb(skb);
2177        return 0;
2178}
2179
2180#ifdef CONFIG_IP_PIMSM_V1
2181/* Handle IGMP messages of PIMv1 */
2182int pim_rcv_v1(struct sk_buff *skb)
2183{
2184        struct igmphdr *pim;
2185        struct net *net = dev_net(skb->dev);
2186        struct mr_table *mrt;
2187
2188        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
2189                goto drop;
2190
2191        pim = igmp_hdr(skb);
2192
2193        mrt = ipmr_rt_fib_lookup(net, skb);
2194        if (IS_ERR(mrt))
2195                goto drop;
2196        if (!mrt->mroute_do_pim ||
2197            pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
2198                goto drop;
2199
2200        if (__pim_rcv(mrt, skb, sizeof(*pim))) {
2201drop:
2202                kfree_skb(skb);
2203        }
2204        return 0;
2205}
2206#endif
2207
2208#ifdef CONFIG_IP_PIMSM_V2
2209static int pim_rcv(struct sk_buff *skb)
2210{
2211        struct pimreghdr *pim;
2212        struct net *net = dev_net(skb->dev);
2213        struct mr_table *mrt;
2214
2215        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
2216                goto drop;
2217
2218        pim = (struct pimreghdr *)skb_transport_header(skb);
2219        if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
2220            (pim->flags & PIM_NULL_REGISTER) ||
2221            (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
2222             csum_fold(skb_checksum(skb, 0, skb->len, 0))))
2223                goto drop;
2224
2225        mrt = ipmr_rt_fib_lookup(net, skb);
2226        if (IS_ERR(mrt))
2227                goto drop;
2228        if (__pim_rcv(mrt, skb, sizeof(*pim))) {
2229drop:
2230                kfree_skb(skb);
2231        }
2232        return 0;
2233}
2234#endif
2235
2236int ipmr_get_route(struct net *net, struct sk_buff *skb,
2237                   __be32 saddr, __be32 daddr,
2238                   struct rtmsg *rtm, u32 portid)
2239{
2240        struct mfc_cache *cache;
2241        struct mr_table *mrt;
2242        int err;
2243
2244        mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2245        if (!mrt)
2246                return -ENOENT;
2247
2248        rcu_read_lock();
2249        cache = ipmr_cache_find(mrt, saddr, daddr);
2250        if (!cache && skb->dev) {
2251                int vif = ipmr_find_vif(mrt, skb->dev);
2252
2253                if (vif >= 0)
2254                        cache = ipmr_cache_find_any(mrt, daddr, vif);
2255        }
2256        if (!cache) {
2257                struct sk_buff *skb2;
2258                struct iphdr *iph;
2259                struct net_device *dev;
2260                int vif = -1;
2261
2262                dev = skb->dev;
2263                read_lock(&mrt_lock);
2264                if (dev)
2265                        vif = ipmr_find_vif(mrt, dev);
2266                if (vif < 0) {
2267                        read_unlock(&mrt_lock);
2268                        rcu_read_unlock();
2269                        return -ENODEV;
2270                }
2271                skb2 = skb_clone(skb, GFP_ATOMIC);
2272                if (!skb2) {
2273                        read_unlock(&mrt_lock);
2274                        rcu_read_unlock();
2275                        return -ENOMEM;
2276                }
2277
2278                NETLINK_CB(skb2).portid = portid;
2279                skb_push(skb2, sizeof(struct iphdr));
2280                skb_reset_network_header(skb2);
2281                iph = ip_hdr(skb2);
2282                iph->ihl = sizeof(struct iphdr) >> 2;
2283                iph->saddr = saddr;
2284                iph->daddr = daddr;
2285                iph->version = 0;
2286                err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
2287                read_unlock(&mrt_lock);
2288                rcu_read_unlock();
2289                return err;
2290        }
2291
2292        read_lock(&mrt_lock);
2293        err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
2294        read_unlock(&mrt_lock);
2295        rcu_read_unlock();
2296        return err;
2297}
2298
2299static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2300                            u32 portid, u32 seq, struct mfc_cache *c, int cmd,
2301                            int flags)
2302{
2303        struct nlmsghdr *nlh;
2304        struct rtmsg *rtm;
2305        int err;
2306
2307        nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
2308        if (!nlh)
2309                return -EMSGSIZE;
2310
2311        rtm = nlmsg_data(nlh);
2312        rtm->rtm_family   = RTNL_FAMILY_IPMR;
2313        rtm->rtm_dst_len  = 32;
2314        rtm->rtm_src_len  = 32;
2315        rtm->rtm_tos      = 0;
2316        rtm->rtm_table    = mrt->id;
2317        if (nla_put_u32(skb, RTA_TABLE, mrt->id))
2318                goto nla_put_failure;
2319        rtm->rtm_type     = RTN_MULTICAST;
2320        rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2321        if (c->_c.mfc_flags & MFC_STATIC)
2322                rtm->rtm_protocol = RTPROT_STATIC;
2323        else
2324                rtm->rtm_protocol = RTPROT_MROUTED;
2325        rtm->rtm_flags    = 0;
2326
2327        if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
2328            nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
2329                goto nla_put_failure;
2330        err = mr_fill_mroute(mrt, skb, &c->_c, rtm);
2331        /* do not break the dump if cache is unresolved */
2332        if (err < 0 && err != -ENOENT)
2333                goto nla_put_failure;
2334
2335        nlmsg_end(skb, nlh);
2336        return 0;
2337
2338nla_put_failure:
2339        nlmsg_cancel(skb, nlh);
2340        return -EMSGSIZE;
2341}
2342
2343static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2344                             u32 portid, u32 seq, struct mr_mfc *c, int cmd,
2345                             int flags)
2346{
2347        return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c,
2348                                cmd, flags);
2349}
2350
2351static size_t mroute_msgsize(bool unresolved, int maxvif)
2352{
2353        size_t len =
2354                NLMSG_ALIGN(sizeof(struct rtmsg))
2355                + nla_total_size(4)     /* RTA_TABLE */
2356                + nla_total_size(4)     /* RTA_SRC */
2357                + nla_total_size(4)     /* RTA_DST */
2358                ;
2359
2360        if (!unresolved)
2361                len = len
2362                      + nla_total_size(4)       /* RTA_IIF */
2363                      + nla_total_size(0)       /* RTA_MULTIPATH */
2364                      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
2365                                                /* RTA_MFC_STATS */
2366                      + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
2367                ;
2368
2369        return len;
2370}
2371
2372static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
2373                                 int cmd)
2374{
2375        struct net *net = read_pnet(&mrt->net);
2376        struct sk_buff *skb;
2377        int err = -ENOBUFS;
2378
2379        skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS,
2380                                       mrt->maxvif),
2381                        GFP_ATOMIC);
2382        if (!skb)
2383                goto errout;
2384
2385        err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
2386        if (err < 0)
2387                goto errout;
2388
2389        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
2390        return;
2391
2392errout:
2393        kfree_skb(skb);
2394        if (err < 0)
2395                rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
2396}
2397
2398static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
2399{
2400        size_t len =
2401                NLMSG_ALIGN(sizeof(struct rtgenmsg))
2402                + nla_total_size(1)     /* IPMRA_CREPORT_MSGTYPE */
2403                + nla_total_size(4)     /* IPMRA_CREPORT_VIF_ID */
2404                + nla_total_size(4)     /* IPMRA_CREPORT_SRC_ADDR */
2405                + nla_total_size(4)     /* IPMRA_CREPORT_DST_ADDR */
2406                                        /* IPMRA_CREPORT_PKT */
2407                + nla_total_size(payloadlen)
2408                ;
2409
2410        return len;
2411}
2412
2413static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
2414{
2415        struct net *net = read_pnet(&mrt->net);
2416        struct nlmsghdr *nlh;
2417        struct rtgenmsg *rtgenm;
2418        struct igmpmsg *msg;
2419        struct sk_buff *skb;
2420        struct nlattr *nla;
2421        int payloadlen;
2422
2423        payloadlen = pkt->len - sizeof(struct igmpmsg);
2424        msg = (struct igmpmsg *)skb_network_header(pkt);
2425
2426        skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC);
2427        if (!skb)
2428                goto errout;
2429
2430        nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
2431                        sizeof(struct rtgenmsg), 0);
2432        if (!nlh)
2433                goto errout;
2434        rtgenm = nlmsg_data(nlh);
2435        rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
2436        if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
2437            nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) ||
2438            nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
2439                            msg->im_src.s_addr) ||
2440            nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
2441                            msg->im_dst.s_addr))
2442                goto nla_put_failure;
2443
2444        nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
2445        if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg),
2446                                  nla_data(nla), payloadlen))
2447                goto nla_put_failure;
2448
2449        nlmsg_end(skb, nlh);
2450
2451        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC);
2452        return;
2453
2454nla_put_failure:
2455        nlmsg_cancel(skb, nlh);
2456errout:
2457        kfree_skb(skb);
2458        rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS);
2459}
2460
2461static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2462                             struct netlink_ext_ack *extack)
2463{
2464        struct net *net = sock_net(in_skb->sk);
2465        struct nlattr *tb[RTA_MAX + 1];
2466        struct sk_buff *skb = NULL;
2467        struct mfc_cache *cache;
2468        struct mr_table *mrt;
2469        struct rtmsg *rtm;
2470        __be32 src, grp;
2471        u32 tableid;
2472        int err;
2473
2474        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
2475                          rtm_ipv4_policy, extack);
2476        if (err < 0)
2477                goto errout;
2478
2479        rtm = nlmsg_data(nlh);
2480
2481        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2482        grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2483        tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0;
2484
2485        mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT);
2486        if (!mrt) {
2487                err = -ENOENT;
2488                goto errout_free;
2489        }
2490
2491        /* entries are added/deleted only under RTNL */
2492        rcu_read_lock();
2493        cache = ipmr_cache_find(mrt, src, grp);
2494        rcu_read_unlock();
2495        if (!cache) {
2496                err = -ENOENT;
2497                goto errout_free;
2498        }
2499
2500        skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL);
2501        if (!skb) {
2502                err = -ENOBUFS;
2503                goto errout_free;
2504        }
2505
2506        err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
2507                               nlh->nlmsg_seq, cache,
2508                               RTM_NEWROUTE, 0);
2509        if (err < 0)
2510                goto errout_free;
2511
2512        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2513
2514errout:
2515        return err;
2516
2517errout_free:
2518        kfree_skb(skb);
2519        goto errout;
2520}
2521
2522static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2523{
2524        return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
2525                                _ipmr_fill_mroute, &mfc_unres_lock);
2526}
2527
2528static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
2529        [RTA_SRC]       = { .type = NLA_U32 },
2530        [RTA_DST]       = { .type = NLA_U32 },
2531        [RTA_IIF]       = { .type = NLA_U32 },
2532        [RTA_TABLE]     = { .type = NLA_U32 },
2533        [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2534};
2535
2536static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol)
2537{
2538        switch (rtm_protocol) {
2539        case RTPROT_STATIC:
2540        case RTPROT_MROUTED:
2541                return true;
2542        }
2543        return false;
2544}
2545
2546static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc)
2547{
2548        struct rtnexthop *rtnh = nla_data(nla);
2549        int remaining = nla_len(nla), vifi = 0;
2550
2551        while (rtnh_ok(rtnh, remaining)) {
2552                mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops;
2553                if (++vifi == MAXVIFS)
2554                        break;
2555                rtnh = rtnh_next(rtnh, &remaining);
2556        }
2557
2558        return remaining > 0 ? -EINVAL : vifi;
2559}
2560
2561/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */
2562static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
2563                            struct mfcctl *mfcc, int *mrtsock,
2564                            struct mr_table **mrtret,
2565                            struct netlink_ext_ack *extack)
2566{
2567        struct net_device *dev = NULL;
2568        u32 tblid = RT_TABLE_DEFAULT;
2569        struct mr_table *mrt;
2570        struct nlattr *attr;
2571        struct rtmsg *rtm;
2572        int ret, rem;
2573
2574        ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy,
2575                             extack);
2576        if (ret < 0)
2577                goto out;
2578        rtm = nlmsg_data(nlh);
2579
2580        ret = -EINVAL;
2581        if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 ||
2582            rtm->rtm_type != RTN_MULTICAST ||
2583            rtm->rtm_scope != RT_SCOPE_UNIVERSE ||
2584            !ipmr_rtm_validate_proto(rtm->rtm_protocol))
2585                goto out;
2586
2587        memset(mfcc, 0, sizeof(*mfcc));
2588        mfcc->mfcc_parent = -1;
2589        ret = 0;
2590        nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) {
2591                switch (nla_type(attr)) {
2592                case RTA_SRC:
2593                        mfcc->mfcc_origin.s_addr = nla_get_be32(attr);
2594                        break;
2595                case RTA_DST:
2596                        mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr);
2597                        break;
2598                case RTA_IIF:
2599                        dev = __dev_get_by_index(net, nla_get_u32(attr));
2600                        if (!dev) {
2601                                ret = -ENODEV;
2602                                goto out;
2603                        }
2604                        break;
2605                case RTA_MULTIPATH:
2606                        if (ipmr_nla_get_ttls(attr, mfcc) < 0) {
2607                                ret = -EINVAL;
2608                                goto out;
2609                        }
2610                        break;
2611                case RTA_PREFSRC:
2612                        ret = 1;
2613                        break;
2614                case RTA_TABLE:
2615                        tblid = nla_get_u32(attr);
2616                        break;
2617                }
2618        }
2619        mrt = ipmr_get_table(net, tblid);
2620        if (!mrt) {
2621                ret = -ENOENT;
2622                goto out;
2623        }
2624        *mrtret = mrt;
2625        *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0;
2626        if (dev)
2627                mfcc->mfcc_parent = ipmr_find_vif(mrt, dev);
2628
2629out:
2630        return ret;
2631}
2632
2633/* takes care of both newroute and delroute */
2634static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh,
2635                          struct netlink_ext_ack *extack)
2636{
2637        struct net *net = sock_net(skb->sk);
2638        int ret, mrtsock, parent;
2639        struct mr_table *tbl;
2640        struct mfcctl mfcc;
2641
2642        mrtsock = 0;
2643        tbl = NULL;
2644        ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack);
2645        if (ret < 0)
2646                return ret;
2647
2648        parent = ret ? mfcc.mfcc_parent : -1;
2649        if (nlh->nlmsg_type == RTM_NEWROUTE)
2650                return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent);
2651        else
2652                return ipmr_mfc_delete(tbl, &mfcc, parent);
2653}
2654
2655static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
2656{
2657        u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len);
2658
2659        if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) ||
2660            nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) ||
2661            nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM,
2662                        mrt->mroute_reg_vif_num) ||
2663            nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
2664                       mrt->mroute_do_assert) ||
2665            nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
2666                return false;
2667
2668        return true;
2669}
2670
2671static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
2672{
2673        struct nlattr *vif_nest;
2674        struct vif_device *vif;
2675
2676        /* if the VIF doesn't exist just continue */
2677        if (!VIF_EXISTS(mrt, vifid))
2678                return true;
2679
2680        vif = &mrt->vif_table[vifid];
2681        vif_nest = nla_nest_start(skb, IPMRA_VIF);
2682        if (!vif_nest)
2683                return false;
2684        if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) ||
2685            nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
2686            nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
2687            nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
2688                              IPMRA_VIFA_PAD) ||
2689            nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out,
2690                              IPMRA_VIFA_PAD) ||
2691            nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in,
2692                              IPMRA_VIFA_PAD) ||
2693            nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out,
2694                              IPMRA_VIFA_PAD) ||
2695            nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) ||
2696            nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) {
2697                nla_nest_cancel(skb, vif_nest);
2698                return false;
2699        }
2700        nla_nest_end(skb, vif_nest);
2701
2702        return true;
2703}
2704
2705static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
2706{
2707        struct net *net = sock_net(skb->sk);
2708        struct nlmsghdr *nlh = NULL;
2709        unsigned int t = 0, s_t;
2710        unsigned int e = 0, s_e;
2711        struct mr_table *mrt;
2712
2713        s_t = cb->args[0];
2714        s_e = cb->args[1];
2715
2716        ipmr_for_each_table(mrt, net) {
2717                struct nlattr *vifs, *af;
2718                struct ifinfomsg *hdr;
2719                u32 i;
2720
2721                if (t < s_t)
2722                        goto skip_table;
2723                nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
2724                                cb->nlh->nlmsg_seq, RTM_NEWLINK,
2725                                sizeof(*hdr), NLM_F_MULTI);
2726                if (!nlh)
2727                        break;
2728
2729                hdr = nlmsg_data(nlh);
2730                memset(hdr, 0, sizeof(*hdr));
2731                hdr->ifi_family = RTNL_FAMILY_IPMR;
2732
2733                af = nla_nest_start(skb, IFLA_AF_SPEC);
2734                if (!af) {
2735                        nlmsg_cancel(skb, nlh);
2736                        goto out;
2737                }
2738
2739                if (!ipmr_fill_table(mrt, skb)) {
2740                        nlmsg_cancel(skb, nlh);
2741                        goto out;
2742                }
2743
2744                vifs = nla_nest_start(skb, IPMRA_TABLE_VIFS);
2745                if (!vifs) {
2746                        nla_nest_end(skb, af);
2747                        nlmsg_end(skb, nlh);
2748                        goto out;
2749                }
2750                for (i = 0; i < mrt->maxvif; i++) {
2751                        if (e < s_e)
2752                                goto skip_entry;
2753                        if (!ipmr_fill_vif(mrt, i, skb)) {
2754                                nla_nest_end(skb, vifs);
2755                                nla_nest_end(skb, af);
2756                                nlmsg_end(skb, nlh);
2757                                goto out;
2758                        }
2759skip_entry:
2760                        e++;
2761                }
2762                s_e = 0;
2763                e = 0;
2764                nla_nest_end(skb, vifs);
2765                nla_nest_end(skb, af);
2766                nlmsg_end(skb, nlh);
2767skip_table:
2768                t++;
2769        }
2770
2771out:
2772        cb->args[1] = e;
2773        cb->args[0] = t;
2774
2775        return skb->len;
2776}
2777
2778#ifdef CONFIG_PROC_FS
2779/* The /proc interfaces to multicast routing :
2780 * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
2781 */
2782
2783static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
2784        __acquires(mrt_lock)
2785{
2786        struct mr_vif_iter *iter = seq->private;
2787        struct net *net = seq_file_net(seq);
2788        struct mr_table *mrt;
2789
2790        mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2791        if (!mrt)
2792                return ERR_PTR(-ENOENT);
2793
2794        iter->mrt = mrt;
2795
2796        read_lock(&mrt_lock);
2797        return mr_vif_seq_start(seq, pos);
2798}
2799
2800static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
2801        __releases(mrt_lock)
2802{
2803        read_unlock(&mrt_lock);
2804}
2805
2806static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2807{
2808        struct mr_vif_iter *iter = seq->private;
2809        struct mr_table *mrt = iter->mrt;
2810
2811        if (v == SEQ_START_TOKEN) {
2812                seq_puts(seq,
2813                         "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
2814        } else {
2815                const struct vif_device *vif = v;
2816                const char *name =  vif->dev ?
2817                                    vif->dev->name : "none";
2818
2819                seq_printf(seq,
2820                           "%2td %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2821                           vif - mrt->vif_table,
2822                           name, vif->bytes_in, vif->pkt_in,
2823                           vif->bytes_out, vif->pkt_out,
2824                           vif->flags, vif->local, vif->remote);
2825        }
2826        return 0;
2827}
2828
2829static const struct seq_operations ipmr_vif_seq_ops = {
2830        .start = ipmr_vif_seq_start,
2831        .next  = mr_vif_seq_next,
2832        .stop  = ipmr_vif_seq_stop,
2833        .show  = ipmr_vif_seq_show,
2834};
2835
2836static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2837{
2838        struct net *net = seq_file_net(seq);
2839        struct mr_table *mrt;
2840
2841        mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2842        if (!mrt)
2843                return ERR_PTR(-ENOENT);
2844
2845        return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock);
2846}
2847
2848static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2849{
2850        int n;
2851
2852        if (v == SEQ_START_TOKEN) {
2853                seq_puts(seq,
2854                 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
2855        } else {
2856                const struct mfc_cache *mfc = v;
2857                const struct mr_mfc_iter *it = seq->private;
2858                const struct mr_table *mrt = it->mrt;
2859
2860                seq_printf(seq, "%08X %08X %-3hd",
2861                           (__force u32) mfc->mfc_mcastgrp,
2862                           (__force u32) mfc->mfc_origin,
2863                           mfc->_c.mfc_parent);
2864
2865                if (it->cache != &mrt->mfc_unres_queue) {
2866                        seq_printf(seq, " %8lu %8lu %8lu",
2867                                   mfc->_c.mfc_un.res.pkt,
2868                                   mfc->_c.mfc_un.res.bytes,
2869                                   mfc->_c.mfc_un.res.wrong_if);
2870                        for (n = mfc->_c.mfc_un.res.minvif;
2871                             n < mfc->_c.mfc_un.res.maxvif; n++) {
2872                                if (VIF_EXISTS(mrt, n) &&
2873                                    mfc->_c.mfc_un.res.ttls[n] < 255)
2874                                        seq_printf(seq,
2875                                           " %2d:%-3d",
2876                                           n, mfc->_c.mfc_un.res.ttls[n]);
2877                        }
2878                } else {
2879                        /* unresolved mfc_caches don't contain
2880                         * pkt, bytes and wrong_if values
2881                         */
2882                        seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
2883                }
2884                seq_putc(seq, '\n');
2885        }
2886        return 0;
2887}
2888
2889static const struct seq_operations ipmr_mfc_seq_ops = {
2890        .start = ipmr_mfc_seq_start,
2891        .next  = mr_mfc_seq_next,
2892        .stop  = mr_mfc_seq_stop,
2893        .show  = ipmr_mfc_seq_show,
2894};
2895#endif
2896
2897#ifdef CONFIG_IP_PIMSM_V2
2898static const struct net_protocol pim_protocol = {
2899        .handler        =       pim_rcv,
2900        .netns_ok       =       1,
2901};
2902#endif
2903
2904static unsigned int ipmr_seq_read(struct net *net)
2905{
2906        ASSERT_RTNL();
2907
2908        return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
2909}
2910
2911static int ipmr_dump(struct net *net, struct notifier_block *nb)
2912{
2913        return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
2914                       ipmr_mr_table_iter, &mrt_lock);
2915}
2916
2917static const struct fib_notifier_ops ipmr_notifier_ops_template = {
2918        .family         = RTNL_FAMILY_IPMR,
2919        .fib_seq_read   = ipmr_seq_read,
2920        .fib_dump       = ipmr_dump,
2921        .owner          = THIS_MODULE,
2922};
2923
2924static int __net_init ipmr_notifier_init(struct net *net)
2925{
2926        struct fib_notifier_ops *ops;
2927
2928        net->ipv4.ipmr_seq = 0;
2929
2930        ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
2931        if (IS_ERR(ops))
2932                return PTR_ERR(ops);
2933        net->ipv4.ipmr_notifier_ops = ops;
2934
2935        return 0;
2936}
2937
2938static void __net_exit ipmr_notifier_exit(struct net *net)
2939{
2940        fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
2941        net->ipv4.ipmr_notifier_ops = NULL;
2942}
2943
2944/* Setup for IP multicast routing */
2945static int __net_init ipmr_net_init(struct net *net)
2946{
2947        int err;
2948
2949        err = ipmr_notifier_init(net);
2950        if (err)
2951                goto ipmr_notifier_fail;
2952
2953        err = ipmr_rules_init(net);
2954        if (err < 0)
2955                goto ipmr_rules_fail;
2956
2957#ifdef CONFIG_PROC_FS
2958        err = -ENOMEM;
2959        if (!proc_create_net("ip_mr_vif", 0, net->proc_net, &ipmr_vif_seq_ops,
2960                        sizeof(struct mr_vif_iter)))
2961                goto proc_vif_fail;
2962        if (!proc_create_net("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops,
2963                        sizeof(struct mr_mfc_iter)))
2964                goto proc_cache_fail;
2965#endif
2966        return 0;
2967
2968#ifdef CONFIG_PROC_FS
2969proc_cache_fail:
2970        remove_proc_entry("ip_mr_vif", net->proc_net);
2971proc_vif_fail:
2972        ipmr_rules_exit(net);
2973#endif
2974ipmr_rules_fail:
2975        ipmr_notifier_exit(net);
2976ipmr_notifier_fail:
2977        return err;
2978}
2979
2980static void __net_exit ipmr_net_exit(struct net *net)
2981{
2982#ifdef CONFIG_PROC_FS
2983        remove_proc_entry("ip_mr_cache", net->proc_net);
2984        remove_proc_entry("ip_mr_vif", net->proc_net);
2985#endif
2986        ipmr_notifier_exit(net);
2987        ipmr_rules_exit(net);
2988}
2989
2990static struct pernet_operations ipmr_net_ops = {
2991        .init = ipmr_net_init,
2992        .exit = ipmr_net_exit,
2993};
2994
2995int __init ip_mr_init(void)
2996{
2997        int err;
2998
2999        mrt_cachep = kmem_cache_create("ip_mrt_cache",
3000                                       sizeof(struct mfc_cache),
3001                                       0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
3002                                       NULL);
3003
3004        err = register_pernet_subsys(&ipmr_net_ops);
3005        if (err)
3006                goto reg_pernet_fail;
3007
3008        err = register_netdevice_notifier(&ip_mr_notifier);
3009        if (err)
3010                goto reg_notif_fail;
3011#ifdef CONFIG_IP_PIMSM_V2
3012        if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
3013                pr_err("%s: can't add PIM protocol\n", __func__);
3014                err = -EAGAIN;
3015                goto add_proto_fail;
3016        }
3017#endif
3018        rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
3019                      ipmr_rtm_getroute, ipmr_rtm_dumproute, 0);
3020        rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
3021                      ipmr_rtm_route, NULL, 0);
3022        rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
3023                      ipmr_rtm_route, NULL, 0);
3024
3025        rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK,
3026                      NULL, ipmr_rtm_dumplink, 0);
3027        return 0;
3028
3029#ifdef CONFIG_IP_PIMSM_V2
3030add_proto_fail:
3031        unregister_netdevice_notifier(&ip_mr_notifier);
3032#endif
3033reg_notif_fail:
3034        unregister_pernet_subsys(&ipmr_net_ops);
3035reg_pernet_fail:
3036        kmem_cache_destroy(mrt_cachep);
3037        return err;
3038}
3039