linux/net/openvswitch/datapath.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2007-2014 Nicira, Inc.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of version 2 of the GNU General Public
   6 * License as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16 * 02110-1301, USA
  17 */
  18
  19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20
  21#include <linux/init.h>
  22#include <linux/module.h>
  23#include <linux/if_arp.h>
  24#include <linux/if_vlan.h>
  25#include <linux/in.h>
  26#include <linux/ip.h>
  27#include <linux/jhash.h>
  28#include <linux/delay.h>
  29#include <linux/time.h>
  30#include <linux/etherdevice.h>
  31#include <linux/genetlink.h>
  32#include <linux/kernel.h>
  33#include <linux/kthread.h>
  34#include <linux/mutex.h>
  35#include <linux/percpu.h>
  36#include <linux/rcupdate.h>
  37#include <linux/tcp.h>
  38#include <linux/udp.h>
  39#include <linux/ethtool.h>
  40#include <linux/wait.h>
  41#include <asm/div64.h>
  42#include <linux/highmem.h>
  43#include <linux/netfilter_bridge.h>
  44#include <linux/netfilter_ipv4.h>
  45#include <linux/inetdevice.h>
  46#include <linux/list.h>
  47#include <linux/openvswitch.h>
  48#include <linux/rculist.h>
  49#include <linux/dmi.h>
  50#include <net/genetlink.h>
  51#include <net/net_namespace.h>
  52#include <net/netns/generic.h>
  53
  54#include "datapath.h"
  55#include "flow.h"
  56#include "flow_table.h"
  57#include "flow_netlink.h"
  58#include "vport-internal_dev.h"
  59#include "vport-netdev.h"
  60
  61int ovs_net_id __read_mostly;
  62EXPORT_SYMBOL_GPL(ovs_net_id);
  63
  64static struct genl_family dp_packet_genl_family;
  65static struct genl_family dp_flow_genl_family;
  66static struct genl_family dp_datapath_genl_family;
  67
  68static const struct nla_policy flow_policy[];
  69
  70static const struct genl_multicast_group ovs_dp_flow_multicast_group = {
  71        .name = OVS_FLOW_MCGROUP,
  72};
  73
  74static const struct genl_multicast_group ovs_dp_datapath_multicast_group = {
  75        .name = OVS_DATAPATH_MCGROUP,
  76};
  77
  78static const struct genl_multicast_group ovs_dp_vport_multicast_group = {
  79        .name = OVS_VPORT_MCGROUP,
  80};
  81
  82/* Check if need to build a reply message.
  83 * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */
  84static bool ovs_must_notify(struct genl_family *family, struct genl_info *info,
  85                            unsigned int group)
  86{
  87        return info->nlhdr->nlmsg_flags & NLM_F_ECHO ||
  88               genl_has_listeners(family, genl_info_net(info), group);
  89}
  90
  91static void ovs_notify(struct genl_family *family,
  92                       struct sk_buff *skb, struct genl_info *info)
  93{
  94        genl_notify(family, skb, info, 0, GFP_KERNEL);
  95}
  96
  97/**
  98 * DOC: Locking:
  99 *
 100 * All writes e.g. Writes to device state (add/remove datapath, port, set
 101 * operations on vports, etc.), Writes to other state (flow table
 102 * modifications, set miscellaneous datapath parameters, etc.) are protected
 103 * by ovs_lock.
 104 *
 105 * Reads are protected by RCU.
 106 *
 107 * There are a few special cases (mostly stats) that have their own
 108 * synchronization but they nest under all of above and don't interact with
 109 * each other.
 110 *
 111 * The RTNL lock nests inside ovs_mutex.
 112 */
 113
 114static DEFINE_MUTEX(ovs_mutex);
 115
 116void ovs_lock(void)
 117{
 118        mutex_lock(&ovs_mutex);
 119}
 120
 121void ovs_unlock(void)
 122{
 123        mutex_unlock(&ovs_mutex);
 124}
 125
 126#ifdef CONFIG_LOCKDEP
 127int lockdep_ovsl_is_held(void)
 128{
 129        if (debug_locks)
 130                return lockdep_is_held(&ovs_mutex);
 131        else
 132                return 1;
 133}
 134EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
 135#endif
 136
 137static struct vport *new_vport(const struct vport_parms *);
 138static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
 139                             const struct sw_flow_key *,
 140                             const struct dp_upcall_info *,
 141                             uint32_t cutlen);
 142static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
 143                                  const struct sw_flow_key *,
 144                                  const struct dp_upcall_info *,
 145                                  uint32_t cutlen);
 146
 147/* Must be called with rcu_read_lock. */
 148static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
 149{
 150        struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
 151
 152        if (dev) {
 153                struct vport *vport = ovs_internal_dev_get_vport(dev);
 154                if (vport)
 155                        return vport->dp;
 156        }
 157
 158        return NULL;
 159}
 160
 161/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
 162 * returned dp pointer valid.
 163 */
 164static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
 165{
 166        struct datapath *dp;
 167
 168        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
 169        rcu_read_lock();
 170        dp = get_dp_rcu(net, dp_ifindex);
 171        rcu_read_unlock();
 172
 173        return dp;
 174}
 175
 176/* Must be called with rcu_read_lock or ovs_mutex. */
 177const char *ovs_dp_name(const struct datapath *dp)
 178{
 179        struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
 180        return ovs_vport_name(vport);
 181}
 182
 183static int get_dpifindex(const struct datapath *dp)
 184{
 185        struct vport *local;
 186        int ifindex;
 187
 188        rcu_read_lock();
 189
 190        local = ovs_vport_rcu(dp, OVSP_LOCAL);
 191        if (local)
 192                ifindex = local->dev->ifindex;
 193        else
 194                ifindex = 0;
 195
 196        rcu_read_unlock();
 197
 198        return ifindex;
 199}
 200
 201static void destroy_dp_rcu(struct rcu_head *rcu)
 202{
 203        struct datapath *dp = container_of(rcu, struct datapath, rcu);
 204
 205        ovs_flow_tbl_destroy(&dp->table);
 206        free_percpu(dp->stats_percpu);
 207        kfree(dp->ports);
 208        kfree(dp);
 209}
 210
 211static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
 212                                            u16 port_no)
 213{
 214        return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
 215}
 216
 217/* Called with ovs_mutex or RCU read lock. */
 218struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
 219{
 220        struct vport *vport;
 221        struct hlist_head *head;
 222
 223        head = vport_hash_bucket(dp, port_no);
 224        hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
 225                if (vport->port_no == port_no)
 226                        return vport;
 227        }
 228        return NULL;
 229}
 230
 231/* Called with ovs_mutex. */
 232static struct vport *new_vport(const struct vport_parms *parms)
 233{
 234        struct vport *vport;
 235
 236        vport = ovs_vport_add(parms);
 237        if (!IS_ERR(vport)) {
 238                struct datapath *dp = parms->dp;
 239                struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
 240
 241                hlist_add_head_rcu(&vport->dp_hash_node, head);
 242        }
 243        return vport;
 244}
 245
 246void ovs_dp_detach_port(struct vport *p)
 247{
 248        ASSERT_OVSL();
 249
 250        /* First drop references to device. */
 251        hlist_del_rcu(&p->dp_hash_node);
 252
 253        /* Then destroy it. */
 254        ovs_vport_del(p);
 255}
 256
 257/* Must be called with rcu_read_lock. */
 258void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 259{
 260        const struct vport *p = OVS_CB(skb)->input_vport;
 261        struct datapath *dp = p->dp;
 262        struct sw_flow *flow;
 263        struct sw_flow_actions *sf_acts;
 264        struct dp_stats_percpu *stats;
 265        u64 *stats_counter;
 266        u32 n_mask_hit;
 267
 268        stats = this_cpu_ptr(dp->stats_percpu);
 269
 270        /* Look up flow. */
 271        flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit);
 272        if (unlikely(!flow)) {
 273                struct dp_upcall_info upcall;
 274                int error;
 275
 276                memset(&upcall, 0, sizeof(upcall));
 277                upcall.cmd = OVS_PACKET_CMD_MISS;
 278                upcall.portid = ovs_vport_find_upcall_portid(p, skb);
 279                upcall.mru = OVS_CB(skb)->mru;
 280                error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
 281                if (unlikely(error))
 282                        kfree_skb(skb);
 283                else
 284                        consume_skb(skb);
 285                stats_counter = &stats->n_missed;
 286                goto out;
 287        }
 288
 289        ovs_flow_stats_update(flow, key->tp.flags, skb);
 290        sf_acts = rcu_dereference(flow->sf_acts);
 291        ovs_execute_actions(dp, skb, sf_acts, key);
 292
 293        stats_counter = &stats->n_hit;
 294
 295out:
 296        /* Update datapath statistics. */
 297        u64_stats_update_begin(&stats->syncp);
 298        (*stats_counter)++;
 299        stats->n_mask_hit += n_mask_hit;
 300        u64_stats_update_end(&stats->syncp);
 301}
 302
 303int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
 304                  const struct sw_flow_key *key,
 305                  const struct dp_upcall_info *upcall_info,
 306                  uint32_t cutlen)
 307{
 308        struct dp_stats_percpu *stats;
 309        int err;
 310
 311        if (upcall_info->portid == 0) {
 312                err = -ENOTCONN;
 313                goto err;
 314        }
 315
 316        if (!skb_is_gso(skb))
 317                err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
 318        else
 319                err = queue_gso_packets(dp, skb, key, upcall_info, cutlen);
 320        if (err)
 321                goto err;
 322
 323        return 0;
 324
 325err:
 326        stats = this_cpu_ptr(dp->stats_percpu);
 327
 328        u64_stats_update_begin(&stats->syncp);
 329        stats->n_lost++;
 330        u64_stats_update_end(&stats->syncp);
 331
 332        return err;
 333}
 334
 335static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 336                             const struct sw_flow_key *key,
 337                             const struct dp_upcall_info *upcall_info,
 338                                 uint32_t cutlen)
 339{
 340        unsigned short gso_type = skb_shinfo(skb)->gso_type;
 341        struct sw_flow_key later_key;
 342        struct sk_buff *segs, *nskb;
 343        int err;
 344
 345        BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET);
 346        segs = __skb_gso_segment(skb, NETIF_F_SG, false);
 347        if (IS_ERR(segs))
 348                return PTR_ERR(segs);
 349        if (segs == NULL)
 350                return -EINVAL;
 351
 352        if (gso_type & SKB_GSO_UDP) {
 353                /* The initial flow key extracted by ovs_flow_key_extract()
 354                 * in this case is for a first fragment, so we need to
 355                 * properly mark later fragments.
 356                 */
 357                later_key = *key;
 358                later_key.ip.frag = OVS_FRAG_TYPE_LATER;
 359        }
 360
 361        /* Queue all of the segments. */
 362        skb = segs;
 363        do {
 364                if (gso_type & SKB_GSO_UDP && skb != segs)
 365                        key = &later_key;
 366
 367                err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
 368                if (err)
 369                        break;
 370
 371        } while ((skb = skb->next));
 372
 373        /* Free all of the segments. */
 374        skb = segs;
 375        do {
 376                nskb = skb->next;
 377                if (err)
 378                        kfree_skb(skb);
 379                else
 380                        consume_skb(skb);
 381        } while ((skb = nskb));
 382        return err;
 383}
 384
 385static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
 386                              unsigned int hdrlen)
 387{
 388        size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
 389                + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
 390                + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
 391                + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
 392
 393        /* OVS_PACKET_ATTR_USERDATA */
 394        if (upcall_info->userdata)
 395                size += NLA_ALIGN(upcall_info->userdata->nla_len);
 396
 397        /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */
 398        if (upcall_info->egress_tun_info)
 399                size += nla_total_size(ovs_tun_key_attr_size());
 400
 401        /* OVS_PACKET_ATTR_ACTIONS */
 402        if (upcall_info->actions_len)
 403                size += nla_total_size(upcall_info->actions_len);
 404
 405        /* OVS_PACKET_ATTR_MRU */
 406        if (upcall_info->mru)
 407                size += nla_total_size(sizeof(upcall_info->mru));
 408
 409        return size;
 410}
 411
 412static void pad_packet(struct datapath *dp, struct sk_buff *skb)
 413{
 414        if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
 415                size_t plen = NLA_ALIGN(skb->len) - skb->len;
 416
 417                if (plen > 0)
 418                        memset(skb_put(skb, plen), 0, plen);
 419        }
 420}
 421
 422static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 423                                  const struct sw_flow_key *key,
 424                                  const struct dp_upcall_info *upcall_info,
 425                                  uint32_t cutlen)
 426{
 427        struct ovs_header *upcall;
 428        struct sk_buff *nskb = NULL;
 429        struct sk_buff *user_skb = NULL; /* to be queued to userspace */
 430        struct nlattr *nla;
 431        size_t len;
 432        unsigned int hlen;
 433        int err, dp_ifindex;
 434
 435        dp_ifindex = get_dpifindex(dp);
 436        if (!dp_ifindex)
 437                return -ENODEV;
 438
 439        if (skb_vlan_tag_present(skb)) {
 440                nskb = skb_clone(skb, GFP_ATOMIC);
 441                if (!nskb)
 442                        return -ENOMEM;
 443
 444                nskb = __vlan_hwaccel_push_inside(nskb);
 445                if (!nskb)
 446                        return -ENOMEM;
 447
 448                skb = nskb;
 449        }
 450
 451        if (nla_attr_size(skb->len) > USHRT_MAX) {
 452                err = -EFBIG;
 453                goto out;
 454        }
 455
 456        /* Complete checksum if needed */
 457        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 458            (err = skb_checksum_help(skb)))
 459                goto out;
 460
 461        /* Older versions of OVS user space enforce alignment of the last
 462         * Netlink attribute to NLA_ALIGNTO which would require extensive
 463         * padding logic. Only perform zerocopy if padding is not required.
 464         */
 465        if (dp->user_features & OVS_DP_F_UNALIGNED)
 466                hlen = skb_zerocopy_headlen(skb);
 467        else
 468                hlen = skb->len;
 469
 470        len = upcall_msg_size(upcall_info, hlen - cutlen);
 471        user_skb = genlmsg_new(len, GFP_ATOMIC);
 472        if (!user_skb) {
 473                err = -ENOMEM;
 474                goto out;
 475        }
 476
 477        upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
 478                             0, upcall_info->cmd);
 479        upcall->dp_ifindex = dp_ifindex;
 480
 481        err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb);
 482        BUG_ON(err);
 483
 484        if (upcall_info->userdata)
 485                __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
 486                          nla_len(upcall_info->userdata),
 487                          nla_data(upcall_info->userdata));
 488
 489        if (upcall_info->egress_tun_info) {
 490                nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
 491                err = ovs_nla_put_tunnel_info(user_skb,
 492                                              upcall_info->egress_tun_info);
 493                BUG_ON(err);
 494                nla_nest_end(user_skb, nla);
 495        }
 496
 497        if (upcall_info->actions_len) {
 498                nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS);
 499                err = ovs_nla_put_actions(upcall_info->actions,
 500                                          upcall_info->actions_len,
 501                                          user_skb);
 502                if (!err)
 503                        nla_nest_end(user_skb, nla);
 504                else
 505                        nla_nest_cancel(user_skb, nla);
 506        }
 507
 508        /* Add OVS_PACKET_ATTR_MRU */
 509        if (upcall_info->mru) {
 510                if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
 511                                upcall_info->mru)) {
 512                        err = -ENOBUFS;
 513                        goto out;
 514                }
 515                pad_packet(dp, user_skb);
 516        }
 517
 518        /* Add OVS_PACKET_ATTR_LEN when packet is truncated */
 519        if (cutlen > 0) {
 520                if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN,
 521                                skb->len)) {
 522                        err = -ENOBUFS;
 523                        goto out;
 524                }
 525                pad_packet(dp, user_skb);
 526        }
 527
 528        /* Only reserve room for attribute header, packet data is added
 529         * in skb_zerocopy() */
 530        if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
 531                err = -ENOBUFS;
 532                goto out;
 533        }
 534        nla->nla_len = nla_attr_size(skb->len - cutlen);
 535
 536        err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen);
 537        if (err)
 538                goto out;
 539
 540        /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
 541        pad_packet(dp, user_skb);
 542
 543        ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
 544
 545        err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
 546        user_skb = NULL;
 547out:
 548        if (err)
 549                skb_tx_error(skb);
 550        kfree_skb(user_skb);
 551        kfree_skb(nskb);
 552        return err;
 553}
 554
 555static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 556{
 557        struct ovs_header *ovs_header = info->userhdr;
 558        struct net *net = sock_net(skb->sk);
 559        struct nlattr **a = info->attrs;
 560        struct sw_flow_actions *acts;
 561        struct sk_buff *packet;
 562        struct sw_flow *flow;
 563        struct sw_flow_actions *sf_acts;
 564        struct datapath *dp;
 565        struct ethhdr *eth;
 566        struct vport *input_vport;
 567        u16 mru = 0;
 568        int len;
 569        int err;
 570        bool log = !a[OVS_PACKET_ATTR_PROBE];
 571
 572        err = -EINVAL;
 573        if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
 574            !a[OVS_PACKET_ATTR_ACTIONS])
 575                goto err;
 576
 577        len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
 578        packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
 579        err = -ENOMEM;
 580        if (!packet)
 581                goto err;
 582        skb_reserve(packet, NET_IP_ALIGN);
 583
 584        nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
 585
 586        skb_reset_mac_header(packet);
 587        eth = eth_hdr(packet);
 588
 589        /* Normally, setting the skb 'protocol' field would be handled by a
 590         * call to eth_type_trans(), but it assumes there's a sending
 591         * device, which we may not have. */
 592        if (eth_proto_is_802_3(eth->h_proto))
 593                packet->protocol = eth->h_proto;
 594        else
 595                packet->protocol = htons(ETH_P_802_2);
 596
 597        /* Set packet's mru */
 598        if (a[OVS_PACKET_ATTR_MRU]) {
 599                mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
 600                packet->ignore_df = 1;
 601        }
 602        OVS_CB(packet)->mru = mru;
 603
 604        /* Build an sw_flow for sending this packet. */
 605        flow = ovs_flow_alloc();
 606        err = PTR_ERR(flow);
 607        if (IS_ERR(flow))
 608                goto err_kfree_skb;
 609
 610        err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY],
 611                                             packet, &flow->key, log);
 612        if (err)
 613                goto err_flow_free;
 614
 615        err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS],
 616                                   &flow->key, &acts, log);
 617        if (err)
 618                goto err_flow_free;
 619
 620        rcu_assign_pointer(flow->sf_acts, acts);
 621        packet->priority = flow->key.phy.priority;
 622        packet->mark = flow->key.phy.skb_mark;
 623
 624        rcu_read_lock();
 625        dp = get_dp_rcu(net, ovs_header->dp_ifindex);
 626        err = -ENODEV;
 627        if (!dp)
 628                goto err_unlock;
 629
 630        input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port);
 631        if (!input_vport)
 632                input_vport = ovs_vport_rcu(dp, OVSP_LOCAL);
 633
 634        if (!input_vport)
 635                goto err_unlock;
 636
 637        packet->dev = input_vport->dev;
 638        OVS_CB(packet)->input_vport = input_vport;
 639        sf_acts = rcu_dereference(flow->sf_acts);
 640
 641        local_bh_disable();
 642        err = ovs_execute_actions(dp, packet, sf_acts, &flow->key);
 643        local_bh_enable();
 644        rcu_read_unlock();
 645
 646        ovs_flow_free(flow, false);
 647        return err;
 648
 649err_unlock:
 650        rcu_read_unlock();
 651err_flow_free:
 652        ovs_flow_free(flow, false);
 653err_kfree_skb:
 654        kfree_skb(packet);
 655err:
 656        return err;
 657}
 658
 659static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
 660        [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
 661        [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
 662        [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
 663        [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
 664        [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
 665};
 666
 667static const struct genl_ops dp_packet_genl_ops[] = {
 668        { .cmd = OVS_PACKET_CMD_EXECUTE,
 669          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 670          .policy = packet_policy,
 671          .doit = ovs_packet_cmd_execute
 672        }
 673};
 674
 675static struct genl_family dp_packet_genl_family = {
 676        .id = GENL_ID_GENERATE,
 677        .hdrsize = sizeof(struct ovs_header),
 678        .name = OVS_PACKET_FAMILY,
 679        .version = OVS_PACKET_VERSION,
 680        .maxattr = OVS_PACKET_ATTR_MAX,
 681        .netnsok = true,
 682        .parallel_ops = true,
 683        .ops = dp_packet_genl_ops,
 684        .n_ops = ARRAY_SIZE(dp_packet_genl_ops),
 685};
 686
 687static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
 688                         struct ovs_dp_megaflow_stats *mega_stats)
 689{
 690        int i;
 691
 692        memset(mega_stats, 0, sizeof(*mega_stats));
 693
 694        stats->n_flows = ovs_flow_tbl_count(&dp->table);
 695        mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
 696
 697        stats->n_hit = stats->n_missed = stats->n_lost = 0;
 698
 699        for_each_possible_cpu(i) {
 700                const struct dp_stats_percpu *percpu_stats;
 701                struct dp_stats_percpu local_stats;
 702                unsigned int start;
 703
 704                percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
 705
 706                do {
 707                        start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
 708                        local_stats = *percpu_stats;
 709                } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
 710
 711                stats->n_hit += local_stats.n_hit;
 712                stats->n_missed += local_stats.n_missed;
 713                stats->n_lost += local_stats.n_lost;
 714                mega_stats->n_mask_hit += local_stats.n_mask_hit;
 715        }
 716}
 717
 718static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags)
 719{
 720        return ovs_identifier_is_ufid(sfid) &&
 721               !(ufid_flags & OVS_UFID_F_OMIT_KEY);
 722}
 723
 724static bool should_fill_mask(uint32_t ufid_flags)
 725{
 726        return !(ufid_flags & OVS_UFID_F_OMIT_MASK);
 727}
 728
 729static bool should_fill_actions(uint32_t ufid_flags)
 730{
 731        return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS);
 732}
 733
 734static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts,
 735                                    const struct sw_flow_id *sfid,
 736                                    uint32_t ufid_flags)
 737{
 738        size_t len = NLMSG_ALIGN(sizeof(struct ovs_header));
 739
 740        /* OVS_FLOW_ATTR_UFID */
 741        if (sfid && ovs_identifier_is_ufid(sfid))
 742                len += nla_total_size(sfid->ufid_len);
 743
 744        /* OVS_FLOW_ATTR_KEY */
 745        if (!sfid || should_fill_key(sfid, ufid_flags))
 746                len += nla_total_size(ovs_key_attr_size());
 747
 748        /* OVS_FLOW_ATTR_MASK */
 749        if (should_fill_mask(ufid_flags))
 750                len += nla_total_size(ovs_key_attr_size());
 751
 752        /* OVS_FLOW_ATTR_ACTIONS */
 753        if (should_fill_actions(ufid_flags))
 754                len += nla_total_size(acts->orig_len);
 755
 756        return len
 757                + nla_total_size_64bit(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
 758                + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
 759                + nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */
 760}
 761
 762/* Called with ovs_mutex or RCU read lock. */
 763static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
 764                                   struct sk_buff *skb)
 765{
 766        struct ovs_flow_stats stats;
 767        __be16 tcp_flags;
 768        unsigned long used;
 769
 770        ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
 771
 772        if (used &&
 773            nla_put_u64_64bit(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used),
 774                              OVS_FLOW_ATTR_PAD))
 775                return -EMSGSIZE;
 776
 777        if (stats.n_packets &&
 778            nla_put_64bit(skb, OVS_FLOW_ATTR_STATS,
 779                          sizeof(struct ovs_flow_stats), &stats,
 780                          OVS_FLOW_ATTR_PAD))
 781                return -EMSGSIZE;
 782
 783        if ((u8)ntohs(tcp_flags) &&
 784             nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags)))
 785                return -EMSGSIZE;
 786
 787        return 0;
 788}
 789
 790/* Called with ovs_mutex or RCU read lock. */
 791static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
 792                                     struct sk_buff *skb, int skb_orig_len)
 793{
 794        struct nlattr *start;
 795        int err;
 796
 797        /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
 798         * this is the first flow to be dumped into 'skb'.  This is unusual for
 799         * Netlink but individual action lists can be longer than
 800         * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
 801         * The userspace caller can always fetch the actions separately if it
 802         * really wants them.  (Most userspace callers in fact don't care.)
 803         *
 804         * This can only fail for dump operations because the skb is always
 805         * properly sized for single flows.
 806         */
 807        start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
 808        if (start) {
 809                const struct sw_flow_actions *sf_acts;
 810
 811                sf_acts = rcu_dereference_ovsl(flow->sf_acts);
 812                err = ovs_nla_put_actions(sf_acts->actions,
 813                                          sf_acts->actions_len, skb);
 814
 815                if (!err)
 816                        nla_nest_end(skb, start);
 817                else {
 818                        if (skb_orig_len)
 819                                return err;
 820
 821                        nla_nest_cancel(skb, start);
 822                }
 823        } else if (skb_orig_len) {
 824                return -EMSGSIZE;
 825        }
 826
 827        return 0;
 828}
 829
 830/* Called with ovs_mutex or RCU read lock. */
 831static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
 832                                  struct sk_buff *skb, u32 portid,
 833                                  u32 seq, u32 flags, u8 cmd, u32 ufid_flags)
 834{
 835        const int skb_orig_len = skb->len;
 836        struct ovs_header *ovs_header;
 837        int err;
 838
 839        ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family,
 840                                 flags, cmd);
 841        if (!ovs_header)
 842                return -EMSGSIZE;
 843
 844        ovs_header->dp_ifindex = dp_ifindex;
 845
 846        err = ovs_nla_put_identifier(flow, skb);
 847        if (err)
 848                goto error;
 849
 850        if (should_fill_key(&flow->id, ufid_flags)) {
 851                err = ovs_nla_put_masked_key(flow, skb);
 852                if (err)
 853                        goto error;
 854        }
 855
 856        if (should_fill_mask(ufid_flags)) {
 857                err = ovs_nla_put_mask(flow, skb);
 858                if (err)
 859                        goto error;
 860        }
 861
 862        err = ovs_flow_cmd_fill_stats(flow, skb);
 863        if (err)
 864                goto error;
 865
 866        if (should_fill_actions(ufid_flags)) {
 867                err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
 868                if (err)
 869                        goto error;
 870        }
 871
 872        genlmsg_end(skb, ovs_header);
 873        return 0;
 874
 875error:
 876        genlmsg_cancel(skb, ovs_header);
 877        return err;
 878}
 879
 880/* May not be called with RCU read lock. */
 881static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts,
 882                                               const struct sw_flow_id *sfid,
 883                                               struct genl_info *info,
 884                                               bool always,
 885                                               uint32_t ufid_flags)
 886{
 887        struct sk_buff *skb;
 888        size_t len;
 889
 890        if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0))
 891                return NULL;
 892
 893        len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags);
 894        skb = genlmsg_new(len, GFP_KERNEL);
 895        if (!skb)
 896                return ERR_PTR(-ENOMEM);
 897
 898        return skb;
 899}
 900
 901/* Called with ovs_mutex. */
 902static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
 903                                               int dp_ifindex,
 904                                               struct genl_info *info, u8 cmd,
 905                                               bool always, u32 ufid_flags)
 906{
 907        struct sk_buff *skb;
 908        int retval;
 909
 910        skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts),
 911                                      &flow->id, info, always, ufid_flags);
 912        if (IS_ERR_OR_NULL(skb))
 913                return skb;
 914
 915        retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
 916                                        info->snd_portid, info->snd_seq, 0,
 917                                        cmd, ufid_flags);
 918        BUG_ON(retval < 0);
 919        return skb;
 920}
 921
 922static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 923{
 924        struct net *net = sock_net(skb->sk);
 925        struct nlattr **a = info->attrs;
 926        struct ovs_header *ovs_header = info->userhdr;
 927        struct sw_flow *flow = NULL, *new_flow;
 928        struct sw_flow_mask mask;
 929        struct sk_buff *reply;
 930        struct datapath *dp;
 931        struct sw_flow_actions *acts;
 932        struct sw_flow_match match;
 933        u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
 934        int error;
 935        bool log = !a[OVS_FLOW_ATTR_PROBE];
 936
 937        /* Must have key and actions. */
 938        error = -EINVAL;
 939        if (!a[OVS_FLOW_ATTR_KEY]) {
 940                OVS_NLERR(log, "Flow key attr not present in new flow.");
 941                goto error;
 942        }
 943        if (!a[OVS_FLOW_ATTR_ACTIONS]) {
 944                OVS_NLERR(log, "Flow actions attr not present in new flow.");
 945                goto error;
 946        }
 947
 948        /* Most of the time we need to allocate a new flow, do it before
 949         * locking.
 950         */
 951        new_flow = ovs_flow_alloc();
 952        if (IS_ERR(new_flow)) {
 953                error = PTR_ERR(new_flow);
 954                goto error;
 955        }
 956
 957        /* Extract key. */
 958        ovs_match_init(&match, &new_flow->key, false, &mask);
 959        error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
 960                                  a[OVS_FLOW_ATTR_MASK], log);
 961        if (error)
 962                goto err_kfree_flow;
 963
 964        /* Extract flow identifier. */
 965        error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
 966                                       &new_flow->key, log);
 967        if (error)
 968                goto err_kfree_flow;
 969
 970        /* unmasked key is needed to match when ufid is not used. */
 971        if (ovs_identifier_is_key(&new_flow->id))
 972                match.key = new_flow->id.unmasked_key;
 973
 974        ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask);
 975
 976        /* Validate actions. */
 977        error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
 978                                     &new_flow->key, &acts, log);
 979        if (error) {
 980                OVS_NLERR(log, "Flow actions may not be safe on all matching packets.");
 981                goto err_kfree_flow;
 982        }
 983
 984        reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false,
 985                                        ufid_flags);
 986        if (IS_ERR(reply)) {
 987                error = PTR_ERR(reply);
 988                goto err_kfree_acts;
 989        }
 990
 991        ovs_lock();
 992        dp = get_dp(net, ovs_header->dp_ifindex);
 993        if (unlikely(!dp)) {
 994                error = -ENODEV;
 995                goto err_unlock_ovs;
 996        }
 997
 998        /* Check if this is a duplicate flow */
 999        if (ovs_identifier_is_ufid(&new_flow->id))
1000                flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
1001        if (!flow)
1002                flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key);
1003        if (likely(!flow)) {
1004                rcu_assign_pointer(new_flow->sf_acts, acts);
1005
1006                /* Put flow in bucket. */
1007                error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
1008                if (unlikely(error)) {
1009                        acts = NULL;
1010                        goto err_unlock_ovs;
1011                }
1012
1013                if (unlikely(reply)) {
1014                        error = ovs_flow_cmd_fill_info(new_flow,
1015                                                       ovs_header->dp_ifindex,
1016                                                       reply, info->snd_portid,
1017                                                       info->snd_seq, 0,
1018                                                       OVS_FLOW_CMD_NEW,
1019                                                       ufid_flags);
1020                        BUG_ON(error < 0);
1021                }
1022                ovs_unlock();
1023        } else {
1024                struct sw_flow_actions *old_acts;
1025
1026                /* Bail out if we're not allowed to modify an existing flow.
1027                 * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
1028                 * because Generic Netlink treats the latter as a dump
1029                 * request.  We also accept NLM_F_EXCL in case that bug ever
1030                 * gets fixed.
1031                 */
1032                if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
1033                                                         | NLM_F_EXCL))) {
1034                        error = -EEXIST;
1035                        goto err_unlock_ovs;
1036                }
1037                /* The flow identifier has to be the same for flow updates.
1038                 * Look for any overlapping flow.
1039                 */
1040                if (unlikely(!ovs_flow_cmp(flow, &match))) {
1041                        if (ovs_identifier_is_key(&flow->id))
1042                                flow = ovs_flow_tbl_lookup_exact(&dp->table,
1043                                                                 &match);
1044                        else /* UFID matches but key is different */
1045                                flow = NULL;
1046                        if (!flow) {
1047                                error = -ENOENT;
1048                                goto err_unlock_ovs;
1049                        }
1050                }
1051                /* Update actions. */
1052                old_acts = ovsl_dereference(flow->sf_acts);
1053                rcu_assign_pointer(flow->sf_acts, acts);
1054
1055                if (unlikely(reply)) {
1056                        error = ovs_flow_cmd_fill_info(flow,
1057                                                       ovs_header->dp_ifindex,
1058                                                       reply, info->snd_portid,
1059                                                       info->snd_seq, 0,
1060                                                       OVS_FLOW_CMD_NEW,
1061                                                       ufid_flags);
1062                        BUG_ON(error < 0);
1063                }
1064                ovs_unlock();
1065
1066                ovs_nla_free_flow_actions_rcu(old_acts);
1067                ovs_flow_free(new_flow, false);
1068        }
1069
1070        if (reply)
1071                ovs_notify(&dp_flow_genl_family, reply, info);
1072        return 0;
1073
1074err_unlock_ovs:
1075        ovs_unlock();
1076        kfree_skb(reply);
1077err_kfree_acts:
1078        ovs_nla_free_flow_actions(acts);
1079err_kfree_flow:
1080        ovs_flow_free(new_flow, false);
1081error:
1082        return error;
1083}
1084
1085/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
1086static struct sw_flow_actions *get_flow_actions(struct net *net,
1087                                                const struct nlattr *a,
1088                                                const struct sw_flow_key *key,
1089                                                const struct sw_flow_mask *mask,
1090                                                bool log)
1091{
1092        struct sw_flow_actions *acts;
1093        struct sw_flow_key masked_key;
1094        int error;
1095
1096        ovs_flow_mask_key(&masked_key, key, true, mask);
1097        error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log);
1098        if (error) {
1099                OVS_NLERR(log,
1100                          "Actions may not be safe on all matching packets");
1101                return ERR_PTR(error);
1102        }
1103
1104        return acts;
1105}
1106
1107static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
1108{
1109        struct net *net = sock_net(skb->sk);
1110        struct nlattr **a = info->attrs;
1111        struct ovs_header *ovs_header = info->userhdr;
1112        struct sw_flow_key key;
1113        struct sw_flow *flow;
1114        struct sw_flow_mask mask;
1115        struct sk_buff *reply = NULL;
1116        struct datapath *dp;
1117        struct sw_flow_actions *old_acts = NULL, *acts = NULL;
1118        struct sw_flow_match match;
1119        struct sw_flow_id sfid;
1120        u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1121        int error = 0;
1122        bool log = !a[OVS_FLOW_ATTR_PROBE];
1123        bool ufid_present;
1124
1125        ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
1126        if (a[OVS_FLOW_ATTR_KEY]) {
1127                ovs_match_init(&match, &key, true, &mask);
1128                error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
1129                                          a[OVS_FLOW_ATTR_MASK], log);
1130        } else if (!ufid_present) {
1131                OVS_NLERR(log,
1132                          "Flow set message rejected, Key attribute missing.");
1133                error = -EINVAL;
1134        }
1135        if (error)
1136                goto error;
1137
1138        /* Validate actions. */
1139        if (a[OVS_FLOW_ATTR_ACTIONS]) {
1140                if (!a[OVS_FLOW_ATTR_KEY]) {
1141                        OVS_NLERR(log,
1142                                  "Flow key attribute not present in set flow.");
1143                        error = -EINVAL;
1144                        goto error;
1145                }
1146
1147                acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
1148                                        &mask, log);
1149                if (IS_ERR(acts)) {
1150                        error = PTR_ERR(acts);
1151                        goto error;
1152                }
1153
1154                /* Can allocate before locking if have acts. */
1155                reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false,
1156                                                ufid_flags);
1157                if (IS_ERR(reply)) {
1158                        error = PTR_ERR(reply);
1159                        goto err_kfree_acts;
1160                }
1161        }
1162
1163        ovs_lock();
1164        dp = get_dp(net, ovs_header->dp_ifindex);
1165        if (unlikely(!dp)) {
1166                error = -ENODEV;
1167                goto err_unlock_ovs;
1168        }
1169        /* Check that the flow exists. */
1170        if (ufid_present)
1171                flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid);
1172        else
1173                flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
1174        if (unlikely(!flow)) {
1175                error = -ENOENT;
1176                goto err_unlock_ovs;
1177        }
1178
1179        /* Update actions, if present. */
1180        if (likely(acts)) {
1181                old_acts = ovsl_dereference(flow->sf_acts);
1182                rcu_assign_pointer(flow->sf_acts, acts);
1183
1184                if (unlikely(reply)) {
1185                        error = ovs_flow_cmd_fill_info(flow,
1186                                                       ovs_header->dp_ifindex,
1187                                                       reply, info->snd_portid,
1188                                                       info->snd_seq, 0,
1189                                                       OVS_FLOW_CMD_NEW,
1190                                                       ufid_flags);
1191                        BUG_ON(error < 0);
1192                }
1193        } else {
1194                /* Could not alloc without acts before locking. */
1195                reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
1196                                                info, OVS_FLOW_CMD_NEW, false,
1197                                                ufid_flags);
1198
1199                if (IS_ERR(reply)) {
1200                        error = PTR_ERR(reply);
1201                        goto err_unlock_ovs;
1202                }
1203        }
1204
1205        /* Clear stats. */
1206        if (a[OVS_FLOW_ATTR_CLEAR])
1207                ovs_flow_stats_clear(flow);
1208        ovs_unlock();
1209
1210        if (reply)
1211                ovs_notify(&dp_flow_genl_family, reply, info);
1212        if (old_acts)
1213                ovs_nla_free_flow_actions_rcu(old_acts);
1214
1215        return 0;
1216
1217err_unlock_ovs:
1218        ovs_unlock();
1219        kfree_skb(reply);
1220err_kfree_acts:
1221        ovs_nla_free_flow_actions(acts);
1222error:
1223        return error;
1224}
1225
1226static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
1227{
1228        struct nlattr **a = info->attrs;
1229        struct ovs_header *ovs_header = info->userhdr;
1230        struct net *net = sock_net(skb->sk);
1231        struct sw_flow_key key;
1232        struct sk_buff *reply;
1233        struct sw_flow *flow;
1234        struct datapath *dp;
1235        struct sw_flow_match match;
1236        struct sw_flow_id ufid;
1237        u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1238        int err = 0;
1239        bool log = !a[OVS_FLOW_ATTR_PROBE];
1240        bool ufid_present;
1241
1242        ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
1243        if (a[OVS_FLOW_ATTR_KEY]) {
1244                ovs_match_init(&match, &key, true, NULL);
1245                err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL,
1246                                        log);
1247        } else if (!ufid_present) {
1248                OVS_NLERR(log,
1249                          "Flow get message rejected, Key attribute missing.");
1250                err = -EINVAL;
1251        }
1252        if (err)
1253                return err;
1254
1255        ovs_lock();
1256        dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1257        if (!dp) {
1258                err = -ENODEV;
1259                goto unlock;
1260        }
1261
1262        if (ufid_present)
1263                flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
1264        else
1265                flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
1266        if (!flow) {
1267                err = -ENOENT;
1268                goto unlock;
1269        }
1270
1271        reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
1272                                        OVS_FLOW_CMD_NEW, true, ufid_flags);
1273        if (IS_ERR(reply)) {
1274                err = PTR_ERR(reply);
1275                goto unlock;
1276        }
1277
1278        ovs_unlock();
1279        return genlmsg_reply(reply, info);
1280unlock:
1281        ovs_unlock();
1282        return err;
1283}
1284
1285static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
1286{
1287        struct nlattr **a = info->attrs;
1288        struct ovs_header *ovs_header = info->userhdr;
1289        struct net *net = sock_net(skb->sk);
1290        struct sw_flow_key key;
1291        struct sk_buff *reply;
1292        struct sw_flow *flow = NULL;
1293        struct datapath *dp;
1294        struct sw_flow_match match;
1295        struct sw_flow_id ufid;
1296        u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1297        int err;
1298        bool log = !a[OVS_FLOW_ATTR_PROBE];
1299        bool ufid_present;
1300
1301        ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
1302        if (a[OVS_FLOW_ATTR_KEY]) {
1303                ovs_match_init(&match, &key, true, NULL);
1304                err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
1305                                        NULL, log);
1306                if (unlikely(err))
1307                        return err;
1308        }
1309
1310        ovs_lock();
1311        dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1312        if (unlikely(!dp)) {
1313                err = -ENODEV;
1314                goto unlock;
1315        }
1316
1317        if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) {
1318                err = ovs_flow_tbl_flush(&dp->table);
1319                goto unlock;
1320        }
1321
1322        if (ufid_present)
1323                flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
1324        else
1325                flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
1326        if (unlikely(!flow)) {
1327                err = -ENOENT;
1328                goto unlock;
1329        }
1330
1331        ovs_flow_tbl_remove(&dp->table, flow);
1332        ovs_unlock();
1333
1334        reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts,
1335                                        &flow->id, info, false, ufid_flags);
1336        if (likely(reply)) {
1337                if (likely(!IS_ERR(reply))) {
1338                        rcu_read_lock();        /*To keep RCU checker happy. */
1339                        err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex,
1340                                                     reply, info->snd_portid,
1341                                                     info->snd_seq, 0,
1342                                                     OVS_FLOW_CMD_DEL,
1343                                                     ufid_flags);
1344                        rcu_read_unlock();
1345                        BUG_ON(err < 0);
1346
1347                        ovs_notify(&dp_flow_genl_family, reply, info);
1348                } else {
1349                        netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply));
1350                }
1351        }
1352
1353        ovs_flow_free(flow, true);
1354        return 0;
1355unlock:
1356        ovs_unlock();
1357        return err;
1358}
1359
1360static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1361{
1362        struct nlattr *a[__OVS_FLOW_ATTR_MAX];
1363        struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1364        struct table_instance *ti;
1365        struct datapath *dp;
1366        u32 ufid_flags;
1367        int err;
1368
1369        err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a,
1370                            OVS_FLOW_ATTR_MAX, flow_policy);
1371        if (err)
1372                return err;
1373        ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1374
1375        rcu_read_lock();
1376        dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
1377        if (!dp) {
1378                rcu_read_unlock();
1379                return -ENODEV;
1380        }
1381
1382        ti = rcu_dereference(dp->table.ti);
1383        for (;;) {
1384                struct sw_flow *flow;
1385                u32 bucket, obj;
1386
1387                bucket = cb->args[0];
1388                obj = cb->args[1];
1389                flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj);
1390                if (!flow)
1391                        break;
1392
1393                if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
1394                                           NETLINK_CB(cb->skb).portid,
1395                                           cb->nlh->nlmsg_seq, NLM_F_MULTI,
1396                                           OVS_FLOW_CMD_NEW, ufid_flags) < 0)
1397                        break;
1398
1399                cb->args[0] = bucket;
1400                cb->args[1] = obj;
1401        }
1402        rcu_read_unlock();
1403        return skb->len;
1404}
1405
1406static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
1407        [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
1408        [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED },
1409        [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
1410        [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
1411        [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG },
1412        [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 },
1413        [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 },
1414};
1415
1416static const struct genl_ops dp_flow_genl_ops[] = {
1417        { .cmd = OVS_FLOW_CMD_NEW,
1418          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1419          .policy = flow_policy,
1420          .doit = ovs_flow_cmd_new
1421        },
1422        { .cmd = OVS_FLOW_CMD_DEL,
1423          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1424          .policy = flow_policy,
1425          .doit = ovs_flow_cmd_del
1426        },
1427        { .cmd = OVS_FLOW_CMD_GET,
1428          .flags = 0,               /* OK for unprivileged users. */
1429          .policy = flow_policy,
1430          .doit = ovs_flow_cmd_get,
1431          .dumpit = ovs_flow_cmd_dump
1432        },
1433        { .cmd = OVS_FLOW_CMD_SET,
1434          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1435          .policy = flow_policy,
1436          .doit = ovs_flow_cmd_set,
1437        },
1438};
1439
1440static struct genl_family dp_flow_genl_family = {
1441        .id = GENL_ID_GENERATE,
1442        .hdrsize = sizeof(struct ovs_header),
1443        .name = OVS_FLOW_FAMILY,
1444        .version = OVS_FLOW_VERSION,
1445        .maxattr = OVS_FLOW_ATTR_MAX,
1446        .netnsok = true,
1447        .parallel_ops = true,
1448        .ops = dp_flow_genl_ops,
1449        .n_ops = ARRAY_SIZE(dp_flow_genl_ops),
1450        .mcgrps = &ovs_dp_flow_multicast_group,
1451        .n_mcgrps = 1,
1452};
1453
1454static size_t ovs_dp_cmd_msg_size(void)
1455{
1456        size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
1457
1458        msgsize += nla_total_size(IFNAMSIZ);
1459        msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats));
1460        msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats));
1461        msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */
1462
1463        return msgsize;
1464}
1465
1466/* Called with ovs_mutex. */
1467static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
1468                                u32 portid, u32 seq, u32 flags, u8 cmd)
1469{
1470        struct ovs_header *ovs_header;
1471        struct ovs_dp_stats dp_stats;
1472        struct ovs_dp_megaflow_stats dp_megaflow_stats;
1473        int err;
1474
1475        ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
1476                                   flags, cmd);
1477        if (!ovs_header)
1478                goto error;
1479
1480        ovs_header->dp_ifindex = get_dpifindex(dp);
1481
1482        err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
1483        if (err)
1484                goto nla_put_failure;
1485
1486        get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
1487        if (nla_put_64bit(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
1488                          &dp_stats, OVS_DP_ATTR_PAD))
1489                goto nla_put_failure;
1490
1491        if (nla_put_64bit(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
1492                          sizeof(struct ovs_dp_megaflow_stats),
1493                          &dp_megaflow_stats, OVS_DP_ATTR_PAD))
1494                goto nla_put_failure;
1495
1496        if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
1497                goto nla_put_failure;
1498
1499        genlmsg_end(skb, ovs_header);
1500        return 0;
1501
1502nla_put_failure:
1503        genlmsg_cancel(skb, ovs_header);
1504error:
1505        return -EMSGSIZE;
1506}
1507
1508static struct sk_buff *ovs_dp_cmd_alloc_info(void)
1509{
1510        return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL);
1511}
1512
1513/* Called with rcu_read_lock or ovs_mutex. */
1514static struct datapath *lookup_datapath(struct net *net,
1515                                        const struct ovs_header *ovs_header,
1516                                        struct nlattr *a[OVS_DP_ATTR_MAX + 1])
1517{
1518        struct datapath *dp;
1519
1520        if (!a[OVS_DP_ATTR_NAME])
1521                dp = get_dp(net, ovs_header->dp_ifindex);
1522        else {
1523                struct vport *vport;
1524
1525                vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME]));
1526                dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
1527        }
1528        return dp ? dp : ERR_PTR(-ENODEV);
1529}
1530
1531static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info)
1532{
1533        struct datapath *dp;
1534
1535        dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1536        if (IS_ERR(dp))
1537                return;
1538
1539        WARN(dp->user_features, "Dropping previously announced user features\n");
1540        dp->user_features = 0;
1541}
1542
1543static void ovs_dp_change(struct datapath *dp, struct nlattr *a[])
1544{
1545        if (a[OVS_DP_ATTR_USER_FEATURES])
1546                dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
1547}
1548
1549static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
1550{
1551        struct nlattr **a = info->attrs;
1552        struct vport_parms parms;
1553        struct sk_buff *reply;
1554        struct datapath *dp;
1555        struct vport *vport;
1556        struct ovs_net *ovs_net;
1557        int err, i;
1558
1559        err = -EINVAL;
1560        if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
1561                goto err;
1562
1563        reply = ovs_dp_cmd_alloc_info();
1564        if (!reply)
1565                return -ENOMEM;
1566
1567        err = -ENOMEM;
1568        dp = kzalloc(sizeof(*dp), GFP_KERNEL);
1569        if (dp == NULL)
1570                goto err_free_reply;
1571
1572        ovs_dp_set_net(dp, sock_net(skb->sk));
1573
1574        /* Allocate table. */
1575        err = ovs_flow_tbl_init(&dp->table);
1576        if (err)
1577                goto err_free_dp;
1578
1579        dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu);
1580        if (!dp->stats_percpu) {
1581                err = -ENOMEM;
1582                goto err_destroy_table;
1583        }
1584
1585        dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
1586                            GFP_KERNEL);
1587        if (!dp->ports) {
1588                err = -ENOMEM;
1589                goto err_destroy_percpu;
1590        }
1591
1592        for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
1593                INIT_HLIST_HEAD(&dp->ports[i]);
1594
1595        /* Set up our datapath device. */
1596        parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
1597        parms.type = OVS_VPORT_TYPE_INTERNAL;
1598        parms.options = NULL;
1599        parms.dp = dp;
1600        parms.port_no = OVSP_LOCAL;
1601        parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
1602
1603        ovs_dp_change(dp, a);
1604
1605        /* So far only local changes have been made, now need the lock. */
1606        ovs_lock();
1607
1608        vport = new_vport(&parms);
1609        if (IS_ERR(vport)) {
1610                err = PTR_ERR(vport);
1611                if (err == -EBUSY)
1612                        err = -EEXIST;
1613
1614                if (err == -EEXIST) {
1615                        /* An outdated user space instance that does not understand
1616                         * the concept of user_features has attempted to create a new
1617                         * datapath and is likely to reuse it. Drop all user features.
1618                         */
1619                        if (info->genlhdr->version < OVS_DP_VER_FEATURES)
1620                                ovs_dp_reset_user_features(skb, info);
1621                }
1622
1623                goto err_destroy_ports_array;
1624        }
1625
1626        err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1627                                   info->snd_seq, 0, OVS_DP_CMD_NEW);
1628        BUG_ON(err < 0);
1629
1630        ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id);
1631        list_add_tail_rcu(&dp->list_node, &ovs_net->dps);
1632
1633        ovs_unlock();
1634
1635        ovs_notify(&dp_datapath_genl_family, reply, info);
1636        return 0;
1637
1638err_destroy_ports_array:
1639        ovs_unlock();
1640        kfree(dp->ports);
1641err_destroy_percpu:
1642        free_percpu(dp->stats_percpu);
1643err_destroy_table:
1644        ovs_flow_tbl_destroy(&dp->table);
1645err_free_dp:
1646        kfree(dp);
1647err_free_reply:
1648        kfree_skb(reply);
1649err:
1650        return err;
1651}
1652
1653/* Called with ovs_mutex. */
1654static void __dp_destroy(struct datapath *dp)
1655{
1656        int i;
1657
1658        for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
1659                struct vport *vport;
1660                struct hlist_node *n;
1661
1662                hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node)
1663                        if (vport->port_no != OVSP_LOCAL)
1664                                ovs_dp_detach_port(vport);
1665        }
1666
1667        list_del_rcu(&dp->list_node);
1668
1669        /* OVSP_LOCAL is datapath internal port. We need to make sure that
1670         * all ports in datapath are destroyed first before freeing datapath.
1671         */
1672        ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
1673
1674        /* RCU destroy the flow table */
1675        call_rcu(&dp->rcu, destroy_dp_rcu);
1676}
1677
1678static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
1679{
1680        struct sk_buff *reply;
1681        struct datapath *dp;
1682        int err;
1683
1684        reply = ovs_dp_cmd_alloc_info();
1685        if (!reply)
1686                return -ENOMEM;
1687
1688        ovs_lock();
1689        dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1690        err = PTR_ERR(dp);
1691        if (IS_ERR(dp))
1692                goto err_unlock_free;
1693
1694        err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1695                                   info->snd_seq, 0, OVS_DP_CMD_DEL);
1696        BUG_ON(err < 0);
1697
1698        __dp_destroy(dp);
1699        ovs_unlock();
1700
1701        ovs_notify(&dp_datapath_genl_family, reply, info);
1702
1703        return 0;
1704
1705err_unlock_free:
1706        ovs_unlock();
1707        kfree_skb(reply);
1708        return err;
1709}
1710
1711static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1712{
1713        struct sk_buff *reply;
1714        struct datapath *dp;
1715        int err;
1716
1717        reply = ovs_dp_cmd_alloc_info();
1718        if (!reply)
1719                return -ENOMEM;
1720
1721        ovs_lock();
1722        dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1723        err = PTR_ERR(dp);
1724        if (IS_ERR(dp))
1725                goto err_unlock_free;
1726
1727        ovs_dp_change(dp, info->attrs);
1728
1729        err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1730                                   info->snd_seq, 0, OVS_DP_CMD_NEW);
1731        BUG_ON(err < 0);
1732
1733        ovs_unlock();
1734        ovs_notify(&dp_datapath_genl_family, reply, info);
1735
1736        return 0;
1737
1738err_unlock_free:
1739        ovs_unlock();
1740        kfree_skb(reply);
1741        return err;
1742}
1743
1744static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1745{
1746        struct sk_buff *reply;
1747        struct datapath *dp;
1748        int err;
1749
1750        reply = ovs_dp_cmd_alloc_info();
1751        if (!reply)
1752                return -ENOMEM;
1753
1754        ovs_lock();
1755        dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1756        if (IS_ERR(dp)) {
1757                err = PTR_ERR(dp);
1758                goto err_unlock_free;
1759        }
1760        err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1761                                   info->snd_seq, 0, OVS_DP_CMD_NEW);
1762        BUG_ON(err < 0);
1763        ovs_unlock();
1764
1765        return genlmsg_reply(reply, info);
1766
1767err_unlock_free:
1768        ovs_unlock();
1769        kfree_skb(reply);
1770        return err;
1771}
1772
1773static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1774{
1775        struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
1776        struct datapath *dp;
1777        int skip = cb->args[0];
1778        int i = 0;
1779
1780        ovs_lock();
1781        list_for_each_entry(dp, &ovs_net->dps, list_node) {
1782                if (i >= skip &&
1783                    ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
1784                                         cb->nlh->nlmsg_seq, NLM_F_MULTI,
1785                                         OVS_DP_CMD_NEW) < 0)
1786                        break;
1787                i++;
1788        }
1789        ovs_unlock();
1790
1791        cb->args[0] = i;
1792
1793        return skb->len;
1794}
1795
1796static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
1797        [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1798        [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1799        [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
1800};
1801
1802static const struct genl_ops dp_datapath_genl_ops[] = {
1803        { .cmd = OVS_DP_CMD_NEW,
1804          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1805          .policy = datapath_policy,
1806          .doit = ovs_dp_cmd_new
1807        },
1808        { .cmd = OVS_DP_CMD_DEL,
1809          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1810          .policy = datapath_policy,
1811          .doit = ovs_dp_cmd_del
1812        },
1813        { .cmd = OVS_DP_CMD_GET,
1814          .flags = 0,               /* OK for unprivileged users. */
1815          .policy = datapath_policy,
1816          .doit = ovs_dp_cmd_get,
1817          .dumpit = ovs_dp_cmd_dump
1818        },
1819        { .cmd = OVS_DP_CMD_SET,
1820          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1821          .policy = datapath_policy,
1822          .doit = ovs_dp_cmd_set,
1823        },
1824};
1825
1826static struct genl_family dp_datapath_genl_family = {
1827        .id = GENL_ID_GENERATE,
1828        .hdrsize = sizeof(struct ovs_header),
1829        .name = OVS_DATAPATH_FAMILY,
1830        .version = OVS_DATAPATH_VERSION,
1831        .maxattr = OVS_DP_ATTR_MAX,
1832        .netnsok = true,
1833        .parallel_ops = true,
1834        .ops = dp_datapath_genl_ops,
1835        .n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
1836        .mcgrps = &ovs_dp_datapath_multicast_group,
1837        .n_mcgrps = 1,
1838};
1839
1840/* Called with ovs_mutex or RCU read lock. */
1841static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
1842                                   u32 portid, u32 seq, u32 flags, u8 cmd)
1843{
1844        struct ovs_header *ovs_header;
1845        struct ovs_vport_stats vport_stats;
1846        int err;
1847
1848        ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family,
1849                                 flags, cmd);
1850        if (!ovs_header)
1851                return -EMSGSIZE;
1852
1853        ovs_header->dp_ifindex = get_dpifindex(vport->dp);
1854
1855        if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
1856            nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
1857            nla_put_string(skb, OVS_VPORT_ATTR_NAME,
1858                           ovs_vport_name(vport)))
1859                goto nla_put_failure;
1860
1861        ovs_vport_get_stats(vport, &vport_stats);
1862        if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS,
1863                          sizeof(struct ovs_vport_stats), &vport_stats,
1864                          OVS_VPORT_ATTR_PAD))
1865                goto nla_put_failure;
1866
1867        if (ovs_vport_get_upcall_portids(vport, skb))
1868                goto nla_put_failure;
1869
1870        err = ovs_vport_get_options(vport, skb);
1871        if (err == -EMSGSIZE)
1872                goto error;
1873
1874        genlmsg_end(skb, ovs_header);
1875        return 0;
1876
1877nla_put_failure:
1878        err = -EMSGSIZE;
1879error:
1880        genlmsg_cancel(skb, ovs_header);
1881        return err;
1882}
1883
1884static struct sk_buff *ovs_vport_cmd_alloc_info(void)
1885{
1886        return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1887}
1888
1889/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
1890struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
1891                                         u32 seq, u8 cmd)
1892{
1893        struct sk_buff *skb;
1894        int retval;
1895
1896        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
1897        if (!skb)
1898                return ERR_PTR(-ENOMEM);
1899
1900        retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd);
1901        BUG_ON(retval < 0);
1902
1903        return skb;
1904}
1905
1906/* Called with ovs_mutex or RCU read lock. */
1907static struct vport *lookup_vport(struct net *net,
1908                                  const struct ovs_header *ovs_header,
1909                                  struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
1910{
1911        struct datapath *dp;
1912        struct vport *vport;
1913
1914        if (a[OVS_VPORT_ATTR_NAME]) {
1915                vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME]));
1916                if (!vport)
1917                        return ERR_PTR(-ENODEV);
1918                if (ovs_header->dp_ifindex &&
1919                    ovs_header->dp_ifindex != get_dpifindex(vport->dp))
1920                        return ERR_PTR(-ENODEV);
1921                return vport;
1922        } else if (a[OVS_VPORT_ATTR_PORT_NO]) {
1923                u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1924
1925                if (port_no >= DP_MAX_PORTS)
1926                        return ERR_PTR(-EFBIG);
1927
1928                dp = get_dp(net, ovs_header->dp_ifindex);
1929                if (!dp)
1930                        return ERR_PTR(-ENODEV);
1931
1932                vport = ovs_vport_ovsl_rcu(dp, port_no);
1933                if (!vport)
1934                        return ERR_PTR(-ENODEV);
1935                return vport;
1936        } else
1937                return ERR_PTR(-EINVAL);
1938}
1939
1940/* Called with ovs_mutex */
1941static void update_headroom(struct datapath *dp)
1942{
1943        unsigned dev_headroom, max_headroom = 0;
1944        struct net_device *dev;
1945        struct vport *vport;
1946        int i;
1947
1948        for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
1949                hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
1950                        dev = vport->dev;
1951                        dev_headroom = netdev_get_fwd_headroom(dev);
1952                        if (dev_headroom > max_headroom)
1953                                max_headroom = dev_headroom;
1954                }
1955        }
1956
1957        dp->max_headroom = max_headroom;
1958        for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
1959                hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
1960                        netdev_set_rx_headroom(vport->dev, max_headroom);
1961}
1962
1963static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
1964{
1965        struct nlattr **a = info->attrs;
1966        struct ovs_header *ovs_header = info->userhdr;
1967        struct vport_parms parms;
1968        struct sk_buff *reply;
1969        struct vport *vport;
1970        struct datapath *dp;
1971        u32 port_no;
1972        int err;
1973
1974        if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
1975            !a[OVS_VPORT_ATTR_UPCALL_PID])
1976                return -EINVAL;
1977
1978        port_no = a[OVS_VPORT_ATTR_PORT_NO]
1979                ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0;
1980        if (port_no >= DP_MAX_PORTS)
1981                return -EFBIG;
1982
1983        reply = ovs_vport_cmd_alloc_info();
1984        if (!reply)
1985                return -ENOMEM;
1986
1987        ovs_lock();
1988restart:
1989        dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1990        err = -ENODEV;
1991        if (!dp)
1992                goto exit_unlock_free;
1993
1994        if (port_no) {
1995                vport = ovs_vport_ovsl(dp, port_no);
1996                err = -EBUSY;
1997                if (vport)
1998                        goto exit_unlock_free;
1999        } else {
2000                for (port_no = 1; ; port_no++) {
2001                        if (port_no >= DP_MAX_PORTS) {
2002                                err = -EFBIG;
2003                                goto exit_unlock_free;
2004                        }
2005                        vport = ovs_vport_ovsl(dp, port_no);
2006                        if (!vport)
2007                                break;
2008                }
2009        }
2010
2011        parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
2012        parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
2013        parms.options = a[OVS_VPORT_ATTR_OPTIONS];
2014        parms.dp = dp;
2015        parms.port_no = port_no;
2016        parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID];
2017
2018        vport = new_vport(&parms);
2019        err = PTR_ERR(vport);
2020        if (IS_ERR(vport)) {
2021                if (err == -EAGAIN)
2022                        goto restart;
2023                goto exit_unlock_free;
2024        }
2025
2026        err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2027                                      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2028
2029        if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
2030                update_headroom(dp);
2031        else
2032                netdev_set_rx_headroom(vport->dev, dp->max_headroom);
2033
2034        BUG_ON(err < 0);
2035        ovs_unlock();
2036
2037        ovs_notify(&dp_vport_genl_family, reply, info);
2038        return 0;
2039
2040exit_unlock_free:
2041        ovs_unlock();
2042        kfree_skb(reply);
2043        return err;
2044}
2045
2046static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
2047{
2048        struct nlattr **a = info->attrs;
2049        struct sk_buff *reply;
2050        struct vport *vport;
2051        int err;
2052
2053        reply = ovs_vport_cmd_alloc_info();
2054        if (!reply)
2055                return -ENOMEM;
2056
2057        ovs_lock();
2058        vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
2059        err = PTR_ERR(vport);
2060        if (IS_ERR(vport))
2061                goto exit_unlock_free;
2062
2063        if (a[OVS_VPORT_ATTR_TYPE] &&
2064            nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) {
2065                err = -EINVAL;
2066                goto exit_unlock_free;
2067        }
2068
2069        if (a[OVS_VPORT_ATTR_OPTIONS]) {
2070                err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
2071                if (err)
2072                        goto exit_unlock_free;
2073        }
2074
2075
2076        if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
2077                struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID];
2078
2079                err = ovs_vport_set_upcall_portids(vport, ids);
2080                if (err)
2081                        goto exit_unlock_free;
2082        }
2083
2084        err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2085                                      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2086        BUG_ON(err < 0);
2087
2088        ovs_unlock();
2089        ovs_notify(&dp_vport_genl_family, reply, info);
2090        return 0;
2091
2092exit_unlock_free:
2093        ovs_unlock();
2094        kfree_skb(reply);
2095        return err;
2096}
2097
2098static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
2099{
2100        bool must_update_headroom = false;
2101        struct nlattr **a = info->attrs;
2102        struct sk_buff *reply;
2103        struct datapath *dp;
2104        struct vport *vport;
2105        int err;
2106
2107        reply = ovs_vport_cmd_alloc_info();
2108        if (!reply)
2109                return -ENOMEM;
2110
2111        ovs_lock();
2112        vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
2113        err = PTR_ERR(vport);
2114        if (IS_ERR(vport))
2115                goto exit_unlock_free;
2116
2117        if (vport->port_no == OVSP_LOCAL) {
2118                err = -EINVAL;
2119                goto exit_unlock_free;
2120        }
2121
2122        err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2123                                      info->snd_seq, 0, OVS_VPORT_CMD_DEL);
2124        BUG_ON(err < 0);
2125
2126        /* the vport deletion may trigger dp headroom update */
2127        dp = vport->dp;
2128        if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
2129                must_update_headroom = true;
2130        netdev_reset_rx_headroom(vport->dev);
2131        ovs_dp_detach_port(vport);
2132
2133        if (must_update_headroom)
2134                update_headroom(dp);
2135        ovs_unlock();
2136
2137        ovs_notify(&dp_vport_genl_family, reply, info);
2138        return 0;
2139
2140exit_unlock_free:
2141        ovs_unlock();
2142        kfree_skb(reply);
2143        return err;
2144}
2145
2146static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
2147{
2148        struct nlattr **a = info->attrs;
2149        struct ovs_header *ovs_header = info->userhdr;
2150        struct sk_buff *reply;
2151        struct vport *vport;
2152        int err;
2153
2154        reply = ovs_vport_cmd_alloc_info();
2155        if (!reply)
2156                return -ENOMEM;
2157
2158        rcu_read_lock();
2159        vport = lookup_vport(sock_net(skb->sk), ovs_header, a);
2160        err = PTR_ERR(vport);
2161        if (IS_ERR(vport))
2162                goto exit_unlock_free;
2163        err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2164                                      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2165        BUG_ON(err < 0);
2166        rcu_read_unlock();
2167
2168        return genlmsg_reply(reply, info);
2169
2170exit_unlock_free:
2171        rcu_read_unlock();
2172        kfree_skb(reply);
2173        return err;
2174}
2175
2176static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
2177{
2178        struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
2179        struct datapath *dp;
2180        int bucket = cb->args[0], skip = cb->args[1];
2181        int i, j = 0;
2182
2183        rcu_read_lock();
2184        dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
2185        if (!dp) {
2186                rcu_read_unlock();
2187                return -ENODEV;
2188        }
2189        for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) {
2190                struct vport *vport;
2191
2192                j = 0;
2193                hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
2194                        if (j >= skip &&
2195                            ovs_vport_cmd_fill_info(vport, skb,
2196                                                    NETLINK_CB(cb->skb).portid,
2197                                                    cb->nlh->nlmsg_seq,
2198                                                    NLM_F_MULTI,
2199                                                    OVS_VPORT_CMD_NEW) < 0)
2200                                goto out;
2201
2202                        j++;
2203                }
2204                skip = 0;
2205        }
2206out:
2207        rcu_read_unlock();
2208
2209        cb->args[0] = i;
2210        cb->args[1] = j;
2211
2212        return skb->len;
2213}
2214
2215static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
2216        [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
2217        [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
2218        [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
2219        [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
2220        [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
2221        [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
2222};
2223
2224static const struct genl_ops dp_vport_genl_ops[] = {
2225        { .cmd = OVS_VPORT_CMD_NEW,
2226          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2227          .policy = vport_policy,
2228          .doit = ovs_vport_cmd_new
2229        },
2230        { .cmd = OVS_VPORT_CMD_DEL,
2231          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2232          .policy = vport_policy,
2233          .doit = ovs_vport_cmd_del
2234        },
2235        { .cmd = OVS_VPORT_CMD_GET,
2236          .flags = 0,               /* OK for unprivileged users. */
2237          .policy = vport_policy,
2238          .doit = ovs_vport_cmd_get,
2239          .dumpit = ovs_vport_cmd_dump
2240        },
2241        { .cmd = OVS_VPORT_CMD_SET,
2242          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2243          .policy = vport_policy,
2244          .doit = ovs_vport_cmd_set,
2245        },
2246};
2247
2248struct genl_family dp_vport_genl_family = {
2249        .id = GENL_ID_GENERATE,
2250        .hdrsize = sizeof(struct ovs_header),
2251        .name = OVS_VPORT_FAMILY,
2252        .version = OVS_VPORT_VERSION,
2253        .maxattr = OVS_VPORT_ATTR_MAX,
2254        .netnsok = true,
2255        .parallel_ops = true,
2256        .ops = dp_vport_genl_ops,
2257        .n_ops = ARRAY_SIZE(dp_vport_genl_ops),
2258        .mcgrps = &ovs_dp_vport_multicast_group,
2259        .n_mcgrps = 1,
2260};
2261
2262static struct genl_family * const dp_genl_families[] = {
2263        &dp_datapath_genl_family,
2264        &dp_vport_genl_family,
2265        &dp_flow_genl_family,
2266        &dp_packet_genl_family,
2267};
2268
2269static void dp_unregister_genl(int n_families)
2270{
2271        int i;
2272
2273        for (i = 0; i < n_families; i++)
2274                genl_unregister_family(dp_genl_families[i]);
2275}
2276
2277static int dp_register_genl(void)
2278{
2279        int err;
2280        int i;
2281
2282        for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
2283
2284                err = genl_register_family(dp_genl_families[i]);
2285                if (err)
2286                        goto error;
2287        }
2288
2289        return 0;
2290
2291error:
2292        dp_unregister_genl(i);
2293        return err;
2294}
2295
2296static int __net_init ovs_init_net(struct net *net)
2297{
2298        struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
2299
2300        INIT_LIST_HEAD(&ovs_net->dps);
2301        INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
2302        ovs_ct_init(net);
2303        return 0;
2304}
2305
2306static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
2307                                            struct list_head *head)
2308{
2309        struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
2310        struct datapath *dp;
2311
2312        list_for_each_entry(dp, &ovs_net->dps, list_node) {
2313                int i;
2314
2315                for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
2316                        struct vport *vport;
2317
2318                        hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) {
2319                                if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL)
2320                                        continue;
2321
2322                                if (dev_net(vport->dev) == dnet)
2323                                        list_add(&vport->detach_list, head);
2324                        }
2325                }
2326        }
2327}
2328
2329static void __net_exit ovs_exit_net(struct net *dnet)
2330{
2331        struct datapath *dp, *dp_next;
2332        struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id);
2333        struct vport *vport, *vport_next;
2334        struct net *net;
2335        LIST_HEAD(head);
2336
2337        ovs_ct_exit(dnet);
2338        ovs_lock();
2339        list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
2340                __dp_destroy(dp);
2341
2342        rtnl_lock();
2343        for_each_net(net)
2344                list_vports_from_net(net, dnet, &head);
2345        rtnl_unlock();
2346
2347        /* Detach all vports from given namespace. */
2348        list_for_each_entry_safe(vport, vport_next, &head, detach_list) {
2349                list_del(&vport->detach_list);
2350                ovs_dp_detach_port(vport);
2351        }
2352
2353        ovs_unlock();
2354
2355        cancel_work_sync(&ovs_net->dp_notify_work);
2356}
2357
2358static struct pernet_operations ovs_net_ops = {
2359        .init = ovs_init_net,
2360        .exit = ovs_exit_net,
2361        .id   = &ovs_net_id,
2362        .size = sizeof(struct ovs_net),
2363};
2364
2365static int __init dp_init(void)
2366{
2367        int err;
2368
2369        BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
2370
2371        pr_info("Open vSwitch switching datapath\n");
2372
2373        err = action_fifos_init();
2374        if (err)
2375                goto error;
2376
2377        err = ovs_internal_dev_rtnl_link_register();
2378        if (err)
2379                goto error_action_fifos_exit;
2380
2381        err = ovs_flow_init();
2382        if (err)
2383                goto error_unreg_rtnl_link;
2384
2385        err = ovs_vport_init();
2386        if (err)
2387                goto error_flow_exit;
2388
2389        err = register_pernet_device(&ovs_net_ops);
2390        if (err)
2391                goto error_vport_exit;
2392
2393        err = register_netdevice_notifier(&ovs_dp_device_notifier);
2394        if (err)
2395                goto error_netns_exit;
2396
2397        err = ovs_netdev_init();
2398        if (err)
2399                goto error_unreg_notifier;
2400
2401        err = dp_register_genl();
2402        if (err < 0)
2403                goto error_unreg_netdev;
2404
2405        return 0;
2406
2407error_unreg_netdev:
2408        ovs_netdev_exit();
2409error_unreg_notifier:
2410        unregister_netdevice_notifier(&ovs_dp_device_notifier);
2411error_netns_exit:
2412        unregister_pernet_device(&ovs_net_ops);
2413error_vport_exit:
2414        ovs_vport_exit();
2415error_flow_exit:
2416        ovs_flow_exit();
2417error_unreg_rtnl_link:
2418        ovs_internal_dev_rtnl_link_unregister();
2419error_action_fifos_exit:
2420        action_fifos_exit();
2421error:
2422        return err;
2423}
2424
2425static void dp_cleanup(void)
2426{
2427        dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
2428        ovs_netdev_exit();
2429        unregister_netdevice_notifier(&ovs_dp_device_notifier);
2430        unregister_pernet_device(&ovs_net_ops);
2431        rcu_barrier();
2432        ovs_vport_exit();
2433        ovs_flow_exit();
2434        ovs_internal_dev_rtnl_link_unregister();
2435        action_fifos_exit();
2436}
2437
2438module_init(dp_init);
2439module_exit(dp_cleanup);
2440
2441MODULE_DESCRIPTION("Open vSwitch switching datapath");
2442MODULE_LICENSE("GPL");
2443MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY);
2444MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
2445MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
2446MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
2447