linux/net/openvswitch/datapath.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2007-2014 Nicira, Inc.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of version 2 of the GNU General Public
   6 * License as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16 * 02110-1301, USA
  17 */
  18
  19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20
  21#include <linux/init.h>
  22#include <linux/module.h>
  23#include <linux/if_arp.h>
  24#include <linux/if_vlan.h>
  25#include <linux/in.h>
  26#include <linux/ip.h>
  27#include <linux/jhash.h>
  28#include <linux/delay.h>
  29#include <linux/time.h>
  30#include <linux/etherdevice.h>
  31#include <linux/genetlink.h>
  32#include <linux/kernel.h>
  33#include <linux/kthread.h>
  34#include <linux/mutex.h>
  35#include <linux/percpu.h>
  36#include <linux/rcupdate.h>
  37#include <linux/tcp.h>
  38#include <linux/udp.h>
  39#include <linux/ethtool.h>
  40#include <linux/wait.h>
  41#include <asm/div64.h>
  42#include <linux/highmem.h>
  43#include <linux/netfilter_bridge.h>
  44#include <linux/netfilter_ipv4.h>
  45#include <linux/inetdevice.h>
  46#include <linux/list.h>
  47#include <linux/openvswitch.h>
  48#include <linux/rculist.h>
  49#include <linux/dmi.h>
  50#include <net/genetlink.h>
  51#include <net/net_namespace.h>
  52#include <net/netns/generic.h>
  53
  54#include "datapath.h"
  55#include "flow.h"
  56#include "flow_table.h"
  57#include "flow_netlink.h"
  58#include "vport-internal_dev.h"
  59#include "vport-netdev.h"
  60
  61int ovs_net_id __read_mostly;
  62EXPORT_SYMBOL_GPL(ovs_net_id);
  63
  64static struct genl_family dp_packet_genl_family;
  65static struct genl_family dp_flow_genl_family;
  66static struct genl_family dp_datapath_genl_family;
  67
  68static const struct nla_policy flow_policy[];
  69
  70static const struct genl_multicast_group ovs_dp_flow_multicast_group = {
  71        .name = OVS_FLOW_MCGROUP,
  72};
  73
  74static const struct genl_multicast_group ovs_dp_datapath_multicast_group = {
  75        .name = OVS_DATAPATH_MCGROUP,
  76};
  77
  78static const struct genl_multicast_group ovs_dp_vport_multicast_group = {
  79        .name = OVS_VPORT_MCGROUP,
  80};
  81
  82/* Check if need to build a reply message.
  83 * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */
  84static bool ovs_must_notify(struct genl_family *family, struct genl_info *info,
  85                            unsigned int group)
  86{
  87        return info->nlhdr->nlmsg_flags & NLM_F_ECHO ||
  88               genl_has_listeners(family, genl_info_net(info), group);
  89}
  90
  91static void ovs_notify(struct genl_family *family,
  92                       struct sk_buff *skb, struct genl_info *info)
  93{
  94        genl_notify(family, skb, info, 0, GFP_KERNEL);
  95}
  96
  97/**
  98 * DOC: Locking:
  99 *
 100 * All writes e.g. Writes to device state (add/remove datapath, port, set
 101 * operations on vports, etc.), Writes to other state (flow table
 102 * modifications, set miscellaneous datapath parameters, etc.) are protected
 103 * by ovs_lock.
 104 *
 105 * Reads are protected by RCU.
 106 *
 107 * There are a few special cases (mostly stats) that have their own
 108 * synchronization but they nest under all of above and don't interact with
 109 * each other.
 110 *
 111 * The RTNL lock nests inside ovs_mutex.
 112 */
 113
 114static DEFINE_MUTEX(ovs_mutex);
 115
 116void ovs_lock(void)
 117{
 118        mutex_lock(&ovs_mutex);
 119}
 120
 121void ovs_unlock(void)
 122{
 123        mutex_unlock(&ovs_mutex);
 124}
 125
 126#ifdef CONFIG_LOCKDEP
 127int lockdep_ovsl_is_held(void)
 128{
 129        if (debug_locks)
 130                return lockdep_is_held(&ovs_mutex);
 131        else
 132                return 1;
 133}
 134EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
 135#endif
 136
 137static struct vport *new_vport(const struct vport_parms *);
 138static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
 139                             const struct sw_flow_key *,
 140                             const struct dp_upcall_info *);
 141static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
 142                                  const struct sw_flow_key *,
 143                                  const struct dp_upcall_info *);
 144
 145/* Must be called with rcu_read_lock. */
 146static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
 147{
 148        struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
 149
 150        if (dev) {
 151                struct vport *vport = ovs_internal_dev_get_vport(dev);
 152                if (vport)
 153                        return vport->dp;
 154        }
 155
 156        return NULL;
 157}
 158
 159/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
 160 * returned dp pointer valid.
 161 */
 162static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
 163{
 164        struct datapath *dp;
 165
 166        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
 167        rcu_read_lock();
 168        dp = get_dp_rcu(net, dp_ifindex);
 169        rcu_read_unlock();
 170
 171        return dp;
 172}
 173
 174/* Must be called with rcu_read_lock or ovs_mutex. */
 175const char *ovs_dp_name(const struct datapath *dp)
 176{
 177        struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
 178        return ovs_vport_name(vport);
 179}
 180
 181static int get_dpifindex(const struct datapath *dp)
 182{
 183        struct vport *local;
 184        int ifindex;
 185
 186        rcu_read_lock();
 187
 188        local = ovs_vport_rcu(dp, OVSP_LOCAL);
 189        if (local)
 190                ifindex = local->dev->ifindex;
 191        else
 192                ifindex = 0;
 193
 194        rcu_read_unlock();
 195
 196        return ifindex;
 197}
 198
 199static void destroy_dp_rcu(struct rcu_head *rcu)
 200{
 201        struct datapath *dp = container_of(rcu, struct datapath, rcu);
 202
 203        ovs_flow_tbl_destroy(&dp->table);
 204        free_percpu(dp->stats_percpu);
 205        kfree(dp->ports);
 206        kfree(dp);
 207}
 208
 209static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
 210                                            u16 port_no)
 211{
 212        return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
 213}
 214
 215/* Called with ovs_mutex or RCU read lock. */
 216struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
 217{
 218        struct vport *vport;
 219        struct hlist_head *head;
 220
 221        head = vport_hash_bucket(dp, port_no);
 222        hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
 223                if (vport->port_no == port_no)
 224                        return vport;
 225        }
 226        return NULL;
 227}
 228
 229/* Called with ovs_mutex. */
 230static struct vport *new_vport(const struct vport_parms *parms)
 231{
 232        struct vport *vport;
 233
 234        vport = ovs_vport_add(parms);
 235        if (!IS_ERR(vport)) {
 236                struct datapath *dp = parms->dp;
 237                struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
 238
 239                hlist_add_head_rcu(&vport->dp_hash_node, head);
 240        }
 241        return vport;
 242}
 243
 244void ovs_dp_detach_port(struct vport *p)
 245{
 246        ASSERT_OVSL();
 247
 248        /* First drop references to device. */
 249        hlist_del_rcu(&p->dp_hash_node);
 250
 251        /* Then destroy it. */
 252        ovs_vport_del(p);
 253}
 254
 255/* Must be called with rcu_read_lock. */
 256void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 257{
 258        const struct vport *p = OVS_CB(skb)->input_vport;
 259        struct datapath *dp = p->dp;
 260        struct sw_flow *flow;
 261        struct sw_flow_actions *sf_acts;
 262        struct dp_stats_percpu *stats;
 263        u64 *stats_counter;
 264        u32 n_mask_hit;
 265
 266        stats = this_cpu_ptr(dp->stats_percpu);
 267
 268        /* Look up flow. */
 269        flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit);
 270        if (unlikely(!flow)) {
 271                struct dp_upcall_info upcall;
 272                int error;
 273
 274                memset(&upcall, 0, sizeof(upcall));
 275                upcall.cmd = OVS_PACKET_CMD_MISS;
 276                upcall.portid = ovs_vport_find_upcall_portid(p, skb);
 277                upcall.mru = OVS_CB(skb)->mru;
 278                error = ovs_dp_upcall(dp, skb, key, &upcall);
 279                if (unlikely(error))
 280                        kfree_skb(skb);
 281                else
 282                        consume_skb(skb);
 283                stats_counter = &stats->n_missed;
 284                goto out;
 285        }
 286
 287        ovs_flow_stats_update(flow, key->tp.flags, skb);
 288        sf_acts = rcu_dereference(flow->sf_acts);
 289        ovs_execute_actions(dp, skb, sf_acts, key);
 290
 291        stats_counter = &stats->n_hit;
 292
 293out:
 294        /* Update datapath statistics. */
 295        u64_stats_update_begin(&stats->syncp);
 296        (*stats_counter)++;
 297        stats->n_mask_hit += n_mask_hit;
 298        u64_stats_update_end(&stats->syncp);
 299}
 300
 301int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
 302                  const struct sw_flow_key *key,
 303                  const struct dp_upcall_info *upcall_info)
 304{
 305        struct dp_stats_percpu *stats;
 306        int err;
 307
 308        if (upcall_info->portid == 0) {
 309                err = -ENOTCONN;
 310                goto err;
 311        }
 312
 313        if (!skb_is_gso(skb))
 314                err = queue_userspace_packet(dp, skb, key, upcall_info);
 315        else
 316                err = queue_gso_packets(dp, skb, key, upcall_info);
 317        if (err)
 318                goto err;
 319
 320        return 0;
 321
 322err:
 323        stats = this_cpu_ptr(dp->stats_percpu);
 324
 325        u64_stats_update_begin(&stats->syncp);
 326        stats->n_lost++;
 327        u64_stats_update_end(&stats->syncp);
 328
 329        return err;
 330}
 331
 332static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 333                             const struct sw_flow_key *key,
 334                             const struct dp_upcall_info *upcall_info)
 335{
 336        unsigned short gso_type = skb_shinfo(skb)->gso_type;
 337        struct sw_flow_key later_key;
 338        struct sk_buff *segs, *nskb;
 339        int err;
 340
 341        BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET);
 342        segs = __skb_gso_segment(skb, NETIF_F_SG, false);
 343        if (IS_ERR(segs))
 344                return PTR_ERR(segs);
 345        if (segs == NULL)
 346                return -EINVAL;
 347
 348        if (gso_type & SKB_GSO_UDP) {
 349                /* The initial flow key extracted by ovs_flow_key_extract()
 350                 * in this case is for a first fragment, so we need to
 351                 * properly mark later fragments.
 352                 */
 353                later_key = *key;
 354                later_key.ip.frag = OVS_FRAG_TYPE_LATER;
 355        }
 356
 357        /* Queue all of the segments. */
 358        skb = segs;
 359        do {
 360                if (gso_type & SKB_GSO_UDP && skb != segs)
 361                        key = &later_key;
 362
 363                err = queue_userspace_packet(dp, skb, key, upcall_info);
 364                if (err)
 365                        break;
 366
 367        } while ((skb = skb->next));
 368
 369        /* Free all of the segments. */
 370        skb = segs;
 371        do {
 372                nskb = skb->next;
 373                if (err)
 374                        kfree_skb(skb);
 375                else
 376                        consume_skb(skb);
 377        } while ((skb = nskb));
 378        return err;
 379}
 380
 381static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
 382                              unsigned int hdrlen)
 383{
 384        size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
 385                + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
 386                + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */
 387
 388        /* OVS_PACKET_ATTR_USERDATA */
 389        if (upcall_info->userdata)
 390                size += NLA_ALIGN(upcall_info->userdata->nla_len);
 391
 392        /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */
 393        if (upcall_info->egress_tun_info)
 394                size += nla_total_size(ovs_tun_key_attr_size());
 395
 396        /* OVS_PACKET_ATTR_ACTIONS */
 397        if (upcall_info->actions_len)
 398                size += nla_total_size(upcall_info->actions_len);
 399
 400        /* OVS_PACKET_ATTR_MRU */
 401        if (upcall_info->mru)
 402                size += nla_total_size(sizeof(upcall_info->mru));
 403
 404        return size;
 405}
 406
 407static void pad_packet(struct datapath *dp, struct sk_buff *skb)
 408{
 409        if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
 410                size_t plen = NLA_ALIGN(skb->len) - skb->len;
 411
 412                if (plen > 0)
 413                        memset(skb_put(skb, plen), 0, plen);
 414        }
 415}
 416
 417static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 418                                  const struct sw_flow_key *key,
 419                                  const struct dp_upcall_info *upcall_info)
 420{
 421        struct ovs_header *upcall;
 422        struct sk_buff *nskb = NULL;
 423        struct sk_buff *user_skb = NULL; /* to be queued to userspace */
 424        struct nlattr *nla;
 425        size_t len;
 426        unsigned int hlen;
 427        int err, dp_ifindex;
 428
 429        dp_ifindex = get_dpifindex(dp);
 430        if (!dp_ifindex)
 431                return -ENODEV;
 432
 433        if (skb_vlan_tag_present(skb)) {
 434                nskb = skb_clone(skb, GFP_ATOMIC);
 435                if (!nskb)
 436                        return -ENOMEM;
 437
 438                nskb = __vlan_hwaccel_push_inside(nskb);
 439                if (!nskb)
 440                        return -ENOMEM;
 441
 442                skb = nskb;
 443        }
 444
 445        if (nla_attr_size(skb->len) > USHRT_MAX) {
 446                err = -EFBIG;
 447                goto out;
 448        }
 449
 450        /* Complete checksum if needed */
 451        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 452            (err = skb_checksum_help(skb)))
 453                goto out;
 454
 455        /* Older versions of OVS user space enforce alignment of the last
 456         * Netlink attribute to NLA_ALIGNTO which would require extensive
 457         * padding logic. Only perform zerocopy if padding is not required.
 458         */
 459        if (dp->user_features & OVS_DP_F_UNALIGNED)
 460                hlen = skb_zerocopy_headlen(skb);
 461        else
 462                hlen = skb->len;
 463
 464        len = upcall_msg_size(upcall_info, hlen);
 465        user_skb = genlmsg_new(len, GFP_ATOMIC);
 466        if (!user_skb) {
 467                err = -ENOMEM;
 468                goto out;
 469        }
 470
 471        upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
 472                             0, upcall_info->cmd);
 473        upcall->dp_ifindex = dp_ifindex;
 474
 475        err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb);
 476        BUG_ON(err);
 477
 478        if (upcall_info->userdata)
 479                __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
 480                          nla_len(upcall_info->userdata),
 481                          nla_data(upcall_info->userdata));
 482
 483        if (upcall_info->egress_tun_info) {
 484                nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
 485                err = ovs_nla_put_tunnel_info(user_skb,
 486                                              upcall_info->egress_tun_info);
 487                BUG_ON(err);
 488                nla_nest_end(user_skb, nla);
 489        }
 490
 491        if (upcall_info->actions_len) {
 492                nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS);
 493                err = ovs_nla_put_actions(upcall_info->actions,
 494                                          upcall_info->actions_len,
 495                                          user_skb);
 496                if (!err)
 497                        nla_nest_end(user_skb, nla);
 498                else
 499                        nla_nest_cancel(user_skb, nla);
 500        }
 501
 502        /* Add OVS_PACKET_ATTR_MRU */
 503        if (upcall_info->mru) {
 504                if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
 505                                upcall_info->mru)) {
 506                        err = -ENOBUFS;
 507                        goto out;
 508                }
 509                pad_packet(dp, user_skb);
 510        }
 511
 512        /* Only reserve room for attribute header, packet data is added
 513         * in skb_zerocopy() */
 514        if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
 515                err = -ENOBUFS;
 516                goto out;
 517        }
 518        nla->nla_len = nla_attr_size(skb->len);
 519
 520        err = skb_zerocopy(user_skb, skb, skb->len, hlen);
 521        if (err)
 522                goto out;
 523
 524        /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
 525        pad_packet(dp, user_skb);
 526
 527        ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
 528
 529        err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
 530        user_skb = NULL;
 531out:
 532        if (err)
 533                skb_tx_error(skb);
 534        kfree_skb(user_skb);
 535        kfree_skb(nskb);
 536        return err;
 537}
 538
 539static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 540{
 541        struct ovs_header *ovs_header = info->userhdr;
 542        struct net *net = sock_net(skb->sk);
 543        struct nlattr **a = info->attrs;
 544        struct sw_flow_actions *acts;
 545        struct sk_buff *packet;
 546        struct sw_flow *flow;
 547        struct sw_flow_actions *sf_acts;
 548        struct datapath *dp;
 549        struct ethhdr *eth;
 550        struct vport *input_vport;
 551        u16 mru = 0;
 552        int len;
 553        int err;
 554        bool log = !a[OVS_PACKET_ATTR_PROBE];
 555
 556        err = -EINVAL;
 557        if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
 558            !a[OVS_PACKET_ATTR_ACTIONS])
 559                goto err;
 560
 561        len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
 562        packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
 563        err = -ENOMEM;
 564        if (!packet)
 565                goto err;
 566        skb_reserve(packet, NET_IP_ALIGN);
 567
 568        nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
 569
 570        skb_reset_mac_header(packet);
 571        eth = eth_hdr(packet);
 572
 573        /* Normally, setting the skb 'protocol' field would be handled by a
 574         * call to eth_type_trans(), but it assumes there's a sending
 575         * device, which we may not have. */
 576        if (eth_proto_is_802_3(eth->h_proto))
 577                packet->protocol = eth->h_proto;
 578        else
 579                packet->protocol = htons(ETH_P_802_2);
 580
 581        /* Set packet's mru */
 582        if (a[OVS_PACKET_ATTR_MRU]) {
 583                mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
 584                packet->ignore_df = 1;
 585        }
 586        OVS_CB(packet)->mru = mru;
 587
 588        /* Build an sw_flow for sending this packet. */
 589        flow = ovs_flow_alloc();
 590        err = PTR_ERR(flow);
 591        if (IS_ERR(flow))
 592                goto err_kfree_skb;
 593
 594        err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY],
 595                                             packet, &flow->key, log);
 596        if (err)
 597                goto err_flow_free;
 598
 599        err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS],
 600                                   &flow->key, &acts, log);
 601        if (err)
 602                goto err_flow_free;
 603
 604        rcu_assign_pointer(flow->sf_acts, acts);
 605        packet->priority = flow->key.phy.priority;
 606        packet->mark = flow->key.phy.skb_mark;
 607
 608        rcu_read_lock();
 609        dp = get_dp_rcu(net, ovs_header->dp_ifindex);
 610        err = -ENODEV;
 611        if (!dp)
 612                goto err_unlock;
 613
 614        input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port);
 615        if (!input_vport)
 616                input_vport = ovs_vport_rcu(dp, OVSP_LOCAL);
 617
 618        if (!input_vport)
 619                goto err_unlock;
 620
 621        packet->dev = input_vport->dev;
 622        OVS_CB(packet)->input_vport = input_vport;
 623        sf_acts = rcu_dereference(flow->sf_acts);
 624
 625        local_bh_disable();
 626        err = ovs_execute_actions(dp, packet, sf_acts, &flow->key);
 627        local_bh_enable();
 628        rcu_read_unlock();
 629
 630        ovs_flow_free(flow, false);
 631        return err;
 632
 633err_unlock:
 634        rcu_read_unlock();
 635err_flow_free:
 636        ovs_flow_free(flow, false);
 637err_kfree_skb:
 638        kfree_skb(packet);
 639err:
 640        return err;
 641}
 642
 643static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
 644        [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
 645        [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
 646        [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
 647        [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
 648        [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
 649};
 650
 651static const struct genl_ops dp_packet_genl_ops[] = {
 652        { .cmd = OVS_PACKET_CMD_EXECUTE,
 653          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 654          .policy = packet_policy,
 655          .doit = ovs_packet_cmd_execute
 656        }
 657};
 658
 659static struct genl_family dp_packet_genl_family = {
 660        .id = GENL_ID_GENERATE,
 661        .hdrsize = sizeof(struct ovs_header),
 662        .name = OVS_PACKET_FAMILY,
 663        .version = OVS_PACKET_VERSION,
 664        .maxattr = OVS_PACKET_ATTR_MAX,
 665        .netnsok = true,
 666        .parallel_ops = true,
 667        .ops = dp_packet_genl_ops,
 668        .n_ops = ARRAY_SIZE(dp_packet_genl_ops),
 669};
 670
 671static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
 672                         struct ovs_dp_megaflow_stats *mega_stats)
 673{
 674        int i;
 675
 676        memset(mega_stats, 0, sizeof(*mega_stats));
 677
 678        stats->n_flows = ovs_flow_tbl_count(&dp->table);
 679        mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
 680
 681        stats->n_hit = stats->n_missed = stats->n_lost = 0;
 682
 683        for_each_possible_cpu(i) {
 684                const struct dp_stats_percpu *percpu_stats;
 685                struct dp_stats_percpu local_stats;
 686                unsigned int start;
 687
 688                percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
 689
 690                do {
 691                        start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
 692                        local_stats = *percpu_stats;
 693                } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
 694
 695                stats->n_hit += local_stats.n_hit;
 696                stats->n_missed += local_stats.n_missed;
 697                stats->n_lost += local_stats.n_lost;
 698                mega_stats->n_mask_hit += local_stats.n_mask_hit;
 699        }
 700}
 701
 702static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags)
 703{
 704        return ovs_identifier_is_ufid(sfid) &&
 705               !(ufid_flags & OVS_UFID_F_OMIT_KEY);
 706}
 707
 708static bool should_fill_mask(uint32_t ufid_flags)
 709{
 710        return !(ufid_flags & OVS_UFID_F_OMIT_MASK);
 711}
 712
 713static bool should_fill_actions(uint32_t ufid_flags)
 714{
 715        return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS);
 716}
 717
 718static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts,
 719                                    const struct sw_flow_id *sfid,
 720                                    uint32_t ufid_flags)
 721{
 722        size_t len = NLMSG_ALIGN(sizeof(struct ovs_header));
 723
 724        /* OVS_FLOW_ATTR_UFID */
 725        if (sfid && ovs_identifier_is_ufid(sfid))
 726                len += nla_total_size(sfid->ufid_len);
 727
 728        /* OVS_FLOW_ATTR_KEY */
 729        if (!sfid || should_fill_key(sfid, ufid_flags))
 730                len += nla_total_size(ovs_key_attr_size());
 731
 732        /* OVS_FLOW_ATTR_MASK */
 733        if (should_fill_mask(ufid_flags))
 734                len += nla_total_size(ovs_key_attr_size());
 735
 736        /* OVS_FLOW_ATTR_ACTIONS */
 737        if (should_fill_actions(ufid_flags))
 738                len += nla_total_size(acts->orig_len);
 739
 740        return len
 741                + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
 742                + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
 743                + nla_total_size(8); /* OVS_FLOW_ATTR_USED */
 744}
 745
 746/* Called with ovs_mutex or RCU read lock. */
 747static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
 748                                   struct sk_buff *skb)
 749{
 750        struct ovs_flow_stats stats;
 751        __be16 tcp_flags;
 752        unsigned long used;
 753
 754        ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
 755
 756        if (used &&
 757            nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
 758                return -EMSGSIZE;
 759
 760        if (stats.n_packets &&
 761            nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats))
 762                return -EMSGSIZE;
 763
 764        if ((u8)ntohs(tcp_flags) &&
 765             nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags)))
 766                return -EMSGSIZE;
 767
 768        return 0;
 769}
 770
 771/* Called with ovs_mutex or RCU read lock. */
 772static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
 773                                     struct sk_buff *skb, int skb_orig_len)
 774{
 775        struct nlattr *start;
 776        int err;
 777
 778        /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
 779         * this is the first flow to be dumped into 'skb'.  This is unusual for
 780         * Netlink but individual action lists can be longer than
 781         * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
 782         * The userspace caller can always fetch the actions separately if it
 783         * really wants them.  (Most userspace callers in fact don't care.)
 784         *
 785         * This can only fail for dump operations because the skb is always
 786         * properly sized for single flows.
 787         */
 788        start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
 789        if (start) {
 790                const struct sw_flow_actions *sf_acts;
 791
 792                sf_acts = rcu_dereference_ovsl(flow->sf_acts);
 793                err = ovs_nla_put_actions(sf_acts->actions,
 794                                          sf_acts->actions_len, skb);
 795
 796                if (!err)
 797                        nla_nest_end(skb, start);
 798                else {
 799                        if (skb_orig_len)
 800                                return err;
 801
 802                        nla_nest_cancel(skb, start);
 803                }
 804        } else if (skb_orig_len) {
 805                return -EMSGSIZE;
 806        }
 807
 808        return 0;
 809}
 810
 811/* Called with ovs_mutex or RCU read lock. */
 812static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
 813                                  struct sk_buff *skb, u32 portid,
 814                                  u32 seq, u32 flags, u8 cmd, u32 ufid_flags)
 815{
 816        const int skb_orig_len = skb->len;
 817        struct ovs_header *ovs_header;
 818        int err;
 819
 820        ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family,
 821                                 flags, cmd);
 822        if (!ovs_header)
 823                return -EMSGSIZE;
 824
 825        ovs_header->dp_ifindex = dp_ifindex;
 826
 827        err = ovs_nla_put_identifier(flow, skb);
 828        if (err)
 829                goto error;
 830
 831        if (should_fill_key(&flow->id, ufid_flags)) {
 832                err = ovs_nla_put_masked_key(flow, skb);
 833                if (err)
 834                        goto error;
 835        }
 836
 837        if (should_fill_mask(ufid_flags)) {
 838                err = ovs_nla_put_mask(flow, skb);
 839                if (err)
 840                        goto error;
 841        }
 842
 843        err = ovs_flow_cmd_fill_stats(flow, skb);
 844        if (err)
 845                goto error;
 846
 847        if (should_fill_actions(ufid_flags)) {
 848                err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
 849                if (err)
 850                        goto error;
 851        }
 852
 853        genlmsg_end(skb, ovs_header);
 854        return 0;
 855
 856error:
 857        genlmsg_cancel(skb, ovs_header);
 858        return err;
 859}
 860
 861/* May not be called with RCU read lock. */
 862static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts,
 863                                               const struct sw_flow_id *sfid,
 864                                               struct genl_info *info,
 865                                               bool always,
 866                                               uint32_t ufid_flags)
 867{
 868        struct sk_buff *skb;
 869        size_t len;
 870
 871        if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0))
 872                return NULL;
 873
 874        len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags);
 875        skb = genlmsg_new(len, GFP_KERNEL);
 876        if (!skb)
 877                return ERR_PTR(-ENOMEM);
 878
 879        return skb;
 880}
 881
 882/* Called with ovs_mutex. */
 883static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
 884                                               int dp_ifindex,
 885                                               struct genl_info *info, u8 cmd,
 886                                               bool always, u32 ufid_flags)
 887{
 888        struct sk_buff *skb;
 889        int retval;
 890
 891        skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts),
 892                                      &flow->id, info, always, ufid_flags);
 893        if (IS_ERR_OR_NULL(skb))
 894                return skb;
 895
 896        retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
 897                                        info->snd_portid, info->snd_seq, 0,
 898                                        cmd, ufid_flags);
 899        BUG_ON(retval < 0);
 900        return skb;
 901}
 902
 903static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 904{
 905        struct net *net = sock_net(skb->sk);
 906        struct nlattr **a = info->attrs;
 907        struct ovs_header *ovs_header = info->userhdr;
 908        struct sw_flow *flow = NULL, *new_flow;
 909        struct sw_flow_mask mask;
 910        struct sk_buff *reply;
 911        struct datapath *dp;
 912        struct sw_flow_key key;
 913        struct sw_flow_actions *acts;
 914        struct sw_flow_match match;
 915        u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
 916        int error;
 917        bool log = !a[OVS_FLOW_ATTR_PROBE];
 918
 919        /* Must have key and actions. */
 920        error = -EINVAL;
 921        if (!a[OVS_FLOW_ATTR_KEY]) {
 922                OVS_NLERR(log, "Flow key attr not present in new flow.");
 923                goto error;
 924        }
 925        if (!a[OVS_FLOW_ATTR_ACTIONS]) {
 926                OVS_NLERR(log, "Flow actions attr not present in new flow.");
 927                goto error;
 928        }
 929
 930        /* Most of the time we need to allocate a new flow, do it before
 931         * locking.
 932         */
 933        new_flow = ovs_flow_alloc();
 934        if (IS_ERR(new_flow)) {
 935                error = PTR_ERR(new_flow);
 936                goto error;
 937        }
 938
 939        /* Extract key. */
 940        ovs_match_init(&match, &key, &mask);
 941        error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
 942                                  a[OVS_FLOW_ATTR_MASK], log);
 943        if (error)
 944                goto err_kfree_flow;
 945
 946        ovs_flow_mask_key(&new_flow->key, &key, true, &mask);
 947
 948        /* Extract flow identifier. */
 949        error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
 950                                       &key, log);
 951        if (error)
 952                goto err_kfree_flow;
 953
 954        /* Validate actions. */
 955        error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
 956                                     &new_flow->key, &acts, log);
 957        if (error) {
 958                OVS_NLERR(log, "Flow actions may not be safe on all matching packets.");
 959                goto err_kfree_flow;
 960        }
 961
 962        reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false,
 963                                        ufid_flags);
 964        if (IS_ERR(reply)) {
 965                error = PTR_ERR(reply);
 966                goto err_kfree_acts;
 967        }
 968
 969        ovs_lock();
 970        dp = get_dp(net, ovs_header->dp_ifindex);
 971        if (unlikely(!dp)) {
 972                error = -ENODEV;
 973                goto err_unlock_ovs;
 974        }
 975
 976        /* Check if this is a duplicate flow */
 977        if (ovs_identifier_is_ufid(&new_flow->id))
 978                flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
 979        if (!flow)
 980                flow = ovs_flow_tbl_lookup(&dp->table, &key);
 981        if (likely(!flow)) {
 982                rcu_assign_pointer(new_flow->sf_acts, acts);
 983
 984                /* Put flow in bucket. */
 985                error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
 986                if (unlikely(error)) {
 987                        acts = NULL;
 988                        goto err_unlock_ovs;
 989                }
 990
 991                if (unlikely(reply)) {
 992                        error = ovs_flow_cmd_fill_info(new_flow,
 993                                                       ovs_header->dp_ifindex,
 994                                                       reply, info->snd_portid,
 995                                                       info->snd_seq, 0,
 996                                                       OVS_FLOW_CMD_NEW,
 997                                                       ufid_flags);
 998                        BUG_ON(error < 0);
 999                }
1000                ovs_unlock();
1001        } else {
1002                struct sw_flow_actions *old_acts;
1003
1004                /* Bail out if we're not allowed to modify an existing flow.
1005                 * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
1006                 * because Generic Netlink treats the latter as a dump
1007                 * request.  We also accept NLM_F_EXCL in case that bug ever
1008                 * gets fixed.
1009                 */
1010                if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
1011                                                         | NLM_F_EXCL))) {
1012                        error = -EEXIST;
1013                        goto err_unlock_ovs;
1014                }
1015                /* The flow identifier has to be the same for flow updates.
1016                 * Look for any overlapping flow.
1017                 */
1018                if (unlikely(!ovs_flow_cmp(flow, &match))) {
1019                        if (ovs_identifier_is_key(&flow->id))
1020                                flow = ovs_flow_tbl_lookup_exact(&dp->table,
1021                                                                 &match);
1022                        else /* UFID matches but key is different */
1023                                flow = NULL;
1024                        if (!flow) {
1025                                error = -ENOENT;
1026                                goto err_unlock_ovs;
1027                        }
1028                }
1029                /* Update actions. */
1030                old_acts = ovsl_dereference(flow->sf_acts);
1031                rcu_assign_pointer(flow->sf_acts, acts);
1032
1033                if (unlikely(reply)) {
1034                        error = ovs_flow_cmd_fill_info(flow,
1035                                                       ovs_header->dp_ifindex,
1036                                                       reply, info->snd_portid,
1037                                                       info->snd_seq, 0,
1038                                                       OVS_FLOW_CMD_NEW,
1039                                                       ufid_flags);
1040                        BUG_ON(error < 0);
1041                }
1042                ovs_unlock();
1043
1044                ovs_nla_free_flow_actions_rcu(old_acts);
1045                ovs_flow_free(new_flow, false);
1046        }
1047
1048        if (reply)
1049                ovs_notify(&dp_flow_genl_family, reply, info);
1050        return 0;
1051
1052err_unlock_ovs:
1053        ovs_unlock();
1054        kfree_skb(reply);
1055err_kfree_acts:
1056        ovs_nla_free_flow_actions(acts);
1057err_kfree_flow:
1058        ovs_flow_free(new_flow, false);
1059error:
1060        return error;
1061}
1062
1063/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
1064static struct sw_flow_actions *get_flow_actions(struct net *net,
1065                                                const struct nlattr *a,
1066                                                const struct sw_flow_key *key,
1067                                                const struct sw_flow_mask *mask,
1068                                                bool log)
1069{
1070        struct sw_flow_actions *acts;
1071        struct sw_flow_key masked_key;
1072        int error;
1073
1074        ovs_flow_mask_key(&masked_key, key, true, mask);
1075        error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log);
1076        if (error) {
1077                OVS_NLERR(log,
1078                          "Actions may not be safe on all matching packets");
1079                return ERR_PTR(error);
1080        }
1081
1082        return acts;
1083}
1084
1085static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
1086{
1087        struct net *net = sock_net(skb->sk);
1088        struct nlattr **a = info->attrs;
1089        struct ovs_header *ovs_header = info->userhdr;
1090        struct sw_flow_key key;
1091        struct sw_flow *flow;
1092        struct sw_flow_mask mask;
1093        struct sk_buff *reply = NULL;
1094        struct datapath *dp;
1095        struct sw_flow_actions *old_acts = NULL, *acts = NULL;
1096        struct sw_flow_match match;
1097        struct sw_flow_id sfid;
1098        u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1099        int error = 0;
1100        bool log = !a[OVS_FLOW_ATTR_PROBE];
1101        bool ufid_present;
1102
1103        ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
1104        if (a[OVS_FLOW_ATTR_KEY]) {
1105                ovs_match_init(&match, &key, &mask);
1106                error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
1107                                          a[OVS_FLOW_ATTR_MASK], log);
1108        } else if (!ufid_present) {
1109                OVS_NLERR(log,
1110                          "Flow set message rejected, Key attribute missing.");
1111                error = -EINVAL;
1112        }
1113        if (error)
1114                goto error;
1115
1116        /* Validate actions. */
1117        if (a[OVS_FLOW_ATTR_ACTIONS]) {
1118                if (!a[OVS_FLOW_ATTR_KEY]) {
1119                        OVS_NLERR(log,
1120                                  "Flow key attribute not present in set flow.");
1121                        error = -EINVAL;
1122                        goto error;
1123                }
1124
1125                acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
1126                                        &mask, log);
1127                if (IS_ERR(acts)) {
1128                        error = PTR_ERR(acts);
1129                        goto error;
1130                }
1131
1132                /* Can allocate before locking if have acts. */
1133                reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false,
1134                                                ufid_flags);
1135                if (IS_ERR(reply)) {
1136                        error = PTR_ERR(reply);
1137                        goto err_kfree_acts;
1138                }
1139        }
1140
1141        ovs_lock();
1142        dp = get_dp(net, ovs_header->dp_ifindex);
1143        if (unlikely(!dp)) {
1144                error = -ENODEV;
1145                goto err_unlock_ovs;
1146        }
1147        /* Check that the flow exists. */
1148        if (ufid_present)
1149                flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid);
1150        else
1151                flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
1152        if (unlikely(!flow)) {
1153                error = -ENOENT;
1154                goto err_unlock_ovs;
1155        }
1156
1157        /* Update actions, if present. */
1158        if (likely(acts)) {
1159                old_acts = ovsl_dereference(flow->sf_acts);
1160                rcu_assign_pointer(flow->sf_acts, acts);
1161
1162                if (unlikely(reply)) {
1163                        error = ovs_flow_cmd_fill_info(flow,
1164                                                       ovs_header->dp_ifindex,
1165                                                       reply, info->snd_portid,
1166                                                       info->snd_seq, 0,
1167                                                       OVS_FLOW_CMD_NEW,
1168                                                       ufid_flags);
1169                        BUG_ON(error < 0);
1170                }
1171        } else {
1172                /* Could not alloc without acts before locking. */
1173                reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
1174                                                info, OVS_FLOW_CMD_NEW, false,
1175                                                ufid_flags);
1176
1177                if (IS_ERR(reply)) {
1178                        error = PTR_ERR(reply);
1179                        goto err_unlock_ovs;
1180                }
1181        }
1182
1183        /* Clear stats. */
1184        if (a[OVS_FLOW_ATTR_CLEAR])
1185                ovs_flow_stats_clear(flow);
1186        ovs_unlock();
1187
1188        if (reply)
1189                ovs_notify(&dp_flow_genl_family, reply, info);
1190        if (old_acts)
1191                ovs_nla_free_flow_actions_rcu(old_acts);
1192
1193        return 0;
1194
1195err_unlock_ovs:
1196        ovs_unlock();
1197        kfree_skb(reply);
1198err_kfree_acts:
1199        ovs_nla_free_flow_actions(acts);
1200error:
1201        return error;
1202}
1203
1204static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
1205{
1206        struct nlattr **a = info->attrs;
1207        struct ovs_header *ovs_header = info->userhdr;
1208        struct net *net = sock_net(skb->sk);
1209        struct sw_flow_key key;
1210        struct sk_buff *reply;
1211        struct sw_flow *flow;
1212        struct datapath *dp;
1213        struct sw_flow_match match;
1214        struct sw_flow_id ufid;
1215        u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1216        int err = 0;
1217        bool log = !a[OVS_FLOW_ATTR_PROBE];
1218        bool ufid_present;
1219
1220        ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
1221        if (a[OVS_FLOW_ATTR_KEY]) {
1222                ovs_match_init(&match, &key, NULL);
1223                err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL,
1224                                        log);
1225        } else if (!ufid_present) {
1226                OVS_NLERR(log,
1227                          "Flow get message rejected, Key attribute missing.");
1228                err = -EINVAL;
1229        }
1230        if (err)
1231                return err;
1232
1233        ovs_lock();
1234        dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1235        if (!dp) {
1236                err = -ENODEV;
1237                goto unlock;
1238        }
1239
1240        if (ufid_present)
1241                flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
1242        else
1243                flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
1244        if (!flow) {
1245                err = -ENOENT;
1246                goto unlock;
1247        }
1248
1249        reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
1250                                        OVS_FLOW_CMD_NEW, true, ufid_flags);
1251        if (IS_ERR(reply)) {
1252                err = PTR_ERR(reply);
1253                goto unlock;
1254        }
1255
1256        ovs_unlock();
1257        return genlmsg_reply(reply, info);
1258unlock:
1259        ovs_unlock();
1260        return err;
1261}
1262
1263static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
1264{
1265        struct nlattr **a = info->attrs;
1266        struct ovs_header *ovs_header = info->userhdr;
1267        struct net *net = sock_net(skb->sk);
1268        struct sw_flow_key key;
1269        struct sk_buff *reply;
1270        struct sw_flow *flow = NULL;
1271        struct datapath *dp;
1272        struct sw_flow_match match;
1273        struct sw_flow_id ufid;
1274        u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1275        int err;
1276        bool log = !a[OVS_FLOW_ATTR_PROBE];
1277        bool ufid_present;
1278
1279        ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
1280        if (a[OVS_FLOW_ATTR_KEY]) {
1281                ovs_match_init(&match, &key, NULL);
1282                err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
1283                                        NULL, log);
1284                if (unlikely(err))
1285                        return err;
1286        }
1287
1288        ovs_lock();
1289        dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1290        if (unlikely(!dp)) {
1291                err = -ENODEV;
1292                goto unlock;
1293        }
1294
1295        if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) {
1296                err = ovs_flow_tbl_flush(&dp->table);
1297                goto unlock;
1298        }
1299
1300        if (ufid_present)
1301                flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
1302        else
1303                flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
1304        if (unlikely(!flow)) {
1305                err = -ENOENT;
1306                goto unlock;
1307        }
1308
1309        ovs_flow_tbl_remove(&dp->table, flow);
1310        ovs_unlock();
1311
1312        reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts,
1313                                        &flow->id, info, false, ufid_flags);
1314        if (likely(reply)) {
1315                if (likely(!IS_ERR(reply))) {
1316                        rcu_read_lock();        /*To keep RCU checker happy. */
1317                        err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex,
1318                                                     reply, info->snd_portid,
1319                                                     info->snd_seq, 0,
1320                                                     OVS_FLOW_CMD_DEL,
1321                                                     ufid_flags);
1322                        rcu_read_unlock();
1323                        BUG_ON(err < 0);
1324
1325                        ovs_notify(&dp_flow_genl_family, reply, info);
1326                } else {
1327                        netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply));
1328                }
1329        }
1330
1331        ovs_flow_free(flow, true);
1332        return 0;
1333unlock:
1334        ovs_unlock();
1335        return err;
1336}
1337
1338static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1339{
1340        struct nlattr *a[__OVS_FLOW_ATTR_MAX];
1341        struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1342        struct table_instance *ti;
1343        struct datapath *dp;
1344        u32 ufid_flags;
1345        int err;
1346
1347        err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a,
1348                            OVS_FLOW_ATTR_MAX, flow_policy);
1349        if (err)
1350                return err;
1351        ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1352
1353        rcu_read_lock();
1354        dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
1355        if (!dp) {
1356                rcu_read_unlock();
1357                return -ENODEV;
1358        }
1359
1360        ti = rcu_dereference(dp->table.ti);
1361        for (;;) {
1362                struct sw_flow *flow;
1363                u32 bucket, obj;
1364
1365                bucket = cb->args[0];
1366                obj = cb->args[1];
1367                flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj);
1368                if (!flow)
1369                        break;
1370
1371                if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
1372                                           NETLINK_CB(cb->skb).portid,
1373                                           cb->nlh->nlmsg_seq, NLM_F_MULTI,
1374                                           OVS_FLOW_CMD_NEW, ufid_flags) < 0)
1375                        break;
1376
1377                cb->args[0] = bucket;
1378                cb->args[1] = obj;
1379        }
1380        rcu_read_unlock();
1381        return skb->len;
1382}
1383
1384static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
1385        [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
1386        [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED },
1387        [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
1388        [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
1389        [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG },
1390        [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 },
1391        [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 },
1392};
1393
1394static const struct genl_ops dp_flow_genl_ops[] = {
1395        { .cmd = OVS_FLOW_CMD_NEW,
1396          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1397          .policy = flow_policy,
1398          .doit = ovs_flow_cmd_new
1399        },
1400        { .cmd = OVS_FLOW_CMD_DEL,
1401          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1402          .policy = flow_policy,
1403          .doit = ovs_flow_cmd_del
1404        },
1405        { .cmd = OVS_FLOW_CMD_GET,
1406          .flags = 0,               /* OK for unprivileged users. */
1407          .policy = flow_policy,
1408          .doit = ovs_flow_cmd_get,
1409          .dumpit = ovs_flow_cmd_dump
1410        },
1411        { .cmd = OVS_FLOW_CMD_SET,
1412          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1413          .policy = flow_policy,
1414          .doit = ovs_flow_cmd_set,
1415        },
1416};
1417
1418static struct genl_family dp_flow_genl_family = {
1419        .id = GENL_ID_GENERATE,
1420        .hdrsize = sizeof(struct ovs_header),
1421        .name = OVS_FLOW_FAMILY,
1422        .version = OVS_FLOW_VERSION,
1423        .maxattr = OVS_FLOW_ATTR_MAX,
1424        .netnsok = true,
1425        .parallel_ops = true,
1426        .ops = dp_flow_genl_ops,
1427        .n_ops = ARRAY_SIZE(dp_flow_genl_ops),
1428        .mcgrps = &ovs_dp_flow_multicast_group,
1429        .n_mcgrps = 1,
1430};
1431
1432static size_t ovs_dp_cmd_msg_size(void)
1433{
1434        size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
1435
1436        msgsize += nla_total_size(IFNAMSIZ);
1437        msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
1438        msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
1439        msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */
1440
1441        return msgsize;
1442}
1443
1444/* Called with ovs_mutex. */
1445static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
1446                                u32 portid, u32 seq, u32 flags, u8 cmd)
1447{
1448        struct ovs_header *ovs_header;
1449        struct ovs_dp_stats dp_stats;
1450        struct ovs_dp_megaflow_stats dp_megaflow_stats;
1451        int err;
1452
1453        ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
1454                                   flags, cmd);
1455        if (!ovs_header)
1456                goto error;
1457
1458        ovs_header->dp_ifindex = get_dpifindex(dp);
1459
1460        err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
1461        if (err)
1462                goto nla_put_failure;
1463
1464        get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
1465        if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
1466                        &dp_stats))
1467                goto nla_put_failure;
1468
1469        if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
1470                        sizeof(struct ovs_dp_megaflow_stats),
1471                        &dp_megaflow_stats))
1472                goto nla_put_failure;
1473
1474        if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
1475                goto nla_put_failure;
1476
1477        genlmsg_end(skb, ovs_header);
1478        return 0;
1479
1480nla_put_failure:
1481        genlmsg_cancel(skb, ovs_header);
1482error:
1483        return -EMSGSIZE;
1484}
1485
1486static struct sk_buff *ovs_dp_cmd_alloc_info(void)
1487{
1488        return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL);
1489}
1490
1491/* Called with rcu_read_lock or ovs_mutex. */
1492static struct datapath *lookup_datapath(struct net *net,
1493                                        const struct ovs_header *ovs_header,
1494                                        struct nlattr *a[OVS_DP_ATTR_MAX + 1])
1495{
1496        struct datapath *dp;
1497
1498        if (!a[OVS_DP_ATTR_NAME])
1499                dp = get_dp(net, ovs_header->dp_ifindex);
1500        else {
1501                struct vport *vport;
1502
1503                vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME]));
1504                dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
1505        }
1506        return dp ? dp : ERR_PTR(-ENODEV);
1507}
1508
1509static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info)
1510{
1511        struct datapath *dp;
1512
1513        dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1514        if (IS_ERR(dp))
1515                return;
1516
1517        WARN(dp->user_features, "Dropping previously announced user features\n");
1518        dp->user_features = 0;
1519}
1520
1521static void ovs_dp_change(struct datapath *dp, struct nlattr *a[])
1522{
1523        if (a[OVS_DP_ATTR_USER_FEATURES])
1524                dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
1525}
1526
1527static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
1528{
1529        struct nlattr **a = info->attrs;
1530        struct vport_parms parms;
1531        struct sk_buff *reply;
1532        struct datapath *dp;
1533        struct vport *vport;
1534        struct ovs_net *ovs_net;
1535        int err, i;
1536
1537        err = -EINVAL;
1538        if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
1539                goto err;
1540
1541        reply = ovs_dp_cmd_alloc_info();
1542        if (!reply)
1543                return -ENOMEM;
1544
1545        err = -ENOMEM;
1546        dp = kzalloc(sizeof(*dp), GFP_KERNEL);
1547        if (dp == NULL)
1548                goto err_free_reply;
1549
1550        ovs_dp_set_net(dp, sock_net(skb->sk));
1551
1552        /* Allocate table. */
1553        err = ovs_flow_tbl_init(&dp->table);
1554        if (err)
1555                goto err_free_dp;
1556
1557        dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu);
1558        if (!dp->stats_percpu) {
1559                err = -ENOMEM;
1560                goto err_destroy_table;
1561        }
1562
1563        dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
1564                            GFP_KERNEL);
1565        if (!dp->ports) {
1566                err = -ENOMEM;
1567                goto err_destroy_percpu;
1568        }
1569
1570        for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
1571                INIT_HLIST_HEAD(&dp->ports[i]);
1572
1573        /* Set up our datapath device. */
1574        parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
1575        parms.type = OVS_VPORT_TYPE_INTERNAL;
1576        parms.options = NULL;
1577        parms.dp = dp;
1578        parms.port_no = OVSP_LOCAL;
1579        parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
1580
1581        ovs_dp_change(dp, a);
1582
1583        /* So far only local changes have been made, now need the lock. */
1584        ovs_lock();
1585
1586        vport = new_vport(&parms);
1587        if (IS_ERR(vport)) {
1588                err = PTR_ERR(vport);
1589                if (err == -EBUSY)
1590                        err = -EEXIST;
1591
1592                if (err == -EEXIST) {
1593                        /* An outdated user space instance that does not understand
1594                         * the concept of user_features has attempted to create a new
1595                         * datapath and is likely to reuse it. Drop all user features.
1596                         */
1597                        if (info->genlhdr->version < OVS_DP_VER_FEATURES)
1598                                ovs_dp_reset_user_features(skb, info);
1599                }
1600
1601                goto err_destroy_ports_array;
1602        }
1603
1604        err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1605                                   info->snd_seq, 0, OVS_DP_CMD_NEW);
1606        BUG_ON(err < 0);
1607
1608        ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id);
1609        list_add_tail_rcu(&dp->list_node, &ovs_net->dps);
1610
1611        ovs_unlock();
1612
1613        ovs_notify(&dp_datapath_genl_family, reply, info);
1614        return 0;
1615
1616err_destroy_ports_array:
1617        ovs_unlock();
1618        kfree(dp->ports);
1619err_destroy_percpu:
1620        free_percpu(dp->stats_percpu);
1621err_destroy_table:
1622        ovs_flow_tbl_destroy(&dp->table);
1623err_free_dp:
1624        kfree(dp);
1625err_free_reply:
1626        kfree_skb(reply);
1627err:
1628        return err;
1629}
1630
1631/* Called with ovs_mutex. */
1632static void __dp_destroy(struct datapath *dp)
1633{
1634        int i;
1635
1636        for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
1637                struct vport *vport;
1638                struct hlist_node *n;
1639
1640                hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node)
1641                        if (vport->port_no != OVSP_LOCAL)
1642                                ovs_dp_detach_port(vport);
1643        }
1644
1645        list_del_rcu(&dp->list_node);
1646
1647        /* OVSP_LOCAL is datapath internal port. We need to make sure that
1648         * all ports in datapath are destroyed first before freeing datapath.
1649         */
1650        ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
1651
1652        /* RCU destroy the flow table */
1653        call_rcu(&dp->rcu, destroy_dp_rcu);
1654}
1655
1656static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
1657{
1658        struct sk_buff *reply;
1659        struct datapath *dp;
1660        int err;
1661
1662        reply = ovs_dp_cmd_alloc_info();
1663        if (!reply)
1664                return -ENOMEM;
1665
1666        ovs_lock();
1667        dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1668        err = PTR_ERR(dp);
1669        if (IS_ERR(dp))
1670                goto err_unlock_free;
1671
1672        err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1673                                   info->snd_seq, 0, OVS_DP_CMD_DEL);
1674        BUG_ON(err < 0);
1675
1676        __dp_destroy(dp);
1677        ovs_unlock();
1678
1679        ovs_notify(&dp_datapath_genl_family, reply, info);
1680
1681        return 0;
1682
1683err_unlock_free:
1684        ovs_unlock();
1685        kfree_skb(reply);
1686        return err;
1687}
1688
1689static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1690{
1691        struct sk_buff *reply;
1692        struct datapath *dp;
1693        int err;
1694
1695        reply = ovs_dp_cmd_alloc_info();
1696        if (!reply)
1697                return -ENOMEM;
1698
1699        ovs_lock();
1700        dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1701        err = PTR_ERR(dp);
1702        if (IS_ERR(dp))
1703                goto err_unlock_free;
1704
1705        ovs_dp_change(dp, info->attrs);
1706
1707        err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1708                                   info->snd_seq, 0, OVS_DP_CMD_NEW);
1709        BUG_ON(err < 0);
1710
1711        ovs_unlock();
1712        ovs_notify(&dp_datapath_genl_family, reply, info);
1713
1714        return 0;
1715
1716err_unlock_free:
1717        ovs_unlock();
1718        kfree_skb(reply);
1719        return err;
1720}
1721
1722static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1723{
1724        struct sk_buff *reply;
1725        struct datapath *dp;
1726        int err;
1727
1728        reply = ovs_dp_cmd_alloc_info();
1729        if (!reply)
1730                return -ENOMEM;
1731
1732        ovs_lock();
1733        dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1734        if (IS_ERR(dp)) {
1735                err = PTR_ERR(dp);
1736                goto err_unlock_free;
1737        }
1738        err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1739                                   info->snd_seq, 0, OVS_DP_CMD_NEW);
1740        BUG_ON(err < 0);
1741        ovs_unlock();
1742
1743        return genlmsg_reply(reply, info);
1744
1745err_unlock_free:
1746        ovs_unlock();
1747        kfree_skb(reply);
1748        return err;
1749}
1750
1751static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1752{
1753        struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
1754        struct datapath *dp;
1755        int skip = cb->args[0];
1756        int i = 0;
1757
1758        ovs_lock();
1759        list_for_each_entry(dp, &ovs_net->dps, list_node) {
1760                if (i >= skip &&
1761                    ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
1762                                         cb->nlh->nlmsg_seq, NLM_F_MULTI,
1763                                         OVS_DP_CMD_NEW) < 0)
1764                        break;
1765                i++;
1766        }
1767        ovs_unlock();
1768
1769        cb->args[0] = i;
1770
1771        return skb->len;
1772}
1773
1774static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
1775        [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1776        [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1777        [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
1778};
1779
1780static const struct genl_ops dp_datapath_genl_ops[] = {
1781        { .cmd = OVS_DP_CMD_NEW,
1782          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1783          .policy = datapath_policy,
1784          .doit = ovs_dp_cmd_new
1785        },
1786        { .cmd = OVS_DP_CMD_DEL,
1787          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1788          .policy = datapath_policy,
1789          .doit = ovs_dp_cmd_del
1790        },
1791        { .cmd = OVS_DP_CMD_GET,
1792          .flags = 0,               /* OK for unprivileged users. */
1793          .policy = datapath_policy,
1794          .doit = ovs_dp_cmd_get,
1795          .dumpit = ovs_dp_cmd_dump
1796        },
1797        { .cmd = OVS_DP_CMD_SET,
1798          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1799          .policy = datapath_policy,
1800          .doit = ovs_dp_cmd_set,
1801        },
1802};
1803
1804static struct genl_family dp_datapath_genl_family = {
1805        .id = GENL_ID_GENERATE,
1806        .hdrsize = sizeof(struct ovs_header),
1807        .name = OVS_DATAPATH_FAMILY,
1808        .version = OVS_DATAPATH_VERSION,
1809        .maxattr = OVS_DP_ATTR_MAX,
1810        .netnsok = true,
1811        .parallel_ops = true,
1812        .ops = dp_datapath_genl_ops,
1813        .n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
1814        .mcgrps = &ovs_dp_datapath_multicast_group,
1815        .n_mcgrps = 1,
1816};
1817
1818/* Called with ovs_mutex or RCU read lock. */
1819static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
1820                                   u32 portid, u32 seq, u32 flags, u8 cmd)
1821{
1822        struct ovs_header *ovs_header;
1823        struct ovs_vport_stats vport_stats;
1824        int err;
1825
1826        ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family,
1827                                 flags, cmd);
1828        if (!ovs_header)
1829                return -EMSGSIZE;
1830
1831        ovs_header->dp_ifindex = get_dpifindex(vport->dp);
1832
1833        if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
1834            nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
1835            nla_put_string(skb, OVS_VPORT_ATTR_NAME,
1836                           ovs_vport_name(vport)))
1837                goto nla_put_failure;
1838
1839        ovs_vport_get_stats(vport, &vport_stats);
1840        if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
1841                    &vport_stats))
1842                goto nla_put_failure;
1843
1844        if (ovs_vport_get_upcall_portids(vport, skb))
1845                goto nla_put_failure;
1846
1847        err = ovs_vport_get_options(vport, skb);
1848        if (err == -EMSGSIZE)
1849                goto error;
1850
1851        genlmsg_end(skb, ovs_header);
1852        return 0;
1853
1854nla_put_failure:
1855        err = -EMSGSIZE;
1856error:
1857        genlmsg_cancel(skb, ovs_header);
1858        return err;
1859}
1860
1861static struct sk_buff *ovs_vport_cmd_alloc_info(void)
1862{
1863        return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1864}
1865
1866/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
1867struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
1868                                         u32 seq, u8 cmd)
1869{
1870        struct sk_buff *skb;
1871        int retval;
1872
1873        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
1874        if (!skb)
1875                return ERR_PTR(-ENOMEM);
1876
1877        retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd);
1878        BUG_ON(retval < 0);
1879
1880        return skb;
1881}
1882
1883/* Called with ovs_mutex or RCU read lock. */
1884static struct vport *lookup_vport(struct net *net,
1885                                  const struct ovs_header *ovs_header,
1886                                  struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
1887{
1888        struct datapath *dp;
1889        struct vport *vport;
1890
1891        if (a[OVS_VPORT_ATTR_NAME]) {
1892                vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME]));
1893                if (!vport)
1894                        return ERR_PTR(-ENODEV);
1895                if (ovs_header->dp_ifindex &&
1896                    ovs_header->dp_ifindex != get_dpifindex(vport->dp))
1897                        return ERR_PTR(-ENODEV);
1898                return vport;
1899        } else if (a[OVS_VPORT_ATTR_PORT_NO]) {
1900                u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1901
1902                if (port_no >= DP_MAX_PORTS)
1903                        return ERR_PTR(-EFBIG);
1904
1905                dp = get_dp(net, ovs_header->dp_ifindex);
1906                if (!dp)
1907                        return ERR_PTR(-ENODEV);
1908
1909                vport = ovs_vport_ovsl_rcu(dp, port_no);
1910                if (!vport)
1911                        return ERR_PTR(-ENODEV);
1912                return vport;
1913        } else
1914                return ERR_PTR(-EINVAL);
1915}
1916
1917/* Called with ovs_mutex */
1918static void update_headroom(struct datapath *dp)
1919{
1920        unsigned dev_headroom, max_headroom = 0;
1921        struct net_device *dev;
1922        struct vport *vport;
1923        int i;
1924
1925        for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
1926                hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
1927                        dev = vport->dev;
1928                        dev_headroom = netdev_get_fwd_headroom(dev);
1929                        if (dev_headroom > max_headroom)
1930                                max_headroom = dev_headroom;
1931                }
1932        }
1933
1934        dp->max_headroom = max_headroom;
1935        for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
1936                hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
1937                        netdev_set_rx_headroom(vport->dev, max_headroom);
1938}
1939
1940static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
1941{
1942        struct nlattr **a = info->attrs;
1943        struct ovs_header *ovs_header = info->userhdr;
1944        struct vport_parms parms;
1945        struct sk_buff *reply;
1946        struct vport *vport;
1947        struct datapath *dp;
1948        u32 port_no;
1949        int err;
1950
1951        if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
1952            !a[OVS_VPORT_ATTR_UPCALL_PID])
1953                return -EINVAL;
1954
1955        port_no = a[OVS_VPORT_ATTR_PORT_NO]
1956                ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0;
1957        if (port_no >= DP_MAX_PORTS)
1958                return -EFBIG;
1959
1960        reply = ovs_vport_cmd_alloc_info();
1961        if (!reply)
1962                return -ENOMEM;
1963
1964        ovs_lock();
1965restart:
1966        dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1967        err = -ENODEV;
1968        if (!dp)
1969                goto exit_unlock_free;
1970
1971        if (port_no) {
1972                vport = ovs_vport_ovsl(dp, port_no);
1973                err = -EBUSY;
1974                if (vport)
1975                        goto exit_unlock_free;
1976        } else {
1977                for (port_no = 1; ; port_no++) {
1978                        if (port_no >= DP_MAX_PORTS) {
1979                                err = -EFBIG;
1980                                goto exit_unlock_free;
1981                        }
1982                        vport = ovs_vport_ovsl(dp, port_no);
1983                        if (!vport)
1984                                break;
1985                }
1986        }
1987
1988        parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
1989        parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
1990        parms.options = a[OVS_VPORT_ATTR_OPTIONS];
1991        parms.dp = dp;
1992        parms.port_no = port_no;
1993        parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID];
1994
1995        vport = new_vport(&parms);
1996        err = PTR_ERR(vport);
1997        if (IS_ERR(vport)) {
1998                if (err == -EAGAIN)
1999                        goto restart;
2000                goto exit_unlock_free;
2001        }
2002
2003        err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2004                                      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2005
2006        if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
2007                update_headroom(dp);
2008        else
2009                netdev_set_rx_headroom(vport->dev, dp->max_headroom);
2010
2011        BUG_ON(err < 0);
2012        ovs_unlock();
2013
2014        ovs_notify(&dp_vport_genl_family, reply, info);
2015        return 0;
2016
2017exit_unlock_free:
2018        ovs_unlock();
2019        kfree_skb(reply);
2020        return err;
2021}
2022
2023static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
2024{
2025        struct nlattr **a = info->attrs;
2026        struct sk_buff *reply;
2027        struct vport *vport;
2028        int err;
2029
2030        reply = ovs_vport_cmd_alloc_info();
2031        if (!reply)
2032                return -ENOMEM;
2033
2034        ovs_lock();
2035        vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
2036        err = PTR_ERR(vport);
2037        if (IS_ERR(vport))
2038                goto exit_unlock_free;
2039
2040        if (a[OVS_VPORT_ATTR_TYPE] &&
2041            nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) {
2042                err = -EINVAL;
2043                goto exit_unlock_free;
2044        }
2045
2046        if (a[OVS_VPORT_ATTR_OPTIONS]) {
2047                err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
2048                if (err)
2049                        goto exit_unlock_free;
2050        }
2051
2052
2053        if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
2054                struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID];
2055
2056                err = ovs_vport_set_upcall_portids(vport, ids);
2057                if (err)
2058                        goto exit_unlock_free;
2059        }
2060
2061        err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2062                                      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2063        BUG_ON(err < 0);
2064
2065        ovs_unlock();
2066        ovs_notify(&dp_vport_genl_family, reply, info);
2067        return 0;
2068
2069exit_unlock_free:
2070        ovs_unlock();
2071        kfree_skb(reply);
2072        return err;
2073}
2074
2075static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
2076{
2077        bool must_update_headroom = false;
2078        struct nlattr **a = info->attrs;
2079        struct sk_buff *reply;
2080        struct datapath *dp;
2081        struct vport *vport;
2082        int err;
2083
2084        reply = ovs_vport_cmd_alloc_info();
2085        if (!reply)
2086                return -ENOMEM;
2087
2088        ovs_lock();
2089        vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
2090        err = PTR_ERR(vport);
2091        if (IS_ERR(vport))
2092                goto exit_unlock_free;
2093
2094        if (vport->port_no == OVSP_LOCAL) {
2095                err = -EINVAL;
2096                goto exit_unlock_free;
2097        }
2098
2099        err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2100                                      info->snd_seq, 0, OVS_VPORT_CMD_DEL);
2101        BUG_ON(err < 0);
2102
2103        /* the vport deletion may trigger dp headroom update */
2104        dp = vport->dp;
2105        if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
2106                must_update_headroom = true;
2107        netdev_reset_rx_headroom(vport->dev);
2108        ovs_dp_detach_port(vport);
2109
2110        if (must_update_headroom)
2111                update_headroom(dp);
2112        ovs_unlock();
2113
2114        ovs_notify(&dp_vport_genl_family, reply, info);
2115        return 0;
2116
2117exit_unlock_free:
2118        ovs_unlock();
2119        kfree_skb(reply);
2120        return err;
2121}
2122
2123static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
2124{
2125        struct nlattr **a = info->attrs;
2126        struct ovs_header *ovs_header = info->userhdr;
2127        struct sk_buff *reply;
2128        struct vport *vport;
2129        int err;
2130
2131        reply = ovs_vport_cmd_alloc_info();
2132        if (!reply)
2133                return -ENOMEM;
2134
2135        rcu_read_lock();
2136        vport = lookup_vport(sock_net(skb->sk), ovs_header, a);
2137        err = PTR_ERR(vport);
2138        if (IS_ERR(vport))
2139                goto exit_unlock_free;
2140        err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2141                                      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2142        BUG_ON(err < 0);
2143        rcu_read_unlock();
2144
2145        return genlmsg_reply(reply, info);
2146
2147exit_unlock_free:
2148        rcu_read_unlock();
2149        kfree_skb(reply);
2150        return err;
2151}
2152
2153static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
2154{
2155        struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
2156        struct datapath *dp;
2157        int bucket = cb->args[0], skip = cb->args[1];
2158        int i, j = 0;
2159
2160        rcu_read_lock();
2161        dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
2162        if (!dp) {
2163                rcu_read_unlock();
2164                return -ENODEV;
2165        }
2166        for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) {
2167                struct vport *vport;
2168
2169                j = 0;
2170                hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
2171                        if (j >= skip &&
2172                            ovs_vport_cmd_fill_info(vport, skb,
2173                                                    NETLINK_CB(cb->skb).portid,
2174                                                    cb->nlh->nlmsg_seq,
2175                                                    NLM_F_MULTI,
2176                                                    OVS_VPORT_CMD_NEW) < 0)
2177                                goto out;
2178
2179                        j++;
2180                }
2181                skip = 0;
2182        }
2183out:
2184        rcu_read_unlock();
2185
2186        cb->args[0] = i;
2187        cb->args[1] = j;
2188
2189        return skb->len;
2190}
2191
2192static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
2193        [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
2194        [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
2195        [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
2196        [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
2197        [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
2198        [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
2199};
2200
2201static const struct genl_ops dp_vport_genl_ops[] = {
2202        { .cmd = OVS_VPORT_CMD_NEW,
2203          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2204          .policy = vport_policy,
2205          .doit = ovs_vport_cmd_new
2206        },
2207        { .cmd = OVS_VPORT_CMD_DEL,
2208          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2209          .policy = vport_policy,
2210          .doit = ovs_vport_cmd_del
2211        },
2212        { .cmd = OVS_VPORT_CMD_GET,
2213          .flags = 0,               /* OK for unprivileged users. */
2214          .policy = vport_policy,
2215          .doit = ovs_vport_cmd_get,
2216          .dumpit = ovs_vport_cmd_dump
2217        },
2218        { .cmd = OVS_VPORT_CMD_SET,
2219          .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2220          .policy = vport_policy,
2221          .doit = ovs_vport_cmd_set,
2222        },
2223};
2224
2225struct genl_family dp_vport_genl_family = {
2226        .id = GENL_ID_GENERATE,
2227        .hdrsize = sizeof(struct ovs_header),
2228        .name = OVS_VPORT_FAMILY,
2229        .version = OVS_VPORT_VERSION,
2230        .maxattr = OVS_VPORT_ATTR_MAX,
2231        .netnsok = true,
2232        .parallel_ops = true,
2233        .ops = dp_vport_genl_ops,
2234        .n_ops = ARRAY_SIZE(dp_vport_genl_ops),
2235        .mcgrps = &ovs_dp_vport_multicast_group,
2236        .n_mcgrps = 1,
2237};
2238
2239static struct genl_family * const dp_genl_families[] = {
2240        &dp_datapath_genl_family,
2241        &dp_vport_genl_family,
2242        &dp_flow_genl_family,
2243        &dp_packet_genl_family,
2244};
2245
2246static void dp_unregister_genl(int n_families)
2247{
2248        int i;
2249
2250        for (i = 0; i < n_families; i++)
2251                genl_unregister_family(dp_genl_families[i]);
2252}
2253
2254static int dp_register_genl(void)
2255{
2256        int err;
2257        int i;
2258
2259        for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
2260
2261                err = genl_register_family(dp_genl_families[i]);
2262                if (err)
2263                        goto error;
2264        }
2265
2266        return 0;
2267
2268error:
2269        dp_unregister_genl(i);
2270        return err;
2271}
2272
2273static int __net_init ovs_init_net(struct net *net)
2274{
2275        struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
2276
2277        INIT_LIST_HEAD(&ovs_net->dps);
2278        INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
2279        ovs_ct_init(net);
2280        return 0;
2281}
2282
2283static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
2284                                            struct list_head *head)
2285{
2286        struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
2287        struct datapath *dp;
2288
2289        list_for_each_entry(dp, &ovs_net->dps, list_node) {
2290                int i;
2291
2292                for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
2293                        struct vport *vport;
2294
2295                        hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) {
2296                                if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL)
2297                                        continue;
2298
2299                                if (dev_net(vport->dev) == dnet)
2300                                        list_add(&vport->detach_list, head);
2301                        }
2302                }
2303        }
2304}
2305
2306static void __net_exit ovs_exit_net(struct net *dnet)
2307{
2308        struct datapath *dp, *dp_next;
2309        struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id);
2310        struct vport *vport, *vport_next;
2311        struct net *net;
2312        LIST_HEAD(head);
2313
2314        ovs_ct_exit(dnet);
2315        ovs_lock();
2316        list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
2317                __dp_destroy(dp);
2318
2319        rtnl_lock();
2320        for_each_net(net)
2321                list_vports_from_net(net, dnet, &head);
2322        rtnl_unlock();
2323
2324        /* Detach all vports from given namespace. */
2325        list_for_each_entry_safe(vport, vport_next, &head, detach_list) {
2326                list_del(&vport->detach_list);
2327                ovs_dp_detach_port(vport);
2328        }
2329
2330        ovs_unlock();
2331
2332        cancel_work_sync(&ovs_net->dp_notify_work);
2333}
2334
2335static struct pernet_operations ovs_net_ops = {
2336        .init = ovs_init_net,
2337        .exit = ovs_exit_net,
2338        .id   = &ovs_net_id,
2339        .size = sizeof(struct ovs_net),
2340};
2341
2342static int __init dp_init(void)
2343{
2344        int err;
2345
2346        BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
2347
2348        pr_info("Open vSwitch switching datapath\n");
2349
2350        err = action_fifos_init();
2351        if (err)
2352                goto error;
2353
2354        err = ovs_internal_dev_rtnl_link_register();
2355        if (err)
2356                goto error_action_fifos_exit;
2357
2358        err = ovs_flow_init();
2359        if (err)
2360                goto error_unreg_rtnl_link;
2361
2362        err = ovs_vport_init();
2363        if (err)
2364                goto error_flow_exit;
2365
2366        err = register_pernet_device(&ovs_net_ops);
2367        if (err)
2368                goto error_vport_exit;
2369
2370        err = register_netdevice_notifier(&ovs_dp_device_notifier);
2371        if (err)
2372                goto error_netns_exit;
2373
2374        err = ovs_netdev_init();
2375        if (err)
2376                goto error_unreg_notifier;
2377
2378        err = dp_register_genl();
2379        if (err < 0)
2380                goto error_unreg_netdev;
2381
2382        return 0;
2383
2384error_unreg_netdev:
2385        ovs_netdev_exit();
2386error_unreg_notifier:
2387        unregister_netdevice_notifier(&ovs_dp_device_notifier);
2388error_netns_exit:
2389        unregister_pernet_device(&ovs_net_ops);
2390error_vport_exit:
2391        ovs_vport_exit();
2392error_flow_exit:
2393        ovs_flow_exit();
2394error_unreg_rtnl_link:
2395        ovs_internal_dev_rtnl_link_unregister();
2396error_action_fifos_exit:
2397        action_fifos_exit();
2398error:
2399        return err;
2400}
2401
2402static void dp_cleanup(void)
2403{
2404        dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
2405        ovs_netdev_exit();
2406        unregister_netdevice_notifier(&ovs_dp_device_notifier);
2407        unregister_pernet_device(&ovs_net_ops);
2408        rcu_barrier();
2409        ovs_vport_exit();
2410        ovs_flow_exit();
2411        ovs_internal_dev_rtnl_link_unregister();
2412        action_fifos_exit();
2413}
2414
2415module_init(dp_init);
2416module_exit(dp_cleanup);
2417
2418MODULE_DESCRIPTION("Open vSwitch switching datapath");
2419MODULE_LICENSE("GPL");
2420