linux/net/openvswitch/actions.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2007-2014 Nicira, Inc.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of version 2 of the GNU General Public
   6 * License as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16 * 02110-1301, USA
  17 */
  18
  19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20
  21#include <linux/skbuff.h>
  22#include <linux/in.h>
  23#include <linux/ip.h>
  24#include <linux/openvswitch.h>
  25#include <linux/netfilter_ipv6.h>
  26#include <linux/sctp.h>
  27#include <linux/tcp.h>
  28#include <linux/udp.h>
  29#include <linux/in6.h>
  30#include <linux/if_arp.h>
  31#include <linux/if_vlan.h>
  32
  33#include <net/dst.h>
  34#include <net/ip.h>
  35#include <net/ipv6.h>
  36#include <net/ip6_fib.h>
  37#include <net/checksum.h>
  38#include <net/dsfield.h>
  39#include <net/mpls.h>
  40#include <net/sctp/checksum.h>
  41
  42#include "datapath.h"
  43#include "flow.h"
  44#include "conntrack.h"
  45#include "vport.h"
  46
  47static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
  48                              struct sw_flow_key *key,
  49                              const struct nlattr *attr, int len);
  50
  51struct deferred_action {
  52        struct sk_buff *skb;
  53        const struct nlattr *actions;
  54
  55        /* Store pkt_key clone when creating deferred action. */
  56        struct sw_flow_key pkt_key;
  57};
  58
  59#define MAX_L2_LEN      (VLAN_ETH_HLEN + 3 * MPLS_HLEN)
  60struct ovs_frag_data {
  61        unsigned long dst;
  62        struct vport *vport;
  63        struct ovs_skb_cb cb;
  64        __be16 inner_protocol;
  65        __u16 vlan_tci;
  66        __be16 vlan_proto;
  67        unsigned int l2_len;
  68        u8 l2_data[MAX_L2_LEN];
  69};
  70
  71static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
  72
  73#define DEFERRED_ACTION_FIFO_SIZE 10
  74struct action_fifo {
  75        int head;
  76        int tail;
  77        /* Deferred action fifo queue storage. */
  78        struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
  79};
  80
  81static struct action_fifo __percpu *action_fifos;
  82static DEFINE_PER_CPU(int, exec_actions_level);
  83
  84static void action_fifo_init(struct action_fifo *fifo)
  85{
  86        fifo->head = 0;
  87        fifo->tail = 0;
  88}
  89
  90static bool action_fifo_is_empty(const struct action_fifo *fifo)
  91{
  92        return (fifo->head == fifo->tail);
  93}
  94
  95static struct deferred_action *action_fifo_get(struct action_fifo *fifo)
  96{
  97        if (action_fifo_is_empty(fifo))
  98                return NULL;
  99
 100        return &fifo->fifo[fifo->tail++];
 101}
 102
 103static struct deferred_action *action_fifo_put(struct action_fifo *fifo)
 104{
 105        if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1)
 106                return NULL;
 107
 108        return &fifo->fifo[fifo->head++];
 109}
 110
 111/* Return true if fifo is not full */
 112static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
 113                                                    const struct sw_flow_key *key,
 114                                                    const struct nlattr *attr)
 115{
 116        struct action_fifo *fifo;
 117        struct deferred_action *da;
 118
 119        fifo = this_cpu_ptr(action_fifos);
 120        da = action_fifo_put(fifo);
 121        if (da) {
 122                da->skb = skb;
 123                da->actions = attr;
 124                da->pkt_key = *key;
 125        }
 126
 127        return da;
 128}
 129
 130static void invalidate_flow_key(struct sw_flow_key *key)
 131{
 132        key->eth.type = htons(0);
 133}
 134
 135static bool is_flow_key_valid(const struct sw_flow_key *key)
 136{
 137        return !!key->eth.type;
 138}
 139
 140static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 141                     const struct ovs_action_push_mpls *mpls)
 142{
 143        __be32 *new_mpls_lse;
 144        struct ethhdr *hdr;
 145
 146        /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
 147        if (skb->encapsulation)
 148                return -ENOTSUPP;
 149
 150        if (skb_cow_head(skb, MPLS_HLEN) < 0)
 151                return -ENOMEM;
 152
 153        skb_push(skb, MPLS_HLEN);
 154        memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
 155                skb->mac_len);
 156        skb_reset_mac_header(skb);
 157
 158        new_mpls_lse = (__be32 *)skb_mpls_header(skb);
 159        *new_mpls_lse = mpls->mpls_lse;
 160
 161        skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
 162
 163        hdr = eth_hdr(skb);
 164        hdr->h_proto = mpls->mpls_ethertype;
 165
 166        if (!skb->inner_protocol)
 167                skb_set_inner_protocol(skb, skb->protocol);
 168        skb->protocol = mpls->mpls_ethertype;
 169
 170        invalidate_flow_key(key);
 171        return 0;
 172}
 173
 174static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 175                    const __be16 ethertype)
 176{
 177        struct ethhdr *hdr;
 178        int err;
 179
 180        err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
 181        if (unlikely(err))
 182                return err;
 183
 184        skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN);
 185
 186        memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
 187                skb->mac_len);
 188
 189        __skb_pull(skb, MPLS_HLEN);
 190        skb_reset_mac_header(skb);
 191
 192        /* skb_mpls_header() is used to locate the ethertype
 193         * field correctly in the presence of VLAN tags.
 194         */
 195        hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
 196        hdr->h_proto = ethertype;
 197        if (eth_p_mpls(skb->protocol))
 198                skb->protocol = ethertype;
 199
 200        invalidate_flow_key(key);
 201        return 0;
 202}
 203
 204static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
 205                    const __be32 *mpls_lse, const __be32 *mask)
 206{
 207        __be32 *stack;
 208        __be32 lse;
 209        int err;
 210
 211        err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
 212        if (unlikely(err))
 213                return err;
 214
 215        stack = (__be32 *)skb_mpls_header(skb);
 216        lse = OVS_MASKED(*stack, *mpls_lse, *mask);
 217        if (skb->ip_summed == CHECKSUM_COMPLETE) {
 218                __be32 diff[] = { ~(*stack), lse };
 219
 220                skb->csum = ~csum_partial((char *)diff, sizeof(diff),
 221                                          ~skb->csum);
 222        }
 223
 224        *stack = lse;
 225        flow_key->mpls.top_lse = lse;
 226        return 0;
 227}
 228
 229static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 230{
 231        int err;
 232
 233        err = skb_vlan_pop(skb);
 234        if (skb_vlan_tag_present(skb))
 235                invalidate_flow_key(key);
 236        else
 237                key->eth.tci = 0;
 238        return err;
 239}
 240
 241static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
 242                     const struct ovs_action_push_vlan *vlan)
 243{
 244        if (skb_vlan_tag_present(skb))
 245                invalidate_flow_key(key);
 246        else
 247                key->eth.tci = vlan->vlan_tci;
 248        return skb_vlan_push(skb, vlan->vlan_tpid,
 249                             ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
 250}
 251
 252/* 'src' is already properly masked. */
 253static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_)
 254{
 255        u16 *dst = (u16 *)dst_;
 256        const u16 *src = (const u16 *)src_;
 257        const u16 *mask = (const u16 *)mask_;
 258
 259        OVS_SET_MASKED(dst[0], src[0], mask[0]);
 260        OVS_SET_MASKED(dst[1], src[1], mask[1]);
 261        OVS_SET_MASKED(dst[2], src[2], mask[2]);
 262}
 263
 264static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
 265                        const struct ovs_key_ethernet *key,
 266                        const struct ovs_key_ethernet *mask)
 267{
 268        int err;
 269
 270        err = skb_ensure_writable(skb, ETH_HLEN);
 271        if (unlikely(err))
 272                return err;
 273
 274        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
 275
 276        ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src,
 277                               mask->eth_src);
 278        ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst,
 279                               mask->eth_dst);
 280
 281        skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
 282
 283        ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source);
 284        ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest);
 285        return 0;
 286}
 287
 288static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
 289                                  __be32 addr, __be32 new_addr)
 290{
 291        int transport_len = skb->len - skb_transport_offset(skb);
 292
 293        if (nh->frag_off & htons(IP_OFFSET))
 294                return;
 295
 296        if (nh->protocol == IPPROTO_TCP) {
 297                if (likely(transport_len >= sizeof(struct tcphdr)))
 298                        inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
 299                                                 addr, new_addr, true);
 300        } else if (nh->protocol == IPPROTO_UDP) {
 301                if (likely(transport_len >= sizeof(struct udphdr))) {
 302                        struct udphdr *uh = udp_hdr(skb);
 303
 304                        if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
 305                                inet_proto_csum_replace4(&uh->check, skb,
 306                                                         addr, new_addr, true);
 307                                if (!uh->check)
 308                                        uh->check = CSUM_MANGLED_0;
 309                        }
 310                }
 311        }
 312}
 313
 314static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
 315                        __be32 *addr, __be32 new_addr)
 316{
 317        update_ip_l4_checksum(skb, nh, *addr, new_addr);
 318        csum_replace4(&nh->check, *addr, new_addr);
 319        skb_clear_hash(skb);
 320        *addr = new_addr;
 321}
 322
 323static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
 324                                 __be32 addr[4], const __be32 new_addr[4])
 325{
 326        int transport_len = skb->len - skb_transport_offset(skb);
 327
 328        if (l4_proto == NEXTHDR_TCP) {
 329                if (likely(transport_len >= sizeof(struct tcphdr)))
 330                        inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb,
 331                                                  addr, new_addr, true);
 332        } else if (l4_proto == NEXTHDR_UDP) {
 333                if (likely(transport_len >= sizeof(struct udphdr))) {
 334                        struct udphdr *uh = udp_hdr(skb);
 335
 336                        if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
 337                                inet_proto_csum_replace16(&uh->check, skb,
 338                                                          addr, new_addr, true);
 339                                if (!uh->check)
 340                                        uh->check = CSUM_MANGLED_0;
 341                        }
 342                }
 343        } else if (l4_proto == NEXTHDR_ICMP) {
 344                if (likely(transport_len >= sizeof(struct icmp6hdr)))
 345                        inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum,
 346                                                  skb, addr, new_addr, true);
 347        }
 348}
 349
 350static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4],
 351                           const __be32 mask[4], __be32 masked[4])
 352{
 353        masked[0] = OVS_MASKED(old[0], addr[0], mask[0]);
 354        masked[1] = OVS_MASKED(old[1], addr[1], mask[1]);
 355        masked[2] = OVS_MASKED(old[2], addr[2], mask[2]);
 356        masked[3] = OVS_MASKED(old[3], addr[3], mask[3]);
 357}
 358
 359static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
 360                          __be32 addr[4], const __be32 new_addr[4],
 361                          bool recalculate_csum)
 362{
 363        if (recalculate_csum)
 364                update_ipv6_checksum(skb, l4_proto, addr, new_addr);
 365
 366        skb_clear_hash(skb);
 367        memcpy(addr, new_addr, sizeof(__be32[4]));
 368}
 369
 370static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask)
 371{
 372        /* Bits 21-24 are always unmasked, so this retains their values. */
 373        OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16));
 374        OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8));
 375        OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask);
 376}
 377
 378static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl,
 379                       u8 mask)
 380{
 381        new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask);
 382
 383        csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
 384        nh->ttl = new_ttl;
 385}
 386
 387static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key,
 388                    const struct ovs_key_ipv4 *key,
 389                    const struct ovs_key_ipv4 *mask)
 390{
 391        struct iphdr *nh;
 392        __be32 new_addr;
 393        int err;
 394
 395        err = skb_ensure_writable(skb, skb_network_offset(skb) +
 396                                  sizeof(struct iphdr));
 397        if (unlikely(err))
 398                return err;
 399
 400        nh = ip_hdr(skb);
 401
 402        /* Setting an IP addresses is typically only a side effect of
 403         * matching on them in the current userspace implementation, so it
 404         * makes sense to check if the value actually changed.
 405         */
 406        if (mask->ipv4_src) {
 407                new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
 408
 409                if (unlikely(new_addr != nh->saddr)) {
 410                        set_ip_addr(skb, nh, &nh->saddr, new_addr);
 411                        flow_key->ipv4.addr.src = new_addr;
 412                }
 413        }
 414        if (mask->ipv4_dst) {
 415                new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
 416
 417                if (unlikely(new_addr != nh->daddr)) {
 418                        set_ip_addr(skb, nh, &nh->daddr, new_addr);
 419                        flow_key->ipv4.addr.dst = new_addr;
 420                }
 421        }
 422        if (mask->ipv4_tos) {
 423                ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos);
 424                flow_key->ip.tos = nh->tos;
 425        }
 426        if (mask->ipv4_ttl) {
 427                set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl);
 428                flow_key->ip.ttl = nh->ttl;
 429        }
 430
 431        return 0;
 432}
 433
 434static bool is_ipv6_mask_nonzero(const __be32 addr[4])
 435{
 436        return !!(addr[0] | addr[1] | addr[2] | addr[3]);
 437}
 438
 439static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
 440                    const struct ovs_key_ipv6 *key,
 441                    const struct ovs_key_ipv6 *mask)
 442{
 443        struct ipv6hdr *nh;
 444        int err;
 445
 446        err = skb_ensure_writable(skb, skb_network_offset(skb) +
 447                                  sizeof(struct ipv6hdr));
 448        if (unlikely(err))
 449                return err;
 450
 451        nh = ipv6_hdr(skb);
 452
 453        /* Setting an IP addresses is typically only a side effect of
 454         * matching on them in the current userspace implementation, so it
 455         * makes sense to check if the value actually changed.
 456         */
 457        if (is_ipv6_mask_nonzero(mask->ipv6_src)) {
 458                __be32 *saddr = (__be32 *)&nh->saddr;
 459                __be32 masked[4];
 460
 461                mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked);
 462
 463                if (unlikely(memcmp(saddr, masked, sizeof(masked)))) {
 464                        set_ipv6_addr(skb, flow_key->ip.proto, saddr, masked,
 465                                      true);
 466                        memcpy(&flow_key->ipv6.addr.src, masked,
 467                               sizeof(flow_key->ipv6.addr.src));
 468                }
 469        }
 470        if (is_ipv6_mask_nonzero(mask->ipv6_dst)) {
 471                unsigned int offset = 0;
 472                int flags = IP6_FH_F_SKIP_RH;
 473                bool recalc_csum = true;
 474                __be32 *daddr = (__be32 *)&nh->daddr;
 475                __be32 masked[4];
 476
 477                mask_ipv6_addr(daddr, key->ipv6_dst, mask->ipv6_dst, masked);
 478
 479                if (unlikely(memcmp(daddr, masked, sizeof(masked)))) {
 480                        if (ipv6_ext_hdr(nh->nexthdr))
 481                                recalc_csum = (ipv6_find_hdr(skb, &offset,
 482                                                             NEXTHDR_ROUTING,
 483                                                             NULL, &flags)
 484                                               != NEXTHDR_ROUTING);
 485
 486                        set_ipv6_addr(skb, flow_key->ip.proto, daddr, masked,
 487                                      recalc_csum);
 488                        memcpy(&flow_key->ipv6.addr.dst, masked,
 489                               sizeof(flow_key->ipv6.addr.dst));
 490                }
 491        }
 492        if (mask->ipv6_tclass) {
 493                ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass);
 494                flow_key->ip.tos = ipv6_get_dsfield(nh);
 495        }
 496        if (mask->ipv6_label) {
 497                set_ipv6_fl(nh, ntohl(key->ipv6_label),
 498                            ntohl(mask->ipv6_label));
 499                flow_key->ipv6.label =
 500                    *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
 501        }
 502        if (mask->ipv6_hlimit) {
 503                OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit,
 504                               mask->ipv6_hlimit);
 505                flow_key->ip.ttl = nh->hop_limit;
 506        }
 507        return 0;
 508}
 509
 510/* Must follow skb_ensure_writable() since that can move the skb data. */
 511static void set_tp_port(struct sk_buff *skb, __be16 *port,
 512                        __be16 new_port, __sum16 *check)
 513{
 514        inet_proto_csum_replace2(check, skb, *port, new_port, false);
 515        *port = new_port;
 516}
 517
 518static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key,
 519                   const struct ovs_key_udp *key,
 520                   const struct ovs_key_udp *mask)
 521{
 522        struct udphdr *uh;
 523        __be16 src, dst;
 524        int err;
 525
 526        err = skb_ensure_writable(skb, skb_transport_offset(skb) +
 527                                  sizeof(struct udphdr));
 528        if (unlikely(err))
 529                return err;
 530
 531        uh = udp_hdr(skb);
 532        /* Either of the masks is non-zero, so do not bother checking them. */
 533        src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src);
 534        dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst);
 535
 536        if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) {
 537                if (likely(src != uh->source)) {
 538                        set_tp_port(skb, &uh->source, src, &uh->check);
 539                        flow_key->tp.src = src;
 540                }
 541                if (likely(dst != uh->dest)) {
 542                        set_tp_port(skb, &uh->dest, dst, &uh->check);
 543                        flow_key->tp.dst = dst;
 544                }
 545
 546                if (unlikely(!uh->check))
 547                        uh->check = CSUM_MANGLED_0;
 548        } else {
 549                uh->source = src;
 550                uh->dest = dst;
 551                flow_key->tp.src = src;
 552                flow_key->tp.dst = dst;
 553        }
 554
 555        skb_clear_hash(skb);
 556
 557        return 0;
 558}
 559
 560static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key,
 561                   const struct ovs_key_tcp *key,
 562                   const struct ovs_key_tcp *mask)
 563{
 564        struct tcphdr *th;
 565        __be16 src, dst;
 566        int err;
 567
 568        err = skb_ensure_writable(skb, skb_transport_offset(skb) +
 569                                  sizeof(struct tcphdr));
 570        if (unlikely(err))
 571                return err;
 572
 573        th = tcp_hdr(skb);
 574        src = OVS_MASKED(th->source, key->tcp_src, mask->tcp_src);
 575        if (likely(src != th->source)) {
 576                set_tp_port(skb, &th->source, src, &th->check);
 577                flow_key->tp.src = src;
 578        }
 579        dst = OVS_MASKED(th->dest, key->tcp_dst, mask->tcp_dst);
 580        if (likely(dst != th->dest)) {
 581                set_tp_port(skb, &th->dest, dst, &th->check);
 582                flow_key->tp.dst = dst;
 583        }
 584        skb_clear_hash(skb);
 585
 586        return 0;
 587}
 588
 589static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
 590                    const struct ovs_key_sctp *key,
 591                    const struct ovs_key_sctp *mask)
 592{
 593        unsigned int sctphoff = skb_transport_offset(skb);
 594        struct sctphdr *sh;
 595        __le32 old_correct_csum, new_csum, old_csum;
 596        int err;
 597
 598        err = skb_ensure_writable(skb, sctphoff + sizeof(struct sctphdr));
 599        if (unlikely(err))
 600                return err;
 601
 602        sh = sctp_hdr(skb);
 603        old_csum = sh->checksum;
 604        old_correct_csum = sctp_compute_cksum(skb, sctphoff);
 605
 606        sh->source = OVS_MASKED(sh->source, key->sctp_src, mask->sctp_src);
 607        sh->dest = OVS_MASKED(sh->dest, key->sctp_dst, mask->sctp_dst);
 608
 609        new_csum = sctp_compute_cksum(skb, sctphoff);
 610
 611        /* Carry any checksum errors through. */
 612        sh->checksum = old_csum ^ old_correct_csum ^ new_csum;
 613
 614        skb_clear_hash(skb);
 615        flow_key->tp.src = sh->source;
 616        flow_key->tp.dst = sh->dest;
 617
 618        return 0;
 619}
 620
 621static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 622{
 623        struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
 624        struct vport *vport = data->vport;
 625
 626        if (skb_cow_head(skb, data->l2_len) < 0) {
 627                kfree_skb(skb);
 628                return -ENOMEM;
 629        }
 630
 631        __skb_dst_copy(skb, data->dst);
 632        *OVS_CB(skb) = data->cb;
 633        skb->inner_protocol = data->inner_protocol;
 634        skb->vlan_tci = data->vlan_tci;
 635        skb->vlan_proto = data->vlan_proto;
 636
 637        /* Reconstruct the MAC header.  */
 638        skb_push(skb, data->l2_len);
 639        memcpy(skb->data, &data->l2_data, data->l2_len);
 640        skb_postpush_rcsum(skb, skb->data, data->l2_len);
 641        skb_reset_mac_header(skb);
 642
 643        ovs_vport_send(vport, skb);
 644        return 0;
 645}
 646
 647static unsigned int
 648ovs_dst_get_mtu(const struct dst_entry *dst)
 649{
 650        return dst->dev->mtu;
 651}
 652
 653static struct dst_ops ovs_dst_ops = {
 654        .family = AF_UNSPEC,
 655        .mtu = ovs_dst_get_mtu,
 656};
 657
 658/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is
 659 * ovs_vport_output(), which is called once per fragmented packet.
 660 */
 661static void prepare_frag(struct vport *vport, struct sk_buff *skb)
 662{
 663        unsigned int hlen = skb_network_offset(skb);
 664        struct ovs_frag_data *data;
 665
 666        data = this_cpu_ptr(&ovs_frag_data_storage);
 667        data->dst = skb->_skb_refdst;
 668        data->vport = vport;
 669        data->cb = *OVS_CB(skb);
 670        data->inner_protocol = skb->inner_protocol;
 671        data->vlan_tci = skb->vlan_tci;
 672        data->vlan_proto = skb->vlan_proto;
 673        data->l2_len = hlen;
 674        memcpy(&data->l2_data, skb->data, hlen);
 675
 676        memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
 677        skb_pull(skb, hlen);
 678}
 679
 680static void ovs_fragment(struct net *net, struct vport *vport,
 681                         struct sk_buff *skb, u16 mru, __be16 ethertype)
 682{
 683        if (skb_network_offset(skb) > MAX_L2_LEN) {
 684                OVS_NLERR(1, "L2 header too long to fragment");
 685                goto err;
 686        }
 687
 688        if (ethertype == htons(ETH_P_IP)) {
 689                struct dst_entry ovs_dst;
 690                unsigned long orig_dst;
 691
 692                prepare_frag(vport, skb);
 693                dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
 694                         DST_OBSOLETE_NONE, DST_NOCOUNT);
 695                ovs_dst.dev = vport->dev;
 696
 697                orig_dst = skb->_skb_refdst;
 698                skb_dst_set_noref(skb, &ovs_dst);
 699                IPCB(skb)->frag_max_size = mru;
 700
 701                ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
 702                refdst_drop(orig_dst);
 703        } else if (ethertype == htons(ETH_P_IPV6)) {
 704                const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
 705                unsigned long orig_dst;
 706                struct rt6_info ovs_rt;
 707
 708                if (!v6ops) {
 709                        goto err;
 710                }
 711
 712                prepare_frag(vport, skb);
 713                memset(&ovs_rt, 0, sizeof(ovs_rt));
 714                dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
 715                         DST_OBSOLETE_NONE, DST_NOCOUNT);
 716                ovs_rt.dst.dev = vport->dev;
 717
 718                orig_dst = skb->_skb_refdst;
 719                skb_dst_set_noref(skb, &ovs_rt.dst);
 720                IP6CB(skb)->frag_max_size = mru;
 721
 722                v6ops->fragment(net, skb->sk, skb, ovs_vport_output);
 723                refdst_drop(orig_dst);
 724        } else {
 725                WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
 726                          ovs_vport_name(vport), ntohs(ethertype), mru,
 727                          vport->dev->mtu);
 728                goto err;
 729        }
 730
 731        return;
 732err:
 733        kfree_skb(skb);
 734}
 735
 736static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
 737                      struct sw_flow_key *key)
 738{
 739        struct vport *vport = ovs_vport_rcu(dp, out_port);
 740
 741        if (likely(vport)) {
 742                u16 mru = OVS_CB(skb)->mru;
 743
 744                if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
 745                        ovs_vport_send(vport, skb);
 746                } else if (mru <= vport->dev->mtu) {
 747                        struct net *net = read_pnet(&dp->net);
 748                        __be16 ethertype = key->eth.type;
 749
 750                        if (!is_flow_key_valid(key)) {
 751                                if (eth_p_mpls(skb->protocol))
 752                                        ethertype = skb->inner_protocol;
 753                                else
 754                                        ethertype = vlan_get_protocol(skb);
 755                        }
 756
 757                        ovs_fragment(net, vport, skb, mru, ethertype);
 758                } else {
 759                        kfree_skb(skb);
 760                }
 761        } else {
 762                kfree_skb(skb);
 763        }
 764}
 765
 766static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 767                            struct sw_flow_key *key, const struct nlattr *attr,
 768                            const struct nlattr *actions, int actions_len)
 769{
 770        struct dp_upcall_info upcall;
 771        const struct nlattr *a;
 772        int rem;
 773
 774        memset(&upcall, 0, sizeof(upcall));
 775        upcall.cmd = OVS_PACKET_CMD_ACTION;
 776        upcall.mru = OVS_CB(skb)->mru;
 777
 778        for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
 779                 a = nla_next(a, &rem)) {
 780                switch (nla_type(a)) {
 781                case OVS_USERSPACE_ATTR_USERDATA:
 782                        upcall.userdata = a;
 783                        break;
 784
 785                case OVS_USERSPACE_ATTR_PID:
 786                        upcall.portid = nla_get_u32(a);
 787                        break;
 788
 789                case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
 790                        /* Get out tunnel info. */
 791                        struct vport *vport;
 792
 793                        vport = ovs_vport_rcu(dp, nla_get_u32(a));
 794                        if (vport) {
 795                                int err;
 796
 797                                err = dev_fill_metadata_dst(vport->dev, skb);
 798                                if (!err)
 799                                        upcall.egress_tun_info = skb_tunnel_info(skb);
 800                        }
 801
 802                        break;
 803                }
 804
 805                case OVS_USERSPACE_ATTR_ACTIONS: {
 806                        /* Include actions. */
 807                        upcall.actions = actions;
 808                        upcall.actions_len = actions_len;
 809                        break;
 810                }
 811
 812                } /* End of switch. */
 813        }
 814
 815        return ovs_dp_upcall(dp, skb, key, &upcall);
 816}
 817
 818static int sample(struct datapath *dp, struct sk_buff *skb,
 819                  struct sw_flow_key *key, const struct nlattr *attr,
 820                  const struct nlattr *actions, int actions_len)
 821{
 822        const struct nlattr *acts_list = NULL;
 823        const struct nlattr *a;
 824        int rem;
 825
 826        for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
 827                 a = nla_next(a, &rem)) {
 828                u32 probability;
 829
 830                switch (nla_type(a)) {
 831                case OVS_SAMPLE_ATTR_PROBABILITY:
 832                        probability = nla_get_u32(a);
 833                        if (!probability || prandom_u32() > probability)
 834                                return 0;
 835                        break;
 836
 837                case OVS_SAMPLE_ATTR_ACTIONS:
 838                        acts_list = a;
 839                        break;
 840                }
 841        }
 842
 843        rem = nla_len(acts_list);
 844        a = nla_data(acts_list);
 845
 846        /* Actions list is empty, do nothing */
 847        if (unlikely(!rem))
 848                return 0;
 849
 850        /* The only known usage of sample action is having a single user-space
 851         * action. Treat this usage as a special case.
 852         * The output_userspace() should clone the skb to be sent to the
 853         * user space. This skb will be consumed by its caller.
 854         */
 855        if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE &&
 856                   nla_is_last(a, rem)))
 857                return output_userspace(dp, skb, key, a, actions, actions_len);
 858
 859        skb = skb_clone(skb, GFP_ATOMIC);
 860        if (!skb)
 861                /* Skip the sample action when out of memory. */
 862                return 0;
 863
 864        if (!add_deferred_actions(skb, key, a)) {
 865                if (net_ratelimit())
 866                        pr_warn("%s: deferred actions limit reached, dropping sample action\n",
 867                                ovs_dp_name(dp));
 868
 869                kfree_skb(skb);
 870        }
 871        return 0;
 872}
 873
 874static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
 875                         const struct nlattr *attr)
 876{
 877        struct ovs_action_hash *hash_act = nla_data(attr);
 878        u32 hash = 0;
 879
 880        /* OVS_HASH_ALG_L4 is the only possible hash algorithm.  */
 881        hash = skb_get_hash(skb);
 882        hash = jhash_1word(hash, hash_act->hash_basis);
 883        if (!hash)
 884                hash = 0x1;
 885
 886        key->ovs_flow_hash = hash;
 887}
 888
 889static int execute_set_action(struct sk_buff *skb,
 890                              struct sw_flow_key *flow_key,
 891                              const struct nlattr *a)
 892{
 893        /* Only tunnel set execution is supported without a mask. */
 894        if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
 895                struct ovs_tunnel_info *tun = nla_data(a);
 896
 897                skb_dst_drop(skb);
 898                dst_hold((struct dst_entry *)tun->tun_dst);
 899                skb_dst_set(skb, (struct dst_entry *)tun->tun_dst);
 900                return 0;
 901        }
 902
 903        return -EINVAL;
 904}
 905
 906/* Mask is at the midpoint of the data. */
 907#define get_mask(a, type) ((const type)nla_data(a) + 1)
 908
 909static int execute_masked_set_action(struct sk_buff *skb,
 910                                     struct sw_flow_key *flow_key,
 911                                     const struct nlattr *a)
 912{
 913        int err = 0;
 914
 915        switch (nla_type(a)) {
 916        case OVS_KEY_ATTR_PRIORITY:
 917                OVS_SET_MASKED(skb->priority, nla_get_u32(a),
 918                               *get_mask(a, u32 *));
 919                flow_key->phy.priority = skb->priority;
 920                break;
 921
 922        case OVS_KEY_ATTR_SKB_MARK:
 923                OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *));
 924                flow_key->phy.skb_mark = skb->mark;
 925                break;
 926
 927        case OVS_KEY_ATTR_TUNNEL_INFO:
 928                /* Masked data not supported for tunnel. */
 929                err = -EINVAL;
 930                break;
 931
 932        case OVS_KEY_ATTR_ETHERNET:
 933                err = set_eth_addr(skb, flow_key, nla_data(a),
 934                                   get_mask(a, struct ovs_key_ethernet *));
 935                break;
 936
 937        case OVS_KEY_ATTR_IPV4:
 938                err = set_ipv4(skb, flow_key, nla_data(a),
 939                               get_mask(a, struct ovs_key_ipv4 *));
 940                break;
 941
 942        case OVS_KEY_ATTR_IPV6:
 943                err = set_ipv6(skb, flow_key, nla_data(a),
 944                               get_mask(a, struct ovs_key_ipv6 *));
 945                break;
 946
 947        case OVS_KEY_ATTR_TCP:
 948                err = set_tcp(skb, flow_key, nla_data(a),
 949                              get_mask(a, struct ovs_key_tcp *));
 950                break;
 951
 952        case OVS_KEY_ATTR_UDP:
 953                err = set_udp(skb, flow_key, nla_data(a),
 954                              get_mask(a, struct ovs_key_udp *));
 955                break;
 956
 957        case OVS_KEY_ATTR_SCTP:
 958                err = set_sctp(skb, flow_key, nla_data(a),
 959                               get_mask(a, struct ovs_key_sctp *));
 960                break;
 961
 962        case OVS_KEY_ATTR_MPLS:
 963                err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
 964                                                                    __be32 *));
 965                break;
 966
 967        case OVS_KEY_ATTR_CT_STATE:
 968        case OVS_KEY_ATTR_CT_ZONE:
 969        case OVS_KEY_ATTR_CT_MARK:
 970        case OVS_KEY_ATTR_CT_LABELS:
 971                err = -EINVAL;
 972                break;
 973        }
 974
 975        return err;
 976}
 977
 978static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
 979                          struct sw_flow_key *key,
 980                          const struct nlattr *a, int rem)
 981{
 982        struct deferred_action *da;
 983
 984        if (!is_flow_key_valid(key)) {
 985                int err;
 986
 987                err = ovs_flow_key_update(skb, key);
 988                if (err)
 989                        return err;
 990        }
 991        BUG_ON(!is_flow_key_valid(key));
 992
 993        if (!nla_is_last(a, rem)) {
 994                /* Recirc action is the not the last action
 995                 * of the action list, need to clone the skb.
 996                 */
 997                skb = skb_clone(skb, GFP_ATOMIC);
 998
 999                /* Skip the recirc action when out of memory, but
1000                 * continue on with the rest of the action list.
1001                 */
1002                if (!skb)
1003                        return 0;
1004        }
1005
1006        da = add_deferred_actions(skb, key, NULL);
1007        if (da) {
1008                da->pkt_key.recirc_id = nla_get_u32(a);
1009        } else {
1010                kfree_skb(skb);
1011
1012                if (net_ratelimit())
1013                        pr_warn("%s: deferred action limit reached, drop recirc action\n",
1014                                ovs_dp_name(dp));
1015        }
1016
1017        return 0;
1018}
1019
1020/* Execute a list of actions against 'skb'. */
1021static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
1022                              struct sw_flow_key *key,
1023                              const struct nlattr *attr, int len)
1024{
1025        /* Every output action needs a separate clone of 'skb', but the common
1026         * case is just a single output action, so that doing a clone and
1027         * then freeing the original skbuff is wasteful.  So the following code
1028         * is slightly obscure just to avoid that.
1029         */
1030        int prev_port = -1;
1031        const struct nlattr *a;
1032        int rem;
1033
1034        for (a = attr, rem = len; rem > 0;
1035             a = nla_next(a, &rem)) {
1036                int err = 0;
1037
1038                if (unlikely(prev_port != -1)) {
1039                        struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
1040
1041                        if (out_skb)
1042                                do_output(dp, out_skb, prev_port, key);
1043
1044                        prev_port = -1;
1045                }
1046
1047                switch (nla_type(a)) {
1048                case OVS_ACTION_ATTR_OUTPUT:
1049                        prev_port = nla_get_u32(a);
1050                        break;
1051
1052                case OVS_ACTION_ATTR_USERSPACE:
1053                        output_userspace(dp, skb, key, a, attr, len);
1054                        break;
1055
1056                case OVS_ACTION_ATTR_HASH:
1057                        execute_hash(skb, key, a);
1058                        break;
1059
1060                case OVS_ACTION_ATTR_PUSH_MPLS:
1061                        err = push_mpls(skb, key, nla_data(a));
1062                        break;
1063
1064                case OVS_ACTION_ATTR_POP_MPLS:
1065                        err = pop_mpls(skb, key, nla_get_be16(a));
1066                        break;
1067
1068                case OVS_ACTION_ATTR_PUSH_VLAN:
1069                        err = push_vlan(skb, key, nla_data(a));
1070                        break;
1071
1072                case OVS_ACTION_ATTR_POP_VLAN:
1073                        err = pop_vlan(skb, key);
1074                        break;
1075
1076                case OVS_ACTION_ATTR_RECIRC:
1077                        err = execute_recirc(dp, skb, key, a, rem);
1078                        if (nla_is_last(a, rem)) {
1079                                /* If this is the last action, the skb has
1080                                 * been consumed or freed.
1081                                 * Return immediately.
1082                                 */
1083                                return err;
1084                        }
1085                        break;
1086
1087                case OVS_ACTION_ATTR_SET:
1088                        err = execute_set_action(skb, key, nla_data(a));
1089                        break;
1090
1091                case OVS_ACTION_ATTR_SET_MASKED:
1092                case OVS_ACTION_ATTR_SET_TO_MASKED:
1093                        err = execute_masked_set_action(skb, key, nla_data(a));
1094                        break;
1095
1096                case OVS_ACTION_ATTR_SAMPLE:
1097                        err = sample(dp, skb, key, a, attr, len);
1098                        break;
1099
1100                case OVS_ACTION_ATTR_CT:
1101                        if (!is_flow_key_valid(key)) {
1102                                err = ovs_flow_key_update(skb, key);
1103                                if (err)
1104                                        return err;
1105                        }
1106
1107                        err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
1108                                             nla_data(a));
1109
1110                        /* Hide stolen IP fragments from user space. */
1111                        if (err)
1112                                return err == -EINPROGRESS ? 0 : err;
1113                        break;
1114                }
1115
1116                if (unlikely(err)) {
1117                        kfree_skb(skb);
1118                        return err;
1119                }
1120        }
1121
1122        if (prev_port != -1)
1123                do_output(dp, skb, prev_port, key);
1124        else
1125                consume_skb(skb);
1126
1127        return 0;
1128}
1129
1130static void process_deferred_actions(struct datapath *dp)
1131{
1132        struct action_fifo *fifo = this_cpu_ptr(action_fifos);
1133
1134        /* Do not touch the FIFO in case there is no deferred actions. */
1135        if (action_fifo_is_empty(fifo))
1136                return;
1137
1138        /* Finishing executing all deferred actions. */
1139        do {
1140                struct deferred_action *da = action_fifo_get(fifo);
1141                struct sk_buff *skb = da->skb;
1142                struct sw_flow_key *key = &da->pkt_key;
1143                const struct nlattr *actions = da->actions;
1144
1145                if (actions)
1146                        do_execute_actions(dp, skb, key, actions,
1147                                           nla_len(actions));
1148                else
1149                        ovs_dp_process_packet(skb, key);
1150        } while (!action_fifo_is_empty(fifo));
1151
1152        /* Reset FIFO for the next packet.  */
1153        action_fifo_init(fifo);
1154}
1155
1156/* Execute a list of actions against 'skb'. */
1157int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
1158                        const struct sw_flow_actions *acts,
1159                        struct sw_flow_key *key)
1160{
1161        static const int ovs_recursion_limit = 5;
1162        int err, level;
1163
1164        level = __this_cpu_inc_return(exec_actions_level);
1165        if (unlikely(level > ovs_recursion_limit)) {
1166                net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
1167                                     ovs_dp_name(dp));
1168                kfree_skb(skb);
1169                err = -ENETDOWN;
1170                goto out;
1171        }
1172
1173        err = do_execute_actions(dp, skb, key,
1174                                 acts->actions, acts->actions_len);
1175
1176        if (level == 1)
1177                process_deferred_actions(dp);
1178
1179out:
1180        __this_cpu_dec(exec_actions_level);
1181        return err;
1182}
1183
1184int action_fifos_init(void)
1185{
1186        action_fifos = alloc_percpu(struct action_fifo);
1187        if (!action_fifos)
1188                return -ENOMEM;
1189
1190        return 0;
1191}
1192
1193void action_fifos_exit(void)
1194{
1195        free_percpu(action_fifos);
1196}
1197