linux/net/core/flow_dissector.c
<<
>>
Prefs
   1#include <linux/skbuff.h>
   2#include <linux/export.h>
   3#include <linux/ip.h>
   4#include <linux/ipv6.h>
   5#include <linux/if_vlan.h>
   6#include <net/ip.h>
   7#include <net/ipv6.h>
   8#include <linux/igmp.h>
   9#include <linux/icmp.h>
  10#include <linux/sctp.h>
  11#include <linux/dccp.h>
  12#include <linux/if_tunnel.h>
  13#include <linux/if_pppox.h>
  14#include <linux/ppp_defs.h>
  15#include <net/flow_keys.h>
  16#include <scsi/fc/fc_fcoe.h>
  17
  18/* copy saddr & daddr, possibly using 64bit load/store
  19 * Equivalent to :      flow->src = iph->saddr;
  20 *                      flow->dst = iph->daddr;
  21 */
  22static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
  23{
  24        BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
  25                     offsetof(typeof(*flow), src) + sizeof(flow->src));
  26        memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
  27}
  28
  29/**
  30 * __skb_flow_get_ports - extract the upper layer ports and return them
  31 * @skb: sk_buff to extract the ports from
  32 * @thoff: transport header offset
  33 * @ip_proto: protocol for which to get port offset
  34 * @data: raw buffer pointer to the packet, if NULL use skb->data
  35 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
  36 *
  37 * The function will try to retrieve the ports at offset thoff + poff where poff
  38 * is the protocol port offset returned from proto_ports_offset
  39 */
  40__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
  41                            void *data, int hlen)
  42{
  43        int poff = proto_ports_offset(ip_proto);
  44
  45        if (!data) {
  46                data = skb->data;
  47                hlen = skb_headlen(skb);
  48        }
  49
  50        if (poff >= 0) {
  51                __be32 *ports, _ports;
  52
  53                ports = __skb_header_pointer(skb, thoff + poff,
  54                                             sizeof(_ports), data, hlen, &_ports);
  55                if (ports)
  56                        return *ports;
  57        }
  58
  59        return 0;
  60}
  61EXPORT_SYMBOL(__skb_flow_get_ports);
  62
  63/**
  64 * __skb_flow_dissect - extract the flow_keys struct and return it
  65 * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
  66 * @data: raw buffer pointer to the packet, if NULL use skb->data
  67 * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
  68 * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
  69 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
  70 *
  71 * The function will try to retrieve the struct flow_keys from either the skbuff
  72 * or a raw buffer specified by the rest parameters
  73 */
  74bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
  75                        void *data, __be16 proto, int nhoff, int hlen)
  76{
  77        u8 ip_proto;
  78
  79        if (!data) {
  80                data = skb->data;
  81                proto = skb->protocol;
  82                nhoff = skb_network_offset(skb);
  83                hlen = skb_headlen(skb);
  84        }
  85
  86        memset(flow, 0, sizeof(*flow));
  87
  88again:
  89        switch (proto) {
  90        case htons(ETH_P_IP): {
  91                const struct iphdr *iph;
  92                struct iphdr _iph;
  93ip:
  94                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
  95                if (!iph || iph->ihl < 5)
  96                        return false;
  97                nhoff += iph->ihl * 4;
  98
  99                ip_proto = iph->protocol;
 100                if (ip_is_fragment(iph))
 101                        ip_proto = 0;
 102
 103                /* skip the address processing if skb is NULL.  The assumption
 104                 * here is that if there is no skb we are not looking for flow
 105                 * info but lengths and protocols.
 106                 */
 107                if (!skb)
 108                        break;
 109
 110                iph_to_flow_copy_addrs(flow, iph);
 111                break;
 112        }
 113        case htons(ETH_P_IPV6): {
 114                const struct ipv6hdr *iph;
 115                struct ipv6hdr _iph;
 116                __be32 flow_label;
 117
 118ipv6:
 119                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
 120                if (!iph)
 121                        return false;
 122
 123                ip_proto = iph->nexthdr;
 124                nhoff += sizeof(struct ipv6hdr);
 125
 126                /* see comment above in IPv4 section */
 127                if (!skb)
 128                        break;
 129
 130                flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
 131                flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
 132
 133                flow_label = ip6_flowlabel(iph);
 134                if (flow_label) {
 135                        /* Awesome, IPv6 packet has a flow label so we can
 136                         * use that to represent the ports without any
 137                         * further dissection.
 138                         */
 139                        flow->n_proto = proto;
 140                        flow->ip_proto = ip_proto;
 141                        flow->ports = flow_label;
 142                        flow->thoff = (u16)nhoff;
 143
 144                        return true;
 145                }
 146
 147                break;
 148        }
 149        case htons(ETH_P_8021AD):
 150        case htons(ETH_P_8021Q): {
 151                const struct vlan_hdr *vlan;
 152                struct vlan_hdr _vlan;
 153
 154                vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
 155                if (!vlan)
 156                        return false;
 157
 158                proto = vlan->h_vlan_encapsulated_proto;
 159                nhoff += sizeof(*vlan);
 160                goto again;
 161        }
 162        case htons(ETH_P_PPP_SES): {
 163                struct {
 164                        struct pppoe_hdr hdr;
 165                        __be16 proto;
 166                } *hdr, _hdr;
 167                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
 168                if (!hdr)
 169                        return false;
 170                proto = hdr->proto;
 171                nhoff += PPPOE_SES_HLEN;
 172                switch (proto) {
 173                case htons(PPP_IP):
 174                        goto ip;
 175                case htons(PPP_IPV6):
 176                        goto ipv6;
 177                default:
 178                        return false;
 179                }
 180        }
 181        case htons(ETH_P_TIPC): {
 182                struct {
 183                        __be32 pre[3];
 184                        __be32 srcnode;
 185                } *hdr, _hdr;
 186                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
 187                if (!hdr)
 188                        return false;
 189                flow->src = hdr->srcnode;
 190                flow->dst = 0;
 191                flow->n_proto = proto;
 192                flow->thoff = (u16)nhoff;
 193                return true;
 194        }
 195        case htons(ETH_P_FCOE):
 196                flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
 197                /* fall through */
 198        default:
 199                return false;
 200        }
 201
 202        switch (ip_proto) {
 203        case IPPROTO_GRE: {
 204                struct gre_hdr {
 205                        __be16 flags;
 206                        __be16 proto;
 207                } *hdr, _hdr;
 208
 209                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
 210                if (!hdr)
 211                        return false;
 212                /*
 213                 * Only look inside GRE if version zero and no
 214                 * routing
 215                 */
 216                if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
 217                        proto = hdr->proto;
 218                        nhoff += 4;
 219                        if (hdr->flags & GRE_CSUM)
 220                                nhoff += 4;
 221                        if (hdr->flags & GRE_KEY)
 222                                nhoff += 4;
 223                        if (hdr->flags & GRE_SEQ)
 224                                nhoff += 4;
 225                        if (proto == htons(ETH_P_TEB)) {
 226                                const struct ethhdr *eth;
 227                                struct ethhdr _eth;
 228
 229                                eth = __skb_header_pointer(skb, nhoff,
 230                                                           sizeof(_eth),
 231                                                           data, hlen, &_eth);
 232                                if (!eth)
 233                                        return false;
 234                                proto = eth->h_proto;
 235                                nhoff += sizeof(*eth);
 236                        }
 237                        goto again;
 238                }
 239                break;
 240        }
 241        case IPPROTO_IPIP:
 242                proto = htons(ETH_P_IP);
 243                goto ip;
 244        case IPPROTO_IPV6:
 245                proto = htons(ETH_P_IPV6);
 246                goto ipv6;
 247        default:
 248                break;
 249        }
 250
 251        flow->n_proto = proto;
 252        flow->ip_proto = ip_proto;
 253        flow->thoff = (u16) nhoff;
 254
 255        /* unless skb is set we don't need to record port info */
 256        if (skb)
 257                flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
 258                                                   data, hlen);
 259
 260        return true;
 261}
 262EXPORT_SYMBOL(__skb_flow_dissect);
 263
 264static u32 hashrnd __read_mostly;
 265static __always_inline void __flow_hash_secret_init(void)
 266{
 267        net_get_random_once(&hashrnd, sizeof(hashrnd));
 268}
 269
 270static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
 271{
 272        __flow_hash_secret_init();
 273        return jhash_3words(a, b, c, hashrnd);
 274}
 275
 276static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
 277{
 278        u32 hash;
 279
 280        /* get a consistent hash (same value on both flow directions) */
 281        if (((__force u32)keys->dst < (__force u32)keys->src) ||
 282            (((__force u32)keys->dst == (__force u32)keys->src) &&
 283             ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
 284                swap(keys->dst, keys->src);
 285                swap(keys->port16[0], keys->port16[1]);
 286        }
 287
 288        hash = __flow_hash_3words((__force u32)keys->dst,
 289                                  (__force u32)keys->src,
 290                                  (__force u32)keys->ports);
 291        if (!hash)
 292                hash = 1;
 293
 294        return hash;
 295}
 296
 297u32 flow_hash_from_keys(struct flow_keys *keys)
 298{
 299        return __flow_hash_from_keys(keys);
 300}
 301EXPORT_SYMBOL(flow_hash_from_keys);
 302
 303/*
 304 * __skb_get_hash: calculate a flow hash based on src/dst addresses
 305 * and src/dst port numbers.  Sets hash in skb to non-zero hash value
 306 * on success, zero indicates no valid hash.  Also, sets l4_hash in skb
 307 * if hash is a canonical 4-tuple hash over transport ports.
 308 */
 309void __skb_get_hash(struct sk_buff *skb)
 310{
 311        struct flow_keys keys;
 312
 313        if (!skb_flow_dissect(skb, &keys))
 314                return;
 315
 316        if (keys.ports)
 317                skb->l4_hash = 1;
 318
 319        skb->sw_hash = 1;
 320
 321        skb->hash = __flow_hash_from_keys(&keys);
 322}
 323EXPORT_SYMBOL(__skb_get_hash);
 324
 325/*
 326 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 327 * to be used as a distribution range.
 328 */
 329u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
 330                  unsigned int num_tx_queues)
 331{
 332        u32 hash;
 333        u16 qoffset = 0;
 334        u16 qcount = num_tx_queues;
 335
 336        if (skb_rx_queue_recorded(skb)) {
 337                hash = skb_get_rx_queue(skb);
 338                while (unlikely(hash >= num_tx_queues))
 339                        hash -= num_tx_queues;
 340                return hash;
 341        }
 342
 343        if (dev->num_tc) {
 344                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 345                qoffset = dev->tc_to_txq[tc].offset;
 346                qcount = dev->tc_to_txq[tc].count;
 347        }
 348
 349        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 350}
 351EXPORT_SYMBOL(__skb_tx_hash);
 352
 353u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 354                   const struct flow_keys *keys, int hlen)
 355{
 356        u32 poff = keys->thoff;
 357
 358        switch (keys->ip_proto) {
 359        case IPPROTO_TCP: {
 360                /* access doff as u8 to avoid unaligned access */
 361                const u8 *doff;
 362                u8 _doff;
 363
 364                doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
 365                                            data, hlen, &_doff);
 366                if (!doff)
 367                        return poff;
 368
 369                poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
 370                break;
 371        }
 372        case IPPROTO_UDP:
 373        case IPPROTO_UDPLITE:
 374                poff += sizeof(struct udphdr);
 375                break;
 376        /* For the rest, we do not really care about header
 377         * extensions at this point for now.
 378         */
 379        case IPPROTO_ICMP:
 380                poff += sizeof(struct icmphdr);
 381                break;
 382        case IPPROTO_ICMPV6:
 383                poff += sizeof(struct icmp6hdr);
 384                break;
 385        case IPPROTO_IGMP:
 386                poff += sizeof(struct igmphdr);
 387                break;
 388        case IPPROTO_DCCP:
 389                poff += sizeof(struct dccp_hdr);
 390                break;
 391        case IPPROTO_SCTP:
 392                poff += sizeof(struct sctphdr);
 393                break;
 394        }
 395
 396        return poff;
 397}
 398
 399/* skb_get_poff() returns the offset to the payload as far as it could
 400 * be dissected. The main user is currently BPF, so that we can dynamically
 401 * truncate packets without needing to push actual payload to the user
 402 * space and can analyze headers only, instead.
 403 */
 404u32 skb_get_poff(const struct sk_buff *skb)
 405{
 406        struct flow_keys keys;
 407
 408        if (!skb_flow_dissect(skb, &keys))
 409                return 0;
 410
 411        return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
 412}
 413
 414static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 415{
 416#ifdef CONFIG_XPS
 417        struct xps_dev_maps *dev_maps;
 418        struct xps_map *map;
 419        int queue_index = -1;
 420
 421        rcu_read_lock();
 422        dev_maps = rcu_dereference(dev->xps_maps);
 423        if (dev_maps) {
 424                map = rcu_dereference(
 425                    dev_maps->cpu_map[skb->sender_cpu - 1]);
 426                if (map) {
 427                        if (map->len == 1)
 428                                queue_index = map->queues[0];
 429                        else
 430                                queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
 431                                                                           map->len)];
 432                        if (unlikely(queue_index >= dev->real_num_tx_queues))
 433                                queue_index = -1;
 434                }
 435        }
 436        rcu_read_unlock();
 437
 438        return queue_index;
 439#else
 440        return -1;
 441#endif
 442}
 443
 444static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 445{
 446        struct sock *sk = skb->sk;
 447        int queue_index = sk_tx_queue_get(sk);
 448
 449        if (queue_index < 0 || skb->ooo_okay ||
 450            queue_index >= dev->real_num_tx_queues) {
 451                int new_index = get_xps_queue(dev, skb);
 452                if (new_index < 0)
 453                        new_index = skb_tx_hash(dev, skb);
 454
 455                if (queue_index != new_index && sk &&
 456                    rcu_access_pointer(sk->sk_dst_cache))
 457                        sk_tx_queue_set(sk, new_index);
 458
 459                queue_index = new_index;
 460        }
 461
 462        return queue_index;
 463}
 464
 465struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 466                                    struct sk_buff *skb,
 467                                    void *accel_priv)
 468{
 469        int queue_index = 0;
 470
 471#ifdef CONFIG_XPS
 472        if (skb->sender_cpu == 0)
 473                skb->sender_cpu = raw_smp_processor_id() + 1;
 474#endif
 475
 476        if (dev->real_num_tx_queues != 1) {
 477                const struct net_device_ops *ops = dev->netdev_ops;
 478                if (ops->ndo_select_queue)
 479                        queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
 480                                                            __netdev_pick_tx);
 481                else
 482                        queue_index = __netdev_pick_tx(dev, skb);
 483
 484                if (!accel_priv)
 485                        queue_index = netdev_cap_txqueue(dev, queue_index);
 486        }
 487
 488        skb_set_queue_mapping(skb, queue_index);
 489        return netdev_get_tx_queue(dev, queue_index);
 490}
 491