linux/net/openvswitch/flow_table.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2007-2014 Nicira, Inc.
   4 */
   5
   6#include "flow.h"
   7#include "datapath.h"
   8#include "flow_netlink.h"
   9#include <linux/uaccess.h>
  10#include <linux/netdevice.h>
  11#include <linux/etherdevice.h>
  12#include <linux/if_ether.h>
  13#include <linux/if_vlan.h>
  14#include <net/llc_pdu.h>
  15#include <linux/kernel.h>
  16#include <linux/jhash.h>
  17#include <linux/jiffies.h>
  18#include <linux/llc.h>
  19#include <linux/module.h>
  20#include <linux/in.h>
  21#include <linux/rcupdate.h>
  22#include <linux/cpumask.h>
  23#include <linux/if_arp.h>
  24#include <linux/ip.h>
  25#include <linux/ipv6.h>
  26#include <linux/sctp.h>
  27#include <linux/tcp.h>
  28#include <linux/udp.h>
  29#include <linux/icmp.h>
  30#include <linux/icmpv6.h>
  31#include <linux/rculist.h>
  32#include <linux/sort.h>
  33#include <net/ip.h>
  34#include <net/ipv6.h>
  35#include <net/ndisc.h>
  36
  37#define TBL_MIN_BUCKETS         1024
  38#define MASK_ARRAY_SIZE_MIN     16
  39#define REHASH_INTERVAL         (10 * 60 * HZ)
  40
  41#define MC_DEFAULT_HASH_ENTRIES 256
  42#define MC_HASH_SHIFT           8
  43#define MC_HASH_SEGS            ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT)
  44
  45static struct kmem_cache *flow_cache;
  46struct kmem_cache *flow_stats_cache __read_mostly;
  47
  48static u16 range_n_bytes(const struct sw_flow_key_range *range)
  49{
  50        return range->end - range->start;
  51}
  52
  53void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
  54                       bool full, const struct sw_flow_mask *mask)
  55{
  56        int start = full ? 0 : mask->range.start;
  57        int len = full ? sizeof *dst : range_n_bytes(&mask->range);
  58        const long *m = (const long *)((const u8 *)&mask->key + start);
  59        const long *s = (const long *)((const u8 *)src + start);
  60        long *d = (long *)((u8 *)dst + start);
  61        int i;
  62
  63        /* If 'full' is true then all of 'dst' is fully initialized. Otherwise,
  64         * if 'full' is false the memory outside of the 'mask->range' is left
  65         * uninitialized. This can be used as an optimization when further
  66         * operations on 'dst' only use contents within 'mask->range'.
  67         */
  68        for (i = 0; i < len; i += sizeof(long))
  69                *d++ = *s++ & *m++;
  70}
  71
  72struct sw_flow *ovs_flow_alloc(void)
  73{
  74        struct sw_flow *flow;
  75        struct sw_flow_stats *stats;
  76
  77        flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
  78        if (!flow)
  79                return ERR_PTR(-ENOMEM);
  80
  81        flow->stats_last_writer = -1;
  82
  83        /* Initialize the default stat node. */
  84        stats = kmem_cache_alloc_node(flow_stats_cache,
  85                                      GFP_KERNEL | __GFP_ZERO,
  86                                      node_online(0) ? 0 : NUMA_NO_NODE);
  87        if (!stats)
  88                goto err;
  89
  90        spin_lock_init(&stats->lock);
  91
  92        RCU_INIT_POINTER(flow->stats[0], stats);
  93
  94        cpumask_set_cpu(0, &flow->cpu_used_mask);
  95
  96        return flow;
  97err:
  98        kmem_cache_free(flow_cache, flow);
  99        return ERR_PTR(-ENOMEM);
 100}
 101
 102int ovs_flow_tbl_count(const struct flow_table *table)
 103{
 104        return table->count;
 105}
 106
 107static void flow_free(struct sw_flow *flow)
 108{
 109        int cpu;
 110
 111        if (ovs_identifier_is_key(&flow->id))
 112                kfree(flow->id.unmasked_key);
 113        if (flow->sf_acts)
 114                ovs_nla_free_flow_actions((struct sw_flow_actions __force *)
 115                                          flow->sf_acts);
 116        /* We open code this to make sure cpu 0 is always considered */
 117        for (cpu = 0; cpu < nr_cpu_ids;
 118             cpu = cpumask_next(cpu, &flow->cpu_used_mask)) {
 119                if (flow->stats[cpu])
 120                        kmem_cache_free(flow_stats_cache,
 121                                        (struct sw_flow_stats __force *)flow->stats[cpu]);
 122        }
 123
 124        kmem_cache_free(flow_cache, flow);
 125}
 126
 127static void rcu_free_flow_callback(struct rcu_head *rcu)
 128{
 129        struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
 130
 131        flow_free(flow);
 132}
 133
 134void ovs_flow_free(struct sw_flow *flow, bool deferred)
 135{
 136        if (!flow)
 137                return;
 138
 139        if (deferred)
 140                call_rcu(&flow->rcu, rcu_free_flow_callback);
 141        else
 142                flow_free(flow);
 143}
 144
 145static void __table_instance_destroy(struct table_instance *ti)
 146{
 147        kvfree(ti->buckets);
 148        kfree(ti);
 149}
 150
 151static struct table_instance *table_instance_alloc(int new_size)
 152{
 153        struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
 154        int i;
 155
 156        if (!ti)
 157                return NULL;
 158
 159        ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head),
 160                                     GFP_KERNEL);
 161        if (!ti->buckets) {
 162                kfree(ti);
 163                return NULL;
 164        }
 165
 166        for (i = 0; i < new_size; i++)
 167                INIT_HLIST_HEAD(&ti->buckets[i]);
 168
 169        ti->n_buckets = new_size;
 170        ti->node_ver = 0;
 171        get_random_bytes(&ti->hash_seed, sizeof(u32));
 172
 173        return ti;
 174}
 175
 176static void __mask_array_destroy(struct mask_array *ma)
 177{
 178        free_percpu(ma->masks_usage_stats);
 179        kfree(ma);
 180}
 181
 182static void mask_array_rcu_cb(struct rcu_head *rcu)
 183{
 184        struct mask_array *ma = container_of(rcu, struct mask_array, rcu);
 185
 186        __mask_array_destroy(ma);
 187}
 188
 189static void tbl_mask_array_reset_counters(struct mask_array *ma)
 190{
 191        int i, cpu;
 192
 193        /* As the per CPU counters are not atomic we can not go ahead and
 194         * reset them from another CPU. To be able to still have an approximate
 195         * zero based counter we store the value at reset, and subtract it
 196         * later when processing.
 197         */
 198        for (i = 0; i < ma->max; i++) {
 199                ma->masks_usage_zero_cntr[i] = 0;
 200
 201                for_each_possible_cpu(cpu) {
 202                        struct mask_array_stats *stats;
 203                        unsigned int start;
 204                        u64 counter;
 205
 206                        stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
 207                        do {
 208                                start = u64_stats_fetch_begin_irq(&stats->syncp);
 209                                counter = stats->usage_cntrs[i];
 210                        } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
 211
 212                        ma->masks_usage_zero_cntr[i] += counter;
 213                }
 214        }
 215}
 216
 217static struct mask_array *tbl_mask_array_alloc(int size)
 218{
 219        struct mask_array *new;
 220
 221        size = max(MASK_ARRAY_SIZE_MIN, size);
 222        new = kzalloc(sizeof(struct mask_array) +
 223                      sizeof(struct sw_flow_mask *) * size +
 224                      sizeof(u64) * size, GFP_KERNEL);
 225        if (!new)
 226                return NULL;
 227
 228        new->masks_usage_zero_cntr = (u64 *)((u8 *)new +
 229                                             sizeof(struct mask_array) +
 230                                             sizeof(struct sw_flow_mask *) *
 231                                             size);
 232
 233        new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
 234                                                sizeof(u64) * size,
 235                                                __alignof__(u64));
 236        if (!new->masks_usage_stats) {
 237                kfree(new);
 238                return NULL;
 239        }
 240
 241        new->count = 0;
 242        new->max = size;
 243
 244        return new;
 245}
 246
 247static int tbl_mask_array_realloc(struct flow_table *tbl, int size)
 248{
 249        struct mask_array *old;
 250        struct mask_array *new;
 251
 252        new = tbl_mask_array_alloc(size);
 253        if (!new)
 254                return -ENOMEM;
 255
 256        old = ovsl_dereference(tbl->mask_array);
 257        if (old) {
 258                int i;
 259
 260                for (i = 0; i < old->max; i++) {
 261                        if (ovsl_dereference(old->masks[i]))
 262                                new->masks[new->count++] = old->masks[i];
 263                }
 264                call_rcu(&old->rcu, mask_array_rcu_cb);
 265        }
 266
 267        rcu_assign_pointer(tbl->mask_array, new);
 268
 269        return 0;
 270}
 271
 272static int tbl_mask_array_add_mask(struct flow_table *tbl,
 273                                   struct sw_flow_mask *new)
 274{
 275        struct mask_array *ma = ovsl_dereference(tbl->mask_array);
 276        int err, ma_count = READ_ONCE(ma->count);
 277
 278        if (ma_count >= ma->max) {
 279                err = tbl_mask_array_realloc(tbl, ma->max +
 280                                                  MASK_ARRAY_SIZE_MIN);
 281                if (err)
 282                        return err;
 283
 284                ma = ovsl_dereference(tbl->mask_array);
 285        } else {
 286                /* On every add or delete we need to reset the counters so
 287                 * every new mask gets a fair chance of being prioritized.
 288                 */
 289                tbl_mask_array_reset_counters(ma);
 290        }
 291
 292        BUG_ON(ovsl_dereference(ma->masks[ma_count]));
 293
 294        rcu_assign_pointer(ma->masks[ma_count], new);
 295        WRITE_ONCE(ma->count, ma_count + 1);
 296
 297        return 0;
 298}
 299
 300static void tbl_mask_array_del_mask(struct flow_table *tbl,
 301                                    struct sw_flow_mask *mask)
 302{
 303        struct mask_array *ma = ovsl_dereference(tbl->mask_array);
 304        int i, ma_count = READ_ONCE(ma->count);
 305
 306        /* Remove the deleted mask pointers from the array */
 307        for (i = 0; i < ma_count; i++) {
 308                if (mask == ovsl_dereference(ma->masks[i]))
 309                        goto found;
 310        }
 311
 312        BUG();
 313        return;
 314
 315found:
 316        WRITE_ONCE(ma->count, ma_count - 1);
 317
 318        rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]);
 319        RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL);
 320
 321        kfree_rcu(mask, rcu);
 322
 323        /* Shrink the mask array if necessary. */
 324        if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) &&
 325            ma_count <= (ma->max / 3))
 326                tbl_mask_array_realloc(tbl, ma->max / 2);
 327        else
 328                tbl_mask_array_reset_counters(ma);
 329
 330}
 331
 332/* Remove 'mask' from the mask list, if it is not needed any more. */
 333static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
 334{
 335        if (mask) {
 336                /* ovs-lock is required to protect mask-refcount and
 337                 * mask list.
 338                 */
 339                ASSERT_OVSL();
 340                BUG_ON(!mask->ref_count);
 341                mask->ref_count--;
 342
 343                if (!mask->ref_count)
 344                        tbl_mask_array_del_mask(tbl, mask);
 345        }
 346}
 347
 348static void __mask_cache_destroy(struct mask_cache *mc)
 349{
 350        free_percpu(mc->mask_cache);
 351        kfree(mc);
 352}
 353
 354static void mask_cache_rcu_cb(struct rcu_head *rcu)
 355{
 356        struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu);
 357
 358        __mask_cache_destroy(mc);
 359}
 360
 361static struct mask_cache *tbl_mask_cache_alloc(u32 size)
 362{
 363        struct mask_cache_entry __percpu *cache = NULL;
 364        struct mask_cache *new;
 365
 366        /* Only allow size to be 0, or a power of 2, and does not exceed
 367         * percpu allocation size.
 368         */
 369        if ((!is_power_of_2(size) && size != 0) ||
 370            (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE)
 371                return NULL;
 372
 373        new = kzalloc(sizeof(*new), GFP_KERNEL);
 374        if (!new)
 375                return NULL;
 376
 377        new->cache_size = size;
 378        if (new->cache_size > 0) {
 379                cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry),
 380                                                  new->cache_size),
 381                                       __alignof__(struct mask_cache_entry));
 382                if (!cache) {
 383                        kfree(new);
 384                        return NULL;
 385                }
 386        }
 387
 388        new->mask_cache = cache;
 389        return new;
 390}
 391int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size)
 392{
 393        struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
 394        struct mask_cache *new;
 395
 396        if (size == mc->cache_size)
 397                return 0;
 398
 399        if ((!is_power_of_2(size) && size != 0) ||
 400            (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE)
 401                return -EINVAL;
 402
 403        new = tbl_mask_cache_alloc(size);
 404        if (!new)
 405                return -ENOMEM;
 406
 407        rcu_assign_pointer(table->mask_cache, new);
 408        call_rcu(&mc->rcu, mask_cache_rcu_cb);
 409
 410        return 0;
 411}
 412
 413int ovs_flow_tbl_init(struct flow_table *table)
 414{
 415        struct table_instance *ti, *ufid_ti;
 416        struct mask_cache *mc;
 417        struct mask_array *ma;
 418
 419        mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES);
 420        if (!mc)
 421                return -ENOMEM;
 422
 423        ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN);
 424        if (!ma)
 425                goto free_mask_cache;
 426
 427        ti = table_instance_alloc(TBL_MIN_BUCKETS);
 428        if (!ti)
 429                goto free_mask_array;
 430
 431        ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
 432        if (!ufid_ti)
 433                goto free_ti;
 434
 435        rcu_assign_pointer(table->ti, ti);
 436        rcu_assign_pointer(table->ufid_ti, ufid_ti);
 437        rcu_assign_pointer(table->mask_array, ma);
 438        rcu_assign_pointer(table->mask_cache, mc);
 439        table->last_rehash = jiffies;
 440        table->count = 0;
 441        table->ufid_count = 0;
 442        return 0;
 443
 444free_ti:
 445        __table_instance_destroy(ti);
 446free_mask_array:
 447        __mask_array_destroy(ma);
 448free_mask_cache:
 449        __mask_cache_destroy(mc);
 450        return -ENOMEM;
 451}
 452
 453static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
 454{
 455        struct table_instance *ti;
 456
 457        ti = container_of(rcu, struct table_instance, rcu);
 458        __table_instance_destroy(ti);
 459}
 460
 461static void table_instance_flow_free(struct flow_table *table,
 462                                     struct table_instance *ti,
 463                                     struct table_instance *ufid_ti,
 464                                     struct sw_flow *flow)
 465{
 466        hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
 467        table->count--;
 468
 469        if (ovs_identifier_is_ufid(&flow->id)) {
 470                hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
 471                table->ufid_count--;
 472        }
 473
 474        flow_mask_remove(table, flow->mask);
 475}
 476
 477/* Must be called with OVS mutex held. */
 478void table_instance_flow_flush(struct flow_table *table,
 479                               struct table_instance *ti,
 480                               struct table_instance *ufid_ti)
 481{
 482        int i;
 483
 484        for (i = 0; i < ti->n_buckets; i++) {
 485                struct hlist_head *head = &ti->buckets[i];
 486                struct hlist_node *n;
 487                struct sw_flow *flow;
 488
 489                hlist_for_each_entry_safe(flow, n, head,
 490                                          flow_table.node[ti->node_ver]) {
 491
 492                        table_instance_flow_free(table, ti, ufid_ti,
 493                                                 flow);
 494                        ovs_flow_free(flow, true);
 495                }
 496        }
 497
 498        if (WARN_ON(table->count != 0 ||
 499                    table->ufid_count != 0)) {
 500                table->count = 0;
 501                table->ufid_count = 0;
 502        }
 503}
 504
 505static void table_instance_destroy(struct table_instance *ti,
 506                                   struct table_instance *ufid_ti)
 507{
 508        call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
 509        call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
 510}
 511
 512/* No need for locking this function is called from RCU callback or
 513 * error path.
 514 */
 515void ovs_flow_tbl_destroy(struct flow_table *table)
 516{
 517        struct table_instance *ti = rcu_dereference_raw(table->ti);
 518        struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
 519        struct mask_cache *mc = rcu_dereference_raw(table->mask_cache);
 520        struct mask_array *ma = rcu_dereference_raw(table->mask_array);
 521
 522        call_rcu(&mc->rcu, mask_cache_rcu_cb);
 523        call_rcu(&ma->rcu, mask_array_rcu_cb);
 524        table_instance_destroy(ti, ufid_ti);
 525}
 526
 527struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
 528                                       u32 *bucket, u32 *last)
 529{
 530        struct sw_flow *flow;
 531        struct hlist_head *head;
 532        int ver;
 533        int i;
 534
 535        ver = ti->node_ver;
 536        while (*bucket < ti->n_buckets) {
 537                i = 0;
 538                head = &ti->buckets[*bucket];
 539                hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
 540                        if (i < *last) {
 541                                i++;
 542                                continue;
 543                        }
 544                        *last = i + 1;
 545                        return flow;
 546                }
 547                (*bucket)++;
 548                *last = 0;
 549        }
 550
 551        return NULL;
 552}
 553
 554static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
 555{
 556        hash = jhash_1word(hash, ti->hash_seed);
 557        return &ti->buckets[hash & (ti->n_buckets - 1)];
 558}
 559
 560static void table_instance_insert(struct table_instance *ti,
 561                                  struct sw_flow *flow)
 562{
 563        struct hlist_head *head;
 564
 565        head = find_bucket(ti, flow->flow_table.hash);
 566        hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head);
 567}
 568
 569static void ufid_table_instance_insert(struct table_instance *ti,
 570                                       struct sw_flow *flow)
 571{
 572        struct hlist_head *head;
 573
 574        head = find_bucket(ti, flow->ufid_table.hash);
 575        hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head);
 576}
 577
 578static void flow_table_copy_flows(struct table_instance *old,
 579                                  struct table_instance *new, bool ufid)
 580{
 581        int old_ver;
 582        int i;
 583
 584        old_ver = old->node_ver;
 585        new->node_ver = !old_ver;
 586
 587        /* Insert in new table. */
 588        for (i = 0; i < old->n_buckets; i++) {
 589                struct sw_flow *flow;
 590                struct hlist_head *head = &old->buckets[i];
 591
 592                if (ufid)
 593                        hlist_for_each_entry_rcu(flow, head,
 594                                                 ufid_table.node[old_ver],
 595                                                 lockdep_ovsl_is_held())
 596                                ufid_table_instance_insert(new, flow);
 597                else
 598                        hlist_for_each_entry_rcu(flow, head,
 599                                                 flow_table.node[old_ver],
 600                                                 lockdep_ovsl_is_held())
 601                                table_instance_insert(new, flow);
 602        }
 603}
 604
 605static struct table_instance *table_instance_rehash(struct table_instance *ti,
 606                                                    int n_buckets, bool ufid)
 607{
 608        struct table_instance *new_ti;
 609
 610        new_ti = table_instance_alloc(n_buckets);
 611        if (!new_ti)
 612                return NULL;
 613
 614        flow_table_copy_flows(ti, new_ti, ufid);
 615
 616        return new_ti;
 617}
 618
 619int ovs_flow_tbl_flush(struct flow_table *flow_table)
 620{
 621        struct table_instance *old_ti, *new_ti;
 622        struct table_instance *old_ufid_ti, *new_ufid_ti;
 623
 624        new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
 625        if (!new_ti)
 626                return -ENOMEM;
 627        new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
 628        if (!new_ufid_ti)
 629                goto err_free_ti;
 630
 631        old_ti = ovsl_dereference(flow_table->ti);
 632        old_ufid_ti = ovsl_dereference(flow_table->ufid_ti);
 633
 634        rcu_assign_pointer(flow_table->ti, new_ti);
 635        rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
 636        flow_table->last_rehash = jiffies;
 637
 638        table_instance_flow_flush(flow_table, old_ti, old_ufid_ti);
 639        table_instance_destroy(old_ti, old_ufid_ti);
 640        return 0;
 641
 642err_free_ti:
 643        __table_instance_destroy(new_ti);
 644        return -ENOMEM;
 645}
 646
 647static u32 flow_hash(const struct sw_flow_key *key,
 648                     const struct sw_flow_key_range *range)
 649{
 650        const u32 *hash_key = (const u32 *)((const u8 *)key + range->start);
 651
 652        /* Make sure number of hash bytes are multiple of u32. */
 653        int hash_u32s = range_n_bytes(range) >> 2;
 654
 655        return jhash2(hash_key, hash_u32s, 0);
 656}
 657
 658static int flow_key_start(const struct sw_flow_key *key)
 659{
 660        if (key->tun_proto)
 661                return 0;
 662        else
 663                return rounddown(offsetof(struct sw_flow_key, phy),
 664                                 sizeof(long));
 665}
 666
 667static bool cmp_key(const struct sw_flow_key *key1,
 668                    const struct sw_flow_key *key2,
 669                    int key_start, int key_end)
 670{
 671        const long *cp1 = (const long *)((const u8 *)key1 + key_start);
 672        const long *cp2 = (const long *)((const u8 *)key2 + key_start);
 673        int i;
 674
 675        for (i = key_start; i < key_end; i += sizeof(long))
 676                if (*cp1++ ^ *cp2++)
 677                        return false;
 678
 679        return true;
 680}
 681
 682static bool flow_cmp_masked_key(const struct sw_flow *flow,
 683                                const struct sw_flow_key *key,
 684                                const struct sw_flow_key_range *range)
 685{
 686        return cmp_key(&flow->key, key, range->start, range->end);
 687}
 688
 689static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
 690                                      const struct sw_flow_match *match)
 691{
 692        struct sw_flow_key *key = match->key;
 693        int key_start = flow_key_start(key);
 694        int key_end = match->range.end;
 695
 696        BUG_ON(ovs_identifier_is_ufid(&flow->id));
 697        return cmp_key(flow->id.unmasked_key, key, key_start, key_end);
 698}
 699
 700static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
 701                                          const struct sw_flow_key *unmasked,
 702                                          const struct sw_flow_mask *mask,
 703                                          u32 *n_mask_hit)
 704{
 705        struct sw_flow *flow;
 706        struct hlist_head *head;
 707        u32 hash;
 708        struct sw_flow_key masked_key;
 709
 710        ovs_flow_mask_key(&masked_key, unmasked, false, mask);
 711        hash = flow_hash(&masked_key, &mask->range);
 712        head = find_bucket(ti, hash);
 713        (*n_mask_hit)++;
 714
 715        hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver],
 716                                 lockdep_ovsl_is_held()) {
 717                if (flow->mask == mask && flow->flow_table.hash == hash &&
 718                    flow_cmp_masked_key(flow, &masked_key, &mask->range))
 719                        return flow;
 720        }
 721        return NULL;
 722}
 723
 724/* Flow lookup does full lookup on flow table. It starts with
 725 * mask from index passed in *index.
 726 * This function MUST be called with BH disabled due to the use
 727 * of CPU specific variables.
 728 */
 729static struct sw_flow *flow_lookup(struct flow_table *tbl,
 730                                   struct table_instance *ti,
 731                                   struct mask_array *ma,
 732                                   const struct sw_flow_key *key,
 733                                   u32 *n_mask_hit,
 734                                   u32 *n_cache_hit,
 735                                   u32 *index)
 736{
 737        struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
 738        struct sw_flow *flow;
 739        struct sw_flow_mask *mask;
 740        int i;
 741
 742        if (likely(*index < ma->max)) {
 743                mask = rcu_dereference_ovsl(ma->masks[*index]);
 744                if (mask) {
 745                        flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
 746                        if (flow) {
 747                                u64_stats_update_begin(&stats->syncp);
 748                                stats->usage_cntrs[*index]++;
 749                                u64_stats_update_end(&stats->syncp);
 750                                (*n_cache_hit)++;
 751                                return flow;
 752                        }
 753                }
 754        }
 755
 756        for (i = 0; i < ma->max; i++)  {
 757
 758                if (i == *index)
 759                        continue;
 760
 761                mask = rcu_dereference_ovsl(ma->masks[i]);
 762                if (unlikely(!mask))
 763                        break;
 764
 765                flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
 766                if (flow) { /* Found */
 767                        *index = i;
 768                        u64_stats_update_begin(&stats->syncp);
 769                        stats->usage_cntrs[*index]++;
 770                        u64_stats_update_end(&stats->syncp);
 771                        return flow;
 772                }
 773        }
 774
 775        return NULL;
 776}
 777
 778/*
 779 * mask_cache maps flow to probable mask. This cache is not tightly
 780 * coupled cache, It means updates to  mask list can result in inconsistent
 781 * cache entry in mask cache.
 782 * This is per cpu cache and is divided in MC_HASH_SEGS segments.
 783 * In case of a hash collision the entry is hashed in next segment.
 784 * */
 785struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
 786                                          const struct sw_flow_key *key,
 787                                          u32 skb_hash,
 788                                          u32 *n_mask_hit,
 789                                          u32 *n_cache_hit)
 790{
 791        struct mask_cache *mc = rcu_dereference(tbl->mask_cache);
 792        struct mask_array *ma = rcu_dereference(tbl->mask_array);
 793        struct table_instance *ti = rcu_dereference(tbl->ti);
 794        struct mask_cache_entry *entries, *ce;
 795        struct sw_flow *flow;
 796        u32 hash;
 797        int seg;
 798
 799        *n_mask_hit = 0;
 800        *n_cache_hit = 0;
 801        if (unlikely(!skb_hash || mc->cache_size == 0)) {
 802                u32 mask_index = 0;
 803                u32 cache = 0;
 804
 805                return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache,
 806                                   &mask_index);
 807        }
 808
 809        /* Pre and post recirulation flows usually have the same skb_hash
 810         * value. To avoid hash collisions, rehash the 'skb_hash' with
 811         * 'recirc_id'.  */
 812        if (key->recirc_id)
 813                skb_hash = jhash_1word(skb_hash, key->recirc_id);
 814
 815        ce = NULL;
 816        hash = skb_hash;
 817        entries = this_cpu_ptr(mc->mask_cache);
 818
 819        /* Find the cache entry 'ce' to operate on. */
 820        for (seg = 0; seg < MC_HASH_SEGS; seg++) {
 821                int index = hash & (mc->cache_size - 1);
 822                struct mask_cache_entry *e;
 823
 824                e = &entries[index];
 825                if (e->skb_hash == skb_hash) {
 826                        flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,
 827                                           n_cache_hit, &e->mask_index);
 828                        if (!flow)
 829                                e->skb_hash = 0;
 830                        return flow;
 831                }
 832
 833                if (!ce || e->skb_hash < ce->skb_hash)
 834                        ce = e;  /* A better replacement cache candidate. */
 835
 836                hash >>= MC_HASH_SHIFT;
 837        }
 838
 839        /* Cache miss, do full lookup. */
 840        flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit,
 841                           &ce->mask_index);
 842        if (flow)
 843                ce->skb_hash = skb_hash;
 844
 845        *n_cache_hit = 0;
 846        return flow;
 847}
 848
 849struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
 850                                    const struct sw_flow_key *key)
 851{
 852        struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
 853        struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
 854        u32 __always_unused n_mask_hit;
 855        u32 __always_unused n_cache_hit;
 856        struct sw_flow *flow;
 857        u32 index = 0;
 858
 859        /* This function gets called trough the netlink interface and therefore
 860         * is preemptible. However, flow_lookup() function needs to be called
 861         * with BH disabled due to CPU specific variables.
 862         */
 863        local_bh_disable();
 864        flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
 865        local_bh_enable();
 866        return flow;
 867}
 868
 869struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
 870                                          const struct sw_flow_match *match)
 871{
 872        struct mask_array *ma = ovsl_dereference(tbl->mask_array);
 873        int i;
 874
 875        /* Always called under ovs-mutex. */
 876        for (i = 0; i < ma->max; i++) {
 877                struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
 878                u32 __always_unused n_mask_hit;
 879                struct sw_flow_mask *mask;
 880                struct sw_flow *flow;
 881
 882                mask = ovsl_dereference(ma->masks[i]);
 883                if (!mask)
 884                        continue;
 885
 886                flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit);
 887                if (flow && ovs_identifier_is_key(&flow->id) &&
 888                    ovs_flow_cmp_unmasked_key(flow, match)) {
 889                        return flow;
 890                }
 891        }
 892
 893        return NULL;
 894}
 895
 896static u32 ufid_hash(const struct sw_flow_id *sfid)
 897{
 898        return jhash(sfid->ufid, sfid->ufid_len, 0);
 899}
 900
 901static bool ovs_flow_cmp_ufid(const struct sw_flow *flow,
 902                              const struct sw_flow_id *sfid)
 903{
 904        if (flow->id.ufid_len != sfid->ufid_len)
 905                return false;
 906
 907        return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len);
 908}
 909
 910bool ovs_flow_cmp(const struct sw_flow *flow,
 911                  const struct sw_flow_match *match)
 912{
 913        if (ovs_identifier_is_ufid(&flow->id))
 914                return flow_cmp_masked_key(flow, match->key, &match->range);
 915
 916        return ovs_flow_cmp_unmasked_key(flow, match);
 917}
 918
 919struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
 920                                         const struct sw_flow_id *ufid)
 921{
 922        struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti);
 923        struct sw_flow *flow;
 924        struct hlist_head *head;
 925        u32 hash;
 926
 927        hash = ufid_hash(ufid);
 928        head = find_bucket(ti, hash);
 929        hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver],
 930                                 lockdep_ovsl_is_held()) {
 931                if (flow->ufid_table.hash == hash &&
 932                    ovs_flow_cmp_ufid(flow, ufid))
 933                        return flow;
 934        }
 935        return NULL;
 936}
 937
 938int ovs_flow_tbl_num_masks(const struct flow_table *table)
 939{
 940        struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
 941        return READ_ONCE(ma->count);
 942}
 943
 944u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table)
 945{
 946        struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
 947
 948        return READ_ONCE(mc->cache_size);
 949}
 950
 951static struct table_instance *table_instance_expand(struct table_instance *ti,
 952                                                    bool ufid)
 953{
 954        return table_instance_rehash(ti, ti->n_buckets * 2, ufid);
 955}
 956
 957/* Must be called with OVS mutex held. */
 958void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
 959{
 960        struct table_instance *ti = ovsl_dereference(table->ti);
 961        struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
 962
 963        BUG_ON(table->count == 0);
 964        table_instance_flow_free(table, ti, ufid_ti, flow);
 965}
 966
 967static struct sw_flow_mask *mask_alloc(void)
 968{
 969        struct sw_flow_mask *mask;
 970
 971        mask = kmalloc(sizeof(*mask), GFP_KERNEL);
 972        if (mask)
 973                mask->ref_count = 1;
 974
 975        return mask;
 976}
 977
 978static bool mask_equal(const struct sw_flow_mask *a,
 979                       const struct sw_flow_mask *b)
 980{
 981        const u8 *a_ = (const u8 *)&a->key + a->range.start;
 982        const u8 *b_ = (const u8 *)&b->key + b->range.start;
 983
 984        return  (a->range.end == b->range.end)
 985                && (a->range.start == b->range.start)
 986                && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
 987}
 988
 989static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
 990                                           const struct sw_flow_mask *mask)
 991{
 992        struct mask_array *ma;
 993        int i;
 994
 995        ma = ovsl_dereference(tbl->mask_array);
 996        for (i = 0; i < ma->max; i++) {
 997                struct sw_flow_mask *t;
 998                t = ovsl_dereference(ma->masks[i]);
 999
1000                if (t && mask_equal(mask, t))
1001                        return t;
1002        }
1003
1004        return NULL;
1005}
1006
1007/* Add 'mask' into the mask list, if it is not already there. */
1008static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
1009                            const struct sw_flow_mask *new)
1010{
1011        struct sw_flow_mask *mask;
1012
1013        mask = flow_mask_find(tbl, new);
1014        if (!mask) {
1015                /* Allocate a new mask if none exsits. */
1016                mask = mask_alloc();
1017                if (!mask)
1018                        return -ENOMEM;
1019                mask->key = new->key;
1020                mask->range = new->range;
1021
1022                /* Add mask to mask-list. */
1023                if (tbl_mask_array_add_mask(tbl, mask)) {
1024                        kfree(mask);
1025                        return -ENOMEM;
1026                }
1027        } else {
1028                BUG_ON(!mask->ref_count);
1029                mask->ref_count++;
1030        }
1031
1032        flow->mask = mask;
1033        return 0;
1034}
1035
1036/* Must be called with OVS mutex held. */
1037static void flow_key_insert(struct flow_table *table, struct sw_flow *flow)
1038{
1039        struct table_instance *new_ti = NULL;
1040        struct table_instance *ti;
1041
1042        flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range);
1043        ti = ovsl_dereference(table->ti);
1044        table_instance_insert(ti, flow);
1045        table->count++;
1046
1047        /* Expand table, if necessary, to make room. */
1048        if (table->count > ti->n_buckets)
1049                new_ti = table_instance_expand(ti, false);
1050        else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
1051                new_ti = table_instance_rehash(ti, ti->n_buckets, false);
1052
1053        if (new_ti) {
1054                rcu_assign_pointer(table->ti, new_ti);
1055                call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
1056                table->last_rehash = jiffies;
1057        }
1058}
1059
1060/* Must be called with OVS mutex held. */
1061static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow)
1062{
1063        struct table_instance *ti;
1064
1065        flow->ufid_table.hash = ufid_hash(&flow->id);
1066        ti = ovsl_dereference(table->ufid_ti);
1067        ufid_table_instance_insert(ti, flow);
1068        table->ufid_count++;
1069
1070        /* Expand table, if necessary, to make room. */
1071        if (table->ufid_count > ti->n_buckets) {
1072                struct table_instance *new_ti;
1073
1074                new_ti = table_instance_expand(ti, true);
1075                if (new_ti) {
1076                        rcu_assign_pointer(table->ufid_ti, new_ti);
1077                        call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
1078                }
1079        }
1080}
1081
1082/* Must be called with OVS mutex held. */
1083int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
1084                        const struct sw_flow_mask *mask)
1085{
1086        int err;
1087
1088        err = flow_mask_insert(table, flow, mask);
1089        if (err)
1090                return err;
1091        flow_key_insert(table, flow);
1092        if (ovs_identifier_is_ufid(&flow->id))
1093                flow_ufid_insert(table, flow);
1094
1095        return 0;
1096}
1097
1098static int compare_mask_and_count(const void *a, const void *b)
1099{
1100        const struct mask_count *mc_a = a;
1101        const struct mask_count *mc_b = b;
1102
1103        return (s64)mc_b->counter - (s64)mc_a->counter;
1104}
1105
1106/* Must be called with OVS mutex held. */
1107void ovs_flow_masks_rebalance(struct flow_table *table)
1108{
1109        struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
1110        struct mask_count *masks_and_count;
1111        struct mask_array *new;
1112        int masks_entries = 0;
1113        int i;
1114
1115        /* Build array of all current entries with use counters. */
1116        masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count),
1117                                        GFP_KERNEL);
1118        if (!masks_and_count)
1119                return;
1120
1121        for (i = 0; i < ma->max; i++) {
1122                struct sw_flow_mask *mask;
1123                int cpu;
1124
1125                mask = rcu_dereference_ovsl(ma->masks[i]);
1126                if (unlikely(!mask))
1127                        break;
1128
1129                masks_and_count[i].index = i;
1130                masks_and_count[i].counter = 0;
1131
1132                for_each_possible_cpu(cpu) {
1133                        struct mask_array_stats *stats;
1134                        unsigned int start;
1135                        u64 counter;
1136
1137                        stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
1138                        do {
1139                                start = u64_stats_fetch_begin_irq(&stats->syncp);
1140                                counter = stats->usage_cntrs[i];
1141                        } while (u64_stats_fetch_retry_irq(&stats->syncp,
1142                                                           start));
1143
1144                        masks_and_count[i].counter += counter;
1145                }
1146
1147                /* Subtract the zero count value. */
1148                masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i];
1149
1150                /* Rather than calling tbl_mask_array_reset_counters()
1151                 * below when no change is needed, do it inline here.
1152                 */
1153                ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter;
1154        }
1155
1156        if (i == 0)
1157                goto free_mask_entries;
1158
1159        /* Sort the entries */
1160        masks_entries = i;
1161        sort(masks_and_count, masks_entries, sizeof(*masks_and_count),
1162             compare_mask_and_count, NULL);
1163
1164        /* If the order is the same, nothing to do... */
1165        for (i = 0; i < masks_entries; i++) {
1166                if (i != masks_and_count[i].index)
1167                        break;
1168        }
1169        if (i == masks_entries)
1170                goto free_mask_entries;
1171
1172        /* Rebuilt the new list in order of usage. */
1173        new = tbl_mask_array_alloc(ma->max);
1174        if (!new)
1175                goto free_mask_entries;
1176
1177        for (i = 0; i < masks_entries; i++) {
1178                int index = masks_and_count[i].index;
1179
1180                if (ovsl_dereference(ma->masks[index]))
1181                        new->masks[new->count++] = ma->masks[index];
1182        }
1183
1184        rcu_assign_pointer(table->mask_array, new);
1185        call_rcu(&ma->rcu, mask_array_rcu_cb);
1186
1187free_mask_entries:
1188        kfree(masks_and_count);
1189}
1190
1191/* Initializes the flow module.
1192 * Returns zero if successful or a negative error code. */
1193int ovs_flow_init(void)
1194{
1195        BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
1196        BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
1197
1198        flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
1199                                       + (nr_cpu_ids
1200                                          * sizeof(struct sw_flow_stats *)),
1201                                       0, 0, NULL);
1202        if (flow_cache == NULL)
1203                return -ENOMEM;
1204
1205        flow_stats_cache
1206                = kmem_cache_create("sw_flow_stats", sizeof(struct sw_flow_stats),
1207                                    0, SLAB_HWCACHE_ALIGN, NULL);
1208        if (flow_stats_cache == NULL) {
1209                kmem_cache_destroy(flow_cache);
1210                flow_cache = NULL;
1211                return -ENOMEM;
1212        }
1213
1214        return 0;
1215}
1216
1217/* Uninitializes the flow module. */
1218void ovs_flow_exit(void)
1219{
1220        kmem_cache_destroy(flow_stats_cache);
1221        kmem_cache_destroy(flow_cache);
1222}
1223