linux/net/xfrm/xfrm_policy.c
<<
>>
Prefs
   1/*
   2 * xfrm_policy.c
   3 *
   4 * Changes:
   5 *      Mitsuru KANDA @USAGI
   6 *      Kazunori MIYAZAWA @USAGI
   7 *      Kunihiro Ishiguro <kunihiro@ipinfusion.com>
   8 *              IPv6 support
   9 *      Kazunori MIYAZAWA @USAGI
  10 *      YOSHIFUJI Hideaki
  11 *              Split up af-specific portion
  12 *      Derek Atkins <derek@ihtfp.com>          Add the post_input processor
  13 *
  14 */
  15
  16#include <linux/err.h>
  17#include <linux/slab.h>
  18#include <linux/kmod.h>
  19#include <linux/list.h>
  20#include <linux/spinlock.h>
  21#include <linux/workqueue.h>
  22#include <linux/notifier.h>
  23#include <linux/netdevice.h>
  24#include <linux/netfilter.h>
  25#include <linux/module.h>
  26#include <linux/cache.h>
  27#include <linux/audit.h>
  28#include <net/dst.h>
  29#include <net/flow.h>
  30#include <net/xfrm.h>
  31#include <net/ip.h>
  32#ifdef CONFIG_XFRM_STATISTICS
  33#include <net/snmp.h>
  34#endif
  35
  36#include "xfrm_hash.h"
  37
  38#define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10))
  39#define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
  40#define XFRM_MAX_QUEUE_LEN      100
  41
  42DEFINE_MUTEX(xfrm_cfg_mutex);
  43EXPORT_SYMBOL(xfrm_cfg_mutex);
  44
  45static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock);
  46static struct dst_entry *xfrm_policy_sk_bundles;
  47static DEFINE_RWLOCK(xfrm_policy_lock);
  48
  49static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
  50static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
  51                                                __read_mostly;
  52
  53static struct kmem_cache *xfrm_dst_cache __read_mostly;
  54
  55static void xfrm_init_pmtu(struct dst_entry *dst);
  56static int stale_bundle(struct dst_entry *dst);
  57static int xfrm_bundle_ok(struct xfrm_dst *xdst);
  58static void xfrm_policy_queue_process(unsigned long arg);
  59
  60static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
  61                                                int dir);
  62
  63static inline bool
  64__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
  65{
  66        const struct flowi4 *fl4 = &fl->u.ip4;
  67
  68        return  addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) &&
  69                addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) &&
  70                !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
  71                !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
  72                (fl4->flowi4_proto == sel->proto || !sel->proto) &&
  73                (fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
  74}
  75
  76static inline bool
  77__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
  78{
  79        const struct flowi6 *fl6 = &fl->u.ip6;
  80
  81        return  addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
  82                addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
  83                !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
  84                !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
  85                (fl6->flowi6_proto == sel->proto || !sel->proto) &&
  86                (fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
  87}
  88
  89bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
  90                         unsigned short family)
  91{
  92        switch (family) {
  93        case AF_INET:
  94                return __xfrm4_selector_match(sel, fl);
  95        case AF_INET6:
  96                return __xfrm6_selector_match(sel, fl);
  97        }
  98        return false;
  99}
 100
 101static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
 102{
 103        struct xfrm_policy_afinfo *afinfo;
 104
 105        if (unlikely(family >= NPROTO))
 106                return NULL;
 107        rcu_read_lock();
 108        afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
 109        if (unlikely(!afinfo))
 110                rcu_read_unlock();
 111        return afinfo;
 112}
 113
 114static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
 115{
 116        rcu_read_unlock();
 117}
 118
 119static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
 120                                                  const xfrm_address_t *saddr,
 121                                                  const xfrm_address_t *daddr,
 122                                                  int family)
 123{
 124        struct xfrm_policy_afinfo *afinfo;
 125        struct dst_entry *dst;
 126
 127        afinfo = xfrm_policy_get_afinfo(family);
 128        if (unlikely(afinfo == NULL))
 129                return ERR_PTR(-EAFNOSUPPORT);
 130
 131        dst = afinfo->dst_lookup(net, tos, saddr, daddr);
 132
 133        xfrm_policy_put_afinfo(afinfo);
 134
 135        return dst;
 136}
 137
 138static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
 139                                                xfrm_address_t *prev_saddr,
 140                                                xfrm_address_t *prev_daddr,
 141                                                int family)
 142{
 143        struct net *net = xs_net(x);
 144        xfrm_address_t *saddr = &x->props.saddr;
 145        xfrm_address_t *daddr = &x->id.daddr;
 146        struct dst_entry *dst;
 147
 148        if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) {
 149                saddr = x->coaddr;
 150                daddr = prev_daddr;
 151        }
 152        if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) {
 153                saddr = prev_saddr;
 154                daddr = x->coaddr;
 155        }
 156
 157        dst = __xfrm_dst_lookup(net, tos, saddr, daddr, family);
 158
 159        if (!IS_ERR(dst)) {
 160                if (prev_saddr != saddr)
 161                        memcpy(prev_saddr, saddr,  sizeof(*prev_saddr));
 162                if (prev_daddr != daddr)
 163                        memcpy(prev_daddr, daddr,  sizeof(*prev_daddr));
 164        }
 165
 166        return dst;
 167}
 168
 169static inline unsigned long make_jiffies(long secs)
 170{
 171        if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
 172                return MAX_SCHEDULE_TIMEOUT-1;
 173        else
 174                return secs*HZ;
 175}
 176
 177static void xfrm_policy_timer(unsigned long data)
 178{
 179        struct xfrm_policy *xp = (struct xfrm_policy*)data;
 180        unsigned long now = get_seconds();
 181        long next = LONG_MAX;
 182        int warn = 0;
 183        int dir;
 184
 185        read_lock(&xp->lock);
 186
 187        if (unlikely(xp->walk.dead))
 188                goto out;
 189
 190        dir = xfrm_policy_id2dir(xp->index);
 191
 192        if (xp->lft.hard_add_expires_seconds) {
 193                long tmo = xp->lft.hard_add_expires_seconds +
 194                        xp->curlft.add_time - now;
 195                if (tmo <= 0)
 196                        goto expired;
 197                if (tmo < next)
 198                        next = tmo;
 199        }
 200        if (xp->lft.hard_use_expires_seconds) {
 201                long tmo = xp->lft.hard_use_expires_seconds +
 202                        (xp->curlft.use_time ? : xp->curlft.add_time) - now;
 203                if (tmo <= 0)
 204                        goto expired;
 205                if (tmo < next)
 206                        next = tmo;
 207        }
 208        if (xp->lft.soft_add_expires_seconds) {
 209                long tmo = xp->lft.soft_add_expires_seconds +
 210                        xp->curlft.add_time - now;
 211                if (tmo <= 0) {
 212                        warn = 1;
 213                        tmo = XFRM_KM_TIMEOUT;
 214                }
 215                if (tmo < next)
 216                        next = tmo;
 217        }
 218        if (xp->lft.soft_use_expires_seconds) {
 219                long tmo = xp->lft.soft_use_expires_seconds +
 220                        (xp->curlft.use_time ? : xp->curlft.add_time) - now;
 221                if (tmo <= 0) {
 222                        warn = 1;
 223                        tmo = XFRM_KM_TIMEOUT;
 224                }
 225                if (tmo < next)
 226                        next = tmo;
 227        }
 228
 229        if (warn)
 230                km_policy_expired(xp, dir, 0, 0);
 231        if (next != LONG_MAX &&
 232            !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
 233                xfrm_pol_hold(xp);
 234
 235out:
 236        read_unlock(&xp->lock);
 237        xfrm_pol_put(xp);
 238        return;
 239
 240expired:
 241        read_unlock(&xp->lock);
 242        if (!xfrm_policy_delete(xp, dir))
 243                km_policy_expired(xp, dir, 1, 0);
 244        xfrm_pol_put(xp);
 245}
 246
 247static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
 248{
 249        struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
 250
 251        if (unlikely(pol->walk.dead))
 252                flo = NULL;
 253        else
 254                xfrm_pol_hold(pol);
 255
 256        return flo;
 257}
 258
 259static int xfrm_policy_flo_check(struct flow_cache_object *flo)
 260{
 261        struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
 262
 263        return !pol->walk.dead;
 264}
 265
 266static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
 267{
 268        xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
 269}
 270
 271static const struct flow_cache_ops xfrm_policy_fc_ops = {
 272        .get = xfrm_policy_flo_get,
 273        .check = xfrm_policy_flo_check,
 274        .delete = xfrm_policy_flo_delete,
 275};
 276
 277/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
 278 * SPD calls.
 279 */
 280
 281struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 282{
 283        struct xfrm_policy *policy;
 284
 285        policy = kzalloc(sizeof(struct xfrm_policy), gfp);
 286
 287        if (policy) {
 288                write_pnet(&policy->xp_net, net);
 289                INIT_LIST_HEAD(&policy->walk.all);
 290                INIT_HLIST_NODE(&policy->bydst);
 291                INIT_HLIST_NODE(&policy->byidx);
 292                rwlock_init(&policy->lock);
 293                atomic_set(&policy->refcnt, 1);
 294                skb_queue_head_init(&policy->polq.hold_queue);
 295                setup_timer(&policy->timer, xfrm_policy_timer,
 296                                (unsigned long)policy);
 297                setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process,
 298                            (unsigned long)policy);
 299                policy->flo.ops = &xfrm_policy_fc_ops;
 300        }
 301        return policy;
 302}
 303EXPORT_SYMBOL(xfrm_policy_alloc);
 304
 305/* Destroy xfrm_policy: descendant resources must be released to this moment. */
 306
 307void xfrm_policy_destroy(struct xfrm_policy *policy)
 308{
 309        BUG_ON(!policy->walk.dead);
 310
 311        if (del_timer(&policy->timer))
 312                BUG();
 313
 314        security_xfrm_policy_free(policy->security);
 315        kfree(policy);
 316}
 317EXPORT_SYMBOL(xfrm_policy_destroy);
 318
 319static void xfrm_queue_purge(struct sk_buff_head *list)
 320{
 321        struct sk_buff *skb;
 322
 323        while ((skb = skb_dequeue(list)) != NULL) {
 324                dev_put(skb->dev);
 325                kfree_skb(skb);
 326        }
 327}
 328
 329/* Rule must be locked. Release descentant resources, announce
 330 * entry dead. The rule must be unlinked from lists to the moment.
 331 */
 332
 333static void xfrm_policy_kill(struct xfrm_policy *policy)
 334{
 335        policy->walk.dead = 1;
 336
 337        atomic_inc(&policy->genid);
 338
 339        del_timer(&policy->polq.hold_timer);
 340        xfrm_queue_purge(&policy->polq.hold_queue);
 341
 342        if (del_timer(&policy->timer))
 343                xfrm_pol_put(policy);
 344
 345        xfrm_pol_put(policy);
 346}
 347
 348static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
 349
 350static inline unsigned int idx_hash(struct net *net, u32 index)
 351{
 352        return __idx_hash(index, net->xfrm.policy_idx_hmask);
 353}
 354
 355static struct hlist_head *policy_hash_bysel(struct net *net,
 356                                            const struct xfrm_selector *sel,
 357                                            unsigned short family, int dir)
 358{
 359        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
 360        unsigned int hash = __sel_hash(sel, family, hmask);
 361
 362        return (hash == hmask + 1 ?
 363                &net->xfrm.policy_inexact[dir] :
 364                net->xfrm.policy_bydst[dir].table + hash);
 365}
 366
 367static struct hlist_head *policy_hash_direct(struct net *net,
 368                                             const xfrm_address_t *daddr,
 369                                             const xfrm_address_t *saddr,
 370                                             unsigned short family, int dir)
 371{
 372        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
 373        unsigned int hash = __addr_hash(daddr, saddr, family, hmask);
 374
 375        return net->xfrm.policy_bydst[dir].table + hash;
 376}
 377
 378static void xfrm_dst_hash_transfer(struct hlist_head *list,
 379                                   struct hlist_head *ndsttable,
 380                                   unsigned int nhashmask)
 381{
 382        struct hlist_node *tmp, *entry0 = NULL;
 383        struct xfrm_policy *pol;
 384        unsigned int h0 = 0;
 385
 386redo:
 387        hlist_for_each_entry_safe(pol, tmp, list, bydst) {
 388                unsigned int h;
 389
 390                h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
 391                                pol->family, nhashmask);
 392                if (!entry0) {
 393                        hlist_del(&pol->bydst);
 394                        hlist_add_head(&pol->bydst, ndsttable+h);
 395                        h0 = h;
 396                } else {
 397                        if (h != h0)
 398                                continue;
 399                        hlist_del(&pol->bydst);
 400                        hlist_add_after(entry0, &pol->bydst);
 401                }
 402                entry0 = &pol->bydst;
 403        }
 404        if (!hlist_empty(list)) {
 405                entry0 = NULL;
 406                goto redo;
 407        }
 408}
 409
 410static void xfrm_idx_hash_transfer(struct hlist_head *list,
 411                                   struct hlist_head *nidxtable,
 412                                   unsigned int nhashmask)
 413{
 414        struct hlist_node *tmp;
 415        struct xfrm_policy *pol;
 416
 417        hlist_for_each_entry_safe(pol, tmp, list, byidx) {
 418                unsigned int h;
 419
 420                h = __idx_hash(pol->index, nhashmask);
 421                hlist_add_head(&pol->byidx, nidxtable+h);
 422        }
 423}
 424
 425static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
 426{
 427        return ((old_hmask + 1) << 1) - 1;
 428}
 429
 430static void xfrm_bydst_resize(struct net *net, int dir)
 431{
 432        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
 433        unsigned int nhashmask = xfrm_new_hash_mask(hmask);
 434        unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
 435        struct hlist_head *odst = net->xfrm.policy_bydst[dir].table;
 436        struct hlist_head *ndst = xfrm_hash_alloc(nsize);
 437        int i;
 438
 439        if (!ndst)
 440                return;
 441
 442        write_lock_bh(&xfrm_policy_lock);
 443
 444        for (i = hmask; i >= 0; i--)
 445                xfrm_dst_hash_transfer(odst + i, ndst, nhashmask);
 446
 447        net->xfrm.policy_bydst[dir].table = ndst;
 448        net->xfrm.policy_bydst[dir].hmask = nhashmask;
 449
 450        write_unlock_bh(&xfrm_policy_lock);
 451
 452        xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
 453}
 454
 455static void xfrm_byidx_resize(struct net *net, int total)
 456{
 457        unsigned int hmask = net->xfrm.policy_idx_hmask;
 458        unsigned int nhashmask = xfrm_new_hash_mask(hmask);
 459        unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
 460        struct hlist_head *oidx = net->xfrm.policy_byidx;
 461        struct hlist_head *nidx = xfrm_hash_alloc(nsize);
 462        int i;
 463
 464        if (!nidx)
 465                return;
 466
 467        write_lock_bh(&xfrm_policy_lock);
 468
 469        for (i = hmask; i >= 0; i--)
 470                xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
 471
 472        net->xfrm.policy_byidx = nidx;
 473        net->xfrm.policy_idx_hmask = nhashmask;
 474
 475        write_unlock_bh(&xfrm_policy_lock);
 476
 477        xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
 478}
 479
 480static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total)
 481{
 482        unsigned int cnt = net->xfrm.policy_count[dir];
 483        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
 484
 485        if (total)
 486                *total += cnt;
 487
 488        if ((hmask + 1) < xfrm_policy_hashmax &&
 489            cnt > hmask)
 490                return 1;
 491
 492        return 0;
 493}
 494
 495static inline int xfrm_byidx_should_resize(struct net *net, int total)
 496{
 497        unsigned int hmask = net->xfrm.policy_idx_hmask;
 498
 499        if ((hmask + 1) < xfrm_policy_hashmax &&
 500            total > hmask)
 501                return 1;
 502
 503        return 0;
 504}
 505
 506void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
 507{
 508        read_lock_bh(&xfrm_policy_lock);
 509        si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
 510        si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
 511        si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
 512        si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
 513        si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
 514        si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
 515        si->spdhcnt = net->xfrm.policy_idx_hmask;
 516        si->spdhmcnt = xfrm_policy_hashmax;
 517        read_unlock_bh(&xfrm_policy_lock);
 518}
 519EXPORT_SYMBOL(xfrm_spd_getinfo);
 520
 521static DEFINE_MUTEX(hash_resize_mutex);
 522static void xfrm_hash_resize(struct work_struct *work)
 523{
 524        struct net *net = container_of(work, struct net, xfrm.policy_hash_work);
 525        int dir, total;
 526
 527        mutex_lock(&hash_resize_mutex);
 528
 529        total = 0;
 530        for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
 531                if (xfrm_bydst_should_resize(net, dir, &total))
 532                        xfrm_bydst_resize(net, dir);
 533        }
 534        if (xfrm_byidx_should_resize(net, total))
 535                xfrm_byidx_resize(net, total);
 536
 537        mutex_unlock(&hash_resize_mutex);
 538}
 539
 540/* Generate new index... KAME seems to generate them ordered by cost
 541 * of an absolute inpredictability of ordering of rules. This will not pass. */
 542static u32 xfrm_gen_index(struct net *net, int dir)
 543{
 544        static u32 idx_generator;
 545
 546        for (;;) {
 547                struct hlist_head *list;
 548                struct xfrm_policy *p;
 549                u32 idx;
 550                int found;
 551
 552                idx = (idx_generator | dir);
 553                idx_generator += 8;
 554                if (idx == 0)
 555                        idx = 8;
 556                list = net->xfrm.policy_byidx + idx_hash(net, idx);
 557                found = 0;
 558                hlist_for_each_entry(p, list, byidx) {
 559                        if (p->index == idx) {
 560                                found = 1;
 561                                break;
 562                        }
 563                }
 564                if (!found)
 565                        return idx;
 566        }
 567}
 568
 569static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
 570{
 571        u32 *p1 = (u32 *) s1;
 572        u32 *p2 = (u32 *) s2;
 573        int len = sizeof(struct xfrm_selector) / sizeof(u32);
 574        int i;
 575
 576        for (i = 0; i < len; i++) {
 577                if (p1[i] != p2[i])
 578                        return 1;
 579        }
 580
 581        return 0;
 582}
 583
 584static void xfrm_policy_requeue(struct xfrm_policy *old,
 585                                struct xfrm_policy *new)
 586{
 587        struct xfrm_policy_queue *pq = &old->polq;
 588        struct sk_buff_head list;
 589
 590        __skb_queue_head_init(&list);
 591
 592        spin_lock_bh(&pq->hold_queue.lock);
 593        skb_queue_splice_init(&pq->hold_queue, &list);
 594        del_timer(&pq->hold_timer);
 595        spin_unlock_bh(&pq->hold_queue.lock);
 596
 597        if (skb_queue_empty(&list))
 598                return;
 599
 600        pq = &new->polq;
 601
 602        spin_lock_bh(&pq->hold_queue.lock);
 603        skb_queue_splice(&list, &pq->hold_queue);
 604        pq->timeout = XFRM_QUEUE_TMO_MIN;
 605        mod_timer(&pq->hold_timer, jiffies);
 606        spin_unlock_bh(&pq->hold_queue.lock);
 607}
 608
 609static bool xfrm_policy_mark_match(struct xfrm_policy *policy,
 610                                   struct xfrm_policy *pol)
 611{
 612        u32 mark = policy->mark.v & policy->mark.m;
 613
 614        if (policy->mark.v == pol->mark.v && policy->mark.m == pol->mark.m)
 615                return true;
 616
 617        if ((mark & pol->mark.m) == pol->mark.v &&
 618            policy->priority == pol->priority)
 619                return true;
 620
 621        return false;
 622}
 623
 624int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 625{
 626        struct net *net = xp_net(policy);
 627        struct xfrm_policy *pol;
 628        struct xfrm_policy *delpol;
 629        struct hlist_head *chain;
 630        struct hlist_node *newpos;
 631
 632        write_lock_bh(&xfrm_policy_lock);
 633        chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
 634        delpol = NULL;
 635        newpos = NULL;
 636        hlist_for_each_entry(pol, chain, bydst) {
 637                if (pol->type == policy->type &&
 638                    !selector_cmp(&pol->selector, &policy->selector) &&
 639                    xfrm_policy_mark_match(policy, pol) &&
 640                    xfrm_sec_ctx_match(pol->security, policy->security) &&
 641                    !WARN_ON(delpol)) {
 642                        if (excl) {
 643                                write_unlock_bh(&xfrm_policy_lock);
 644                                return -EEXIST;
 645                        }
 646                        delpol = pol;
 647                        if (policy->priority > pol->priority)
 648                                continue;
 649                } else if (policy->priority >= pol->priority) {
 650                        newpos = &pol->bydst;
 651                        continue;
 652                }
 653                if (delpol)
 654                        break;
 655        }
 656        if (newpos)
 657                hlist_add_after(newpos, &policy->bydst);
 658        else
 659                hlist_add_head(&policy->bydst, chain);
 660        xfrm_pol_hold(policy);
 661        net->xfrm.policy_count[dir]++;
 662        atomic_inc(&flow_cache_genid);
 663        rt_genid_bump(net);
 664        if (delpol) {
 665                xfrm_policy_requeue(delpol, policy);
 666                __xfrm_policy_unlink(delpol, dir);
 667        }
 668        policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir);
 669        hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
 670        policy->curlft.add_time = get_seconds();
 671        policy->curlft.use_time = 0;
 672        if (!mod_timer(&policy->timer, jiffies + HZ))
 673                xfrm_pol_hold(policy);
 674        list_add(&policy->walk.all, &net->xfrm.policy_all);
 675        write_unlock_bh(&xfrm_policy_lock);
 676
 677        if (delpol)
 678                xfrm_policy_kill(delpol);
 679        else if (xfrm_bydst_should_resize(net, dir, NULL))
 680                schedule_work(&net->xfrm.policy_hash_work);
 681
 682        return 0;
 683}
 684EXPORT_SYMBOL(xfrm_policy_insert);
 685
 686struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 687                                          int dir, struct xfrm_selector *sel,
 688                                          struct xfrm_sec_ctx *ctx, int delete,
 689                                          int *err)
 690{
 691        struct xfrm_policy *pol, *ret;
 692        struct hlist_head *chain;
 693
 694        *err = 0;
 695        write_lock_bh(&xfrm_policy_lock);
 696        chain = policy_hash_bysel(net, sel, sel->family, dir);
 697        ret = NULL;
 698        hlist_for_each_entry(pol, chain, bydst) {
 699                if (pol->type == type &&
 700                    (mark & pol->mark.m) == pol->mark.v &&
 701                    !selector_cmp(sel, &pol->selector) &&
 702                    xfrm_sec_ctx_match(ctx, pol->security)) {
 703                        xfrm_pol_hold(pol);
 704                        if (delete) {
 705                                *err = security_xfrm_policy_delete(
 706                                                                pol->security);
 707                                if (*err) {
 708                                        write_unlock_bh(&xfrm_policy_lock);
 709                                        return pol;
 710                                }
 711                                __xfrm_policy_unlink(pol, dir);
 712                        }
 713                        ret = pol;
 714                        break;
 715                }
 716        }
 717        write_unlock_bh(&xfrm_policy_lock);
 718
 719        if (ret && delete)
 720                xfrm_policy_kill(ret);
 721        return ret;
 722}
 723EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
 724
 725struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 726                                     int dir, u32 id, int delete, int *err)
 727{
 728        struct xfrm_policy *pol, *ret;
 729        struct hlist_head *chain;
 730
 731        *err = -ENOENT;
 732        if (xfrm_policy_id2dir(id) != dir)
 733                return NULL;
 734
 735        *err = 0;
 736        write_lock_bh(&xfrm_policy_lock);
 737        chain = net->xfrm.policy_byidx + idx_hash(net, id);
 738        ret = NULL;
 739        hlist_for_each_entry(pol, chain, byidx) {
 740                if (pol->type == type && pol->index == id &&
 741                    (mark & pol->mark.m) == pol->mark.v) {
 742                        xfrm_pol_hold(pol);
 743                        if (delete) {
 744                                *err = security_xfrm_policy_delete(
 745                                                                pol->security);
 746                                if (*err) {
 747                                        write_unlock_bh(&xfrm_policy_lock);
 748                                        return pol;
 749                                }
 750                                __xfrm_policy_unlink(pol, dir);
 751                        }
 752                        ret = pol;
 753                        break;
 754                }
 755        }
 756        write_unlock_bh(&xfrm_policy_lock);
 757
 758        if (ret && delete)
 759                xfrm_policy_kill(ret);
 760        return ret;
 761}
 762EXPORT_SYMBOL(xfrm_policy_byid);
 763
 764#ifdef CONFIG_SECURITY_NETWORK_XFRM
 765static inline int
 766xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audit_info)
 767{
 768        int dir, err = 0;
 769
 770        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
 771                struct xfrm_policy *pol;
 772                int i;
 773
 774                hlist_for_each_entry(pol,
 775                                     &net->xfrm.policy_inexact[dir], bydst) {
 776                        if (pol->type != type)
 777                                continue;
 778                        err = security_xfrm_policy_delete(pol->security);
 779                        if (err) {
 780                                xfrm_audit_policy_delete(pol, 0,
 781                                                         audit_info->loginuid,
 782                                                         audit_info->sessionid,
 783                                                         audit_info->secid);
 784                                return err;
 785                        }
 786                }
 787                for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
 788                        hlist_for_each_entry(pol,
 789                                             net->xfrm.policy_bydst[dir].table + i,
 790                                             bydst) {
 791                                if (pol->type != type)
 792                                        continue;
 793                                err = security_xfrm_policy_delete(
 794                                                                pol->security);
 795                                if (err) {
 796                                        xfrm_audit_policy_delete(pol, 0,
 797                                                        audit_info->loginuid,
 798                                                        audit_info->sessionid,
 799                                                        audit_info->secid);
 800                                        return err;
 801                                }
 802                        }
 803                }
 804        }
 805        return err;
 806}
 807#else
 808static inline int
 809xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audit_info)
 810{
 811        return 0;
 812}
 813#endif
 814
 815int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 816{
 817        int dir, err = 0, cnt = 0;
 818
 819        write_lock_bh(&xfrm_policy_lock);
 820
 821        err = xfrm_policy_flush_secctx_check(net, type, audit_info);
 822        if (err)
 823                goto out;
 824
 825        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
 826                struct xfrm_policy *pol;
 827                int i;
 828
 829        again1:
 830                hlist_for_each_entry(pol,
 831                                     &net->xfrm.policy_inexact[dir], bydst) {
 832                        if (pol->type != type)
 833                                continue;
 834                        __xfrm_policy_unlink(pol, dir);
 835                        write_unlock_bh(&xfrm_policy_lock);
 836                        cnt++;
 837
 838                        xfrm_audit_policy_delete(pol, 1, audit_info->loginuid,
 839                                                 audit_info->sessionid,
 840                                                 audit_info->secid);
 841
 842                        xfrm_policy_kill(pol);
 843
 844                        write_lock_bh(&xfrm_policy_lock);
 845                        goto again1;
 846                }
 847
 848                for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
 849        again2:
 850                        hlist_for_each_entry(pol,
 851                                             net->xfrm.policy_bydst[dir].table + i,
 852                                             bydst) {
 853                                if (pol->type != type)
 854                                        continue;
 855                                __xfrm_policy_unlink(pol, dir);
 856                                write_unlock_bh(&xfrm_policy_lock);
 857                                cnt++;
 858
 859                                xfrm_audit_policy_delete(pol, 1,
 860                                                         audit_info->loginuid,
 861                                                         audit_info->sessionid,
 862                                                         audit_info->secid);
 863                                xfrm_policy_kill(pol);
 864
 865                                write_lock_bh(&xfrm_policy_lock);
 866                                goto again2;
 867                        }
 868                }
 869
 870        }
 871        if (!cnt)
 872                err = -ESRCH;
 873out:
 874        write_unlock_bh(&xfrm_policy_lock);
 875        return err;
 876}
 877EXPORT_SYMBOL(xfrm_policy_flush);
 878
 879int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
 880                     int (*func)(struct xfrm_policy *, int, int, void*),
 881                     void *data)
 882{
 883        struct xfrm_policy *pol;
 884        struct xfrm_policy_walk_entry *x;
 885        int error = 0;
 886
 887        if (walk->type >= XFRM_POLICY_TYPE_MAX &&
 888            walk->type != XFRM_POLICY_TYPE_ANY)
 889                return -EINVAL;
 890
 891        if (list_empty(&walk->walk.all) && walk->seq != 0)
 892                return 0;
 893
 894        write_lock_bh(&xfrm_policy_lock);
 895        if (list_empty(&walk->walk.all))
 896                x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
 897        else
 898                x = list_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all);
 899        list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
 900                if (x->dead)
 901                        continue;
 902                pol = container_of(x, struct xfrm_policy, walk);
 903                if (walk->type != XFRM_POLICY_TYPE_ANY &&
 904                    walk->type != pol->type)
 905                        continue;
 906                error = func(pol, xfrm_policy_id2dir(pol->index),
 907                             walk->seq, data);
 908                if (error) {
 909                        list_move_tail(&walk->walk.all, &x->all);
 910                        goto out;
 911                }
 912                walk->seq++;
 913        }
 914        if (walk->seq == 0) {
 915                error = -ENOENT;
 916                goto out;
 917        }
 918        list_del_init(&walk->walk.all);
 919out:
 920        write_unlock_bh(&xfrm_policy_lock);
 921        return error;
 922}
 923EXPORT_SYMBOL(xfrm_policy_walk);
 924
 925void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
 926{
 927        INIT_LIST_HEAD(&walk->walk.all);
 928        walk->walk.dead = 1;
 929        walk->type = type;
 930        walk->seq = 0;
 931}
 932EXPORT_SYMBOL(xfrm_policy_walk_init);
 933
 934void xfrm_policy_walk_done(struct xfrm_policy_walk *walk)
 935{
 936        if (list_empty(&walk->walk.all))
 937                return;
 938
 939        write_lock_bh(&xfrm_policy_lock);
 940        list_del(&walk->walk.all);
 941        write_unlock_bh(&xfrm_policy_lock);
 942}
 943EXPORT_SYMBOL(xfrm_policy_walk_done);
 944
 945/*
 946 * Find policy to apply to this flow.
 947 *
 948 * Returns 0 if policy found, else an -errno.
 949 */
 950static int xfrm_policy_match(const struct xfrm_policy *pol,
 951                             const struct flowi *fl,
 952                             u8 type, u16 family, int dir)
 953{
 954        const struct xfrm_selector *sel = &pol->selector;
 955        int ret = -ESRCH;
 956        bool match;
 957
 958        if (pol->family != family ||
 959            (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
 960            pol->type != type)
 961                return ret;
 962
 963        match = xfrm_selector_match(sel, fl, family);
 964        if (match)
 965                ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
 966                                                  dir);
 967
 968        return ret;
 969}
 970
 971static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 972                                                     const struct flowi *fl,
 973                                                     u16 family, u8 dir)
 974{
 975        int err;
 976        struct xfrm_policy *pol, *ret;
 977        const xfrm_address_t *daddr, *saddr;
 978        struct hlist_head *chain;
 979        u32 priority = ~0U;
 980
 981        daddr = xfrm_flowi_daddr(fl, family);
 982        saddr = xfrm_flowi_saddr(fl, family);
 983        if (unlikely(!daddr || !saddr))
 984                return NULL;
 985
 986        read_lock_bh(&xfrm_policy_lock);
 987        chain = policy_hash_direct(net, daddr, saddr, family, dir);
 988        ret = NULL;
 989        hlist_for_each_entry(pol, chain, bydst) {
 990                err = xfrm_policy_match(pol, fl, type, family, dir);
 991                if (err) {
 992                        if (err == -ESRCH)
 993                                continue;
 994                        else {
 995                                ret = ERR_PTR(err);
 996                                goto fail;
 997                        }
 998                } else {
 999                        ret = pol;
1000                        priority = ret->priority;
1001                        break;
1002                }
1003        }
1004        chain = &net->xfrm.policy_inexact[dir];
1005        hlist_for_each_entry(pol, chain, bydst) {
1006                err = xfrm_policy_match(pol, fl, type, family, dir);
1007                if (err) {
1008                        if (err == -ESRCH)
1009                                continue;
1010                        else {
1011                                ret = ERR_PTR(err);
1012                                goto fail;
1013                        }
1014                } else if (pol->priority < priority) {
1015                        ret = pol;
1016                        break;
1017                }
1018        }
1019        if (ret)
1020                xfrm_pol_hold(ret);
1021fail:
1022        read_unlock_bh(&xfrm_policy_lock);
1023
1024        return ret;
1025}
1026
1027static struct xfrm_policy *
1028__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
1029{
1030#ifdef CONFIG_XFRM_SUB_POLICY
1031        struct xfrm_policy *pol;
1032
1033        pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
1034        if (pol != NULL)
1035                return pol;
1036#endif
1037        return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
1038}
1039
1040static int flow_to_policy_dir(int dir)
1041{
1042        if (XFRM_POLICY_IN == FLOW_DIR_IN &&
1043            XFRM_POLICY_OUT == FLOW_DIR_OUT &&
1044            XFRM_POLICY_FWD == FLOW_DIR_FWD)
1045                return dir;
1046
1047        switch (dir) {
1048        default:
1049        case FLOW_DIR_IN:
1050                return XFRM_POLICY_IN;
1051        case FLOW_DIR_OUT:
1052                return XFRM_POLICY_OUT;
1053        case FLOW_DIR_FWD:
1054                return XFRM_POLICY_FWD;
1055        }
1056}
1057
1058static struct flow_cache_object *
1059xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
1060                   u8 dir, struct flow_cache_object *old_obj, void *ctx)
1061{
1062        struct xfrm_policy *pol;
1063
1064        if (old_obj)
1065                xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
1066
1067        pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir));
1068        if (IS_ERR_OR_NULL(pol))
1069                return ERR_CAST(pol);
1070
1071        /* Resolver returns two references:
1072         * one for cache and one for caller of flow_cache_lookup() */
1073        xfrm_pol_hold(pol);
1074
1075        return &pol->flo;
1076}
1077
1078static inline int policy_to_flow_dir(int dir)
1079{
1080        if (XFRM_POLICY_IN == FLOW_DIR_IN &&
1081            XFRM_POLICY_OUT == FLOW_DIR_OUT &&
1082            XFRM_POLICY_FWD == FLOW_DIR_FWD)
1083                return dir;
1084        switch (dir) {
1085        default:
1086        case XFRM_POLICY_IN:
1087                return FLOW_DIR_IN;
1088        case XFRM_POLICY_OUT:
1089                return FLOW_DIR_OUT;
1090        case XFRM_POLICY_FWD:
1091                return FLOW_DIR_FWD;
1092        }
1093}
1094
1095static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir,
1096                                                 const struct flowi *fl)
1097{
1098        struct xfrm_policy *pol;
1099
1100        read_lock_bh(&xfrm_policy_lock);
1101        if ((pol = sk->sk_policy[dir]) != NULL) {
1102                bool match = xfrm_selector_match(&pol->selector, fl,
1103                                                 sk->sk_family);
1104                int err = 0;
1105
1106                if (match) {
1107                        if ((sk->sk_mark & pol->mark.m) != pol->mark.v) {
1108                                pol = NULL;
1109                                goto out;
1110                        }
1111                        err = security_xfrm_policy_lookup(pol->security,
1112                                                      fl->flowi_secid,
1113                                                      policy_to_flow_dir(dir));
1114                        if (!err)
1115                                xfrm_pol_hold(pol);
1116                        else if (err == -ESRCH)
1117                                pol = NULL;
1118                        else
1119                                pol = ERR_PTR(err);
1120                } else
1121                        pol = NULL;
1122        }
1123out:
1124        read_unlock_bh(&xfrm_policy_lock);
1125        return pol;
1126}
1127
1128static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
1129{
1130        struct net *net = xp_net(pol);
1131        struct hlist_head *chain = policy_hash_bysel(net, &pol->selector,
1132                                                     pol->family, dir);
1133
1134        list_add(&pol->walk.all, &net->xfrm.policy_all);
1135        hlist_add_head(&pol->bydst, chain);
1136        hlist_add_head(&pol->byidx, net->xfrm.policy_byidx+idx_hash(net, pol->index));
1137        net->xfrm.policy_count[dir]++;
1138        xfrm_pol_hold(pol);
1139
1140        if (xfrm_bydst_should_resize(net, dir, NULL))
1141                schedule_work(&net->xfrm.policy_hash_work);
1142}
1143
1144static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
1145                                                int dir)
1146{
1147        struct net *net = xp_net(pol);
1148
1149        if (hlist_unhashed(&pol->bydst))
1150                return NULL;
1151
1152        hlist_del(&pol->bydst);
1153        hlist_del(&pol->byidx);
1154        list_del(&pol->walk.all);
1155        net->xfrm.policy_count[dir]--;
1156
1157        return pol;
1158}
1159
1160int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
1161{
1162        write_lock_bh(&xfrm_policy_lock);
1163        pol = __xfrm_policy_unlink(pol, dir);
1164        write_unlock_bh(&xfrm_policy_lock);
1165        if (pol) {
1166                xfrm_policy_kill(pol);
1167                return 0;
1168        }
1169        return -ENOENT;
1170}
1171EXPORT_SYMBOL(xfrm_policy_delete);
1172
1173int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
1174{
1175        struct net *net = xp_net(pol);
1176        struct xfrm_policy *old_pol;
1177
1178#ifdef CONFIG_XFRM_SUB_POLICY
1179        if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
1180                return -EINVAL;
1181#endif
1182
1183        write_lock_bh(&xfrm_policy_lock);
1184        old_pol = sk->sk_policy[dir];
1185        sk->sk_policy[dir] = pol;
1186        if (pol) {
1187                pol->curlft.add_time = get_seconds();
1188                pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir);
1189                __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
1190        }
1191        if (old_pol) {
1192                if (pol)
1193                        xfrm_policy_requeue(old_pol, pol);
1194
1195                /* Unlinking succeeds always. This is the only function
1196                 * allowed to delete or replace socket policy.
1197                 */
1198                __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
1199        }
1200        write_unlock_bh(&xfrm_policy_lock);
1201
1202        if (old_pol) {
1203                xfrm_policy_kill(old_pol);
1204        }
1205        return 0;
1206}
1207
1208static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
1209{
1210        struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
1211
1212        if (newp) {
1213                newp->selector = old->selector;
1214                if (security_xfrm_policy_clone(old->security,
1215                                               &newp->security)) {
1216                        kfree(newp);
1217                        return NULL;  /* ENOMEM */
1218                }
1219                newp->lft = old->lft;
1220                newp->curlft = old->curlft;
1221                newp->mark = old->mark;
1222                newp->action = old->action;
1223                newp->flags = old->flags;
1224                newp->xfrm_nr = old->xfrm_nr;
1225                newp->index = old->index;
1226                newp->type = old->type;
1227                memcpy(newp->xfrm_vec, old->xfrm_vec,
1228                       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
1229                write_lock_bh(&xfrm_policy_lock);
1230                __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir);
1231                write_unlock_bh(&xfrm_policy_lock);
1232                xfrm_pol_put(newp);
1233        }
1234        return newp;
1235}
1236
1237int __xfrm_sk_clone_policy(struct sock *sk)
1238{
1239        struct xfrm_policy *p0 = sk->sk_policy[0],
1240                           *p1 = sk->sk_policy[1];
1241
1242        sk->sk_policy[0] = sk->sk_policy[1] = NULL;
1243        if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
1244                return -ENOMEM;
1245        if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
1246                return -ENOMEM;
1247        return 0;
1248}
1249
1250static int
1251xfrm_get_saddr(struct net *net, xfrm_address_t *local, xfrm_address_t *remote,
1252               unsigned short family)
1253{
1254        int err;
1255        struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1256
1257        if (unlikely(afinfo == NULL))
1258                return -EINVAL;
1259        err = afinfo->get_saddr(net, local, remote);
1260        xfrm_policy_put_afinfo(afinfo);
1261        return err;
1262}
1263
1264/* Resolve list of templates for the flow, given policy. */
1265
1266static int
1267xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
1268                      struct xfrm_state **xfrm, unsigned short family)
1269{
1270        struct net *net = xp_net(policy);
1271        int nx;
1272        int i, error;
1273        xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
1274        xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
1275        xfrm_address_t tmp;
1276
1277        for (nx=0, i = 0; i < policy->xfrm_nr; i++) {
1278                struct xfrm_state *x;
1279                xfrm_address_t *remote = daddr;
1280                xfrm_address_t *local  = saddr;
1281                struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
1282
1283                if (tmpl->mode == XFRM_MODE_TUNNEL ||
1284                    tmpl->mode == XFRM_MODE_BEET) {
1285                        remote = &tmpl->id.daddr;
1286                        local = &tmpl->saddr;
1287                        if (xfrm_addr_any(local, tmpl->encap_family)) {
1288                                error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
1289                                if (error)
1290                                        goto fail;
1291                                local = &tmp;
1292                        }
1293                }
1294
1295                x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
1296
1297                if (x && x->km.state == XFRM_STATE_VALID) {
1298                        xfrm[nx++] = x;
1299                        daddr = remote;
1300                        saddr = local;
1301                        continue;
1302                }
1303                if (x) {
1304                        error = (x->km.state == XFRM_STATE_ERROR ?
1305                                 -EINVAL : -EAGAIN);
1306                        xfrm_state_put(x);
1307                }
1308                else if (error == -ESRCH)
1309                        error = -EAGAIN;
1310
1311                if (!tmpl->optional)
1312                        goto fail;
1313        }
1314        return nx;
1315
1316fail:
1317        for (nx--; nx>=0; nx--)
1318                xfrm_state_put(xfrm[nx]);
1319        return error;
1320}
1321
1322static int
1323xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
1324                  struct xfrm_state **xfrm, unsigned short family)
1325{
1326        struct xfrm_state *tp[XFRM_MAX_DEPTH];
1327        struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
1328        int cnx = 0;
1329        int error;
1330        int ret;
1331        int i;
1332
1333        for (i = 0; i < npols; i++) {
1334                if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
1335                        error = -ENOBUFS;
1336                        goto fail;
1337                }
1338
1339                ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
1340                if (ret < 0) {
1341                        error = ret;
1342                        goto fail;
1343                } else
1344                        cnx += ret;
1345        }
1346
1347        /* found states are sorted for outbound processing */
1348        if (npols > 1)
1349                xfrm_state_sort(xfrm, tpp, cnx, family);
1350
1351        return cnx;
1352
1353 fail:
1354        for (cnx--; cnx>=0; cnx--)
1355                xfrm_state_put(tpp[cnx]);
1356        return error;
1357
1358}
1359
1360/* Check that the bundle accepts the flow and its components are
1361 * still valid.
1362 */
1363
1364static inline int xfrm_get_tos(const struct flowi *fl, int family)
1365{
1366        struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1367        int tos;
1368
1369        if (!afinfo)
1370                return -EINVAL;
1371
1372        tos = afinfo->get_tos(fl);
1373
1374        xfrm_policy_put_afinfo(afinfo);
1375
1376        return tos;
1377}
1378
1379static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
1380{
1381        struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
1382        struct dst_entry *dst = &xdst->u.dst;
1383
1384        if (xdst->route == NULL) {
1385                /* Dummy bundle - if it has xfrms we were not
1386                 * able to build bundle as template resolution failed.
1387                 * It means we need to try again resolving. */
1388                if (xdst->num_xfrms > 0)
1389                        return NULL;
1390        } else if (dst->flags & DST_XFRM_QUEUE) {
1391                return NULL;
1392        } else {
1393                /* Real bundle */
1394                if (stale_bundle(dst))
1395                        return NULL;
1396        }
1397
1398        dst_hold(dst);
1399        return flo;
1400}
1401
1402static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
1403{
1404        struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
1405        struct dst_entry *dst = &xdst->u.dst;
1406
1407        if (!xdst->route)
1408                return 0;
1409        if (stale_bundle(dst))
1410                return 0;
1411
1412        return 1;
1413}
1414
1415static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
1416{
1417        struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
1418        struct dst_entry *dst = &xdst->u.dst;
1419
1420        dst_free(dst);
1421}
1422
1423static const struct flow_cache_ops xfrm_bundle_fc_ops = {
1424        .get = xfrm_bundle_flo_get,
1425        .check = xfrm_bundle_flo_check,
1426        .delete = xfrm_bundle_flo_delete,
1427};
1428
1429static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
1430{
1431        struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1432        struct dst_ops *dst_ops;
1433        struct xfrm_dst *xdst;
1434
1435        if (!afinfo)
1436                return ERR_PTR(-EINVAL);
1437
1438        switch (family) {
1439        case AF_INET:
1440                dst_ops = &net->xfrm.xfrm4_dst_ops;
1441                break;
1442#if IS_ENABLED(CONFIG_IPV6)
1443        case AF_INET6:
1444                dst_ops = &net->xfrm.xfrm6_dst_ops;
1445                break;
1446#endif
1447        default:
1448                BUG();
1449        }
1450        xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0);
1451
1452        if (likely(xdst)) {
1453                struct dst_entry *dst = &xdst->u.dst;
1454
1455                memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst));
1456                xdst->flo.ops = &xfrm_bundle_fc_ops;
1457                if (afinfo->init_dst)
1458                        afinfo->init_dst(net, xdst);
1459        } else
1460                xdst = ERR_PTR(-ENOBUFS);
1461
1462        xfrm_policy_put_afinfo(afinfo);
1463
1464        return xdst;
1465}
1466
1467static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
1468                                 int nfheader_len)
1469{
1470        struct xfrm_policy_afinfo *afinfo =
1471                xfrm_policy_get_afinfo(dst->ops->family);
1472        int err;
1473
1474        if (!afinfo)
1475                return -EINVAL;
1476
1477        err = afinfo->init_path(path, dst, nfheader_len);
1478
1479        xfrm_policy_put_afinfo(afinfo);
1480
1481        return err;
1482}
1483
1484static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
1485                                const struct flowi *fl)
1486{
1487        struct xfrm_policy_afinfo *afinfo =
1488                xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
1489        int err;
1490
1491        if (!afinfo)
1492                return -EINVAL;
1493
1494        err = afinfo->fill_dst(xdst, dev, fl);
1495
1496        xfrm_policy_put_afinfo(afinfo);
1497
1498        return err;
1499}
1500
1501
1502/* Allocate chain of dst_entry's, attach known xfrm's, calculate
1503 * all the metrics... Shortly, bundle a bundle.
1504 */
1505
1506static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
1507                                            struct xfrm_state **xfrm, int nx,
1508                                            const struct flowi *fl,
1509                                            struct dst_entry *dst)
1510{
1511        struct net *net = xp_net(policy);
1512        unsigned long now = jiffies;
1513        struct net_device *dev;
1514        struct xfrm_mode *inner_mode;
1515        struct dst_entry *dst_prev = NULL;
1516        struct dst_entry *dst0 = NULL;
1517        int i = 0;
1518        int err;
1519        int header_len = 0;
1520        int nfheader_len = 0;
1521        int trailer_len = 0;
1522        int tos;
1523        int family = policy->selector.family;
1524        xfrm_address_t saddr, daddr;
1525
1526        xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
1527
1528        tos = xfrm_get_tos(fl, family);
1529        err = tos;
1530        if (tos < 0)
1531                goto put_states;
1532
1533        dst_hold(dst);
1534
1535        for (; i < nx; i++) {
1536                struct xfrm_dst *xdst = xfrm_alloc_dst(net, family);
1537                struct dst_entry *dst1 = &xdst->u.dst;
1538
1539                err = PTR_ERR(xdst);
1540                if (IS_ERR(xdst)) {
1541                        dst_release(dst);
1542                        goto put_states;
1543                }
1544
1545                if (xfrm[i]->sel.family == AF_UNSPEC) {
1546                        inner_mode = xfrm_ip2inner_mode(xfrm[i],
1547                                                        xfrm_af2proto(family));
1548                        if (!inner_mode) {
1549                                err = -EAFNOSUPPORT;
1550                                dst_release(dst);
1551                                goto put_states;
1552                        }
1553                } else
1554                        inner_mode = xfrm[i]->inner_mode;
1555
1556                if (!dst_prev)
1557                        dst0 = dst1;
1558                else {
1559                        dst_prev->child = dst_clone(dst1);
1560                        dst1->flags |= DST_NOHASH;
1561                }
1562
1563                xdst->route = dst;
1564                dst_copy_metrics(dst1, dst);
1565
1566                if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
1567                        family = xfrm[i]->props.family;
1568                        dst = xfrm_dst_lookup(xfrm[i], tos, &saddr, &daddr,
1569                                              family);
1570                        err = PTR_ERR(dst);
1571                        if (IS_ERR(dst))
1572                                goto put_states;
1573                } else
1574                        dst_hold(dst);
1575
1576                dst1->xfrm = xfrm[i];
1577                xdst->xfrm_genid = xfrm[i]->genid;
1578
1579                dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
1580                dst1->flags |= DST_HOST;
1581                dst1->lastuse = now;
1582
1583                dst1->input = dst_discard;
1584                dst1->output = inner_mode->afinfo->output;
1585
1586                dst1->next = dst_prev;
1587                dst_prev = dst1;
1588
1589                header_len += xfrm[i]->props.header_len;
1590                if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
1591                        nfheader_len += xfrm[i]->props.header_len;
1592                trailer_len += xfrm[i]->props.trailer_len;
1593        }
1594
1595        dst_prev->child = dst;
1596        dst0->path = dst;
1597
1598        err = -ENODEV;
1599        dev = dst->dev;
1600        if (!dev)
1601                goto free_dst;
1602
1603        xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);
1604        xfrm_init_pmtu(dst_prev);
1605
1606        for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) {
1607                struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev;
1608
1609                err = xfrm_fill_dst(xdst, dev, fl);
1610                if (err)
1611                        goto free_dst;
1612
1613                dst_prev->header_len = header_len;
1614                dst_prev->trailer_len = trailer_len;
1615                header_len -= xdst->u.dst.xfrm->props.header_len;
1616                trailer_len -= xdst->u.dst.xfrm->props.trailer_len;
1617        }
1618
1619out:
1620        return dst0;
1621
1622put_states:
1623        for (; i < nx; i++)
1624                xfrm_state_put(xfrm[i]);
1625free_dst:
1626        if (dst0)
1627                dst_free(dst0);
1628        dst0 = ERR_PTR(err);
1629        goto out;
1630}
1631
1632static int inline
1633xfrm_dst_alloc_copy(void **target, const void *src, int size)
1634{
1635        if (!*target) {
1636                *target = kmalloc(size, GFP_ATOMIC);
1637                if (!*target)
1638                        return -ENOMEM;
1639        }
1640        memcpy(*target, src, size);
1641        return 0;
1642}
1643
1644static int inline
1645xfrm_dst_update_parent(struct dst_entry *dst, const struct xfrm_selector *sel)
1646{
1647#ifdef CONFIG_XFRM_SUB_POLICY
1648        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
1649        return xfrm_dst_alloc_copy((void **)&(xdst->partner),
1650                                   sel, sizeof(*sel));
1651#else
1652        return 0;
1653#endif
1654}
1655
1656static int inline
1657xfrm_dst_update_origin(struct dst_entry *dst, const struct flowi *fl)
1658{
1659#ifdef CONFIG_XFRM_SUB_POLICY
1660        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
1661        return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl));
1662#else
1663        return 0;
1664#endif
1665}
1666
1667static int xfrm_expand_policies(const struct flowi *fl, u16 family,
1668                                struct xfrm_policy **pols,
1669                                int *num_pols, int *num_xfrms)
1670{
1671        int i;
1672
1673        if (*num_pols == 0 || !pols[0]) {
1674                *num_pols = 0;
1675                *num_xfrms = 0;
1676                return 0;
1677        }
1678        if (IS_ERR(pols[0]))
1679                return PTR_ERR(pols[0]);
1680
1681        *num_xfrms = pols[0]->xfrm_nr;
1682
1683#ifdef CONFIG_XFRM_SUB_POLICY
1684        if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW &&
1685            pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
1686                pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
1687                                                    XFRM_POLICY_TYPE_MAIN,
1688                                                    fl, family,
1689                                                    XFRM_POLICY_OUT);
1690                if (pols[1]) {
1691                        if (IS_ERR(pols[1])) {
1692                                xfrm_pols_put(pols, *num_pols);
1693                                return PTR_ERR(pols[1]);
1694                        }
1695                        (*num_pols) ++;
1696                        (*num_xfrms) += pols[1]->xfrm_nr;
1697                }
1698        }
1699#endif
1700        for (i = 0; i < *num_pols; i++) {
1701                if (pols[i]->action != XFRM_POLICY_ALLOW) {
1702                        *num_xfrms = -1;
1703                        break;
1704                }
1705        }
1706
1707        return 0;
1708
1709}
1710
1711static struct xfrm_dst *
1712xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
1713                               const struct flowi *fl, u16 family,
1714                               struct dst_entry *dst_orig)
1715{
1716        struct net *net = xp_net(pols[0]);
1717        struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
1718        struct dst_entry *dst;
1719        struct xfrm_dst *xdst;
1720        int err;
1721
1722        /* Try to instantiate a bundle */
1723        err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
1724        if (err <= 0) {
1725                if (err != 0 && err != -EAGAIN)
1726                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
1727                return ERR_PTR(err);
1728        }
1729
1730        dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
1731        if (IS_ERR(dst)) {
1732                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
1733                return ERR_CAST(dst);
1734        }
1735
1736        xdst = (struct xfrm_dst *)dst;
1737        xdst->num_xfrms = err;
1738        if (num_pols > 1)
1739                err = xfrm_dst_update_parent(dst, &pols[1]->selector);
1740        else
1741                err = xfrm_dst_update_origin(dst, fl);
1742        if (unlikely(err)) {
1743                dst_free(dst);
1744                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
1745                return ERR_PTR(err);
1746        }
1747
1748        xdst->num_pols = num_pols;
1749        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
1750        xdst->policy_genid = atomic_read(&pols[0]->genid);
1751
1752        return xdst;
1753}
1754
1755static void xfrm_policy_queue_process(unsigned long arg)
1756{
1757        int err = 0;
1758        struct sk_buff *skb;
1759        struct sock *sk;
1760        struct dst_entry *dst;
1761        struct net_device *dev;
1762        struct xfrm_policy *pol = (struct xfrm_policy *)arg;
1763        struct xfrm_policy_queue *pq = &pol->polq;
1764        struct flowi fl;
1765        struct sk_buff_head list;
1766
1767        spin_lock(&pq->hold_queue.lock);
1768        skb = skb_peek(&pq->hold_queue);
1769        dst = skb_dst(skb);
1770        sk = skb->sk;
1771        xfrm_decode_session(skb, &fl, dst->ops->family);
1772        spin_unlock(&pq->hold_queue.lock);
1773
1774        dst_hold(dst->path);
1775        dst = xfrm_lookup(xp_net(pol), dst->path, &fl,
1776                          sk, 0);
1777        if (IS_ERR(dst))
1778                goto purge_queue;
1779
1780        if (dst->flags & DST_XFRM_QUEUE) {
1781                dst_release(dst);
1782
1783                if (pq->timeout >= XFRM_QUEUE_TMO_MAX)
1784                        goto purge_queue;
1785
1786                pq->timeout = pq->timeout << 1;
1787                mod_timer(&pq->hold_timer, jiffies + pq->timeout);
1788                return;
1789        }
1790
1791        dst_release(dst);
1792
1793        __skb_queue_head_init(&list);
1794
1795        spin_lock(&pq->hold_queue.lock);
1796        pq->timeout = 0;
1797        skb_queue_splice_init(&pq->hold_queue, &list);
1798        spin_unlock(&pq->hold_queue.lock);
1799
1800        while (!skb_queue_empty(&list)) {
1801                skb = __skb_dequeue(&list);
1802
1803                xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
1804                dst_hold(skb_dst(skb)->path);
1805                dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path,
1806                                  &fl, skb->sk, 0);
1807                if (IS_ERR(dst)) {
1808                        dev_put(skb->dev);
1809                        kfree_skb(skb);
1810                        continue;
1811                }
1812
1813                nf_reset(skb);
1814                skb_dst_drop(skb);
1815                skb_dst_set(skb, dst);
1816
1817                dev = skb->dev;
1818                err = dst_output(skb);
1819                dev_put(dev);
1820        }
1821
1822        return;
1823
1824purge_queue:
1825        pq->timeout = 0;
1826        xfrm_queue_purge(&pq->hold_queue);
1827}
1828
1829static int xdst_queue_output(struct sk_buff *skb)
1830{
1831        unsigned long sched_next;
1832        struct dst_entry *dst = skb_dst(skb);
1833        struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
1834        struct xfrm_policy_queue *pq = &xdst->pols[0]->polq;
1835
1836        if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
1837                kfree_skb(skb);
1838                return -EAGAIN;
1839        }
1840
1841        skb_dst_force(skb);
1842        dev_hold(skb->dev);
1843
1844        spin_lock_bh(&pq->hold_queue.lock);
1845
1846        if (!pq->timeout)
1847                pq->timeout = XFRM_QUEUE_TMO_MIN;
1848
1849        sched_next = jiffies + pq->timeout;
1850
1851        if (del_timer(&pq->hold_timer)) {
1852                if (time_before(pq->hold_timer.expires, sched_next))
1853                        sched_next = pq->hold_timer.expires;
1854        }
1855
1856        __skb_queue_tail(&pq->hold_queue, skb);
1857        mod_timer(&pq->hold_timer, sched_next);
1858
1859        spin_unlock_bh(&pq->hold_queue.lock);
1860
1861        return 0;
1862}
1863
1864static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
1865                                                 struct dst_entry *dst,
1866                                                 const struct flowi *fl,
1867                                                 int num_xfrms,
1868                                                 u16 family)
1869{
1870        int err;
1871        struct net_device *dev;
1872        struct dst_entry *dst1;
1873        struct xfrm_dst *xdst;
1874
1875        xdst = xfrm_alloc_dst(net, family);
1876        if (IS_ERR(xdst))
1877                return xdst;
1878
1879        if (net->xfrm.sysctl_larval_drop || num_xfrms <= 0 ||
1880            (fl->flowi_flags & FLOWI_FLAG_CAN_SLEEP))
1881                return xdst;
1882
1883        dst1 = &xdst->u.dst;
1884        dst_hold(dst);
1885        xdst->route = dst;
1886
1887        dst_copy_metrics(dst1, dst);
1888
1889        dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
1890        dst1->flags |= DST_HOST | DST_XFRM_QUEUE;
1891        dst1->lastuse = jiffies;
1892
1893        dst1->input = dst_discard;
1894        dst1->output = xdst_queue_output;
1895
1896        dst_hold(dst);
1897        dst1->child = dst;
1898        dst1->path = dst;
1899
1900        xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);
1901
1902        err = -ENODEV;
1903        dev = dst->dev;
1904        if (!dev)
1905                goto free_dst;
1906
1907        err = xfrm_fill_dst(xdst, dev, fl);
1908        if (err)
1909                goto free_dst;
1910
1911out:
1912        return xdst;
1913
1914free_dst:
1915        dst_release(dst1);
1916        xdst = ERR_PTR(err);
1917        goto out;
1918}
1919
1920static struct flow_cache_object *
1921xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
1922                   struct flow_cache_object *oldflo, void *ctx)
1923{
1924        struct dst_entry *dst_orig = (struct dst_entry *)ctx;
1925        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
1926        struct xfrm_dst *xdst, *new_xdst;
1927        int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
1928
1929        /* Check if the policies from old bundle are usable */
1930        xdst = NULL;
1931        if (oldflo) {
1932                xdst = container_of(oldflo, struct xfrm_dst, flo);
1933                num_pols = xdst->num_pols;
1934                num_xfrms = xdst->num_xfrms;
1935                pol_dead = 0;
1936                for (i = 0; i < num_pols; i++) {
1937                        pols[i] = xdst->pols[i];
1938                        pol_dead |= pols[i]->walk.dead;
1939                }
1940                if (pol_dead) {
1941                        dst_free(&xdst->u.dst);
1942                        xdst = NULL;
1943                        num_pols = 0;
1944                        num_xfrms = 0;
1945                        oldflo = NULL;
1946                }
1947        }
1948
1949        /* Resolve policies to use if we couldn't get them from
1950         * previous cache entry */
1951        if (xdst == NULL) {
1952                num_pols = 1;
1953                pols[0] = __xfrm_policy_lookup(net, fl, family,
1954                                               flow_to_policy_dir(dir));
1955                err = xfrm_expand_policies(fl, family, pols,
1956                                           &num_pols, &num_xfrms);
1957                if (err < 0)
1958                        goto inc_error;
1959                if (num_pols == 0)
1960                        return NULL;
1961                if (num_xfrms <= 0)
1962                        goto make_dummy_bundle;
1963        }
1964
1965        new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig);
1966        if (IS_ERR(new_xdst)) {
1967                err = PTR_ERR(new_xdst);
1968                if (err != -EAGAIN)
1969                        goto error;
1970                if (oldflo == NULL)
1971                        goto make_dummy_bundle;
1972                dst_hold(&xdst->u.dst);
1973                return oldflo;
1974        } else if (new_xdst == NULL) {
1975                num_xfrms = 0;
1976                if (oldflo == NULL)
1977                        goto make_dummy_bundle;
1978                xdst->num_xfrms = 0;
1979                dst_hold(&xdst->u.dst);
1980                return oldflo;
1981        }
1982
1983        /* Kill the previous bundle */
1984        if (xdst) {
1985                /* The policies were stolen for newly generated bundle */
1986                xdst->num_pols = 0;
1987                dst_free(&xdst->u.dst);
1988        }
1989
1990        /* Flow cache does not have reference, it dst_free()'s,
1991         * but we do need to return one reference for original caller */
1992        dst_hold(&new_xdst->u.dst);
1993        return &new_xdst->flo;
1994
1995make_dummy_bundle:
1996        /* We found policies, but there's no bundles to instantiate:
1997         * either because the policy blocks, has no transformations or
1998         * we could not build template (no xfrm_states).*/
1999        xdst = xfrm_create_dummy_bundle(net, dst_orig, fl, num_xfrms, family);
2000        if (IS_ERR(xdst)) {
2001                xfrm_pols_put(pols, num_pols);
2002                return ERR_CAST(xdst);
2003        }
2004        xdst->num_pols = num_pols;
2005        xdst->num_xfrms = num_xfrms;
2006        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
2007
2008        dst_hold(&xdst->u.dst);
2009        return &xdst->flo;
2010
2011inc_error:
2012        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
2013error:
2014        if (xdst != NULL)
2015                dst_free(&xdst->u.dst);
2016        else
2017                xfrm_pols_put(pols, num_pols);
2018        return ERR_PTR(err);
2019}
2020
2021static struct dst_entry *make_blackhole(struct net *net, u16 family,
2022                                        struct dst_entry *dst_orig)
2023{
2024        struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2025        struct dst_entry *ret;
2026
2027        if (!afinfo) {
2028                dst_release(dst_orig);
2029                return ERR_PTR(-EINVAL);
2030        } else {
2031                ret = afinfo->blackhole_route(net, dst_orig);
2032        }
2033        xfrm_policy_put_afinfo(afinfo);
2034
2035        return ret;
2036}
2037
2038/* Main function: finds/creates a bundle for given flow.
2039 *
2040 * At the moment we eat a raw IP route. Mostly to speed up lookups
2041 * on interfaces with disabled IPsec.
2042 */
2043struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
2044                              const struct flowi *fl,
2045                              struct sock *sk, int flags)
2046{
2047        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
2048        struct flow_cache_object *flo;
2049        struct xfrm_dst *xdst;
2050        struct dst_entry *dst, *route;
2051        u16 family = dst_orig->ops->family;
2052        u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
2053        int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
2054
2055restart:
2056        dst = NULL;
2057        xdst = NULL;
2058        route = NULL;
2059
2060        if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
2061                num_pols = 1;
2062                pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
2063                err = xfrm_expand_policies(fl, family, pols,
2064                                           &num_pols, &num_xfrms);
2065                if (err < 0)
2066                        goto dropdst;
2067
2068                if (num_pols) {
2069                        if (num_xfrms <= 0) {
2070                                drop_pols = num_pols;
2071                                goto no_transform;
2072                        }
2073
2074                        xdst = xfrm_resolve_and_create_bundle(
2075                                        pols, num_pols, fl,
2076                                        family, dst_orig);
2077                        if (IS_ERR(xdst)) {
2078                                xfrm_pols_put(pols, num_pols);
2079                                err = PTR_ERR(xdst);
2080                                goto dropdst;
2081                        } else if (xdst == NULL) {
2082                                num_xfrms = 0;
2083                                drop_pols = num_pols;
2084                                goto no_transform;
2085                        }
2086
2087                        dst_hold(&xdst->u.dst);
2088
2089                        spin_lock_bh(&xfrm_policy_sk_bundle_lock);
2090                        xdst->u.dst.next = xfrm_policy_sk_bundles;
2091                        xfrm_policy_sk_bundles = &xdst->u.dst;
2092                        spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
2093
2094                        route = xdst->route;
2095                }
2096        }
2097
2098        if (xdst == NULL) {
2099                /* To accelerate a bit...  */
2100                if ((dst_orig->flags & DST_NOXFRM) ||
2101                    !net->xfrm.policy_count[XFRM_POLICY_OUT])
2102                        goto nopol;
2103
2104                flo = flow_cache_lookup(net, fl, family, dir,
2105                                        xfrm_bundle_lookup, dst_orig);
2106                if (flo == NULL)
2107                        goto nopol;
2108                if (IS_ERR(flo)) {
2109                        err = PTR_ERR(flo);
2110                        goto dropdst;
2111                }
2112                xdst = container_of(flo, struct xfrm_dst, flo);
2113
2114                num_pols = xdst->num_pols;
2115                num_xfrms = xdst->num_xfrms;
2116                memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols);
2117                route = xdst->route;
2118        }
2119
2120        dst = &xdst->u.dst;
2121        if (route == NULL && num_xfrms > 0) {
2122                /* The only case when xfrm_bundle_lookup() returns a
2123                 * bundle with null route, is when the template could
2124                 * not be resolved. It means policies are there, but
2125                 * bundle could not be created, since we don't yet
2126                 * have the xfrm_state's. We need to wait for KM to
2127                 * negotiate new SA's or bail out with error.*/
2128                if (net->xfrm.sysctl_larval_drop) {
2129                        /* EREMOTE tells the caller to generate
2130                         * a one-shot blackhole route. */
2131                        dst_release(dst);
2132                        xfrm_pols_put(pols, drop_pols);
2133                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
2134
2135                        return make_blackhole(net, family, dst_orig);
2136                }
2137                if (fl->flowi_flags & FLOWI_FLAG_CAN_SLEEP) {
2138                        DECLARE_WAITQUEUE(wait, current);
2139
2140                        add_wait_queue(&net->xfrm.km_waitq, &wait);
2141                        set_current_state(TASK_INTERRUPTIBLE);
2142                        schedule();
2143                        set_current_state(TASK_RUNNING);
2144                        remove_wait_queue(&net->xfrm.km_waitq, &wait);
2145
2146                        if (!signal_pending(current)) {
2147                                dst_release(dst);
2148                                goto restart;
2149                        }
2150
2151                        err = -ERESTART;
2152                } else
2153                        err = -EAGAIN;
2154
2155                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
2156                goto error;
2157        }
2158
2159no_transform:
2160        if (num_pols == 0)
2161                goto nopol;
2162
2163        if ((flags & XFRM_LOOKUP_ICMP) &&
2164            !(pols[0]->flags & XFRM_POLICY_ICMP)) {
2165                err = -ENOENT;
2166                goto error;
2167        }
2168
2169        for (i = 0; i < num_pols; i++)
2170                pols[i]->curlft.use_time = get_seconds();
2171
2172        if (num_xfrms < 0) {
2173                /* Prohibit the flow */
2174                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
2175                err = -EPERM;
2176                goto error;
2177        } else if (num_xfrms > 0) {
2178                /* Flow transformed */
2179                dst_release(dst_orig);
2180        } else {
2181                /* Flow passes untransformed */
2182                dst_release(dst);
2183                dst = dst_orig;
2184        }
2185ok:
2186        xfrm_pols_put(pols, drop_pols);
2187        if (dst && dst->xfrm &&
2188            dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
2189                dst->flags |= DST_XFRM_TUNNEL;
2190        return dst;
2191
2192nopol:
2193        if (!(flags & XFRM_LOOKUP_ICMP)) {
2194                dst = dst_orig;
2195                goto ok;
2196        }
2197        err = -ENOENT;
2198error:
2199        dst_release(dst);
2200dropdst:
2201        dst_release(dst_orig);
2202        xfrm_pols_put(pols, drop_pols);
2203        return ERR_PTR(err);
2204}
2205EXPORT_SYMBOL(xfrm_lookup);
2206
2207static inline int
2208xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
2209{
2210        struct xfrm_state *x;
2211
2212        if (!skb->sp || idx < 0 || idx >= skb->sp->len)
2213                return 0;
2214        x = skb->sp->xvec[idx];
2215        if (!x->type->reject)
2216                return 0;
2217        return x->type->reject(x, skb, fl);
2218}
2219
2220/* When skb is transformed back to its "native" form, we have to
2221 * check policy restrictions. At the moment we make this in maximally
2222 * stupid way. Shame on me. :-) Of course, connected sockets must
2223 * have policy cached at them.
2224 */
2225
2226static inline int
2227xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
2228              unsigned short family)
2229{
2230        if (xfrm_state_kern(x))
2231                return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
2232        return  x->id.proto == tmpl->id.proto &&
2233                (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
2234                (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
2235                x->props.mode == tmpl->mode &&
2236                (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
2237                 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
2238                !(x->props.mode != XFRM_MODE_TRANSPORT &&
2239                  xfrm_state_addr_cmp(tmpl, x, family));
2240}
2241
2242/*
2243 * 0 or more than 0 is returned when validation is succeeded (either bypass
2244 * because of optional transport mode, or next index of the mathced secpath
2245 * state with the template.
2246 * -1 is returned when no matching template is found.
2247 * Otherwise "-2 - errored_index" is returned.
2248 */
2249static inline int
2250xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
2251               unsigned short family)
2252{
2253        int idx = start;
2254
2255        if (tmpl->optional) {
2256                if (tmpl->mode == XFRM_MODE_TRANSPORT)
2257                        return start;
2258        } else
2259                start = -1;
2260        for (; idx < sp->len; idx++) {
2261                if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
2262                        return ++idx;
2263                if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
2264                        if (start == -1)
2265                                start = -2-idx;
2266                        break;
2267                }
2268        }
2269        return start;
2270}
2271
2272int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
2273                          unsigned int family, int reverse)
2274{
2275        struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2276        int err;
2277
2278        if (unlikely(afinfo == NULL))
2279                return -EAFNOSUPPORT;
2280
2281        afinfo->decode_session(skb, fl, reverse);
2282        err = security_xfrm_decode_session(skb, &fl->flowi_secid);
2283        xfrm_policy_put_afinfo(afinfo);
2284        return err;
2285}
2286EXPORT_SYMBOL(__xfrm_decode_session);
2287
2288static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
2289{
2290        for (; k < sp->len; k++) {
2291                if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
2292                        *idxp = k;
2293                        return 1;
2294                }
2295        }
2296
2297        return 0;
2298}
2299
2300int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
2301                        unsigned short family)
2302{
2303        struct net *net = dev_net(skb->dev);
2304        struct xfrm_policy *pol;
2305        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
2306        int npols = 0;
2307        int xfrm_nr;
2308        int pi;
2309        int reverse;
2310        struct flowi fl;
2311        u8 fl_dir;
2312        int xerr_idx = -1;
2313
2314        reverse = dir & ~XFRM_POLICY_MASK;
2315        dir &= XFRM_POLICY_MASK;
2316        fl_dir = policy_to_flow_dir(dir);
2317
2318        if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
2319                XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
2320                return 0;
2321        }
2322
2323        nf_nat_decode_session(skb, &fl, family);
2324
2325        /* First, check used SA against their selectors. */
2326        if (skb->sp) {
2327                int i;
2328
2329                for (i=skb->sp->len-1; i>=0; i--) {
2330                        struct xfrm_state *x = skb->sp->xvec[i];
2331                        if (!xfrm_selector_match(&x->sel, &fl, family)) {
2332                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
2333                                return 0;
2334                        }
2335                }
2336        }
2337
2338        pol = NULL;
2339        if (sk && sk->sk_policy[dir]) {
2340                pol = xfrm_sk_policy_lookup(sk, dir, &fl);
2341                if (IS_ERR(pol)) {
2342                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
2343                        return 0;
2344                }
2345        }
2346
2347        if (!pol) {
2348                struct flow_cache_object *flo;
2349
2350                flo = flow_cache_lookup(net, &fl, family, fl_dir,
2351                                        xfrm_policy_lookup, NULL);
2352                if (IS_ERR_OR_NULL(flo))
2353                        pol = ERR_CAST(flo);
2354                else
2355                        pol = container_of(flo, struct xfrm_policy, flo);
2356        }
2357
2358        if (IS_ERR(pol)) {
2359                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
2360                return 0;
2361        }
2362
2363        if (!pol) {
2364                if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) {
2365                        xfrm_secpath_reject(xerr_idx, skb, &fl);
2366                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
2367                        return 0;
2368                }
2369                return 1;
2370        }
2371
2372        pol->curlft.use_time = get_seconds();
2373
2374        pols[0] = pol;
2375        npols ++;
2376#ifdef CONFIG_XFRM_SUB_POLICY
2377        if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
2378                pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
2379                                                    &fl, family,
2380                                                    XFRM_POLICY_IN);
2381                if (pols[1]) {
2382                        if (IS_ERR(pols[1])) {
2383                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
2384                                return 0;
2385                        }
2386                        pols[1]->curlft.use_time = get_seconds();
2387                        npols ++;
2388                }
2389        }
2390#endif
2391
2392        if (pol->action == XFRM_POLICY_ALLOW) {
2393                struct sec_path *sp;
2394                static struct sec_path dummy;
2395                struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
2396                struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
2397                struct xfrm_tmpl **tpp = tp;
2398                int ti = 0;
2399                int i, k;
2400
2401                if ((sp = skb->sp) == NULL)
2402                        sp = &dummy;
2403
2404                for (pi = 0; pi < npols; pi++) {
2405                        if (pols[pi] != pol &&
2406                            pols[pi]->action != XFRM_POLICY_ALLOW) {
2407                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
2408                                goto reject;
2409                        }
2410                        if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) {
2411                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
2412                                goto reject_error;
2413                        }
2414                        for (i = 0; i < pols[pi]->xfrm_nr; i++)
2415                                tpp[ti++] = &pols[pi]->xfrm_vec[i];
2416                }
2417                xfrm_nr = ti;
2418                if (npols > 1) {
2419                        xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
2420                        tpp = stp;
2421                }
2422
2423                /* For each tunnel xfrm, find the first matching tmpl.
2424                 * For each tmpl before that, find corresponding xfrm.
2425                 * Order is _important_. Later we will implement
2426                 * some barriers, but at the moment barriers
2427                 * are implied between each two transformations.
2428                 */
2429                for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
2430                        k = xfrm_policy_ok(tpp[i], sp, k, family);
2431                        if (k < 0) {
2432                                if (k < -1)
2433                                        /* "-2 - errored_index" returned */
2434                                        xerr_idx = -(2+k);
2435                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
2436                                goto reject;
2437                        }
2438                }
2439
2440                if (secpath_has_nontransport(sp, k, &xerr_idx)) {
2441                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
2442                        goto reject;
2443                }
2444
2445                xfrm_pols_put(pols, npols);
2446                return 1;
2447        }
2448        XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
2449
2450reject:
2451        xfrm_secpath_reject(xerr_idx, skb, &fl);
2452reject_error:
2453        xfrm_pols_put(pols, npols);
2454        return 0;
2455}
2456EXPORT_SYMBOL(__xfrm_policy_check);
2457
2458int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
2459{
2460        struct net *net = dev_net(skb->dev);
2461        struct flowi fl;
2462        struct dst_entry *dst;
2463        int res = 1;
2464
2465        if (xfrm_decode_session(skb, &fl, family) < 0) {
2466                XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
2467                return 0;
2468        }
2469
2470        skb_dst_force(skb);
2471
2472        dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, 0);
2473        if (IS_ERR(dst)) {
2474                res = 0;
2475                dst = NULL;
2476        }
2477        skb_dst_set(skb, dst);
2478        return res;
2479}
2480EXPORT_SYMBOL(__xfrm_route_forward);
2481
2482/* Optimize later using cookies and generation ids. */
2483
2484static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
2485{
2486        /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
2487         * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to
2488         * get validated by dst_ops->check on every use.  We do this
2489         * because when a normal route referenced by an XFRM dst is
2490         * obsoleted we do not go looking around for all parent
2491         * referencing XFRM dsts so that we can invalidate them.  It
2492         * is just too much work.  Instead we make the checks here on
2493         * every use.  For example:
2494         *
2495         *      XFRM dst A --> IPv4 dst X
2496         *
2497         * X is the "xdst->route" of A (X is also the "dst->path" of A
2498         * in this example).  If X is marked obsolete, "A" will not
2499         * notice.  That's what we are validating here via the
2500         * stale_bundle() check.
2501         *
2502         * When a policy's bundle is pruned, we dst_free() the XFRM
2503         * dst which causes it's ->obsolete field to be set to
2504         * DST_OBSOLETE_DEAD.  If an XFRM dst has been pruned like
2505         * this, we want to force a new route lookup.
2506         */
2507        if (dst->obsolete < 0 && !stale_bundle(dst))
2508                return dst;
2509
2510        return NULL;
2511}
2512
2513static int stale_bundle(struct dst_entry *dst)
2514{
2515        return !xfrm_bundle_ok((struct xfrm_dst *)dst);
2516}
2517
2518void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
2519{
2520        while ((dst = dst->child) && dst->xfrm && dst->dev == dev) {
2521                dst->dev = dev_net(dev)->loopback_dev;
2522                dev_hold(dst->dev);
2523                dev_put(dev);
2524        }
2525}
2526EXPORT_SYMBOL(xfrm_dst_ifdown);
2527
2528static void xfrm_link_failure(struct sk_buff *skb)
2529{
2530        /* Impossible. Such dst must be popped before reaches point of failure. */
2531}
2532
2533static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
2534{
2535        if (dst) {
2536                if (dst->obsolete) {
2537                        dst_release(dst);
2538                        dst = NULL;
2539                }
2540        }
2541        return dst;
2542}
2543
2544static void __xfrm_garbage_collect(struct net *net)
2545{
2546        struct dst_entry *head, *next;
2547
2548        spin_lock_bh(&xfrm_policy_sk_bundle_lock);
2549        head = xfrm_policy_sk_bundles;
2550        xfrm_policy_sk_bundles = NULL;
2551        spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
2552
2553        while (head) {
2554                next = head->next;
2555                dst_free(head);
2556                head = next;
2557        }
2558}
2559
2560void xfrm_garbage_collect(struct net *net)
2561{
2562        flow_cache_flush();
2563        __xfrm_garbage_collect(net);
2564}
2565EXPORT_SYMBOL(xfrm_garbage_collect);
2566
2567static void xfrm_garbage_collect_deferred(struct net *net)
2568{
2569        flow_cache_flush_deferred();
2570        __xfrm_garbage_collect(net);
2571}
2572
2573static void xfrm_init_pmtu(struct dst_entry *dst)
2574{
2575        do {
2576                struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
2577                u32 pmtu, route_mtu_cached;
2578
2579                pmtu = dst_mtu(dst->child);
2580                xdst->child_mtu_cached = pmtu;
2581
2582                pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
2583
2584                route_mtu_cached = dst_mtu(xdst->route);
2585                xdst->route_mtu_cached = route_mtu_cached;
2586
2587                if (pmtu > route_mtu_cached)
2588                        pmtu = route_mtu_cached;
2589
2590                dst_metric_set(dst, RTAX_MTU, pmtu);
2591        } while ((dst = dst->next));
2592}
2593
2594/* Check that the bundle accepts the flow and its components are
2595 * still valid.
2596 */
2597
2598static int xfrm_bundle_ok(struct xfrm_dst *first)
2599{
2600        struct dst_entry *dst = &first->u.dst;
2601        struct xfrm_dst *last;
2602        u32 mtu;
2603
2604        if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
2605            (dst->dev && !netif_running(dst->dev)))
2606                return 0;
2607
2608        if (dst->flags & DST_XFRM_QUEUE)
2609                return 1;
2610
2611        last = NULL;
2612
2613        do {
2614                struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
2615
2616                if (dst->xfrm->km.state != XFRM_STATE_VALID)
2617                        return 0;
2618                if (xdst->xfrm_genid != dst->xfrm->genid)
2619                        return 0;
2620                if (xdst->num_pols > 0 &&
2621                    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
2622                        return 0;
2623
2624                mtu = dst_mtu(dst->child);
2625                if (xdst->child_mtu_cached != mtu) {
2626                        last = xdst;
2627                        xdst->child_mtu_cached = mtu;
2628                }
2629
2630                if (!dst_check(xdst->route, xdst->route_cookie))
2631                        return 0;
2632                mtu = dst_mtu(xdst->route);
2633                if (xdst->route_mtu_cached != mtu) {
2634                        last = xdst;
2635                        xdst->route_mtu_cached = mtu;
2636                }
2637
2638                dst = dst->child;
2639        } while (dst->xfrm);
2640
2641        if (likely(!last))
2642                return 1;
2643
2644        mtu = last->child_mtu_cached;
2645        for (;;) {
2646                dst = &last->u.dst;
2647
2648                mtu = xfrm_state_mtu(dst->xfrm, mtu);
2649                if (mtu > last->route_mtu_cached)
2650                        mtu = last->route_mtu_cached;
2651                dst_metric_set(dst, RTAX_MTU, mtu);
2652
2653                if (last == first)
2654                        break;
2655
2656                last = (struct xfrm_dst *)last->u.dst.next;
2657                last->child_mtu_cached = mtu;
2658        }
2659
2660        return 1;
2661}
2662
2663static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
2664{
2665        return dst_metric_advmss(dst->path);
2666}
2667
2668static unsigned int xfrm_mtu(const struct dst_entry *dst)
2669{
2670        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2671
2672        return mtu ? : dst_mtu(dst->path);
2673}
2674
2675static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
2676                                           struct sk_buff *skb,
2677                                           const void *daddr)
2678{
2679        return dst->path->ops->neigh_lookup(dst, skb, daddr);
2680}
2681
2682int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
2683{
2684        struct net *net;
2685        int err = 0;
2686        if (unlikely(afinfo == NULL))
2687                return -EINVAL;
2688        if (unlikely(afinfo->family >= NPROTO))
2689                return -EAFNOSUPPORT;
2690        spin_lock(&xfrm_policy_afinfo_lock);
2691        if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
2692                err = -ENOBUFS;
2693        else {
2694                struct dst_ops *dst_ops = afinfo->dst_ops;
2695                if (likely(dst_ops->kmem_cachep == NULL))
2696                        dst_ops->kmem_cachep = xfrm_dst_cache;
2697                if (likely(dst_ops->check == NULL))
2698                        dst_ops->check = xfrm_dst_check;
2699                if (likely(dst_ops->default_advmss == NULL))
2700                        dst_ops->default_advmss = xfrm_default_advmss;
2701                if (likely(dst_ops->mtu == NULL))
2702                        dst_ops->mtu = xfrm_mtu;
2703                if (likely(dst_ops->negative_advice == NULL))
2704                        dst_ops->negative_advice = xfrm_negative_advice;
2705                if (likely(dst_ops->link_failure == NULL))
2706                        dst_ops->link_failure = xfrm_link_failure;
2707                if (likely(dst_ops->neigh_lookup == NULL))
2708                        dst_ops->neigh_lookup = xfrm_neigh_lookup;
2709                if (likely(afinfo->garbage_collect == NULL))
2710                        afinfo->garbage_collect = xfrm_garbage_collect_deferred;
2711                rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo);
2712        }
2713        spin_unlock(&xfrm_policy_afinfo_lock);
2714
2715        rtnl_lock();
2716        for_each_net(net) {
2717                struct dst_ops *xfrm_dst_ops;
2718
2719                switch (afinfo->family) {
2720                case AF_INET:
2721                        xfrm_dst_ops = &net->xfrm.xfrm4_dst_ops;
2722                        break;
2723#if IS_ENABLED(CONFIG_IPV6)
2724                case AF_INET6:
2725                        xfrm_dst_ops = &net->xfrm.xfrm6_dst_ops;
2726                        break;
2727#endif
2728                default:
2729                        BUG();
2730                }
2731                *xfrm_dst_ops = *afinfo->dst_ops;
2732        }
2733        rtnl_unlock();
2734
2735        return err;
2736}
2737EXPORT_SYMBOL(xfrm_policy_register_afinfo);
2738
2739int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
2740{
2741        int err = 0;
2742        if (unlikely(afinfo == NULL))
2743                return -EINVAL;
2744        if (unlikely(afinfo->family >= NPROTO))
2745                return -EAFNOSUPPORT;
2746        spin_lock(&xfrm_policy_afinfo_lock);
2747        if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
2748                if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
2749                        err = -EINVAL;
2750                else
2751                        RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family],
2752                                         NULL);
2753        }
2754        spin_unlock(&xfrm_policy_afinfo_lock);
2755        if (!err) {
2756                struct dst_ops *dst_ops = afinfo->dst_ops;
2757
2758                synchronize_rcu();
2759
2760                dst_ops->kmem_cachep = NULL;
2761                dst_ops->check = NULL;
2762                dst_ops->negative_advice = NULL;
2763                dst_ops->link_failure = NULL;
2764                afinfo->garbage_collect = NULL;
2765        }
2766        return err;
2767}
2768EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
2769
2770static void __net_init xfrm_dst_ops_init(struct net *net)
2771{
2772        struct xfrm_policy_afinfo *afinfo;
2773
2774        rcu_read_lock();
2775        afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET]);
2776        if (afinfo)
2777                net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops;
2778#if IS_ENABLED(CONFIG_IPV6)
2779        afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET6]);
2780        if (afinfo)
2781                net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops;
2782#endif
2783        rcu_read_unlock();
2784}
2785
2786static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
2787{
2788        struct net_device *dev = ptr;
2789
2790        switch (event) {
2791        case NETDEV_DOWN:
2792                xfrm_garbage_collect(dev_net(dev));
2793        }
2794        return NOTIFY_DONE;
2795}
2796
2797static struct notifier_block xfrm_dev_notifier = {
2798        .notifier_call  = xfrm_dev_event,
2799};
2800
2801#ifdef CONFIG_XFRM_STATISTICS
2802static int __net_init xfrm_statistics_init(struct net *net)
2803{
2804        int rv;
2805
2806        if (snmp_mib_init((void __percpu **)net->mib.xfrm_statistics,
2807                          sizeof(struct linux_xfrm_mib),
2808                          __alignof__(struct linux_xfrm_mib)) < 0)
2809                return -ENOMEM;
2810        rv = xfrm_proc_init(net);
2811        if (rv < 0)
2812                snmp_mib_free((void __percpu **)net->mib.xfrm_statistics);
2813        return rv;
2814}
2815
2816static void xfrm_statistics_fini(struct net *net)
2817{
2818        xfrm_proc_fini(net);
2819        snmp_mib_free((void __percpu **)net->mib.xfrm_statistics);
2820}
2821#else
2822static int __net_init xfrm_statistics_init(struct net *net)
2823{
2824        return 0;
2825}
2826
2827static void xfrm_statistics_fini(struct net *net)
2828{
2829}
2830#endif
2831
2832static int __net_init xfrm_policy_init(struct net *net)
2833{
2834        unsigned int hmask, sz;
2835        int dir;
2836
2837        if (net_eq(net, &init_net))
2838                xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
2839                                           sizeof(struct xfrm_dst),
2840                                           0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2841                                           NULL);
2842
2843        hmask = 8 - 1;
2844        sz = (hmask+1) * sizeof(struct hlist_head);
2845
2846        net->xfrm.policy_byidx = xfrm_hash_alloc(sz);
2847        if (!net->xfrm.policy_byidx)
2848                goto out_byidx;
2849        net->xfrm.policy_idx_hmask = hmask;
2850
2851        for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
2852                struct xfrm_policy_hash *htab;
2853
2854                net->xfrm.policy_count[dir] = 0;
2855                INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
2856
2857                htab = &net->xfrm.policy_bydst[dir];
2858                htab->table = xfrm_hash_alloc(sz);
2859                if (!htab->table)
2860                        goto out_bydst;
2861                htab->hmask = hmask;
2862        }
2863
2864        INIT_LIST_HEAD(&net->xfrm.policy_all);
2865        INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
2866        if (net_eq(net, &init_net))
2867                register_netdevice_notifier(&xfrm_dev_notifier);
2868        return 0;
2869
2870out_bydst:
2871        for (dir--; dir >= 0; dir--) {
2872                struct xfrm_policy_hash *htab;
2873
2874                htab = &net->xfrm.policy_bydst[dir];
2875                xfrm_hash_free(htab->table, sz);
2876        }
2877        xfrm_hash_free(net->xfrm.policy_byidx, sz);
2878out_byidx:
2879        return -ENOMEM;
2880}
2881
2882static void xfrm_policy_fini(struct net *net)
2883{
2884        struct xfrm_audit audit_info;
2885        unsigned int sz;
2886        int dir;
2887
2888        flush_work(&net->xfrm.policy_hash_work);
2889#ifdef CONFIG_XFRM_SUB_POLICY
2890        audit_info.loginuid = INVALID_UID;
2891        audit_info.sessionid = -1;
2892        audit_info.secid = 0;
2893        xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, &audit_info);
2894#endif
2895        audit_info.loginuid = INVALID_UID;
2896        audit_info.sessionid = -1;
2897        audit_info.secid = 0;
2898        xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info);
2899
2900        WARN_ON(!list_empty(&net->xfrm.policy_all));
2901
2902        for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
2903                struct xfrm_policy_hash *htab;
2904
2905                WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir]));
2906
2907                htab = &net->xfrm.policy_bydst[dir];
2908                sz = (htab->hmask + 1) * sizeof(struct hlist_head);
2909                WARN_ON(!hlist_empty(htab->table));
2910                xfrm_hash_free(htab->table, sz);
2911        }
2912
2913        sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
2914        WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
2915        xfrm_hash_free(net->xfrm.policy_byidx, sz);
2916}
2917
2918static int __net_init xfrm_net_init(struct net *net)
2919{
2920        int rv;
2921
2922        rv = xfrm_statistics_init(net);
2923        if (rv < 0)
2924                goto out_statistics;
2925        rv = xfrm_state_init(net);
2926        if (rv < 0)
2927                goto out_state;
2928        rv = xfrm_policy_init(net);
2929        if (rv < 0)
2930                goto out_policy;
2931        xfrm_dst_ops_init(net);
2932        rv = xfrm_sysctl_init(net);
2933        if (rv < 0)
2934                goto out_sysctl;
2935        return 0;
2936
2937out_sysctl:
2938        xfrm_policy_fini(net);
2939out_policy:
2940        xfrm_state_fini(net);
2941out_state:
2942        xfrm_statistics_fini(net);
2943out_statistics:
2944        return rv;
2945}
2946
2947static void __net_exit xfrm_net_exit(struct net *net)
2948{
2949        xfrm_sysctl_fini(net);
2950        xfrm_policy_fini(net);
2951        xfrm_state_fini(net);
2952        xfrm_statistics_fini(net);
2953}
2954
2955static struct pernet_operations __net_initdata xfrm_net_ops = {
2956        .init = xfrm_net_init,
2957        .exit = xfrm_net_exit,
2958};
2959
2960void __init xfrm_init(void)
2961{
2962        register_pernet_subsys(&xfrm_net_ops);
2963        xfrm_input_init();
2964}
2965
2966#ifdef CONFIG_AUDITSYSCALL
2967static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
2968                                         struct audit_buffer *audit_buf)
2969{
2970        struct xfrm_sec_ctx *ctx = xp->security;
2971        struct xfrm_selector *sel = &xp->selector;
2972
2973        if (ctx)
2974                audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
2975                                 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);
2976
2977        switch(sel->family) {
2978        case AF_INET:
2979                audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4);
2980                if (sel->prefixlen_s != 32)
2981                        audit_log_format(audit_buf, " src_prefixlen=%d",
2982                                         sel->prefixlen_s);
2983                audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4);
2984                if (sel->prefixlen_d != 32)
2985                        audit_log_format(audit_buf, " dst_prefixlen=%d",
2986                                         sel->prefixlen_d);
2987                break;
2988        case AF_INET6:
2989                audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
2990                if (sel->prefixlen_s != 128)
2991                        audit_log_format(audit_buf, " src_prefixlen=%d",
2992                                         sel->prefixlen_s);
2993                audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
2994                if (sel->prefixlen_d != 128)
2995                        audit_log_format(audit_buf, " dst_prefixlen=%d",
2996                                         sel->prefixlen_d);
2997                break;
2998        }
2999}
3000
3001void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
3002                           kuid_t auid, u32 sessionid, u32 secid)
3003{
3004        struct audit_buffer *audit_buf;
3005
3006        audit_buf = xfrm_audit_start("SPD-add");
3007        if (audit_buf == NULL)
3008                return;
3009        xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf);
3010        audit_log_format(audit_buf, " res=%u", result);
3011        xfrm_audit_common_policyinfo(xp, audit_buf);
3012        audit_log_end(audit_buf);
3013}
3014EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);
3015
3016void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
3017                              kuid_t auid, u32 sessionid, u32 secid)
3018{
3019        struct audit_buffer *audit_buf;
3020
3021        audit_buf = xfrm_audit_start("SPD-delete");
3022        if (audit_buf == NULL)
3023                return;
3024        xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf);
3025        audit_log_format(audit_buf, " res=%u", result);
3026        xfrm_audit_common_policyinfo(xp, audit_buf);
3027        audit_log_end(audit_buf);
3028}
3029EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
3030#endif
3031
3032#ifdef CONFIG_XFRM_MIGRATE
3033static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp,
3034                                        const struct xfrm_selector *sel_tgt)
3035{
3036        if (sel_cmp->proto == IPSEC_ULPROTO_ANY) {
3037                if (sel_tgt->family == sel_cmp->family &&
3038                    xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr,
3039                                    sel_cmp->family) &&
3040                    xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr,
3041                                    sel_cmp->family) &&
3042                    sel_tgt->prefixlen_d == sel_cmp->prefixlen_d &&
3043                    sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) {
3044                        return true;
3045                }
3046        } else {
3047                if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) {
3048                        return true;
3049                }
3050        }
3051        return false;
3052}
3053
3054static struct xfrm_policy * xfrm_migrate_policy_find(const struct xfrm_selector *sel,
3055                                                     u8 dir, u8 type)
3056{
3057        struct xfrm_policy *pol, *ret = NULL;
3058        struct hlist_head *chain;
3059        u32 priority = ~0U;
3060
3061        read_lock_bh(&xfrm_policy_lock);
3062        chain = policy_hash_direct(&init_net, &sel->daddr, &sel->saddr, sel->family, dir);
3063        hlist_for_each_entry(pol, chain, bydst) {
3064                if (xfrm_migrate_selector_match(sel, &pol->selector) &&
3065                    pol->type == type) {
3066                        ret = pol;
3067                        priority = ret->priority;
3068                        break;
3069                }
3070        }
3071        chain = &init_net.xfrm.policy_inexact[dir];
3072        hlist_for_each_entry(pol, chain, bydst) {
3073                if (xfrm_migrate_selector_match(sel, &pol->selector) &&
3074                    pol->type == type &&
3075                    pol->priority < priority) {
3076                        ret = pol;
3077                        break;
3078                }
3079        }
3080
3081        if (ret)
3082                xfrm_pol_hold(ret);
3083
3084        read_unlock_bh(&xfrm_policy_lock);
3085
3086        return ret;
3087}
3088
3089static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
3090{
3091        int match = 0;
3092
3093        if (t->mode == m->mode && t->id.proto == m->proto &&
3094            (m->reqid == 0 || t->reqid == m->reqid)) {
3095                switch (t->mode) {
3096                case XFRM_MODE_TUNNEL:
3097                case XFRM_MODE_BEET:
3098                        if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
3099                                            m->old_family) &&
3100                            xfrm_addr_equal(&t->saddr, &m->old_saddr,
3101                                            m->old_family)) {
3102                                match = 1;
3103                        }
3104                        break;
3105                case XFRM_MODE_TRANSPORT:
3106                        /* in case of transport mode, template does not store
3107                           any IP addresses, hence we just compare mode and
3108                           protocol */
3109                        match = 1;
3110                        break;
3111                default:
3112                        break;
3113                }
3114        }
3115        return match;
3116}
3117
3118/* update endpoint address(es) of template(s) */
3119static int xfrm_policy_migrate(struct xfrm_policy *pol,
3120                               struct xfrm_migrate *m, int num_migrate)
3121{
3122        struct xfrm_migrate *mp;
3123        int i, j, n = 0;
3124
3125        write_lock_bh(&pol->lock);
3126        if (unlikely(pol->walk.dead)) {
3127                /* target policy has been deleted */
3128                write_unlock_bh(&pol->lock);
3129                return -ENOENT;
3130        }
3131
3132        for (i = 0; i < pol->xfrm_nr; i++) {
3133                for (j = 0, mp = m; j < num_migrate; j++, mp++) {
3134                        if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i]))
3135                                continue;
3136                        n++;
3137                        if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
3138                            pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
3139                                continue;
3140                        /* update endpoints */
3141                        memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
3142                               sizeof(pol->xfrm_vec[i].id.daddr));
3143                        memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr,
3144                               sizeof(pol->xfrm_vec[i].saddr));
3145                        pol->xfrm_vec[i].encap_family = mp->new_family;
3146                        /* flush bundles */
3147                        atomic_inc(&pol->genid);
3148                }
3149        }
3150
3151        write_unlock_bh(&pol->lock);
3152
3153        if (!n)
3154                return -ENODATA;
3155
3156        return 0;
3157}
3158
3159static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
3160{
3161        int i, j;
3162
3163        if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH)
3164                return -EINVAL;
3165
3166        for (i = 0; i < num_migrate; i++) {
3167                if (xfrm_addr_equal(&m[i].old_daddr, &m[i].new_daddr,
3168                                    m[i].old_family) &&
3169                    xfrm_addr_equal(&m[i].old_saddr, &m[i].new_saddr,
3170                                    m[i].old_family))
3171                        return -EINVAL;
3172                if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
3173                    xfrm_addr_any(&m[i].new_saddr, m[i].new_family))
3174                        return -EINVAL;
3175
3176                /* check if there is any duplicated entry */
3177                for (j = i + 1; j < num_migrate; j++) {
3178                        if (!memcmp(&m[i].old_daddr, &m[j].old_daddr,
3179                                    sizeof(m[i].old_daddr)) &&
3180                            !memcmp(&m[i].old_saddr, &m[j].old_saddr,
3181                                    sizeof(m[i].old_saddr)) &&
3182                            m[i].proto == m[j].proto &&
3183                            m[i].mode == m[j].mode &&
3184                            m[i].reqid == m[j].reqid &&
3185                            m[i].old_family == m[j].old_family)
3186                                return -EINVAL;
3187                }
3188        }
3189
3190        return 0;
3191}
3192
3193int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
3194                 struct xfrm_migrate *m, int num_migrate,
3195                 struct xfrm_kmaddress *k)
3196{
3197        int i, err, nx_cur = 0, nx_new = 0;
3198        struct xfrm_policy *pol = NULL;
3199        struct xfrm_state *x, *xc;
3200        struct xfrm_state *x_cur[XFRM_MAX_DEPTH];
3201        struct xfrm_state *x_new[XFRM_MAX_DEPTH];
3202        struct xfrm_migrate *mp;
3203
3204        if ((err = xfrm_migrate_check(m, num_migrate)) < 0)
3205                goto out;
3206
3207        /* Stage 1 - find policy */
3208        if ((pol = xfrm_migrate_policy_find(sel, dir, type)) == NULL) {
3209                err = -ENOENT;
3210                goto out;
3211        }
3212
3213        /* Stage 2 - find and update state(s) */
3214        for (i = 0, mp = m; i < num_migrate; i++, mp++) {
3215                if ((x = xfrm_migrate_state_find(mp))) {
3216                        x_cur[nx_cur] = x;
3217                        nx_cur++;
3218                        if ((xc = xfrm_state_migrate(x, mp))) {
3219                                x_new[nx_new] = xc;
3220                                nx_new++;
3221                        } else {
3222                                err = -ENODATA;
3223                                goto restore_state;
3224                        }
3225                }
3226        }
3227
3228        /* Stage 3 - update policy */
3229        if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0)
3230                goto restore_state;
3231
3232        /* Stage 4 - delete old state(s) */
3233        if (nx_cur) {
3234                xfrm_states_put(x_cur, nx_cur);
3235                xfrm_states_delete(x_cur, nx_cur);
3236        }
3237
3238        /* Stage 5 - announce */
3239        km_migrate(sel, dir, type, m, num_migrate, k);
3240
3241        xfrm_pol_put(pol);
3242
3243        return 0;
3244out:
3245        return err;
3246
3247restore_state:
3248        if (pol)
3249                xfrm_pol_put(pol);
3250        if (nx_cur)
3251                xfrm_states_put(x_cur, nx_cur);
3252        if (nx_new)
3253                xfrm_states_delete(x_new, nx_new);
3254
3255        return err;
3256}
3257EXPORT_SYMBOL(xfrm_migrate);
3258#endif
3259