linux/net/core/net_namespace.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   3
   4#include <linux/workqueue.h>
   5#include <linux/rtnetlink.h>
   6#include <linux/cache.h>
   7#include <linux/slab.h>
   8#include <linux/list.h>
   9#include <linux/delay.h>
  10#include <linux/sched.h>
  11#include <linux/idr.h>
  12#include <linux/rculist.h>
  13#include <linux/nsproxy.h>
  14#include <linux/fs.h>
  15#include <linux/proc_ns.h>
  16#include <linux/file.h>
  17#include <linux/export.h>
  18#include <linux/user_namespace.h>
  19#include <linux/net_namespace.h>
  20#include <linux/sched/task.h>
  21#include <linux/uidgid.h>
  22
  23#include <net/sock.h>
  24#include <net/netlink.h>
  25#include <net/net_namespace.h>
  26#include <net/netns/generic.h>
  27
  28/*
  29 *      Our network namespace constructor/destructor lists
  30 */
  31
  32static LIST_HEAD(pernet_list);
  33static struct list_head *first_device = &pernet_list;
  34
  35LIST_HEAD(net_namespace_list);
  36EXPORT_SYMBOL_GPL(net_namespace_list);
  37
  38/* Protects net_namespace_list. Nests iside rtnl_lock() */
  39DECLARE_RWSEM(net_rwsem);
  40EXPORT_SYMBOL_GPL(net_rwsem);
  41
  42#ifdef CONFIG_KEYS
  43static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
  44#endif
  45
  46struct net init_net = {
  47        .count          = REFCOUNT_INIT(1),
  48        .dev_base_head  = LIST_HEAD_INIT(init_net.dev_base_head),
  49#ifdef CONFIG_KEYS
  50        .key_domain     = &init_net_key_domain,
  51#endif
  52};
  53EXPORT_SYMBOL(init_net);
  54
  55static bool init_net_initialized;
  56/*
  57 * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
  58 * init_net_initialized and first_device pointer.
  59 * This is internal net namespace object. Please, don't use it
  60 * outside.
  61 */
  62DECLARE_RWSEM(pernet_ops_rwsem);
  63EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
  64
  65#define MIN_PERNET_OPS_ID       \
  66        ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
  67
  68#define INITIAL_NET_GEN_PTRS    13 /* +1 for len +2 for rcu_head */
  69
  70static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
  71
  72static struct net_generic *net_alloc_generic(void)
  73{
  74        struct net_generic *ng;
  75        unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
  76
  77        ng = kzalloc(generic_size, GFP_KERNEL);
  78        if (ng)
  79                ng->s.len = max_gen_ptrs;
  80
  81        return ng;
  82}
  83
  84static int net_assign_generic(struct net *net, unsigned int id, void *data)
  85{
  86        struct net_generic *ng, *old_ng;
  87
  88        BUG_ON(id < MIN_PERNET_OPS_ID);
  89
  90        old_ng = rcu_dereference_protected(net->gen,
  91                                           lockdep_is_held(&pernet_ops_rwsem));
  92        if (old_ng->s.len > id) {
  93                old_ng->ptr[id] = data;
  94                return 0;
  95        }
  96
  97        ng = net_alloc_generic();
  98        if (ng == NULL)
  99                return -ENOMEM;
 100
 101        /*
 102         * Some synchronisation notes:
 103         *
 104         * The net_generic explores the net->gen array inside rcu
 105         * read section. Besides once set the net->gen->ptr[x]
 106         * pointer never changes (see rules in netns/generic.h).
 107         *
 108         * That said, we simply duplicate this array and schedule
 109         * the old copy for kfree after a grace period.
 110         */
 111
 112        memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
 113               (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
 114        ng->ptr[id] = data;
 115
 116        rcu_assign_pointer(net->gen, ng);
 117        kfree_rcu(old_ng, s.rcu);
 118        return 0;
 119}
 120
 121static int ops_init(const struct pernet_operations *ops, struct net *net)
 122{
 123        int err = -ENOMEM;
 124        void *data = NULL;
 125
 126        if (ops->id && ops->size) {
 127                data = kzalloc(ops->size, GFP_KERNEL);
 128                if (!data)
 129                        goto out;
 130
 131                err = net_assign_generic(net, *ops->id, data);
 132                if (err)
 133                        goto cleanup;
 134        }
 135        err = 0;
 136        if (ops->init)
 137                err = ops->init(net);
 138        if (!err)
 139                return 0;
 140
 141cleanup:
 142        kfree(data);
 143
 144out:
 145        return err;
 146}
 147
 148static void ops_free(const struct pernet_operations *ops, struct net *net)
 149{
 150        if (ops->id && ops->size) {
 151                kfree(net_generic(net, *ops->id));
 152        }
 153}
 154
 155static void ops_pre_exit_list(const struct pernet_operations *ops,
 156                              struct list_head *net_exit_list)
 157{
 158        struct net *net;
 159
 160        if (ops->pre_exit) {
 161                list_for_each_entry(net, net_exit_list, exit_list)
 162                        ops->pre_exit(net);
 163        }
 164}
 165
 166static void ops_exit_list(const struct pernet_operations *ops,
 167                          struct list_head *net_exit_list)
 168{
 169        struct net *net;
 170        if (ops->exit) {
 171                list_for_each_entry(net, net_exit_list, exit_list)
 172                        ops->exit(net);
 173        }
 174        if (ops->exit_batch)
 175                ops->exit_batch(net_exit_list);
 176}
 177
 178static void ops_free_list(const struct pernet_operations *ops,
 179                          struct list_head *net_exit_list)
 180{
 181        struct net *net;
 182        if (ops->size && ops->id) {
 183                list_for_each_entry(net, net_exit_list, exit_list)
 184                        ops_free(ops, net);
 185        }
 186}
 187
 188/* should be called with nsid_lock held */
 189static int alloc_netid(struct net *net, struct net *peer, int reqid)
 190{
 191        int min = 0, max = 0;
 192
 193        if (reqid >= 0) {
 194                min = reqid;
 195                max = reqid + 1;
 196        }
 197
 198        return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
 199}
 200
 201/* This function is used by idr_for_each(). If net is equal to peer, the
 202 * function returns the id so that idr_for_each() stops. Because we cannot
 203 * returns the id 0 (idr_for_each() will not stop), we return the magic value
 204 * NET_ID_ZERO (-1) for it.
 205 */
 206#define NET_ID_ZERO -1
 207static int net_eq_idr(int id, void *net, void *peer)
 208{
 209        if (net_eq(net, peer))
 210                return id ? : NET_ID_ZERO;
 211        return 0;
 212}
 213
 214/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
 215 * is set to true, thus the caller knows that the new id must be notified via
 216 * rtnl.
 217 */
 218static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
 219{
 220        int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
 221        bool alloc_it = *alloc;
 222
 223        *alloc = false;
 224
 225        /* Magic value for id 0. */
 226        if (id == NET_ID_ZERO)
 227                return 0;
 228        if (id > 0)
 229                return id;
 230
 231        if (alloc_it) {
 232                id = alloc_netid(net, peer, -1);
 233                *alloc = true;
 234                return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
 235        }
 236
 237        return NETNSA_NSID_NOT_ASSIGNED;
 238}
 239
 240/* should be called with nsid_lock held */
 241static int __peernet2id(struct net *net, struct net *peer)
 242{
 243        bool no = false;
 244
 245        return __peernet2id_alloc(net, peer, &no);
 246}
 247
 248static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
 249                              struct nlmsghdr *nlh, gfp_t gfp);
 250/* This function returns the id of a peer netns. If no id is assigned, one will
 251 * be allocated and returned.
 252 */
 253int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
 254{
 255        bool alloc = false, alive = false;
 256        int id;
 257
 258        if (refcount_read(&net->count) == 0)
 259                return NETNSA_NSID_NOT_ASSIGNED;
 260        spin_lock_bh(&net->nsid_lock);
 261        /*
 262         * When peer is obtained from RCU lists, we may race with
 263         * its cleanup. Check whether it's alive, and this guarantees
 264         * we never hash a peer back to net->netns_ids, after it has
 265         * just been idr_remove()'d from there in cleanup_net().
 266         */
 267        if (maybe_get_net(peer))
 268                alive = alloc = true;
 269        id = __peernet2id_alloc(net, peer, &alloc);
 270        spin_unlock_bh(&net->nsid_lock);
 271        if (alloc && id >= 0)
 272                rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp);
 273        if (alive)
 274                put_net(peer);
 275        return id;
 276}
 277EXPORT_SYMBOL_GPL(peernet2id_alloc);
 278
 279/* This function returns, if assigned, the id of a peer netns. */
 280int peernet2id(struct net *net, struct net *peer)
 281{
 282        int id;
 283
 284        spin_lock_bh(&net->nsid_lock);
 285        id = __peernet2id(net, peer);
 286        spin_unlock_bh(&net->nsid_lock);
 287        return id;
 288}
 289EXPORT_SYMBOL(peernet2id);
 290
 291/* This function returns true is the peer netns has an id assigned into the
 292 * current netns.
 293 */
 294bool peernet_has_id(struct net *net, struct net *peer)
 295{
 296        return peernet2id(net, peer) >= 0;
 297}
 298
 299struct net *get_net_ns_by_id(struct net *net, int id)
 300{
 301        struct net *peer;
 302
 303        if (id < 0)
 304                return NULL;
 305
 306        rcu_read_lock();
 307        peer = idr_find(&net->netns_ids, id);
 308        if (peer)
 309                peer = maybe_get_net(peer);
 310        rcu_read_unlock();
 311
 312        return peer;
 313}
 314
 315/*
 316 * setup_net runs the initializers for the network namespace object.
 317 */
 318static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 319{
 320        /* Must be called with pernet_ops_rwsem held */
 321        const struct pernet_operations *ops, *saved_ops;
 322        int error = 0;
 323        LIST_HEAD(net_exit_list);
 324
 325        refcount_set(&net->count, 1);
 326        refcount_set(&net->passive, 1);
 327        get_random_bytes(&net->hash_mix, sizeof(u32));
 328        net->dev_base_seq = 1;
 329        net->user_ns = user_ns;
 330        idr_init(&net->netns_ids);
 331        spin_lock_init(&net->nsid_lock);
 332        mutex_init(&net->ipv4.ra_mutex);
 333
 334        list_for_each_entry(ops, &pernet_list, list) {
 335                error = ops_init(ops, net);
 336                if (error < 0)
 337                        goto out_undo;
 338        }
 339        down_write(&net_rwsem);
 340        list_add_tail_rcu(&net->list, &net_namespace_list);
 341        up_write(&net_rwsem);
 342out:
 343        return error;
 344
 345out_undo:
 346        /* Walk through the list backwards calling the exit functions
 347         * for the pernet modules whose init functions did not fail.
 348         */
 349        list_add(&net->exit_list, &net_exit_list);
 350        saved_ops = ops;
 351        list_for_each_entry_continue_reverse(ops, &pernet_list, list)
 352                ops_pre_exit_list(ops, &net_exit_list);
 353
 354        synchronize_rcu();
 355
 356        ops = saved_ops;
 357        list_for_each_entry_continue_reverse(ops, &pernet_list, list)
 358                ops_exit_list(ops, &net_exit_list);
 359
 360        ops = saved_ops;
 361        list_for_each_entry_continue_reverse(ops, &pernet_list, list)
 362                ops_free_list(ops, &net_exit_list);
 363
 364        rcu_barrier();
 365        goto out;
 366}
 367
 368static int __net_init net_defaults_init_net(struct net *net)
 369{
 370        net->core.sysctl_somaxconn = SOMAXCONN;
 371        return 0;
 372}
 373
 374static struct pernet_operations net_defaults_ops = {
 375        .init = net_defaults_init_net,
 376};
 377
 378static __init int net_defaults_init(void)
 379{
 380        if (register_pernet_subsys(&net_defaults_ops))
 381                panic("Cannot initialize net default settings");
 382
 383        return 0;
 384}
 385
 386core_initcall(net_defaults_init);
 387
 388#ifdef CONFIG_NET_NS
 389static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
 390{
 391        return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
 392}
 393
 394static void dec_net_namespaces(struct ucounts *ucounts)
 395{
 396        dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
 397}
 398
 399static struct kmem_cache *net_cachep __ro_after_init;
 400static struct workqueue_struct *netns_wq;
 401
 402static struct net *net_alloc(void)
 403{
 404        struct net *net = NULL;
 405        struct net_generic *ng;
 406
 407        ng = net_alloc_generic();
 408        if (!ng)
 409                goto out;
 410
 411        net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
 412        if (!net)
 413                goto out_free;
 414
 415#ifdef CONFIG_KEYS
 416        net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL);
 417        if (!net->key_domain)
 418                goto out_free_2;
 419        refcount_set(&net->key_domain->usage, 1);
 420#endif
 421
 422        rcu_assign_pointer(net->gen, ng);
 423out:
 424        return net;
 425
 426#ifdef CONFIG_KEYS
 427out_free_2:
 428        kmem_cache_free(net_cachep, net);
 429        net = NULL;
 430#endif
 431out_free:
 432        kfree(ng);
 433        goto out;
 434}
 435
 436static void net_free(struct net *net)
 437{
 438        kfree(rcu_access_pointer(net->gen));
 439        kmem_cache_free(net_cachep, net);
 440}
 441
 442void net_drop_ns(void *p)
 443{
 444        struct net *ns = p;
 445        if (ns && refcount_dec_and_test(&ns->passive))
 446                net_free(ns);
 447}
 448
 449struct net *copy_net_ns(unsigned long flags,
 450                        struct user_namespace *user_ns, struct net *old_net)
 451{
 452        struct ucounts *ucounts;
 453        struct net *net;
 454        int rv;
 455
 456        if (!(flags & CLONE_NEWNET))
 457                return get_net(old_net);
 458
 459        ucounts = inc_net_namespaces(user_ns);
 460        if (!ucounts)
 461                return ERR_PTR(-ENOSPC);
 462
 463        net = net_alloc();
 464        if (!net) {
 465                rv = -ENOMEM;
 466                goto dec_ucounts;
 467        }
 468        refcount_set(&net->passive, 1);
 469        net->ucounts = ucounts;
 470        get_user_ns(user_ns);
 471
 472        rv = down_read_killable(&pernet_ops_rwsem);
 473        if (rv < 0)
 474                goto put_userns;
 475
 476        rv = setup_net(net, user_ns);
 477
 478        up_read(&pernet_ops_rwsem);
 479
 480        if (rv < 0) {
 481put_userns:
 482                key_remove_domain(net->key_domain);
 483                put_user_ns(user_ns);
 484                net_drop_ns(net);
 485dec_ucounts:
 486                dec_net_namespaces(ucounts);
 487                return ERR_PTR(rv);
 488        }
 489        return net;
 490}
 491
 492/**
 493 * net_ns_get_ownership - get sysfs ownership data for @net
 494 * @net: network namespace in question (can be NULL)
 495 * @uid: kernel user ID for sysfs objects
 496 * @gid: kernel group ID for sysfs objects
 497 *
 498 * Returns the uid/gid pair of root in the user namespace associated with the
 499 * given network namespace.
 500 */
 501void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
 502{
 503        if (net) {
 504                kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
 505                kgid_t ns_root_gid = make_kgid(net->user_ns, 0);
 506
 507                if (uid_valid(ns_root_uid))
 508                        *uid = ns_root_uid;
 509
 510                if (gid_valid(ns_root_gid))
 511                        *gid = ns_root_gid;
 512        } else {
 513                *uid = GLOBAL_ROOT_UID;
 514                *gid = GLOBAL_ROOT_GID;
 515        }
 516}
 517EXPORT_SYMBOL_GPL(net_ns_get_ownership);
 518
 519static void unhash_nsid(struct net *net, struct net *last)
 520{
 521        struct net *tmp;
 522        /* This function is only called from cleanup_net() work,
 523         * and this work is the only process, that may delete
 524         * a net from net_namespace_list. So, when the below
 525         * is executing, the list may only grow. Thus, we do not
 526         * use for_each_net_rcu() or net_rwsem.
 527         */
 528        for_each_net(tmp) {
 529                int id;
 530
 531                spin_lock_bh(&tmp->nsid_lock);
 532                id = __peernet2id(tmp, net);
 533                if (id >= 0)
 534                        idr_remove(&tmp->netns_ids, id);
 535                spin_unlock_bh(&tmp->nsid_lock);
 536                if (id >= 0)
 537                        rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
 538                                          GFP_KERNEL);
 539                if (tmp == last)
 540                        break;
 541        }
 542        spin_lock_bh(&net->nsid_lock);
 543        idr_destroy(&net->netns_ids);
 544        spin_unlock_bh(&net->nsid_lock);
 545}
 546
 547static LLIST_HEAD(cleanup_list);
 548
 549static void cleanup_net(struct work_struct *work)
 550{
 551        const struct pernet_operations *ops;
 552        struct net *net, *tmp, *last;
 553        struct llist_node *net_kill_list;
 554        LIST_HEAD(net_exit_list);
 555
 556        /* Atomically snapshot the list of namespaces to cleanup */
 557        net_kill_list = llist_del_all(&cleanup_list);
 558
 559        down_read(&pernet_ops_rwsem);
 560
 561        /* Don't let anyone else find us. */
 562        down_write(&net_rwsem);
 563        llist_for_each_entry(net, net_kill_list, cleanup_list)
 564                list_del_rcu(&net->list);
 565        /* Cache last net. After we unlock rtnl, no one new net
 566         * added to net_namespace_list can assign nsid pointer
 567         * to a net from net_kill_list (see peernet2id_alloc()).
 568         * So, we skip them in unhash_nsid().
 569         *
 570         * Note, that unhash_nsid() does not delete nsid links
 571         * between net_kill_list's nets, as they've already
 572         * deleted from net_namespace_list. But, this would be
 573         * useless anyway, as netns_ids are destroyed there.
 574         */
 575        last = list_last_entry(&net_namespace_list, struct net, list);
 576        up_write(&net_rwsem);
 577
 578        llist_for_each_entry(net, net_kill_list, cleanup_list) {
 579                unhash_nsid(net, last);
 580                list_add_tail(&net->exit_list, &net_exit_list);
 581        }
 582
 583        /* Run all of the network namespace pre_exit methods */
 584        list_for_each_entry_reverse(ops, &pernet_list, list)
 585                ops_pre_exit_list(ops, &net_exit_list);
 586
 587        /*
 588         * Another CPU might be rcu-iterating the list, wait for it.
 589         * This needs to be before calling the exit() notifiers, so
 590         * the rcu_barrier() below isn't sufficient alone.
 591         * Also the pre_exit() and exit() methods need this barrier.
 592         */
 593        synchronize_rcu();
 594
 595        /* Run all of the network namespace exit methods */
 596        list_for_each_entry_reverse(ops, &pernet_list, list)
 597                ops_exit_list(ops, &net_exit_list);
 598
 599        /* Free the net generic variables */
 600        list_for_each_entry_reverse(ops, &pernet_list, list)
 601                ops_free_list(ops, &net_exit_list);
 602
 603        up_read(&pernet_ops_rwsem);
 604
 605        /* Ensure there are no outstanding rcu callbacks using this
 606         * network namespace.
 607         */
 608        rcu_barrier();
 609
 610        /* Finally it is safe to free my network namespace structure */
 611        list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
 612                list_del_init(&net->exit_list);
 613                dec_net_namespaces(net->ucounts);
 614                key_remove_domain(net->key_domain);
 615                put_user_ns(net->user_ns);
 616                net_drop_ns(net);
 617        }
 618}
 619
 620/**
 621 * net_ns_barrier - wait until concurrent net_cleanup_work is done
 622 *
 623 * cleanup_net runs from work queue and will first remove namespaces
 624 * from the global list, then run net exit functions.
 625 *
 626 * Call this in module exit path to make sure that all netns
 627 * ->exit ops have been invoked before the function is removed.
 628 */
 629void net_ns_barrier(void)
 630{
 631        down_write(&pernet_ops_rwsem);
 632        up_write(&pernet_ops_rwsem);
 633}
 634EXPORT_SYMBOL(net_ns_barrier);
 635
 636static DECLARE_WORK(net_cleanup_work, cleanup_net);
 637
 638void __put_net(struct net *net)
 639{
 640        /* Cleanup the network namespace in process context */
 641        if (llist_add(&net->cleanup_list, &cleanup_list))
 642                queue_work(netns_wq, &net_cleanup_work);
 643}
 644EXPORT_SYMBOL_GPL(__put_net);
 645
 646struct net *get_net_ns_by_fd(int fd)
 647{
 648        struct file *file;
 649        struct ns_common *ns;
 650        struct net *net;
 651
 652        file = proc_ns_fget(fd);
 653        if (IS_ERR(file))
 654                return ERR_CAST(file);
 655
 656        ns = get_proc_ns(file_inode(file));
 657        if (ns->ops == &netns_operations)
 658                net = get_net(container_of(ns, struct net, ns));
 659        else
 660                net = ERR_PTR(-EINVAL);
 661
 662        fput(file);
 663        return net;
 664}
 665
 666#else
 667struct net *get_net_ns_by_fd(int fd)
 668{
 669        return ERR_PTR(-EINVAL);
 670}
 671#endif
 672EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
 673
 674struct net *get_net_ns_by_pid(pid_t pid)
 675{
 676        struct task_struct *tsk;
 677        struct net *net;
 678
 679        /* Lookup the network namespace */
 680        net = ERR_PTR(-ESRCH);
 681        rcu_read_lock();
 682        tsk = find_task_by_vpid(pid);
 683        if (tsk) {
 684                struct nsproxy *nsproxy;
 685                task_lock(tsk);
 686                nsproxy = tsk->nsproxy;
 687                if (nsproxy)
 688                        net = get_net(nsproxy->net_ns);
 689                task_unlock(tsk);
 690        }
 691        rcu_read_unlock();
 692        return net;
 693}
 694EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
 695
 696static __net_init int net_ns_net_init(struct net *net)
 697{
 698#ifdef CONFIG_NET_NS
 699        net->ns.ops = &netns_operations;
 700#endif
 701        return ns_alloc_inum(&net->ns);
 702}
 703
 704static __net_exit void net_ns_net_exit(struct net *net)
 705{
 706        ns_free_inum(&net->ns);
 707}
 708
 709static struct pernet_operations __net_initdata net_ns_ops = {
 710        .init = net_ns_net_init,
 711        .exit = net_ns_net_exit,
 712};
 713
 714static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
 715        [NETNSA_NONE]           = { .type = NLA_UNSPEC },
 716        [NETNSA_NSID]           = { .type = NLA_S32 },
 717        [NETNSA_PID]            = { .type = NLA_U32 },
 718        [NETNSA_FD]             = { .type = NLA_U32 },
 719        [NETNSA_TARGET_NSID]    = { .type = NLA_S32 },
 720};
 721
 722static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
 723                          struct netlink_ext_ack *extack)
 724{
 725        struct net *net = sock_net(skb->sk);
 726        struct nlattr *tb[NETNSA_MAX + 1];
 727        struct nlattr *nla;
 728        struct net *peer;
 729        int nsid, err;
 730
 731        err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb,
 732                                     NETNSA_MAX, rtnl_net_policy, extack);
 733        if (err < 0)
 734                return err;
 735        if (!tb[NETNSA_NSID]) {
 736                NL_SET_ERR_MSG(extack, "nsid is missing");
 737                return -EINVAL;
 738        }
 739        nsid = nla_get_s32(tb[NETNSA_NSID]);
 740
 741        if (tb[NETNSA_PID]) {
 742                peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
 743                nla = tb[NETNSA_PID];
 744        } else if (tb[NETNSA_FD]) {
 745                peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
 746                nla = tb[NETNSA_FD];
 747        } else {
 748                NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
 749                return -EINVAL;
 750        }
 751        if (IS_ERR(peer)) {
 752                NL_SET_BAD_ATTR(extack, nla);
 753                NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
 754                return PTR_ERR(peer);
 755        }
 756
 757        spin_lock_bh(&net->nsid_lock);
 758        if (__peernet2id(net, peer) >= 0) {
 759                spin_unlock_bh(&net->nsid_lock);
 760                err = -EEXIST;
 761                NL_SET_BAD_ATTR(extack, nla);
 762                NL_SET_ERR_MSG(extack,
 763                               "Peer netns already has a nsid assigned");
 764                goto out;
 765        }
 766
 767        err = alloc_netid(net, peer, nsid);
 768        spin_unlock_bh(&net->nsid_lock);
 769        if (err >= 0) {
 770                rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid,
 771                                  nlh, GFP_KERNEL);
 772                err = 0;
 773        } else if (err == -ENOSPC && nsid >= 0) {
 774                err = -EEXIST;
 775                NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]);
 776                NL_SET_ERR_MSG(extack, "The specified nsid is already used");
 777        }
 778out:
 779        put_net(peer);
 780        return err;
 781}
 782
 783static int rtnl_net_get_size(void)
 784{
 785        return NLMSG_ALIGN(sizeof(struct rtgenmsg))
 786               + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
 787               + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */
 788               ;
 789}
 790
 791struct net_fill_args {
 792        u32 portid;
 793        u32 seq;
 794        int flags;
 795        int cmd;
 796        int nsid;
 797        bool add_ref;
 798        int ref_nsid;
 799};
 800
 801static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
 802{
 803        struct nlmsghdr *nlh;
 804        struct rtgenmsg *rth;
 805
 806        nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth),
 807                        args->flags);
 808        if (!nlh)
 809                return -EMSGSIZE;
 810
 811        rth = nlmsg_data(nlh);
 812        rth->rtgen_family = AF_UNSPEC;
 813
 814        if (nla_put_s32(skb, NETNSA_NSID, args->nsid))
 815                goto nla_put_failure;
 816
 817        if (args->add_ref &&
 818            nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid))
 819                goto nla_put_failure;
 820
 821        nlmsg_end(skb, nlh);
 822        return 0;
 823
 824nla_put_failure:
 825        nlmsg_cancel(skb, nlh);
 826        return -EMSGSIZE;
 827}
 828
 829static int rtnl_net_valid_getid_req(struct sk_buff *skb,
 830                                    const struct nlmsghdr *nlh,
 831                                    struct nlattr **tb,
 832                                    struct netlink_ext_ack *extack)
 833{
 834        int i, err;
 835
 836        if (!netlink_strict_get_check(skb))
 837                return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg),
 838                                              tb, NETNSA_MAX, rtnl_net_policy,
 839                                              extack);
 840
 841        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
 842                                            NETNSA_MAX, rtnl_net_policy,
 843                                            extack);
 844        if (err)
 845                return err;
 846
 847        for (i = 0; i <= NETNSA_MAX; i++) {
 848                if (!tb[i])
 849                        continue;
 850
 851                switch (i) {
 852                case NETNSA_PID:
 853                case NETNSA_FD:
 854                case NETNSA_NSID:
 855                case NETNSA_TARGET_NSID:
 856                        break;
 857                default:
 858                        NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request");
 859                        return -EINVAL;
 860                }
 861        }
 862
 863        return 0;
 864}
 865
 866static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
 867                          struct netlink_ext_ack *extack)
 868{
 869        struct net *net = sock_net(skb->sk);
 870        struct nlattr *tb[NETNSA_MAX + 1];
 871        struct net_fill_args fillargs = {
 872                .portid = NETLINK_CB(skb).portid,
 873                .seq = nlh->nlmsg_seq,
 874                .cmd = RTM_NEWNSID,
 875        };
 876        struct net *peer, *target = net;
 877        struct nlattr *nla;
 878        struct sk_buff *msg;
 879        int err;
 880
 881        err = rtnl_net_valid_getid_req(skb, nlh, tb, extack);
 882        if (err < 0)
 883                return err;
 884        if (tb[NETNSA_PID]) {
 885                peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
 886                nla = tb[NETNSA_PID];
 887        } else if (tb[NETNSA_FD]) {
 888                peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
 889                nla = tb[NETNSA_FD];
 890        } else if (tb[NETNSA_NSID]) {
 891                peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID]));
 892                if (!peer)
 893                        peer = ERR_PTR(-ENOENT);
 894                nla = tb[NETNSA_NSID];
 895        } else {
 896                NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
 897                return -EINVAL;
 898        }
 899
 900        if (IS_ERR(peer)) {
 901                NL_SET_BAD_ATTR(extack, nla);
 902                NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
 903                return PTR_ERR(peer);
 904        }
 905
 906        if (tb[NETNSA_TARGET_NSID]) {
 907                int id = nla_get_s32(tb[NETNSA_TARGET_NSID]);
 908
 909                target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id);
 910                if (IS_ERR(target)) {
 911                        NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]);
 912                        NL_SET_ERR_MSG(extack,
 913                                       "Target netns reference is invalid");
 914                        err = PTR_ERR(target);
 915                        goto out;
 916                }
 917                fillargs.add_ref = true;
 918                fillargs.ref_nsid = peernet2id(net, peer);
 919        }
 920
 921        msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
 922        if (!msg) {
 923                err = -ENOMEM;
 924                goto out;
 925        }
 926
 927        fillargs.nsid = peernet2id(target, peer);
 928        err = rtnl_net_fill(msg, &fillargs);
 929        if (err < 0)
 930                goto err_out;
 931
 932        err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
 933        goto out;
 934
 935err_out:
 936        nlmsg_free(msg);
 937out:
 938        if (fillargs.add_ref)
 939                put_net(target);
 940        put_net(peer);
 941        return err;
 942}
 943
 944struct rtnl_net_dump_cb {
 945        struct net *tgt_net;
 946        struct net *ref_net;
 947        struct sk_buff *skb;
 948        struct net_fill_args fillargs;
 949        int idx;
 950        int s_idx;
 951};
 952
 953static int rtnl_net_dumpid_one(int id, void *peer, void *data)
 954{
 955        struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
 956        int ret;
 957
 958        if (net_cb->idx < net_cb->s_idx)
 959                goto cont;
 960
 961        net_cb->fillargs.nsid = id;
 962        if (net_cb->fillargs.add_ref)
 963                net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer);
 964        ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs);
 965        if (ret < 0)
 966                return ret;
 967
 968cont:
 969        net_cb->idx++;
 970        return 0;
 971}
 972
 973static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
 974                                   struct rtnl_net_dump_cb *net_cb,
 975                                   struct netlink_callback *cb)
 976{
 977        struct netlink_ext_ack *extack = cb->extack;
 978        struct nlattr *tb[NETNSA_MAX + 1];
 979        int err, i;
 980
 981        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
 982                                            NETNSA_MAX, rtnl_net_policy,
 983                                            extack);
 984        if (err < 0)
 985                return err;
 986
 987        for (i = 0; i <= NETNSA_MAX; i++) {
 988                if (!tb[i])
 989                        continue;
 990
 991                if (i == NETNSA_TARGET_NSID) {
 992                        struct net *net;
 993
 994                        net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i]));
 995                        if (IS_ERR(net)) {
 996                                NL_SET_BAD_ATTR(extack, tb[i]);
 997                                NL_SET_ERR_MSG(extack,
 998                                               "Invalid target network namespace id");
 999                                return PTR_ERR(net);
1000                        }
1001                        net_cb->fillargs.add_ref = true;
1002                        net_cb->ref_net = net_cb->tgt_net;
1003                        net_cb->tgt_net = net;
1004                } else {
1005                        NL_SET_BAD_ATTR(extack, tb[i]);
1006                        NL_SET_ERR_MSG(extack,
1007                                       "Unsupported attribute in dump request");
1008                        return -EINVAL;
1009                }
1010        }
1011
1012        return 0;
1013}
1014
1015static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
1016{
1017        struct rtnl_net_dump_cb net_cb = {
1018                .tgt_net = sock_net(skb->sk),
1019                .skb = skb,
1020                .fillargs = {
1021                        .portid = NETLINK_CB(cb->skb).portid,
1022                        .seq = cb->nlh->nlmsg_seq,
1023                        .flags = NLM_F_MULTI,
1024                        .cmd = RTM_NEWNSID,
1025                },
1026                .idx = 0,
1027                .s_idx = cb->args[0],
1028        };
1029        int err = 0;
1030
1031        if (cb->strict_check) {
1032                err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb);
1033                if (err < 0)
1034                        goto end;
1035        }
1036
1037        spin_lock_bh(&net_cb.tgt_net->nsid_lock);
1038        if (net_cb.fillargs.add_ref &&
1039            !net_eq(net_cb.ref_net, net_cb.tgt_net) &&
1040            !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) {
1041                spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
1042                err = -EAGAIN;
1043                goto end;
1044        }
1045        idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
1046        if (net_cb.fillargs.add_ref &&
1047            !net_eq(net_cb.ref_net, net_cb.tgt_net))
1048                spin_unlock_bh(&net_cb.ref_net->nsid_lock);
1049        spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
1050
1051        cb->args[0] = net_cb.idx;
1052end:
1053        if (net_cb.fillargs.add_ref)
1054                put_net(net_cb.tgt_net);
1055        return err < 0 ? err : skb->len;
1056}
1057
1058static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
1059                              struct nlmsghdr *nlh, gfp_t gfp)
1060{
1061        struct net_fill_args fillargs = {
1062                .portid = portid,
1063                .seq = nlh ? nlh->nlmsg_seq : 0,
1064                .cmd = cmd,
1065                .nsid = id,
1066        };
1067        struct sk_buff *msg;
1068        int err = -ENOMEM;
1069
1070        msg = nlmsg_new(rtnl_net_get_size(), gfp);
1071        if (!msg)
1072                goto out;
1073
1074        err = rtnl_net_fill(msg, &fillargs);
1075        if (err < 0)
1076                goto err_out;
1077
1078        rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp);
1079        return;
1080
1081err_out:
1082        nlmsg_free(msg);
1083out:
1084        rtnl_set_sk_err(net, RTNLGRP_NSID, err);
1085}
1086
1087static int __init net_ns_init(void)
1088{
1089        struct net_generic *ng;
1090
1091#ifdef CONFIG_NET_NS
1092        net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
1093                                        SMP_CACHE_BYTES,
1094                                        SLAB_PANIC|SLAB_ACCOUNT, NULL);
1095
1096        /* Create workqueue for cleanup */
1097        netns_wq = create_singlethread_workqueue("netns");
1098        if (!netns_wq)
1099                panic("Could not create netns workq");
1100#endif
1101
1102        ng = net_alloc_generic();
1103        if (!ng)
1104                panic("Could not allocate generic netns");
1105
1106        rcu_assign_pointer(init_net.gen, ng);
1107
1108        down_write(&pernet_ops_rwsem);
1109        if (setup_net(&init_net, &init_user_ns))
1110                panic("Could not setup the initial network namespace");
1111
1112        init_net_initialized = true;
1113        up_write(&pernet_ops_rwsem);
1114
1115        if (register_pernet_subsys(&net_ns_ops))
1116                panic("Could not register network namespace subsystems");
1117
1118        rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL,
1119                      RTNL_FLAG_DOIT_UNLOCKED);
1120        rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
1121                      RTNL_FLAG_DOIT_UNLOCKED);
1122
1123        return 0;
1124}
1125
1126pure_initcall(net_ns_init);
1127
1128#ifdef CONFIG_NET_NS
1129static int __register_pernet_operations(struct list_head *list,
1130                                        struct pernet_operations *ops)
1131{
1132        struct net *net;
1133        int error;
1134        LIST_HEAD(net_exit_list);
1135
1136        list_add_tail(&ops->list, list);
1137        if (ops->init || (ops->id && ops->size)) {
1138                /* We held write locked pernet_ops_rwsem, and parallel
1139                 * setup_net() and cleanup_net() are not possible.
1140                 */
1141                for_each_net(net) {
1142                        error = ops_init(ops, net);
1143                        if (error)
1144                                goto out_undo;
1145                        list_add_tail(&net->exit_list, &net_exit_list);
1146                }
1147        }
1148        return 0;
1149
1150out_undo:
1151        /* If I have an error cleanup all namespaces I initialized */
1152        list_del(&ops->list);
1153        ops_pre_exit_list(ops, &net_exit_list);
1154        synchronize_rcu();
1155        ops_exit_list(ops, &net_exit_list);
1156        ops_free_list(ops, &net_exit_list);
1157        return error;
1158}
1159
1160static void __unregister_pernet_operations(struct pernet_operations *ops)
1161{
1162        struct net *net;
1163        LIST_HEAD(net_exit_list);
1164
1165        list_del(&ops->list);
1166        /* See comment in __register_pernet_operations() */
1167        for_each_net(net)
1168                list_add_tail(&net->exit_list, &net_exit_list);
1169        ops_pre_exit_list(ops, &net_exit_list);
1170        synchronize_rcu();
1171        ops_exit_list(ops, &net_exit_list);
1172        ops_free_list(ops, &net_exit_list);
1173}
1174
1175#else
1176
1177static int __register_pernet_operations(struct list_head *list,
1178                                        struct pernet_operations *ops)
1179{
1180        if (!init_net_initialized) {
1181                list_add_tail(&ops->list, list);
1182                return 0;
1183        }
1184
1185        return ops_init(ops, &init_net);
1186}
1187
1188static void __unregister_pernet_operations(struct pernet_operations *ops)
1189{
1190        if (!init_net_initialized) {
1191                list_del(&ops->list);
1192        } else {
1193                LIST_HEAD(net_exit_list);
1194                list_add(&init_net.exit_list, &net_exit_list);
1195                ops_pre_exit_list(ops, &net_exit_list);
1196                synchronize_rcu();
1197                ops_exit_list(ops, &net_exit_list);
1198                ops_free_list(ops, &net_exit_list);
1199        }
1200}
1201
1202#endif /* CONFIG_NET_NS */
1203
1204static DEFINE_IDA(net_generic_ids);
1205
1206static int register_pernet_operations(struct list_head *list,
1207                                      struct pernet_operations *ops)
1208{
1209        int error;
1210
1211        if (ops->id) {
1212                error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
1213                                GFP_KERNEL);
1214                if (error < 0)
1215                        return error;
1216                *ops->id = error;
1217                max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
1218        }
1219        error = __register_pernet_operations(list, ops);
1220        if (error) {
1221                rcu_barrier();
1222                if (ops->id)
1223                        ida_free(&net_generic_ids, *ops->id);
1224        }
1225
1226        return error;
1227}
1228
1229static void unregister_pernet_operations(struct pernet_operations *ops)
1230{
1231        __unregister_pernet_operations(ops);
1232        rcu_barrier();
1233        if (ops->id)
1234                ida_free(&net_generic_ids, *ops->id);
1235}
1236
1237/**
1238 *      register_pernet_subsys - register a network namespace subsystem
1239 *      @ops:  pernet operations structure for the subsystem
1240 *
1241 *      Register a subsystem which has init and exit functions
1242 *      that are called when network namespaces are created and
1243 *      destroyed respectively.
1244 *
1245 *      When registered all network namespace init functions are
1246 *      called for every existing network namespace.  Allowing kernel
1247 *      modules to have a race free view of the set of network namespaces.
1248 *
1249 *      When a new network namespace is created all of the init
1250 *      methods are called in the order in which they were registered.
1251 *
1252 *      When a network namespace is destroyed all of the exit methods
1253 *      are called in the reverse of the order with which they were
1254 *      registered.
1255 */
1256int register_pernet_subsys(struct pernet_operations *ops)
1257{
1258        int error;
1259        down_write(&pernet_ops_rwsem);
1260        error =  register_pernet_operations(first_device, ops);
1261        up_write(&pernet_ops_rwsem);
1262        return error;
1263}
1264EXPORT_SYMBOL_GPL(register_pernet_subsys);
1265
1266/**
1267 *      unregister_pernet_subsys - unregister a network namespace subsystem
1268 *      @ops: pernet operations structure to manipulate
1269 *
1270 *      Remove the pernet operations structure from the list to be
1271 *      used when network namespaces are created or destroyed.  In
1272 *      addition run the exit method for all existing network
1273 *      namespaces.
1274 */
1275void unregister_pernet_subsys(struct pernet_operations *ops)
1276{
1277        down_write(&pernet_ops_rwsem);
1278        unregister_pernet_operations(ops);
1279        up_write(&pernet_ops_rwsem);
1280}
1281EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
1282
1283/**
1284 *      register_pernet_device - register a network namespace device
1285 *      @ops:  pernet operations structure for the subsystem
1286 *
1287 *      Register a device which has init and exit functions
1288 *      that are called when network namespaces are created and
1289 *      destroyed respectively.
1290 *
1291 *      When registered all network namespace init functions are
1292 *      called for every existing network namespace.  Allowing kernel
1293 *      modules to have a race free view of the set of network namespaces.
1294 *
1295 *      When a new network namespace is created all of the init
1296 *      methods are called in the order in which they were registered.
1297 *
1298 *      When a network namespace is destroyed all of the exit methods
1299 *      are called in the reverse of the order with which they were
1300 *      registered.
1301 */
1302int register_pernet_device(struct pernet_operations *ops)
1303{
1304        int error;
1305        down_write(&pernet_ops_rwsem);
1306        error = register_pernet_operations(&pernet_list, ops);
1307        if (!error && (first_device == &pernet_list))
1308                first_device = &ops->list;
1309        up_write(&pernet_ops_rwsem);
1310        return error;
1311}
1312EXPORT_SYMBOL_GPL(register_pernet_device);
1313
1314/**
1315 *      unregister_pernet_device - unregister a network namespace netdevice
1316 *      @ops: pernet operations structure to manipulate
1317 *
1318 *      Remove the pernet operations structure from the list to be
1319 *      used when network namespaces are created or destroyed.  In
1320 *      addition run the exit method for all existing network
1321 *      namespaces.
1322 */
1323void unregister_pernet_device(struct pernet_operations *ops)
1324{
1325        down_write(&pernet_ops_rwsem);
1326        if (&ops->list == first_device)
1327                first_device = first_device->next;
1328        unregister_pernet_operations(ops);
1329        up_write(&pernet_ops_rwsem);
1330}
1331EXPORT_SYMBOL_GPL(unregister_pernet_device);
1332
1333#ifdef CONFIG_NET_NS
1334static struct ns_common *netns_get(struct task_struct *task)
1335{
1336        struct net *net = NULL;
1337        struct nsproxy *nsproxy;
1338
1339        task_lock(task);
1340        nsproxy = task->nsproxy;
1341        if (nsproxy)
1342                net = get_net(nsproxy->net_ns);
1343        task_unlock(task);
1344
1345        return net ? &net->ns : NULL;
1346}
1347
1348static inline struct net *to_net_ns(struct ns_common *ns)
1349{
1350        return container_of(ns, struct net, ns);
1351}
1352
1353static void netns_put(struct ns_common *ns)
1354{
1355        put_net(to_net_ns(ns));
1356}
1357
1358static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
1359{
1360        struct net *net = to_net_ns(ns);
1361
1362        if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
1363            !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
1364                return -EPERM;
1365
1366        put_net(nsproxy->net_ns);
1367        nsproxy->net_ns = get_net(net);
1368        return 0;
1369}
1370
1371static struct user_namespace *netns_owner(struct ns_common *ns)
1372{
1373        return to_net_ns(ns)->user_ns;
1374}
1375
1376const struct proc_ns_operations netns_operations = {
1377        .name           = "net",
1378        .type           = CLONE_NEWNET,
1379        .get            = netns_get,
1380        .put            = netns_put,
1381        .install        = netns_install,
1382        .owner          = netns_owner,
1383};
1384#endif
1385