diff options
Diffstat (limited to 'net/core/net_namespace.c')
-rw-r--r-- | net/core/net_namespace.c | 353 |
1 files changed, 201 insertions, 152 deletions
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f0540c557515..42ee7fce3d95 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -56,7 +56,6 @@ static bool init_net_initialized; * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); -EXPORT_SYMBOL_GPL(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) @@ -69,12 +68,15 @@ DEFINE_COOKIE(net_cookie); static struct net_generic *net_alloc_generic(void) { + unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); + unsigned int generic_size; struct net_generic *ng; - unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + generic_size = offsetof(struct net_generic, ptr[gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) - ng->s.len = max_gen_ptrs; + ng->s.len = gen_ptrs; return ng; } @@ -122,7 +124,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) int err = -ENOMEM; void *data = NULL; - if (ops->id && ops->size) { + if (ops->id) { data = kzalloc(ops->size, GFP_KERNEL); if (!data) goto out; @@ -137,7 +139,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) if (!err) return 0; - if (ops->id && ops->size) { + if (ops->id) { ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); ng->ptr[*ops->id] = NULL; @@ -161,16 +163,45 @@ static void ops_pre_exit_list(const struct pernet_operations *ops, } } +static void ops_exit_rtnl_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + const struct pernet_operations *saved_ops = ops; + LIST_HEAD(dev_kill_list); + struct net *net; + + rtnl_lock(); + + list_for_each_entry(net, net_exit_list, exit_list) { + __rtnl_net_lock(net); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_rtnl) + ops->exit_rtnl(net, &dev_kill_list); + } + + __rtnl_net_unlock(net); + } + + unregister_netdevice_many(&dev_kill_list); + + rtnl_unlock(); +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { - struct net *net; if (ops->exit) { + struct net *net; + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } + if (ops->exit_batch) ops->exit_batch(net_exit_list); } @@ -179,12 +210,63 @@ static void ops_free_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; - if (ops->size && ops->id) { + + if (ops->id) { list_for_each_entry(net, net_exit_list, exit_list) kfree(net_generic(net, *ops->id)); } } +static void ops_undo_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list, + bool expedite_rcu) +{ + const struct pernet_operations *saved_ops; + bool hold_rtnl = false; + + if (!ops) + ops = list_entry(ops_list, typeof(*ops), list); + + saved_ops = ops; + + list_for_each_entry_continue_reverse(ops, ops_list, list) { + hold_rtnl |= !!ops->exit_rtnl; + ops_pre_exit_list(ops, net_exit_list); + } + + /* Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so the + * rcu_barrier() after ops_undo_list() isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. + */ + if (expedite_rcu) + synchronize_rcu_expedited(); + else + synchronize_rcu(); + + if (hold_rtnl) + ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_exit_list(ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_free_list(ops, net_exit_list); +} + +static void ops_undo_single(struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + LIST_HEAD(ops_list); + + list_add(&ops->list, &ops_list); + ops_undo_list(&ops_list, NULL, net_exit_list, false); + list_del(&ops->list); +} + /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { @@ -305,36 +387,56 @@ struct net *get_net_ns_by_id(const struct net *net, int id) } EXPORT_SYMBOL_GPL(get_net_ns_by_id); +static __net_init void preinit_net_sysctl(struct net *net) +{ + net->core.sysctl_somaxconn = SOMAXCONN; + /* Limits per socket sk_omem_alloc usage. + * TCP zerocopy regular usage needs 128 KB. + */ + net->core.sysctl_optmem_max = 128 * 1024; + net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + net->core.sysctl_tstamp_allow_data = 1; +} + /* init code that must occur even if setup_net() is not called. */ -static __net_init void preinit_net(struct net *net) +static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) { + refcount_set(&net->passive, 1); + refcount_set(&net->ns.count, 1); + ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt"); + + get_random_bytes(&net->hash_mix, sizeof(u32)); + net->dev_base_seq = 1; + net->user_ns = user_ns; + + idr_init(&net->netns_ids); + spin_lock_init(&net->nsid_lock); + mutex_init(&net->ipv4.ra_mutex); + +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + mutex_init(&net->rtnl_mutex); + lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); +#endif + + INIT_LIST_HEAD(&net->ptype_all); + INIT_LIST_HEAD(&net->ptype_specific); + preinit_net_sysctl(net); } /* * setup_net runs the initializers for the network namespace object. */ -static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) +static __net_init int setup_net(struct net *net) { /* Must be called with pernet_ops_rwsem held */ - const struct pernet_operations *ops, *saved_ops; + const struct pernet_operations *ops; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); int error = 0; - refcount_set(&net->ns.count, 1); - ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); - - refcount_set(&net->passive, 1); - get_random_bytes(&net->hash_mix, sizeof(u32)); preempt_disable(); net->net_cookie = gen_cookie_next(&net_cookie); preempt_enable(); - net->dev_base_seq = 1; - net->user_ns = user_ns; - idr_init(&net->netns_ids); - spin_lock_init(&net->nsid_lock); - mutex_init(&net->ipv4.ra_mutex); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -352,59 +454,11 @@ out_undo: * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); - saved_ops = ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - synchronize_rcu(); - - ops = saved_ops; - rtnl_lock(); - list_for_each_entry_continue_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); - + ops_undo_list(&pernet_list, ops, &net_exit_list, false); rcu_barrier(); goto out; } -static int __net_init net_defaults_init_net(struct net *net) -{ - net->core.sysctl_somaxconn = SOMAXCONN; - /* Limits per socket sk_omem_alloc usage. - * TCP zerocopy regular usage needs 128 KB. - */ - net->core.sysctl_optmem_max = 128 * 1024; - net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; - - return 0; -} - -static struct pernet_operations net_defaults_ops = { - .init = net_defaults_init_net, -}; - -static __init int net_defaults_init(void) -{ - if (register_pernet_subsys(&net_defaults_ops)) - panic("Cannot initialize net default settings"); - - return 0; -} - -core_initcall(net_defaults_init); - #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) { @@ -453,7 +507,22 @@ out_free: goto out; } -static void net_free(struct net *net) +static LLIST_HEAD(defer_free_list); + +static void net_complete_free(void) +{ + struct llist_node *kill_list; + struct net *net, *next; + + /* Get the list of namespaces to free from last round. */ + kill_list = llist_del_all(&defer_free_list); + + llist_for_each_entry_safe(net, next, kill_list, defer_free_list) + kmem_cache_free(net_cachep, net); + +} + +void net_passive_dec(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); @@ -461,7 +530,8 @@ static void net_free(struct net *net) /* There should not be any trackers left there. */ ref_tracker_dir_exit(&net->notrefcnt_tracker); - kmem_cache_free(net_cachep, net); + /* Wait for an extra rcu_barrier() before final free. */ + llist_add(&net->defer_free_list, &defer_free_list); } } @@ -470,7 +540,7 @@ void net_drop_ns(void *p) struct net *net = (struct net *)p; if (net) - net_free(net); + net_passive_dec(net); } struct net *copy_net_ns(unsigned long flags, @@ -493,8 +563,7 @@ struct net *copy_net_ns(unsigned long flags, goto dec_ucounts; } - preinit_net(net); - refcount_set(&net->passive, 1); + preinit_net(net, user_ns); net->ucounts = ucounts; get_user_ns(user_ns); @@ -502,7 +571,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) goto put_userns; - rv = setup_net(net, user_ns); + rv = setup_net(net); up_read(&pernet_ops_rwsem); @@ -512,7 +581,7 @@ put_userns: key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); - net_free(net); + net_passive_dec(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -577,13 +646,15 @@ static void unhash_nsid(struct net *net, struct net *last) static LLIST_HEAD(cleanup_list); +struct task_struct *cleanup_net_task; + static void cleanup_net(struct work_struct *work) { - const struct pernet_operations *ops; - struct net *net, *tmp, *last; struct llist_node *net_kill_list; + struct net *net, *tmp, *last; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); + + cleanup_net_task = current; /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); @@ -612,33 +683,7 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } - /* Run all of the network namespace pre_exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - /* - * Another CPU might be rcu-iterating the list, wait for it. - * This needs to be before calling the exit() notifiers, so - * the rcu_barrier() below isn't sufficient alone. - * Also the pre_exit() and exit() methods need this barrier. - */ - synchronize_rcu_expedited(); - - rtnl_lock(); - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - /* Free the net generic variables */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); + ops_undo_list(&pernet_list, NULL, &net_exit_list, true); up_read(&pernet_ops_rwsem); @@ -647,6 +692,8 @@ static void cleanup_net(struct work_struct *work) */ rcu_barrier(); + net_complete_free(); + /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); @@ -655,8 +702,9 @@ static void cleanup_net(struct work_struct *work) key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); - net_free(net); + net_passive_dec(net); } + cleanup_net_task = NULL; } /** @@ -690,30 +738,33 @@ EXPORT_SYMBOL_GPL(__put_net); * get_net_ns - increment the refcount of the network namespace * @ns: common namespace (net) * - * Returns the net's common namespace. + * Returns the net's common namespace or ERR_PTR() if ref is zero. */ struct ns_common *get_net_ns(struct ns_common *ns) { - return &get_net(container_of(ns, struct net, ns))->ns; + struct net *net; + + net = maybe_get_net(container_of(ns, struct net, ns)); + if (net) + return &net->ns; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns); struct net *get_net_ns_by_fd(int fd) { - struct fd f = fdget(fd); - struct net *net = ERR_PTR(-EINVAL); + CLASS(fd, f)(fd); - if (!f.file) + if (fd_empty(f)) return ERR_PTR(-EBADF); - if (proc_ns_file(f.file)) { - struct ns_common *ns = get_proc_ns(file_inode(f.file)); + if (proc_ns_file(fd_file(f))) { + struct ns_common *ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ops == &netns_operations) - net = get_net(container_of(ns, struct net, ns)); + return get_net(container_of(ns, struct net, ns)); } - fdput(f); - return net; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns_by_fd); #endif @@ -1090,7 +1141,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) end: if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); - return err < 0 ? err : skb->len; + return err; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, @@ -1159,13 +1210,23 @@ static void __init netns_ipv4_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_l3mdev_accept); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_rmem); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); } #endif +static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid, + .dumpit = rtnl_net_dumpid, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + void __init net_ns_init(void) { struct net_generic *ng; @@ -1191,9 +1252,10 @@ void __init net_ns_init(void) #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif + preinit_net(&init_net, &init_user_ns); + down_write(&pernet_ops_rwsem); - preinit_net(&init_net); - if (setup_net(&init_net, &init_user_ns)) + if (setup_net(&init_net)) panic("Could not setup the initial network namespace"); init_net_initialized = true; @@ -1202,40 +1264,19 @@ void __init net_ns_init(void) if (register_pernet_subsys(&net_ns_ops)) panic("Could not register network namespace subsystems"); - rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, - RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, - RTNL_FLAG_DOIT_UNLOCKED); -} - -static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) -{ - ops_pre_exit_list(ops, net_exit_list); - synchronize_rcu(); - - if (ops->exit_batch_rtnl) { - LIST_HEAD(dev_kill_list); - - rtnl_lock(); - ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } - ops_exit_list(ops, net_exit_list); - - ops_free_list(ops, net_exit_list); + rtnl_register_many(net_ns_rtnl_msg_handlers); } #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + LIST_HEAD(net_exit_list); struct net *net; int error; - LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); - if (ops->init || (ops->id && ops->size)) { + if (ops->init || ops->id) { /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ @@ -1251,21 +1292,21 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { - struct net *net; LIST_HEAD(net_exit_list); + struct net *net; - list_del(&ops->list); /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + list_del(&ops->list); + ops_undo_single(ops, &net_exit_list); } #else @@ -1287,8 +1328,9 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) list_del(&ops->list); } else { LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); } } @@ -1301,13 +1343,20 @@ static int register_pernet_operations(struct list_head *list, { int error; + if (WARN_ON(!!ops->id ^ !!ops->size)) + return -EINVAL; + if (ops->id) { error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID, GFP_KERNEL); if (error < 0) return error; *ops->id = error; - max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1); + /* This does not require READ_ONCE as writers already hold + * pernet_ops_rwsem. But WRITE_ONCE is needed to protect + * net_alloc_generic. + */ + WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1)); } error = __register_pernet_operations(list, ops); if (error) { |